From: Mike Snitzer Date: Tue, 10 Nov 2020 18:41:53 +0000 (-0500) Subject: dm: rename multipath path selector source files to have "dm-ps" prefix X-Git-Tag: baikal/mips/sdk5.9~11898^2~7 X-Git-Url: https://git.baikalelectronics.ru/sdk/?a=commitdiff_plain;h=cb6d571116c718fccf17a931d5d24b46c23b49e7;p=kernel.git dm: rename multipath path selector source files to have "dm-ps" prefix Additional prefix helps clarify that these source files implement path selectors. Required updating Makefile to still build modules _without_ the "dm-ps" prefix to preserve dm-multipath's ability to autoload path selector modules. While at it, cleaned up some DM whitespace in Makefile. Signed-off-by: Mike Snitzer --- diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 4f95f332d0156..ef7ddc27685c8 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -7,23 +7,28 @@ dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o \ dm-rq.o dm-multipath-y += dm-path-selector.o dm-mpath.o +dm-historical-service-time-y += dm-ps-historical-service-time.o +dm-io-affinity-y += dm-ps-io-affinity.o +dm-queue-length-y += dm-ps-queue-length.o +dm-round-robin-y += dm-ps-round-robin.o +dm-service-time-y += dm-ps-service-time.o dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ dm-snap-persistent.o dm-mirror-y += dm-raid1.o -dm-log-userspace-y \ - += dm-log-userspace-base.o dm-log-userspace-transfer.o +dm-log-userspace-y += dm-log-userspace-base.o dm-log-userspace-transfer.o dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o dm-thin-pool-y += dm-thin.o dm-thin-metadata.o dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ dm-cache-background-tracker.o -dm-cache-smq-y += dm-cache-policy-smq.o +dm-cache-smq-y += dm-cache-policy-smq.o dm-ebs-y += dm-ebs-target.o dm-era-y += dm-era-target.o dm-clone-y += dm-clone-target.o dm-clone-metadata.o dm-verity-y += dm-verity-target.o +dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o + md-mod-y += md.o md-bitmap.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o -dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o linear-y += md-linear.o multipath-y += md-multipath.o faulty-y += md-faulty.o @@ -62,12 +67,12 @@ obj-$(CONFIG_DM_MULTIPATH_HST) += dm-historical-service-time.o obj-$(CONFIG_DM_MULTIPATH_IOA) += dm-io-affinity.o obj-$(CONFIG_DM_SWITCH) += dm-switch.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o -obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/ +obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/ obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o obj-$(CONFIG_DM_ZERO) += dm-zero.o -obj-$(CONFIG_DM_RAID) += dm-raid.o -obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o +obj-$(CONFIG_DM_RAID) += dm-raid.o +obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_CACHE) += dm-cache.o obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o diff --git a/drivers/md/dm-historical-service-time.c b/drivers/md/dm-historical-service-time.c deleted file mode 100644 index 186f91e2752c1..0000000000000 --- a/drivers/md/dm-historical-service-time.c +++ /dev/null @@ -1,561 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Historical Service Time - * - * Keeps a time-weighted exponential moving average of the historical - * service time. Estimates future service time based on the historical - * service time and the number of outstanding requests. - * - * Marks paths stale if they have not finished within hst * - * num_paths. If a path is stale and unused, we will send a single - * request to probe in case the path has improved. This situation - * generally arises if the path is so much worse than others that it - * will never have the best estimated service time, or if the entire - * multipath device is unused. If a path is stale and in use, limit the - * number of requests it can receive with the assumption that the path - * has become degraded. - * - * To avoid repeatedly calculating exponents for time weighting, times - * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT) - * ns, and the weighting is pre-calculated. - * - */ - -#include "dm.h" -#include "dm-path-selector.h" - -#include -#include -#include - - -#define DM_MSG_PREFIX "multipath historical-service-time" -#define HST_MIN_IO 1 -#define HST_VERSION "0.1.1" - -#define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */ -#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT) -#define HST_FIXED_1 (1 << HST_FIXED_SHIFT) -#define HST_FIXED_95 972 -#define HST_MAX_INFLIGHT HST_FIXED_1 -#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */ -#define HST_WEIGHT_COUNT 64ULL - -struct selector { - struct list_head valid_paths; - struct list_head failed_paths; - int valid_count; - spinlock_t lock; - - unsigned int weights[HST_WEIGHT_COUNT]; - unsigned int threshold_multiplier; -}; - -struct path_info { - struct list_head list; - struct dm_path *path; - unsigned int repeat_count; - - spinlock_t lock; - - u64 historical_service_time; /* Fixed point */ - - u64 stale_after; - u64 last_finish; - - u64 outstanding; -}; - -/** - * fixed_power - compute: x^n, in O(log n) time - * - * @x: base of the power - * @frac_bits: fractional bits of @x - * @n: power to raise @x to. - * - * By exploiting the relation between the definition of the natural power - * function: x^n := x*x*...*x (x multiplied by itself for n times), and - * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, - * (where: n_i \elem {0, 1}, the binary vector representing n), - * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is - * of course trivially computable in O(log_2 n), the length of our binary - * vector. - * - * (see: kernel/sched/loadavg.c) - */ -static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n) -{ - unsigned long result = 1UL << frac_bits; - - if (n) { - for (;;) { - if (n & 1) { - result *= x; - result += 1UL << (frac_bits - 1); - result >>= frac_bits; - } - n >>= 1; - if (!n) - break; - x *= x; - x += 1UL << (frac_bits - 1); - x >>= frac_bits; - } - } - - return result; -} - -/* - * Calculate the next value of an exponential moving average - * a_1 = a_0 * e + a * (1 - e) - * - * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT] - * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT] - * @weight: [0, HST_FIXED_1] - * - * Note: - * To account for multiple periods in the same calculation, - * a_n = a_0 * e^n + a * (1 - e^n), - * so call fixed_ema(last, next, pow(weight, N)) - */ -static u64 fixed_ema(u64 last, u64 next, u64 weight) -{ - last *= weight; - last += next * (HST_FIXED_1 - weight); - last += 1ULL << (HST_FIXED_SHIFT - 1); - return last >> HST_FIXED_SHIFT; -} - -static struct selector *alloc_selector(void) -{ - struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (s) { - INIT_LIST_HEAD(&s->valid_paths); - INIT_LIST_HEAD(&s->failed_paths); - spin_lock_init(&s->lock); - s->valid_count = 0; - } - - return s; -} - -/* - * Get the weight for a given time span. - */ -static u64 hst_weight(struct path_selector *ps, u64 delta) -{ - struct selector *s = ps->context; - int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL, - HST_WEIGHT_COUNT - 1); - - return s->weights[bucket]; -} - -/* - * Set up the weights array. - * - * weights[len-1] = 0 - * weights[n] = base ^ (n + 1) - */ -static void hst_set_weights(struct path_selector *ps, unsigned int base) -{ - struct selector *s = ps->context; - int i; - - if (base >= HST_FIXED_1) - return; - - for (i = 0; i < HST_WEIGHT_COUNT - 1; i++) - s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1); - s->weights[HST_WEIGHT_COUNT - 1] = 0; -} - -static int hst_create(struct path_selector *ps, unsigned int argc, char **argv) -{ - struct selector *s; - unsigned int base_weight = HST_FIXED_95; - unsigned int threshold_multiplier = 0; - char dummy; - - /* - * Arguments: [ []] - * : Base weight for ema [0, 1024) 10-bit fixed point. A - * value of 0 will completely ignore any history. - * If not given, default (HST_FIXED_95) is used. - * : Minimum threshold multiplier for paths to - * be considered different. That is, a path is - * considered different iff (p1 > N * p2) where p1 - * is the path with higher service time. A threshold - * of 1 or 0 has no effect. Defaults to 0. - */ - if (argc > 2) - return -EINVAL; - - if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 || - base_weight >= HST_FIXED_1)) { - return -EINVAL; - } - - if (argc > 1 && (sscanf(argv[1], "%u%c", - &threshold_multiplier, &dummy) != 1)) { - return -EINVAL; - } - - s = alloc_selector(); - if (!s) - return -ENOMEM; - - ps->context = s; - - hst_set_weights(ps, base_weight); - s->threshold_multiplier = threshold_multiplier; - return 0; -} - -static void free_paths(struct list_head *paths) -{ - struct path_info *pi, *next; - - list_for_each_entry_safe(pi, next, paths, list) { - list_del(&pi->list); - kfree(pi); - } -} - -static void hst_destroy(struct path_selector *ps) -{ - struct selector *s = ps->context; - - free_paths(&s->valid_paths); - free_paths(&s->failed_paths); - kfree(s); - ps->context = NULL; -} - -static int hst_status(struct path_selector *ps, struct dm_path *path, - status_type_t type, char *result, unsigned int maxlen) -{ - unsigned int sz = 0; - struct path_info *pi; - - if (!path) { - struct selector *s = ps->context; - - DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier); - } else { - pi = path->pscontext; - - switch (type) { - case STATUSTYPE_INFO: - DMEMIT("%llu %llu %llu ", pi->historical_service_time, - pi->outstanding, pi->stale_after); - break; - case STATUSTYPE_TABLE: - DMEMIT("0 "); - break; - } - } - - return sz; -} - -static int hst_add_path(struct path_selector *ps, struct dm_path *path, - int argc, char **argv, char **error) -{ - struct selector *s = ps->context; - struct path_info *pi; - unsigned int repeat_count = HST_MIN_IO; - char dummy; - unsigned long flags; - - /* - * Arguments: [] - * : The number of I/Os before switching path. - * If not given, default (HST_MIN_IO) is used. - */ - if (argc > 1) { - *error = "historical-service-time ps: incorrect number of arguments"; - return -EINVAL; - } - - if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { - *error = "historical-service-time ps: invalid repeat count"; - return -EINVAL; - } - - /* allocate the path */ - pi = kmalloc(sizeof(*pi), GFP_KERNEL); - if (!pi) { - *error = "historical-service-time ps: Error allocating path context"; - return -ENOMEM; - } - - pi->path = path; - pi->repeat_count = repeat_count; - - pi->historical_service_time = HST_FIXED_1; - - spin_lock_init(&pi->lock); - pi->outstanding = 0; - - pi->stale_after = 0; - pi->last_finish = 0; - - path->pscontext = pi; - - spin_lock_irqsave(&s->lock, flags); - list_add_tail(&pi->list, &s->valid_paths); - s->valid_count++; - spin_unlock_irqrestore(&s->lock, flags); - - return 0; -} - -static void hst_fail_path(struct path_selector *ps, struct dm_path *path) -{ - struct selector *s = ps->context; - struct path_info *pi = path->pscontext; - unsigned long flags; - - spin_lock_irqsave(&s->lock, flags); - list_move(&pi->list, &s->failed_paths); - s->valid_count--; - spin_unlock_irqrestore(&s->lock, flags); -} - -static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path) -{ - struct selector *s = ps->context; - struct path_info *pi = path->pscontext; - unsigned long flags; - - spin_lock_irqsave(&s->lock, flags); - list_move_tail(&pi->list, &s->valid_paths); - s->valid_count++; - spin_unlock_irqrestore(&s->lock, flags); - - return 0; -} - -static void hst_fill_compare(struct path_info *pi, u64 *hst, - u64 *out, u64 *stale) -{ - unsigned long flags; - - spin_lock_irqsave(&pi->lock, flags); - *hst = pi->historical_service_time; - *out = pi->outstanding; - *stale = pi->stale_after; - spin_unlock_irqrestore(&pi->lock, flags); -} - -/* - * Compare the estimated service time of 2 paths, pi1 and pi2, - * for the incoming I/O. - * - * Returns: - * < 0 : pi1 is better - * 0 : no difference between pi1 and pi2 - * > 0 : pi2 is better - * - */ -static long long hst_compare(struct path_info *pi1, struct path_info *pi2, - u64 time_now, struct path_selector *ps) -{ - struct selector *s = ps->context; - u64 hst1, hst2; - long long out1, out2, stale1, stale2; - int pi2_better, over_threshold; - - hst_fill_compare(pi1, &hst1, &out1, &stale1); - hst_fill_compare(pi2, &hst2, &out2, &stale2); - - /* Check here if estimated latency for two paths are too similar. - * If this is the case, we skip extra calculation and just compare - * outstanding requests. In this case, any unloaded paths will - * be preferred. - */ - if (hst1 > hst2) - over_threshold = hst1 > (s->threshold_multiplier * hst2); - else - over_threshold = hst2 > (s->threshold_multiplier * hst1); - - if (!over_threshold) - return out1 - out2; - - /* - * If an unloaded path is stale, choose it. If both paths are unloaded, - * choose path that is the most stale. - * (If one path is loaded, choose the other) - */ - if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) || - (!out1 && !out2)) - return (!out2 * stale1) - (!out1 * stale2); - - /* Compare estimated service time. If outstanding is the same, we - * don't need to multiply - */ - if (out1 == out2) { - pi2_better = hst1 > hst2; - } else { - /* Potential overflow with out >= 1024 */ - if (unlikely(out1 >= HST_MAX_INFLIGHT || - out2 >= HST_MAX_INFLIGHT)) { - /* If over 1023 in-flights, we may overflow if hst - * is at max. (With this shift we still overflow at - * 1048576 in-flights, which is high enough). - */ - hst1 >>= HST_FIXED_SHIFT; - hst2 >>= HST_FIXED_SHIFT; - } - pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2; - } - - /* In the case that the 'winner' is stale, limit to equal usage. */ - if (pi2_better) { - if (stale2 < time_now) - return out1 - out2; - return 1; - } - if (stale1 < time_now) - return out1 - out2; - return -1; -} - -static struct dm_path *hst_select_path(struct path_selector *ps, - size_t nr_bytes) -{ - struct selector *s = ps->context; - struct path_info *pi = NULL, *best = NULL; - u64 time_now = sched_clock(); - struct dm_path *ret = NULL; - unsigned long flags; - - spin_lock_irqsave(&s->lock, flags); - if (list_empty(&s->valid_paths)) - goto out; - - list_for_each_entry(pi, &s->valid_paths, list) { - if (!best || (hst_compare(pi, best, time_now, ps) < 0)) - best = pi; - } - - if (!best) - goto out; - - /* Move last used path to end (least preferred in case of ties) */ - list_move_tail(&best->list, &s->valid_paths); - - ret = best->path; - -out: - spin_unlock_irqrestore(&s->lock, flags); - return ret; -} - -static int hst_start_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes) -{ - struct path_info *pi = path->pscontext; - unsigned long flags; - - spin_lock_irqsave(&pi->lock, flags); - pi->outstanding++; - spin_unlock_irqrestore(&pi->lock, flags); - - return 0; -} - -static u64 path_service_time(struct path_info *pi, u64 start_time) -{ - u64 sched_now = ktime_get_ns(); - - /* if a previous disk request has finished after this IO was - * sent to the hardware, pretend the submission happened - * serially. - */ - if (time_after64(pi->last_finish, start_time)) - start_time = pi->last_finish; - - pi->last_finish = sched_now; - if (time_before64(sched_now, start_time)) - return 0; - - return sched_now - start_time; -} - -static int hst_end_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes, u64 start_time) -{ - struct path_info *pi = path->pscontext; - struct selector *s = ps->context; - unsigned long flags; - u64 st; - - spin_lock_irqsave(&pi->lock, flags); - - st = path_service_time(pi, start_time); - pi->outstanding--; - pi->historical_service_time = - fixed_ema(pi->historical_service_time, - min(st * HST_FIXED_1, HST_FIXED_MAX), - hst_weight(ps, st)); - - /* - * On request end, mark path as fresh. If a path hasn't - * finished any requests within the fresh period, the estimated - * service time is considered too optimistic and we limit the - * maximum requests on that path. - */ - pi->stale_after = pi->last_finish + - (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT)); - - spin_unlock_irqrestore(&pi->lock, flags); - - return 0; -} - -static struct path_selector_type hst_ps = { - .name = "historical-service-time", - .module = THIS_MODULE, - .table_args = 1, - .info_args = 3, - .create = hst_create, - .destroy = hst_destroy, - .status = hst_status, - .add_path = hst_add_path, - .fail_path = hst_fail_path, - .reinstate_path = hst_reinstate_path, - .select_path = hst_select_path, - .start_io = hst_start_io, - .end_io = hst_end_io, -}; - -static int __init dm_hst_init(void) -{ - int r = dm_register_path_selector(&hst_ps); - - if (r < 0) - DMERR("register failed %d", r); - - DMINFO("version " HST_VERSION " loaded"); - - return r; -} - -static void __exit dm_hst_exit(void) -{ - int r = dm_unregister_path_selector(&hst_ps); - - if (r < 0) - DMERR("unregister failed %d", r); -} - -module_init(dm_hst_init); -module_exit(dm_hst_exit); - -MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector"); -MODULE_AUTHOR("Khazhismel Kumykov "); -MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-io-affinity.c b/drivers/md/dm-io-affinity.c deleted file mode 100644 index 077655cd4fae6..0000000000000 --- a/drivers/md/dm-io-affinity.c +++ /dev/null @@ -1,272 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2020 Oracle Corporation - * - * Module Author: Mike Christie - */ -#include "dm-path-selector.h" - -#include -#include - -#define DM_MSG_PREFIX "multipath io-affinity" - -struct path_info { - struct dm_path *path; - cpumask_var_t cpumask; - refcount_t refcount; - bool failed; -}; - -struct selector { - struct path_info **path_map; - cpumask_var_t path_mask; - atomic_t map_misses; -}; - -static void ioa_free_path(struct selector *s, unsigned int cpu) -{ - struct path_info *pi = s->path_map[cpu]; - - if (!pi) - return; - - if (refcount_dec_and_test(&pi->refcount)) { - cpumask_clear_cpu(cpu, s->path_mask); - free_cpumask_var(pi->cpumask); - kfree(pi); - - s->path_map[cpu] = NULL; - } -} - -static int ioa_add_path(struct path_selector *ps, struct dm_path *path, - int argc, char **argv, char **error) -{ - struct selector *s = ps->context; - struct path_info *pi = NULL; - unsigned int cpu; - int ret; - - if (argc != 1) { - *error = "io-affinity ps: invalid number of arguments"; - return -EINVAL; - } - - pi = kzalloc(sizeof(*pi), GFP_KERNEL); - if (!pi) { - *error = "io-affinity ps: Error allocating path context"; - return -ENOMEM; - } - - pi->path = path; - path->pscontext = pi; - refcount_set(&pi->refcount, 1); - - if (!zalloc_cpumask_var(&pi->cpumask, GFP_KERNEL)) { - *error = "io-affinity ps: Error allocating cpumask context"; - ret = -ENOMEM; - goto free_pi; - } - - ret = cpumask_parse(argv[0], pi->cpumask); - if (ret) { - *error = "io-affinity ps: invalid cpumask"; - ret = -EINVAL; - goto free_mask; - } - - for_each_cpu(cpu, pi->cpumask) { - if (cpu >= nr_cpu_ids) { - DMWARN_LIMIT("Ignoring mapping for CPU %u. Max CPU is %u", - cpu, nr_cpu_ids); - break; - } - - if (s->path_map[cpu]) { - DMWARN("CPU mapping for %u exists. Ignoring.", cpu); - continue; - } - - cpumask_set_cpu(cpu, s->path_mask); - s->path_map[cpu] = pi; - refcount_inc(&pi->refcount); - continue; - } - - if (refcount_dec_and_test(&pi->refcount)) { - *error = "io-affinity ps: No new/valid CPU mapping found"; - ret = -EINVAL; - goto free_mask; - } - - return 0; - -free_mask: - free_cpumask_var(pi->cpumask); -free_pi: - kfree(pi); - return ret; -} - -static int ioa_create(struct path_selector *ps, unsigned argc, char **argv) -{ - struct selector *s; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - s->path_map = kzalloc(nr_cpu_ids * sizeof(struct path_info *), - GFP_KERNEL); - if (!s->path_map) - goto free_selector; - - if (!zalloc_cpumask_var(&s->path_mask, GFP_KERNEL)) - goto free_map; - - atomic_set(&s->map_misses, 0); - ps->context = s; - return 0; - -free_map: - kfree(s->path_map); -free_selector: - kfree(s); - return -ENOMEM; -} - -static void ioa_destroy(struct path_selector *ps) -{ - struct selector *s = ps->context; - unsigned cpu; - - for_each_cpu(cpu, s->path_mask) - ioa_free_path(s, cpu); - - free_cpumask_var(s->path_mask); - kfree(s->path_map); - kfree(s); - - ps->context = NULL; -} - -static int ioa_status(struct path_selector *ps, struct dm_path *path, - status_type_t type, char *result, unsigned int maxlen) -{ - struct selector *s = ps->context; - struct path_info *pi; - int sz = 0; - - if (!path) { - DMEMIT("0 "); - return sz; - } - - switch(type) { - case STATUSTYPE_INFO: - DMEMIT("%d ", atomic_read(&s->map_misses)); - break; - case STATUSTYPE_TABLE: - pi = path->pscontext; - DMEMIT("%*pb ", cpumask_pr_args(pi->cpumask)); - break; - } - - return sz; -} - -static void ioa_fail_path(struct path_selector *ps, struct dm_path *p) -{ - struct path_info *pi = p->pscontext; - - pi->failed = true; -} - -static int ioa_reinstate_path(struct path_selector *ps, struct dm_path *p) -{ - struct path_info *pi = p->pscontext; - - pi->failed = false; - return 0; -} - -static struct dm_path *ioa_select_path(struct path_selector *ps, - size_t nr_bytes) -{ - unsigned int cpu, node; - struct selector *s = ps->context; - const struct cpumask *cpumask; - struct path_info *pi; - int i; - - cpu = get_cpu(); - - pi = s->path_map[cpu]; - if (pi && !pi->failed) - goto done; - - /* - * Perf is not optimal, but we at least try the local node then just - * try not to fail. - */ - if (!pi) - atomic_inc(&s->map_misses); - - node = cpu_to_node(cpu); - cpumask = cpumask_of_node(node); - for_each_cpu(i, cpumask) { - pi = s->path_map[i]; - if (pi && !pi->failed) - goto done; - } - - for_each_cpu(i, s->path_mask) { - pi = s->path_map[i]; - if (pi && !pi->failed) - goto done; - } - pi = NULL; - -done: - put_cpu(); - return pi ? pi->path : NULL; -} - -static struct path_selector_type ioa_ps = { - .name = "io-affinity", - .module = THIS_MODULE, - .table_args = 1, - .info_args = 1, - .create = ioa_create, - .destroy = ioa_destroy, - .status = ioa_status, - .add_path = ioa_add_path, - .fail_path = ioa_fail_path, - .reinstate_path = ioa_reinstate_path, - .select_path = ioa_select_path, -}; - -static int __init dm_ioa_init(void) -{ - int ret = dm_register_path_selector(&ioa_ps); - - if (ret < 0) - DMERR("register failed %d", ret); - return ret; -} - -static void __exit dm_ioa_exit(void) -{ - int ret = dm_unregister_path_selector(&ioa_ps); - - if (ret < 0) - DMERR("unregister failed %d", ret); -} - -module_init(dm_ioa_init); -module_exit(dm_ioa_exit); - -MODULE_DESCRIPTION(DM_NAME " multipath path selector that selects paths based on the CPU IO is being executed on"); -MODULE_AUTHOR("Mike Christie "); -MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-ps-historical-service-time.c b/drivers/md/dm-ps-historical-service-time.c new file mode 100644 index 0000000000000..186f91e2752c1 --- /dev/null +++ b/drivers/md/dm-ps-historical-service-time.c @@ -0,0 +1,561 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Historical Service Time + * + * Keeps a time-weighted exponential moving average of the historical + * service time. Estimates future service time based on the historical + * service time and the number of outstanding requests. + * + * Marks paths stale if they have not finished within hst * + * num_paths. If a path is stale and unused, we will send a single + * request to probe in case the path has improved. This situation + * generally arises if the path is so much worse than others that it + * will never have the best estimated service time, or if the entire + * multipath device is unused. If a path is stale and in use, limit the + * number of requests it can receive with the assumption that the path + * has become degraded. + * + * To avoid repeatedly calculating exponents for time weighting, times + * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT) + * ns, and the weighting is pre-calculated. + * + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include +#include +#include + + +#define DM_MSG_PREFIX "multipath historical-service-time" +#define HST_MIN_IO 1 +#define HST_VERSION "0.1.1" + +#define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */ +#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT) +#define HST_FIXED_1 (1 << HST_FIXED_SHIFT) +#define HST_FIXED_95 972 +#define HST_MAX_INFLIGHT HST_FIXED_1 +#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */ +#define HST_WEIGHT_COUNT 64ULL + +struct selector { + struct list_head valid_paths; + struct list_head failed_paths; + int valid_count; + spinlock_t lock; + + unsigned int weights[HST_WEIGHT_COUNT]; + unsigned int threshold_multiplier; +}; + +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned int repeat_count; + + spinlock_t lock; + + u64 historical_service_time; /* Fixed point */ + + u64 stale_after; + u64 last_finish; + + u64 outstanding; +}; + +/** + * fixed_power - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + * + * (see: kernel/sched/loadavg.c) + */ +static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) { + for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + } + + return result; +} + +/* + * Calculate the next value of an exponential moving average + * a_1 = a_0 * e + a * (1 - e) + * + * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT] + * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT] + * @weight: [0, HST_FIXED_1] + * + * Note: + * To account for multiple periods in the same calculation, + * a_n = a_0 * e^n + a * (1 - e^n), + * so call fixed_ema(last, next, pow(weight, N)) + */ +static u64 fixed_ema(u64 last, u64 next, u64 weight) +{ + last *= weight; + last += next * (HST_FIXED_1 - weight); + last += 1ULL << (HST_FIXED_SHIFT - 1); + return last >> HST_FIXED_SHIFT; +} + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->failed_paths); + spin_lock_init(&s->lock); + s->valid_count = 0; + } + + return s; +} + +/* + * Get the weight for a given time span. + */ +static u64 hst_weight(struct path_selector *ps, u64 delta) +{ + struct selector *s = ps->context; + int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL, + HST_WEIGHT_COUNT - 1); + + return s->weights[bucket]; +} + +/* + * Set up the weights array. + * + * weights[len-1] = 0 + * weights[n] = base ^ (n + 1) + */ +static void hst_set_weights(struct path_selector *ps, unsigned int base) +{ + struct selector *s = ps->context; + int i; + + if (base >= HST_FIXED_1) + return; + + for (i = 0; i < HST_WEIGHT_COUNT - 1; i++) + s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1); + s->weights[HST_WEIGHT_COUNT - 1] = 0; +} + +static int hst_create(struct path_selector *ps, unsigned int argc, char **argv) +{ + struct selector *s; + unsigned int base_weight = HST_FIXED_95; + unsigned int threshold_multiplier = 0; + char dummy; + + /* + * Arguments: [ []] + * : Base weight for ema [0, 1024) 10-bit fixed point. A + * value of 0 will completely ignore any history. + * If not given, default (HST_FIXED_95) is used. + * : Minimum threshold multiplier for paths to + * be considered different. That is, a path is + * considered different iff (p1 > N * p2) where p1 + * is the path with higher service time. A threshold + * of 1 or 0 has no effect. Defaults to 0. + */ + if (argc > 2) + return -EINVAL; + + if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 || + base_weight >= HST_FIXED_1)) { + return -EINVAL; + } + + if (argc > 1 && (sscanf(argv[1], "%u%c", + &threshold_multiplier, &dummy) != 1)) { + return -EINVAL; + } + + s = alloc_selector(); + if (!s) + return -ENOMEM; + + ps->context = s; + + hst_set_weights(ps, base_weight); + s->threshold_multiplier = threshold_multiplier; + return 0; +} + +static void free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +static void hst_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + free_paths(&s->valid_paths); + free_paths(&s->failed_paths); + kfree(s); + ps->context = NULL; +} + +static int hst_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned int maxlen) +{ + unsigned int sz = 0; + struct path_info *pi; + + if (!path) { + struct selector *s = ps->context; + + DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier); + } else { + pi = path->pscontext; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%llu %llu %llu ", pi->historical_service_time, + pi->outstanding, pi->stale_after); + break; + case STATUSTYPE_TABLE: + DMEMIT("0 "); + break; + } + } + + return sz; +} + +static int hst_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned int repeat_count = HST_MIN_IO; + char dummy; + unsigned long flags; + + /* + * Arguments: [] + * : The number of I/Os before switching path. + * If not given, default (HST_MIN_IO) is used. + */ + if (argc > 1) { + *error = "historical-service-time ps: incorrect number of arguments"; + return -EINVAL; + } + + if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { + *error = "historical-service-time ps: invalid repeat count"; + return -EINVAL; + } + + /* allocate the path */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "historical-service-time ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + + pi->historical_service_time = HST_FIXED_1; + + spin_lock_init(&pi->lock); + pi->outstanding = 0; + + pi->stale_after = 0; + pi->last_finish = 0; + + path->pscontext = pi; + + spin_lock_irqsave(&s->lock, flags); + list_add_tail(&pi->list, &s->valid_paths); + s->valid_count++; + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +static void hst_fail_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + list_move(&pi->list, &s->failed_paths); + s->valid_count--; + spin_unlock_irqrestore(&s->lock, flags); +} + +static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + list_move_tail(&pi->list, &s->valid_paths); + s->valid_count++; + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +static void hst_fill_compare(struct path_info *pi, u64 *hst, + u64 *out, u64 *stale) +{ + unsigned long flags; + + spin_lock_irqsave(&pi->lock, flags); + *hst = pi->historical_service_time; + *out = pi->outstanding; + *stale = pi->stale_after; + spin_unlock_irqrestore(&pi->lock, flags); +} + +/* + * Compare the estimated service time of 2 paths, pi1 and pi2, + * for the incoming I/O. + * + * Returns: + * < 0 : pi1 is better + * 0 : no difference between pi1 and pi2 + * > 0 : pi2 is better + * + */ +static long long hst_compare(struct path_info *pi1, struct path_info *pi2, + u64 time_now, struct path_selector *ps) +{ + struct selector *s = ps->context; + u64 hst1, hst2; + long long out1, out2, stale1, stale2; + int pi2_better, over_threshold; + + hst_fill_compare(pi1, &hst1, &out1, &stale1); + hst_fill_compare(pi2, &hst2, &out2, &stale2); + + /* Check here if estimated latency for two paths are too similar. + * If this is the case, we skip extra calculation and just compare + * outstanding requests. In this case, any unloaded paths will + * be preferred. + */ + if (hst1 > hst2) + over_threshold = hst1 > (s->threshold_multiplier * hst2); + else + over_threshold = hst2 > (s->threshold_multiplier * hst1); + + if (!over_threshold) + return out1 - out2; + + /* + * If an unloaded path is stale, choose it. If both paths are unloaded, + * choose path that is the most stale. + * (If one path is loaded, choose the other) + */ + if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) || + (!out1 && !out2)) + return (!out2 * stale1) - (!out1 * stale2); + + /* Compare estimated service time. If outstanding is the same, we + * don't need to multiply + */ + if (out1 == out2) { + pi2_better = hst1 > hst2; + } else { + /* Potential overflow with out >= 1024 */ + if (unlikely(out1 >= HST_MAX_INFLIGHT || + out2 >= HST_MAX_INFLIGHT)) { + /* If over 1023 in-flights, we may overflow if hst + * is at max. (With this shift we still overflow at + * 1048576 in-flights, which is high enough). + */ + hst1 >>= HST_FIXED_SHIFT; + hst2 >>= HST_FIXED_SHIFT; + } + pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2; + } + + /* In the case that the 'winner' is stale, limit to equal usage. */ + if (pi2_better) { + if (stale2 < time_now) + return out1 - out2; + return 1; + } + if (stale1 < time_now) + return out1 - out2; + return -1; +} + +static struct dm_path *hst_select_path(struct path_selector *ps, + size_t nr_bytes) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL, *best = NULL; + u64 time_now = sched_clock(); + struct dm_path *ret = NULL; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + if (list_empty(&s->valid_paths)) + goto out; + + list_for_each_entry(pi, &s->valid_paths, list) { + if (!best || (hst_compare(pi, best, time_now, ps) < 0)) + best = pi; + } + + if (!best) + goto out; + + /* Move last used path to end (least preferred in case of ties) */ + list_move_tail(&best->list, &s->valid_paths); + + ret = best->path; + +out: + spin_unlock_irqrestore(&s->lock, flags); + return ret; +} + +static int hst_start_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&pi->lock, flags); + pi->outstanding++; + spin_unlock_irqrestore(&pi->lock, flags); + + return 0; +} + +static u64 path_service_time(struct path_info *pi, u64 start_time) +{ + u64 sched_now = ktime_get_ns(); + + /* if a previous disk request has finished after this IO was + * sent to the hardware, pretend the submission happened + * serially. + */ + if (time_after64(pi->last_finish, start_time)) + start_time = pi->last_finish; + + pi->last_finish = sched_now; + if (time_before64(sched_now, start_time)) + return 0; + + return sched_now - start_time; +} + +static int hst_end_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes, u64 start_time) +{ + struct path_info *pi = path->pscontext; + struct selector *s = ps->context; + unsigned long flags; + u64 st; + + spin_lock_irqsave(&pi->lock, flags); + + st = path_service_time(pi, start_time); + pi->outstanding--; + pi->historical_service_time = + fixed_ema(pi->historical_service_time, + min(st * HST_FIXED_1, HST_FIXED_MAX), + hst_weight(ps, st)); + + /* + * On request end, mark path as fresh. If a path hasn't + * finished any requests within the fresh period, the estimated + * service time is considered too optimistic and we limit the + * maximum requests on that path. + */ + pi->stale_after = pi->last_finish + + (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT)); + + spin_unlock_irqrestore(&pi->lock, flags); + + return 0; +} + +static struct path_selector_type hst_ps = { + .name = "historical-service-time", + .module = THIS_MODULE, + .table_args = 1, + .info_args = 3, + .create = hst_create, + .destroy = hst_destroy, + .status = hst_status, + .add_path = hst_add_path, + .fail_path = hst_fail_path, + .reinstate_path = hst_reinstate_path, + .select_path = hst_select_path, + .start_io = hst_start_io, + .end_io = hst_end_io, +}; + +static int __init dm_hst_init(void) +{ + int r = dm_register_path_selector(&hst_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " HST_VERSION " loaded"); + + return r; +} + +static void __exit dm_hst_exit(void) +{ + int r = dm_unregister_path_selector(&hst_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_hst_init); +module_exit(dm_hst_exit); + +MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector"); +MODULE_AUTHOR("Khazhismel Kumykov "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-ps-io-affinity.c b/drivers/md/dm-ps-io-affinity.c new file mode 100644 index 0000000000000..077655cd4fae6 --- /dev/null +++ b/drivers/md/dm-ps-io-affinity.c @@ -0,0 +1,272 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Oracle Corporation + * + * Module Author: Mike Christie + */ +#include "dm-path-selector.h" + +#include +#include + +#define DM_MSG_PREFIX "multipath io-affinity" + +struct path_info { + struct dm_path *path; + cpumask_var_t cpumask; + refcount_t refcount; + bool failed; +}; + +struct selector { + struct path_info **path_map; + cpumask_var_t path_mask; + atomic_t map_misses; +}; + +static void ioa_free_path(struct selector *s, unsigned int cpu) +{ + struct path_info *pi = s->path_map[cpu]; + + if (!pi) + return; + + if (refcount_dec_and_test(&pi->refcount)) { + cpumask_clear_cpu(cpu, s->path_mask); + free_cpumask_var(pi->cpumask); + kfree(pi); + + s->path_map[cpu] = NULL; + } +} + +static int ioa_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL; + unsigned int cpu; + int ret; + + if (argc != 1) { + *error = "io-affinity ps: invalid number of arguments"; + return -EINVAL; + } + + pi = kzalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "io-affinity ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + path->pscontext = pi; + refcount_set(&pi->refcount, 1); + + if (!zalloc_cpumask_var(&pi->cpumask, GFP_KERNEL)) { + *error = "io-affinity ps: Error allocating cpumask context"; + ret = -ENOMEM; + goto free_pi; + } + + ret = cpumask_parse(argv[0], pi->cpumask); + if (ret) { + *error = "io-affinity ps: invalid cpumask"; + ret = -EINVAL; + goto free_mask; + } + + for_each_cpu(cpu, pi->cpumask) { + if (cpu >= nr_cpu_ids) { + DMWARN_LIMIT("Ignoring mapping for CPU %u. Max CPU is %u", + cpu, nr_cpu_ids); + break; + } + + if (s->path_map[cpu]) { + DMWARN("CPU mapping for %u exists. Ignoring.", cpu); + continue; + } + + cpumask_set_cpu(cpu, s->path_mask); + s->path_map[cpu] = pi; + refcount_inc(&pi->refcount); + continue; + } + + if (refcount_dec_and_test(&pi->refcount)) { + *error = "io-affinity ps: No new/valid CPU mapping found"; + ret = -EINVAL; + goto free_mask; + } + + return 0; + +free_mask: + free_cpumask_var(pi->cpumask); +free_pi: + kfree(pi); + return ret; +} + +static int ioa_create(struct path_selector *ps, unsigned argc, char **argv) +{ + struct selector *s; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + s->path_map = kzalloc(nr_cpu_ids * sizeof(struct path_info *), + GFP_KERNEL); + if (!s->path_map) + goto free_selector; + + if (!zalloc_cpumask_var(&s->path_mask, GFP_KERNEL)) + goto free_map; + + atomic_set(&s->map_misses, 0); + ps->context = s; + return 0; + +free_map: + kfree(s->path_map); +free_selector: + kfree(s); + return -ENOMEM; +} + +static void ioa_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + unsigned cpu; + + for_each_cpu(cpu, s->path_mask) + ioa_free_path(s, cpu); + + free_cpumask_var(s->path_mask); + kfree(s->path_map); + kfree(s); + + ps->context = NULL; +} + +static int ioa_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned int maxlen) +{ + struct selector *s = ps->context; + struct path_info *pi; + int sz = 0; + + if (!path) { + DMEMIT("0 "); + return sz; + } + + switch(type) { + case STATUSTYPE_INFO: + DMEMIT("%d ", atomic_read(&s->map_misses)); + break; + case STATUSTYPE_TABLE: + pi = path->pscontext; + DMEMIT("%*pb ", cpumask_pr_args(pi->cpumask)); + break; + } + + return sz; +} + +static void ioa_fail_path(struct path_selector *ps, struct dm_path *p) +{ + struct path_info *pi = p->pscontext; + + pi->failed = true; +} + +static int ioa_reinstate_path(struct path_selector *ps, struct dm_path *p) +{ + struct path_info *pi = p->pscontext; + + pi->failed = false; + return 0; +} + +static struct dm_path *ioa_select_path(struct path_selector *ps, + size_t nr_bytes) +{ + unsigned int cpu, node; + struct selector *s = ps->context; + const struct cpumask *cpumask; + struct path_info *pi; + int i; + + cpu = get_cpu(); + + pi = s->path_map[cpu]; + if (pi && !pi->failed) + goto done; + + /* + * Perf is not optimal, but we at least try the local node then just + * try not to fail. + */ + if (!pi) + atomic_inc(&s->map_misses); + + node = cpu_to_node(cpu); + cpumask = cpumask_of_node(node); + for_each_cpu(i, cpumask) { + pi = s->path_map[i]; + if (pi && !pi->failed) + goto done; + } + + for_each_cpu(i, s->path_mask) { + pi = s->path_map[i]; + if (pi && !pi->failed) + goto done; + } + pi = NULL; + +done: + put_cpu(); + return pi ? pi->path : NULL; +} + +static struct path_selector_type ioa_ps = { + .name = "io-affinity", + .module = THIS_MODULE, + .table_args = 1, + .info_args = 1, + .create = ioa_create, + .destroy = ioa_destroy, + .status = ioa_status, + .add_path = ioa_add_path, + .fail_path = ioa_fail_path, + .reinstate_path = ioa_reinstate_path, + .select_path = ioa_select_path, +}; + +static int __init dm_ioa_init(void) +{ + int ret = dm_register_path_selector(&ioa_ps); + + if (ret < 0) + DMERR("register failed %d", ret); + return ret; +} + +static void __exit dm_ioa_exit(void) +{ + int ret = dm_unregister_path_selector(&ioa_ps); + + if (ret < 0) + DMERR("unregister failed %d", ret); +} + +module_init(dm_ioa_init); +module_exit(dm_ioa_exit); + +MODULE_DESCRIPTION(DM_NAME " multipath path selector that selects paths based on the CPU IO is being executed on"); +MODULE_AUTHOR("Mike Christie "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-ps-queue-length.c b/drivers/md/dm-ps-queue-length.c new file mode 100644 index 0000000000000..5fd018d184187 --- /dev/null +++ b/drivers/md/dm-ps-queue-length.c @@ -0,0 +1,283 @@ +/* + * Copyright (C) 2004-2005 IBM Corp. All Rights Reserved. + * Copyright (C) 2006-2009 NEC Corporation. + * + * dm-queue-length.c + * + * Module Author: Stefan Bader, IBM + * Modified by: Kiyoshi Ueda, NEC + * + * This file is released under the GPL. + * + * queue-length path selector - choose a path with the least number of + * in-flight I/Os. + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include +#include +#include +#include +#include + +#define DM_MSG_PREFIX "multipath queue-length" +#define QL_MIN_IO 1 +#define QL_VERSION "0.2.0" + +struct selector { + struct list_head valid_paths; + struct list_head failed_paths; + spinlock_t lock; +}; + +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned repeat_count; + atomic_t qlen; /* the number of in-flight I/Os */ +}; + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->failed_paths); + spin_lock_init(&s->lock); + } + + return s; +} + +static int ql_create(struct path_selector *ps, unsigned argc, char **argv) +{ + struct selector *s = alloc_selector(); + + if (!s) + return -ENOMEM; + + ps->context = s; + return 0; +} + +static void ql_free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +static void ql_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + ql_free_paths(&s->valid_paths); + ql_free_paths(&s->failed_paths); + kfree(s); + ps->context = NULL; +} + +static int ql_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned maxlen) +{ + unsigned sz = 0; + struct path_info *pi; + + /* When called with NULL path, return selector status/args. */ + if (!path) + DMEMIT("0 "); + else { + pi = path->pscontext; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%d ", atomic_read(&pi->qlen)); + break; + case STATUSTYPE_TABLE: + DMEMIT("%u ", pi->repeat_count); + break; + } + } + + return sz; +} + +static int ql_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned repeat_count = QL_MIN_IO; + char dummy; + unsigned long flags; + + /* + * Arguments: [] + * : The number of I/Os before switching path. + * If not given, default (QL_MIN_IO) is used. + */ + if (argc > 1) { + *error = "queue-length ps: incorrect number of arguments"; + return -EINVAL; + } + + if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { + *error = "queue-length ps: invalid repeat count"; + return -EINVAL; + } + + if (repeat_count > 1) { + DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); + repeat_count = 1; + } + + /* Allocate the path information structure */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "queue-length ps: Error allocating path information"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + atomic_set(&pi->qlen, 0); + + path->pscontext = pi; + + spin_lock_irqsave(&s->lock, flags); + list_add_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +static void ql_fail_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + list_move(&pi->list, &s->failed_paths); + spin_unlock_irqrestore(&s->lock, flags); +} + +static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + list_move_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +/* + * Select a path having the minimum number of in-flight I/Os + */ +static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL, *best = NULL; + struct dm_path *ret = NULL; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + if (list_empty(&s->valid_paths)) + goto out; + + list_for_each_entry(pi, &s->valid_paths, list) { + if (!best || + (atomic_read(&pi->qlen) < atomic_read(&best->qlen))) + best = pi; + + if (!atomic_read(&best->qlen)) + break; + } + + if (!best) + goto out; + + /* Move most recently used to least preferred to evenly balance. */ + list_move_tail(&best->list, &s->valid_paths); + + ret = best->path; +out: + spin_unlock_irqrestore(&s->lock, flags); + return ret; +} + +static int ql_start_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_inc(&pi->qlen); + + return 0; +} + +static int ql_end_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes, u64 start_time) +{ + struct path_info *pi = path->pscontext; + + atomic_dec(&pi->qlen); + + return 0; +} + +static struct path_selector_type ql_ps = { + .name = "queue-length", + .module = THIS_MODULE, + .table_args = 1, + .info_args = 1, + .create = ql_create, + .destroy = ql_destroy, + .status = ql_status, + .add_path = ql_add_path, + .fail_path = ql_fail_path, + .reinstate_path = ql_reinstate_path, + .select_path = ql_select_path, + .start_io = ql_start_io, + .end_io = ql_end_io, +}; + +static int __init dm_ql_init(void) +{ + int r = dm_register_path_selector(&ql_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " QL_VERSION " loaded"); + + return r; +} + +static void __exit dm_ql_exit(void) +{ + int r = dm_unregister_path_selector(&ql_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_ql_init); +module_exit(dm_ql_exit); + +MODULE_AUTHOR("Stefan Bader "); +MODULE_DESCRIPTION( + "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n" + DM_NAME " path selector to balance the number of in-flight I/Os" +); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-ps-round-robin.c b/drivers/md/dm-ps-round-robin.c new file mode 100644 index 0000000000000..bdbb7e6e8212b --- /dev/null +++ b/drivers/md/dm-ps-round-robin.c @@ -0,0 +1,236 @@ +/* + * Copyright (C) 2003 Sistina Software. + * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. + * + * Module Author: Heinz Mauelshagen + * + * This file is released under the GPL. + * + * Round-robin path selector. + */ + +#include + +#include "dm-path-selector.h" + +#include +#include + +#define DM_MSG_PREFIX "multipath round-robin" +#define RR_MIN_IO 1 +#define RR_VERSION "1.2.0" + +/*----------------------------------------------------------------- + * Path-handling code, paths are held in lists + *---------------------------------------------------------------*/ +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned repeat_count; +}; + +static void free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +/*----------------------------------------------------------------- + * Round-robin selector + *---------------------------------------------------------------*/ + +struct selector { + struct list_head valid_paths; + struct list_head invalid_paths; + spinlock_t lock; +}; + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->invalid_paths); + spin_lock_init(&s->lock); + } + + return s; +} + +static int rr_create(struct path_selector *ps, unsigned argc, char **argv) +{ + struct selector *s; + + s = alloc_selector(); + if (!s) + return -ENOMEM; + + ps->context = s; + return 0; +} + +static void rr_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + free_paths(&s->valid_paths); + free_paths(&s->invalid_paths); + kfree(s); + ps->context = NULL; +} + +static int rr_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned int maxlen) +{ + struct path_info *pi; + int sz = 0; + + if (!path) + DMEMIT("0 "); + else { + switch(type) { + case STATUSTYPE_INFO: + break; + case STATUSTYPE_TABLE: + pi = path->pscontext; + DMEMIT("%u ", pi->repeat_count); + break; + } + } + + return sz; +} + +/* + * Called during initialisation to register each path with an + * optional repeat_count. + */ +static int rr_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned repeat_count = RR_MIN_IO; + char dummy; + unsigned long flags; + + if (argc > 1) { + *error = "round-robin ps: incorrect number of arguments"; + return -EINVAL; + } + + /* First path argument is number of I/Os before switching path */ + if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { + *error = "round-robin ps: invalid repeat count"; + return -EINVAL; + } + + if (repeat_count > 1) { + DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); + repeat_count = 1; + } + + /* allocate the path */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "round-robin ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + + path->pscontext = pi; + + spin_lock_irqsave(&s->lock, flags); + list_add_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +static void rr_fail_path(struct path_selector *ps, struct dm_path *p) +{ + unsigned long flags; + struct selector *s = ps->context; + struct path_info *pi = p->pscontext; + + spin_lock_irqsave(&s->lock, flags); + list_move(&pi->list, &s->invalid_paths); + spin_unlock_irqrestore(&s->lock, flags); +} + +static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) +{ + unsigned long flags; + struct selector *s = ps->context; + struct path_info *pi = p->pscontext; + + spin_lock_irqsave(&s->lock, flags); + list_move(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes) +{ + unsigned long flags; + struct selector *s = ps->context; + struct path_info *pi = NULL; + + spin_lock_irqsave(&s->lock, flags); + if (!list_empty(&s->valid_paths)) { + pi = list_entry(s->valid_paths.next, struct path_info, list); + list_move_tail(&pi->list, &s->valid_paths); + } + spin_unlock_irqrestore(&s->lock, flags); + + return pi ? pi->path : NULL; +} + +static struct path_selector_type rr_ps = { + .name = "round-robin", + .module = THIS_MODULE, + .table_args = 1, + .info_args = 0, + .create = rr_create, + .destroy = rr_destroy, + .status = rr_status, + .add_path = rr_add_path, + .fail_path = rr_fail_path, + .reinstate_path = rr_reinstate_path, + .select_path = rr_select_path, +}; + +static int __init dm_rr_init(void) +{ + int r = dm_register_path_selector(&rr_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " RR_VERSION " loaded"); + + return r; +} + +static void __exit dm_rr_exit(void) +{ + int r = dm_unregister_path_selector(&rr_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_rr_init); +module_exit(dm_rr_exit); + +MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector"); +MODULE_AUTHOR("Sistina Software "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-ps-service-time.c b/drivers/md/dm-ps-service-time.c new file mode 100644 index 0000000000000..9cfda665e9ebd --- /dev/null +++ b/drivers/md/dm-ps-service-time.c @@ -0,0 +1,362 @@ +/* + * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. + * + * Module Author: Kiyoshi Ueda + * + * This file is released under the GPL. + * + * Throughput oriented path selector. + */ + +#include "dm.h" +#include "dm-path-selector.h" + +#include +#include + +#define DM_MSG_PREFIX "multipath service-time" +#define ST_MIN_IO 1 +#define ST_MAX_RELATIVE_THROUGHPUT 100 +#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 +#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) +#define ST_VERSION "0.3.0" + +struct selector { + struct list_head valid_paths; + struct list_head failed_paths; + spinlock_t lock; +}; + +struct path_info { + struct list_head list; + struct dm_path *path; + unsigned repeat_count; + unsigned relative_throughput; + atomic_t in_flight_size; /* Total size of in-flight I/Os */ +}; + +static struct selector *alloc_selector(void) +{ + struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (s) { + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->failed_paths); + spin_lock_init(&s->lock); + } + + return s; +} + +static int st_create(struct path_selector *ps, unsigned argc, char **argv) +{ + struct selector *s = alloc_selector(); + + if (!s) + return -ENOMEM; + + ps->context = s; + return 0; +} + +static void free_paths(struct list_head *paths) +{ + struct path_info *pi, *next; + + list_for_each_entry_safe(pi, next, paths, list) { + list_del(&pi->list); + kfree(pi); + } +} + +static void st_destroy(struct path_selector *ps) +{ + struct selector *s = ps->context; + + free_paths(&s->valid_paths); + free_paths(&s->failed_paths); + kfree(s); + ps->context = NULL; +} + +static int st_status(struct path_selector *ps, struct dm_path *path, + status_type_t type, char *result, unsigned maxlen) +{ + unsigned sz = 0; + struct path_info *pi; + + if (!path) + DMEMIT("0 "); + else { + pi = path->pscontext; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%d %u ", atomic_read(&pi->in_flight_size), + pi->relative_throughput); + break; + case STATUSTYPE_TABLE: + DMEMIT("%u %u ", pi->repeat_count, + pi->relative_throughput); + break; + } + } + + return sz; +} + +static int st_add_path(struct path_selector *ps, struct dm_path *path, + int argc, char **argv, char **error) +{ + struct selector *s = ps->context; + struct path_info *pi; + unsigned repeat_count = ST_MIN_IO; + unsigned relative_throughput = 1; + char dummy; + unsigned long flags; + + /* + * Arguments: [ []] + * : The number of I/Os before switching path. + * If not given, default (ST_MIN_IO) is used. + * : The relative throughput value of + * the path among all paths in the path-group. + * The valid range: 0- + * If not given, minimum value '1' is used. + * If '0' is given, the path isn't selected while + * other paths having a positive value are + * available. + */ + if (argc > 2) { + *error = "service-time ps: incorrect number of arguments"; + return -EINVAL; + } + + if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { + *error = "service-time ps: invalid repeat count"; + return -EINVAL; + } + + if (repeat_count > 1) { + DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); + repeat_count = 1; + } + + if ((argc == 2) && + (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 || + relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { + *error = "service-time ps: invalid relative_throughput value"; + return -EINVAL; + } + + /* allocate the path */ + pi = kmalloc(sizeof(*pi), GFP_KERNEL); + if (!pi) { + *error = "service-time ps: Error allocating path context"; + return -ENOMEM; + } + + pi->path = path; + pi->repeat_count = repeat_count; + pi->relative_throughput = relative_throughput; + atomic_set(&pi->in_flight_size, 0); + + path->pscontext = pi; + + spin_lock_irqsave(&s->lock, flags); + list_add_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +static void st_fail_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + list_move(&pi->list, &s->failed_paths); + spin_unlock_irqrestore(&s->lock, flags); +} + +static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) +{ + struct selector *s = ps->context; + struct path_info *pi = path->pscontext; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + list_move_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); + + return 0; +} + +/* + * Compare the estimated service time of 2 paths, pi1 and pi2, + * for the incoming I/O. + * + * Returns: + * < 0 : pi1 is better + * 0 : no difference between pi1 and pi2 + * > 0 : pi2 is better + * + * Description: + * Basically, the service time is estimated by: + * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' + * To reduce the calculation, some optimizations are made. + * (See comments inline) + */ +static int st_compare_load(struct path_info *pi1, struct path_info *pi2, + size_t incoming) +{ + size_t sz1, sz2, st1, st2; + + sz1 = atomic_read(&pi1->in_flight_size); + sz2 = atomic_read(&pi2->in_flight_size); + + /* + * Case 1: Both have same throughput value. Choose less loaded path. + */ + if (pi1->relative_throughput == pi2->relative_throughput) + return sz1 - sz2; + + /* + * Case 2a: Both have same load. Choose higher throughput path. + * Case 2b: One path has no throughput value. Choose the other one. + */ + if (sz1 == sz2 || + !pi1->relative_throughput || !pi2->relative_throughput) + return pi2->relative_throughput - pi1->relative_throughput; + + /* + * Case 3: Calculate service time. Choose faster path. + * Service time using pi1: + * st1 = (sz1 + incoming) / pi1->relative_throughput + * Service time using pi2: + * st2 = (sz2 + incoming) / pi2->relative_throughput + * + * To avoid the division, transform the expression to use + * multiplication. + * Because ->relative_throughput > 0 here, if st1 < st2, + * the expressions below are the same meaning: + * (sz1 + incoming) / pi1->relative_throughput < + * (sz2 + incoming) / pi2->relative_throughput + * (sz1 + incoming) * pi2->relative_throughput < + * (sz2 + incoming) * pi1->relative_throughput + * So use the later one. + */ + sz1 += incoming; + sz2 += incoming; + if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || + sz2 >= ST_MAX_INFLIGHT_SIZE)) { + /* + * Size may be too big for multiplying pi->relative_throughput + * and overflow. + * To avoid the overflow and mis-selection, shift down both. + */ + sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; + sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; + } + st1 = sz1 * pi2->relative_throughput; + st2 = sz2 * pi1->relative_throughput; + if (st1 != st2) + return st1 - st2; + + /* + * Case 4: Service time is equal. Choose higher throughput path. + */ + return pi2->relative_throughput - pi1->relative_throughput; +} + +static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes) +{ + struct selector *s = ps->context; + struct path_info *pi = NULL, *best = NULL; + struct dm_path *ret = NULL; + unsigned long flags; + + spin_lock_irqsave(&s->lock, flags); + if (list_empty(&s->valid_paths)) + goto out; + + list_for_each_entry(pi, &s->valid_paths, list) + if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) + best = pi; + + if (!best) + goto out; + + /* Move most recently used to least preferred to evenly balance. */ + list_move_tail(&best->list, &s->valid_paths); + + ret = best->path; +out: + spin_unlock_irqrestore(&s->lock, flags); + return ret; +} + +static int st_start_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes) +{ + struct path_info *pi = path->pscontext; + + atomic_add(nr_bytes, &pi->in_flight_size); + + return 0; +} + +static int st_end_io(struct path_selector *ps, struct dm_path *path, + size_t nr_bytes, u64 start_time) +{ + struct path_info *pi = path->pscontext; + + atomic_sub(nr_bytes, &pi->in_flight_size); + + return 0; +} + +static struct path_selector_type st_ps = { + .name = "service-time", + .module = THIS_MODULE, + .table_args = 2, + .info_args = 2, + .create = st_create, + .destroy = st_destroy, + .status = st_status, + .add_path = st_add_path, + .fail_path = st_fail_path, + .reinstate_path = st_reinstate_path, + .select_path = st_select_path, + .start_io = st_start_io, + .end_io = st_end_io, +}; + +static int __init dm_st_init(void) +{ + int r = dm_register_path_selector(&st_ps); + + if (r < 0) + DMERR("register failed %d", r); + + DMINFO("version " ST_VERSION " loaded"); + + return r; +} + +static void __exit dm_st_exit(void) +{ + int r = dm_unregister_path_selector(&st_ps); + + if (r < 0) + DMERR("unregister failed %d", r); +} + +module_init(dm_st_init); +module_exit(dm_st_exit); + +MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); +MODULE_AUTHOR("Kiyoshi Ueda "); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c deleted file mode 100644 index 5fd018d184187..0000000000000 --- a/drivers/md/dm-queue-length.c +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Copyright (C) 2004-2005 IBM Corp. All Rights Reserved. - * Copyright (C) 2006-2009 NEC Corporation. - * - * dm-queue-length.c - * - * Module Author: Stefan Bader, IBM - * Modified by: Kiyoshi Ueda, NEC - * - * This file is released under the GPL. - * - * queue-length path selector - choose a path with the least number of - * in-flight I/Os. - */ - -#include "dm.h" -#include "dm-path-selector.h" - -#include -#include -#include -#include -#include - -#define DM_MSG_PREFIX "multipath queue-length" -#define QL_MIN_IO 1 -#define QL_VERSION "0.2.0" - -struct selector { - struct list_head valid_paths; - struct list_head failed_paths; - spinlock_t lock; -}; - -struct path_info { - struct list_head list; - struct dm_path *path; - unsigned repeat_count; - atomic_t qlen; /* the number of in-flight I/Os */ -}; - -static struct selector *alloc_selector(void) -{ - struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (s) { - INIT_LIST_HEAD(&s->valid_paths); - INIT_LIST_HEAD(&s->failed_paths); - spin_lock_init(&s->lock); - } - - return s; -} - -static int ql_create(struct path_selector *ps, unsigned argc, char **argv) -{ - struct selector *s = alloc_selector(); - - if (!s) - return -ENOMEM; - - ps->context = s; - return 0; -} - -static void ql_free_paths(struct list_head *paths) -{ - struct path_info *pi, *next; - - list_for_each_entry_safe(pi, next, paths, list) { - list_del(&pi->list); - kfree(pi); - } -} - -static void ql_destroy(struct path_selector *ps) -{ - struct selector *s = ps->context; - - ql_free_paths(&s->valid_paths); - ql_free_paths(&s->failed_paths); - kfree(s); - ps->context = NULL; -} - -static int ql_status(struct path_selector *ps, struct dm_path *path, - status_type_t type, char *result, unsigned maxlen) -{ - unsigned sz = 0; - struct path_info *pi; - - /* When called with NULL path, return selector status/args. */ - if (!path) - DMEMIT("0 "); - else { - pi = path->pscontext; - - switch (type) { - case STATUSTYPE_INFO: - DMEMIT("%d ", atomic_read(&pi->qlen)); - break; - case STATUSTYPE_TABLE: - DMEMIT("%u ", pi->repeat_count); - break; - } - } - - return sz; -} - -static int ql_add_path(struct path_selector *ps, struct dm_path *path, - int argc, char **argv, char **error) -{ - struct selector *s = ps->context; - struct path_info *pi; - unsigned repeat_count = QL_MIN_IO; - char dummy; - unsigned long flags; - - /* - * Arguments: [] - * : The number of I/Os before switching path. - * If not given, default (QL_MIN_IO) is used. - */ - if (argc > 1) { - *error = "queue-length ps: incorrect number of arguments"; - return -EINVAL; - } - - if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { - *error = "queue-length ps: invalid repeat count"; - return -EINVAL; - } - - if (repeat_count > 1) { - DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); - repeat_count = 1; - } - - /* Allocate the path information structure */ - pi = kmalloc(sizeof(*pi), GFP_KERNEL); - if (!pi) { - *error = "queue-length ps: Error allocating path information"; - return -ENOMEM; - } - - pi->path = path; - pi->repeat_count = repeat_count; - atomic_set(&pi->qlen, 0); - - path->pscontext = pi; - - spin_lock_irqsave(&s->lock, flags); - list_add_tail(&pi->list, &s->valid_paths); - spin_unlock_irqrestore(&s->lock, flags); - - return 0; -} - -static void ql_fail_path(struct path_selector *ps, struct dm_path *path) -{ - struct selector *s = ps->context; - struct path_info *pi = path->pscontext; - unsigned long flags; - - spin_lock_irqsave(&s->lock, flags); - list_move(&pi->list, &s->failed_paths); - spin_unlock_irqrestore(&s->lock, flags); -} - -static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) -{ - struct selector *s = ps->context; - struct path_info *pi = path->pscontext; - unsigned long flags; - - spin_lock_irqsave(&s->lock, flags); - list_move_tail(&pi->list, &s->valid_paths); - spin_unlock_irqrestore(&s->lock, flags); - - return 0; -} - -/* - * Select a path having the minimum number of in-flight I/Os - */ -static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes) -{ - struct selector *s = ps->context; - struct path_info *pi = NULL, *best = NULL; - struct dm_path *ret = NULL; - unsigned long flags; - - spin_lock_irqsave(&s->lock, flags); - if (list_empty(&s->valid_paths)) - goto out; - - list_for_each_entry(pi, &s->valid_paths, list) { - if (!best || - (atomic_read(&pi->qlen) < atomic_read(&best->qlen))) - best = pi; - - if (!atomic_read(&best->qlen)) - break; - } - - if (!best) - goto out; - - /* Move most recently used to least preferred to evenly balance. */ - list_move_tail(&best->list, &s->valid_paths); - - ret = best->path; -out: - spin_unlock_irqrestore(&s->lock, flags); - return ret; -} - -static int ql_start_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes) -{ - struct path_info *pi = path->pscontext; - - atomic_inc(&pi->qlen); - - return 0; -} - -static int ql_end_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes, u64 start_time) -{ - struct path_info *pi = path->pscontext; - - atomic_dec(&pi->qlen); - - return 0; -} - -static struct path_selector_type ql_ps = { - .name = "queue-length", - .module = THIS_MODULE, - .table_args = 1, - .info_args = 1, - .create = ql_create, - .destroy = ql_destroy, - .status = ql_status, - .add_path = ql_add_path, - .fail_path = ql_fail_path, - .reinstate_path = ql_reinstate_path, - .select_path = ql_select_path, - .start_io = ql_start_io, - .end_io = ql_end_io, -}; - -static int __init dm_ql_init(void) -{ - int r = dm_register_path_selector(&ql_ps); - - if (r < 0) - DMERR("register failed %d", r); - - DMINFO("version " QL_VERSION " loaded"); - - return r; -} - -static void __exit dm_ql_exit(void) -{ - int r = dm_unregister_path_selector(&ql_ps); - - if (r < 0) - DMERR("unregister failed %d", r); -} - -module_init(dm_ql_init); -module_exit(dm_ql_exit); - -MODULE_AUTHOR("Stefan Bader "); -MODULE_DESCRIPTION( - "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n" - DM_NAME " path selector to balance the number of in-flight I/Os" -); -MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c deleted file mode 100644 index bdbb7e6e8212b..0000000000000 --- a/drivers/md/dm-round-robin.c +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright (C) 2003 Sistina Software. - * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. - * - * Module Author: Heinz Mauelshagen - * - * This file is released under the GPL. - * - * Round-robin path selector. - */ - -#include - -#include "dm-path-selector.h" - -#include -#include - -#define DM_MSG_PREFIX "multipath round-robin" -#define RR_MIN_IO 1 -#define RR_VERSION "1.2.0" - -/*----------------------------------------------------------------- - * Path-handling code, paths are held in lists - *---------------------------------------------------------------*/ -struct path_info { - struct list_head list; - struct dm_path *path; - unsigned repeat_count; -}; - -static void free_paths(struct list_head *paths) -{ - struct path_info *pi, *next; - - list_for_each_entry_safe(pi, next, paths, list) { - list_del(&pi->list); - kfree(pi); - } -} - -/*----------------------------------------------------------------- - * Round-robin selector - *---------------------------------------------------------------*/ - -struct selector { - struct list_head valid_paths; - struct list_head invalid_paths; - spinlock_t lock; -}; - -static struct selector *alloc_selector(void) -{ - struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (s) { - INIT_LIST_HEAD(&s->valid_paths); - INIT_LIST_HEAD(&s->invalid_paths); - spin_lock_init(&s->lock); - } - - return s; -} - -static int rr_create(struct path_selector *ps, unsigned argc, char **argv) -{ - struct selector *s; - - s = alloc_selector(); - if (!s) - return -ENOMEM; - - ps->context = s; - return 0; -} - -static void rr_destroy(struct path_selector *ps) -{ - struct selector *s = ps->context; - - free_paths(&s->valid_paths); - free_paths(&s->invalid_paths); - kfree(s); - ps->context = NULL; -} - -static int rr_status(struct path_selector *ps, struct dm_path *path, - status_type_t type, char *result, unsigned int maxlen) -{ - struct path_info *pi; - int sz = 0; - - if (!path) - DMEMIT("0 "); - else { - switch(type) { - case STATUSTYPE_INFO: - break; - case STATUSTYPE_TABLE: - pi = path->pscontext; - DMEMIT("%u ", pi->repeat_count); - break; - } - } - - return sz; -} - -/* - * Called during initialisation to register each path with an - * optional repeat_count. - */ -static int rr_add_path(struct path_selector *ps, struct dm_path *path, - int argc, char **argv, char **error) -{ - struct selector *s = ps->context; - struct path_info *pi; - unsigned repeat_count = RR_MIN_IO; - char dummy; - unsigned long flags; - - if (argc > 1) { - *error = "round-robin ps: incorrect number of arguments"; - return -EINVAL; - } - - /* First path argument is number of I/Os before switching path */ - if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { - *error = "round-robin ps: invalid repeat count"; - return -EINVAL; - } - - if (repeat_count > 1) { - DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); - repeat_count = 1; - } - - /* allocate the path */ - pi = kmalloc(sizeof(*pi), GFP_KERNEL); - if (!pi) { - *error = "round-robin ps: Error allocating path context"; - return -ENOMEM; - } - - pi->path = path; - pi->repeat_count = repeat_count; - - path->pscontext = pi; - - spin_lock_irqsave(&s->lock, flags); - list_add_tail(&pi->list, &s->valid_paths); - spin_unlock_irqrestore(&s->lock, flags); - - return 0; -} - -static void rr_fail_path(struct path_selector *ps, struct dm_path *p) -{ - unsigned long flags; - struct selector *s = ps->context; - struct path_info *pi = p->pscontext; - - spin_lock_irqsave(&s->lock, flags); - list_move(&pi->list, &s->invalid_paths); - spin_unlock_irqrestore(&s->lock, flags); -} - -static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) -{ - unsigned long flags; - struct selector *s = ps->context; - struct path_info *pi = p->pscontext; - - spin_lock_irqsave(&s->lock, flags); - list_move(&pi->list, &s->valid_paths); - spin_unlock_irqrestore(&s->lock, flags); - - return 0; -} - -static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes) -{ - unsigned long flags; - struct selector *s = ps->context; - struct path_info *pi = NULL; - - spin_lock_irqsave(&s->lock, flags); - if (!list_empty(&s->valid_paths)) { - pi = list_entry(s->valid_paths.next, struct path_info, list); - list_move_tail(&pi->list, &s->valid_paths); - } - spin_unlock_irqrestore(&s->lock, flags); - - return pi ? pi->path : NULL; -} - -static struct path_selector_type rr_ps = { - .name = "round-robin", - .module = THIS_MODULE, - .table_args = 1, - .info_args = 0, - .create = rr_create, - .destroy = rr_destroy, - .status = rr_status, - .add_path = rr_add_path, - .fail_path = rr_fail_path, - .reinstate_path = rr_reinstate_path, - .select_path = rr_select_path, -}; - -static int __init dm_rr_init(void) -{ - int r = dm_register_path_selector(&rr_ps); - - if (r < 0) - DMERR("register failed %d", r); - - DMINFO("version " RR_VERSION " loaded"); - - return r; -} - -static void __exit dm_rr_exit(void) -{ - int r = dm_unregister_path_selector(&rr_ps); - - if (r < 0) - DMERR("unregister failed %d", r); -} - -module_init(dm_rr_init); -module_exit(dm_rr_exit); - -MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector"); -MODULE_AUTHOR("Sistina Software "); -MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c deleted file mode 100644 index 9cfda665e9ebd..0000000000000 --- a/drivers/md/dm-service-time.c +++ /dev/null @@ -1,362 +0,0 @@ -/* - * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. - * - * Module Author: Kiyoshi Ueda - * - * This file is released under the GPL. - * - * Throughput oriented path selector. - */ - -#include "dm.h" -#include "dm-path-selector.h" - -#include -#include - -#define DM_MSG_PREFIX "multipath service-time" -#define ST_MIN_IO 1 -#define ST_MAX_RELATIVE_THROUGHPUT 100 -#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 -#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) -#define ST_VERSION "0.3.0" - -struct selector { - struct list_head valid_paths; - struct list_head failed_paths; - spinlock_t lock; -}; - -struct path_info { - struct list_head list; - struct dm_path *path; - unsigned repeat_count; - unsigned relative_throughput; - atomic_t in_flight_size; /* Total size of in-flight I/Os */ -}; - -static struct selector *alloc_selector(void) -{ - struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); - - if (s) { - INIT_LIST_HEAD(&s->valid_paths); - INIT_LIST_HEAD(&s->failed_paths); - spin_lock_init(&s->lock); - } - - return s; -} - -static int st_create(struct path_selector *ps, unsigned argc, char **argv) -{ - struct selector *s = alloc_selector(); - - if (!s) - return -ENOMEM; - - ps->context = s; - return 0; -} - -static void free_paths(struct list_head *paths) -{ - struct path_info *pi, *next; - - list_for_each_entry_safe(pi, next, paths, list) { - list_del(&pi->list); - kfree(pi); - } -} - -static void st_destroy(struct path_selector *ps) -{ - struct selector *s = ps->context; - - free_paths(&s->valid_paths); - free_paths(&s->failed_paths); - kfree(s); - ps->context = NULL; -} - -static int st_status(struct path_selector *ps, struct dm_path *path, - status_type_t type, char *result, unsigned maxlen) -{ - unsigned sz = 0; - struct path_info *pi; - - if (!path) - DMEMIT("0 "); - else { - pi = path->pscontext; - - switch (type) { - case STATUSTYPE_INFO: - DMEMIT("%d %u ", atomic_read(&pi->in_flight_size), - pi->relative_throughput); - break; - case STATUSTYPE_TABLE: - DMEMIT("%u %u ", pi->repeat_count, - pi->relative_throughput); - break; - } - } - - return sz; -} - -static int st_add_path(struct path_selector *ps, struct dm_path *path, - int argc, char **argv, char **error) -{ - struct selector *s = ps->context; - struct path_info *pi; - unsigned repeat_count = ST_MIN_IO; - unsigned relative_throughput = 1; - char dummy; - unsigned long flags; - - /* - * Arguments: [ []] - * : The number of I/Os before switching path. - * If not given, default (ST_MIN_IO) is used. - * : The relative throughput value of - * the path among all paths in the path-group. - * The valid range: 0- - * If not given, minimum value '1' is used. - * If '0' is given, the path isn't selected while - * other paths having a positive value are - * available. - */ - if (argc > 2) { - *error = "service-time ps: incorrect number of arguments"; - return -EINVAL; - } - - if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { - *error = "service-time ps: invalid repeat count"; - return -EINVAL; - } - - if (repeat_count > 1) { - DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); - repeat_count = 1; - } - - if ((argc == 2) && - (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 || - relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { - *error = "service-time ps: invalid relative_throughput value"; - return -EINVAL; - } - - /* allocate the path */ - pi = kmalloc(sizeof(*pi), GFP_KERNEL); - if (!pi) { - *error = "service-time ps: Error allocating path context"; - return -ENOMEM; - } - - pi->path = path; - pi->repeat_count = repeat_count; - pi->relative_throughput = relative_throughput; - atomic_set(&pi->in_flight_size, 0); - - path->pscontext = pi; - - spin_lock_irqsave(&s->lock, flags); - list_add_tail(&pi->list, &s->valid_paths); - spin_unlock_irqrestore(&s->lock, flags); - - return 0; -} - -static void st_fail_path(struct path_selector *ps, struct dm_path *path) -{ - struct selector *s = ps->context; - struct path_info *pi = path->pscontext; - unsigned long flags; - - spin_lock_irqsave(&s->lock, flags); - list_move(&pi->list, &s->failed_paths); - spin_unlock_irqrestore(&s->lock, flags); -} - -static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) -{ - struct selector *s = ps->context; - struct path_info *pi = path->pscontext; - unsigned long flags; - - spin_lock_irqsave(&s->lock, flags); - list_move_tail(&pi->list, &s->valid_paths); - spin_unlock_irqrestore(&s->lock, flags); - - return 0; -} - -/* - * Compare the estimated service time of 2 paths, pi1 and pi2, - * for the incoming I/O. - * - * Returns: - * < 0 : pi1 is better - * 0 : no difference between pi1 and pi2 - * > 0 : pi2 is better - * - * Description: - * Basically, the service time is estimated by: - * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' - * To reduce the calculation, some optimizations are made. - * (See comments inline) - */ -static int st_compare_load(struct path_info *pi1, struct path_info *pi2, - size_t incoming) -{ - size_t sz1, sz2, st1, st2; - - sz1 = atomic_read(&pi1->in_flight_size); - sz2 = atomic_read(&pi2->in_flight_size); - - /* - * Case 1: Both have same throughput value. Choose less loaded path. - */ - if (pi1->relative_throughput == pi2->relative_throughput) - return sz1 - sz2; - - /* - * Case 2a: Both have same load. Choose higher throughput path. - * Case 2b: One path has no throughput value. Choose the other one. - */ - if (sz1 == sz2 || - !pi1->relative_throughput || !pi2->relative_throughput) - return pi2->relative_throughput - pi1->relative_throughput; - - /* - * Case 3: Calculate service time. Choose faster path. - * Service time using pi1: - * st1 = (sz1 + incoming) / pi1->relative_throughput - * Service time using pi2: - * st2 = (sz2 + incoming) / pi2->relative_throughput - * - * To avoid the division, transform the expression to use - * multiplication. - * Because ->relative_throughput > 0 here, if st1 < st2, - * the expressions below are the same meaning: - * (sz1 + incoming) / pi1->relative_throughput < - * (sz2 + incoming) / pi2->relative_throughput - * (sz1 + incoming) * pi2->relative_throughput < - * (sz2 + incoming) * pi1->relative_throughput - * So use the later one. - */ - sz1 += incoming; - sz2 += incoming; - if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || - sz2 >= ST_MAX_INFLIGHT_SIZE)) { - /* - * Size may be too big for multiplying pi->relative_throughput - * and overflow. - * To avoid the overflow and mis-selection, shift down both. - */ - sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; - sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; - } - st1 = sz1 * pi2->relative_throughput; - st2 = sz2 * pi1->relative_throughput; - if (st1 != st2) - return st1 - st2; - - /* - * Case 4: Service time is equal. Choose higher throughput path. - */ - return pi2->relative_throughput - pi1->relative_throughput; -} - -static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes) -{ - struct selector *s = ps->context; - struct path_info *pi = NULL, *best = NULL; - struct dm_path *ret = NULL; - unsigned long flags; - - spin_lock_irqsave(&s->lock, flags); - if (list_empty(&s->valid_paths)) - goto out; - - list_for_each_entry(pi, &s->valid_paths, list) - if (!best || (st_compare_load(pi, best, nr_bytes) < 0)) - best = pi; - - if (!best) - goto out; - - /* Move most recently used to least preferred to evenly balance. */ - list_move_tail(&best->list, &s->valid_paths); - - ret = best->path; -out: - spin_unlock_irqrestore(&s->lock, flags); - return ret; -} - -static int st_start_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes) -{ - struct path_info *pi = path->pscontext; - - atomic_add(nr_bytes, &pi->in_flight_size); - - return 0; -} - -static int st_end_io(struct path_selector *ps, struct dm_path *path, - size_t nr_bytes, u64 start_time) -{ - struct path_info *pi = path->pscontext; - - atomic_sub(nr_bytes, &pi->in_flight_size); - - return 0; -} - -static struct path_selector_type st_ps = { - .name = "service-time", - .module = THIS_MODULE, - .table_args = 2, - .info_args = 2, - .create = st_create, - .destroy = st_destroy, - .status = st_status, - .add_path = st_add_path, - .fail_path = st_fail_path, - .reinstate_path = st_reinstate_path, - .select_path = st_select_path, - .start_io = st_start_io, - .end_io = st_end_io, -}; - -static int __init dm_st_init(void) -{ - int r = dm_register_path_selector(&st_ps); - - if (r < 0) - DMERR("register failed %d", r); - - DMINFO("version " ST_VERSION " loaded"); - - return r; -} - -static void __exit dm_st_exit(void) -{ - int r = dm_unregister_path_selector(&st_ps); - - if (r < 0) - DMERR("unregister failed %d", r); -} - -module_init(dm_st_init); -module_exit(dm_st_exit); - -MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector"); -MODULE_AUTHOR("Kiyoshi Ueda "); -MODULE_LICENSE("GPL");