From 9c6fbbf4eea1b82abe28db2445beef388b4f7ab0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Apr 2012 13:38:40 +0200 Subject: [PATCH] sched: Fix more load-balancing fallout Commits 7f123faee926 ("sched: Ditch per cgroup task lists for load-balancing") and 5be01d884 ("sched: Fix load-balance wreckage") left some more wreckage. By setting loop_max unconditionally to ->nr_running load-balancing could take a lot of time on very long runqueues (hackbench!). So keep the sysctl as max limit of the amount of tasks we'll iterate. Furthermore, the min load filter for migration completely fails with cgroups since inequality in per-cpu state can easily lead to such small loads :/ Furthermore the change to add new tasks to the tail of the queue instead of the head seems to have some effect.. not quite sure I understand why. Combined these fixes solve the huge hackbench regression reported by Tim when hackbench is ran in a cgroup. Reported-by: Tim Chen Acked-by: Tim Chen Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1335365763.28150.267.camel@twins [ got rid of the CONFIG_PREEMPT tuning and made small readability edits ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 18 ++++++++++-------- kernel/sched/features.h | 1 + 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0d97ebdc58f07..e9553640c1c3b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP if (entity_is_task(se)) - list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); + list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); #endif cfs_rq->nr_running++; } @@ -3215,6 +3215,8 @@ static int move_one_task(struct lb_env *env) static unsigned long task_h_load(struct task_struct *p); +static const unsigned int sched_nr_migrate_break = 32; + /* * move_tasks tries to move up to load_move weighted load from busiest to * this_rq, as part of a balancing operation within domain "sd". @@ -3242,7 +3244,7 @@ static int move_tasks(struct lb_env *env) /* take a breather every nr_migrate tasks */ if (env->loop > env->loop_break) { - env->loop_break += sysctl_sched_nr_migrate; + env->loop_break += sched_nr_migrate_break; env->flags |= LBF_NEED_BREAK; break; } @@ -3252,7 +3254,7 @@ static int move_tasks(struct lb_env *env) load = task_h_load(p); - if (load < 16 && !env->sd->nr_balance_failed) + if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) goto next; if ((load / 2) > env->load_move) @@ -4407,7 +4409,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .dst_cpu = this_cpu, .dst_rq = this_rq, .idle = idle, - .loop_break = sysctl_sched_nr_migrate, + .loop_break = sched_nr_migrate_break, }; cpumask_copy(cpus, cpu_active_mask); @@ -4445,10 +4447,10 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.load_move = imbalance; - env.src_cpu = busiest->cpu; - env.src_rq = busiest; - env.loop_max = busiest->nr_running; + env.load_move = imbalance; + env.src_cpu = busiest->cpu; + env.src_rq = busiest; + env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running); more_balance: local_irq_save(flags); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e61fd73913d06..de00a486c5c69 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -68,3 +68,4 @@ SCHED_FEAT(TTWU_QUEUE, true) SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) +SCHED_FEAT(LB_MIN, false) -- 2.39.5