sched: Simplify cpu-hot-unplug task migration
While discussing the need for sched_idle_next(), Oleg remarked that since try_to_wake_up() ensures sleeping tasks will end up running on a sane cpu, we can do away with migrate_live_tasks(). If we then extend the existing hack of migrating current from CPU_DYING to migrating the full rq worth of tasks from CPU_DYING, the need for the sched_idle_next() abomination disappears as well, since idle will be the only possible thread left after the migration thread stops. This greatly simplifies the hot-unplug task migration path, as can be seen from the resulting code reduction (and about half the new lines are comments). Suggested-by: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <1289851597.2109.547.camel@laptop> Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
parent
92fd4d4d67
commit
48c5ccae88
3 changed files with 82 additions and 173 deletions
|
@ -1871,14 +1871,11 @@ extern void sched_clock_idle_sleep_event(void);
|
|||
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
|
||||
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
|
||||
extern void idle_task_exit(void);
|
||||
#else
|
||||
static inline void idle_task_exit(void) {}
|
||||
#endif
|
||||
|
||||
extern void sched_idle_next(void);
|
||||
|
||||
#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
|
||||
extern void wake_up_idle_cpu(int cpu);
|
||||
#else
|
||||
|
|
16
kernel/cpu.c
16
kernel/cpu.c
|
@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
|
|||
}
|
||||
|
||||
struct take_cpu_down_param {
|
||||
struct task_struct *caller;
|
||||
unsigned long mod;
|
||||
void *hcpu;
|
||||
};
|
||||
|
@ -208,11 +207,6 @@ static int __ref take_cpu_down(void *_param)
|
|||
|
||||
cpu_notify(CPU_DYING | param->mod, param->hcpu);
|
||||
|
||||
if (task_cpu(param->caller) == cpu)
|
||||
move_task_off_dead_cpu(cpu, param->caller);
|
||||
/* Force idle task to run as soon as we yield: it should
|
||||
immediately notice cpu is offline and die quickly. */
|
||||
sched_idle_next();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -223,7 +217,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
|
|||
void *hcpu = (void *)(long)cpu;
|
||||
unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
|
||||
struct take_cpu_down_param tcd_param = {
|
||||
.caller = current,
|
||||
.mod = mod,
|
||||
.hcpu = hcpu,
|
||||
};
|
||||
|
@ -253,9 +246,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
|
|||
}
|
||||
BUG_ON(cpu_online(cpu));
|
||||
|
||||
/* Wait for it to sleep (leaving idle task). */
|
||||
while (!idle_cpu(cpu))
|
||||
yield();
|
||||
/*
|
||||
* The migration_call() CPU_DYING callback will have removed all
|
||||
* runnable tasks from the cpu, there's only the idle task left now
|
||||
* that the migration thread is done doing the stop_machine thing.
|
||||
*/
|
||||
BUG_ON(!idle_cpu(cpu));
|
||||
|
||||
/* This actually kills the CPU. */
|
||||
__cpu_die(cpu);
|
||||
|
|
236
kernel/sched.c
236
kernel/sched.c
|
@ -2366,18 +2366,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
|
|||
return dest_cpu;
|
||||
|
||||
/* No more Mr. Nice Guy. */
|
||||
if (unlikely(dest_cpu >= nr_cpu_ids)) {
|
||||
dest_cpu = cpuset_cpus_allowed_fallback(p);
|
||||
/*
|
||||
* Don't tell them about moving exiting tasks or
|
||||
* kernel threads (both mm NULL), since they never
|
||||
* leave kernel.
|
||||
*/
|
||||
if (p->mm && printk_ratelimit()) {
|
||||
printk(KERN_INFO "process %d (%s) no "
|
||||
"longer affine to cpu%d\n",
|
||||
task_pid_nr(p), p->comm, cpu);
|
||||
}
|
||||
dest_cpu = cpuset_cpus_allowed_fallback(p);
|
||||
/*
|
||||
* Don't tell them about moving exiting tasks or
|
||||
* kernel threads (both mm NULL), since they never
|
||||
* leave kernel.
|
||||
*/
|
||||
if (p->mm && printk_ratelimit()) {
|
||||
printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
|
||||
task_pid_nr(p), p->comm, cpu);
|
||||
}
|
||||
|
||||
return dest_cpu;
|
||||
|
@ -5712,96 +5709,6 @@ static int migration_cpu_stop(void *data)
|
|||
}
|
||||
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
/*
|
||||
* Figure out where task on dead CPU should go, use force if necessary.
|
||||
*/
|
||||
void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
|
||||
{
|
||||
struct rq *rq = cpu_rq(dead_cpu);
|
||||
int needs_cpu, uninitialized_var(dest_cpu);
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
|
||||
raw_spin_lock(&rq->lock);
|
||||
needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
|
||||
if (needs_cpu)
|
||||
dest_cpu = select_fallback_rq(dead_cpu, p);
|
||||
raw_spin_unlock(&rq->lock);
|
||||
/*
|
||||
* It can only fail if we race with set_cpus_allowed(),
|
||||
* in the racer should migrate the task anyway.
|
||||
*/
|
||||
if (needs_cpu)
|
||||
__migrate_task(p, dead_cpu, dest_cpu);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* While a dead CPU has no uninterruptible tasks queued at this point,
|
||||
* it might still have a nonzero ->nr_uninterruptible counter, because
|
||||
* for performance reasons the counter is not stricly tracking tasks to
|
||||
* their home CPUs. So we just add the counter to another CPU's counter,
|
||||
* to keep the global sum constant after CPU-down:
|
||||
*/
|
||||
static void migrate_nr_uninterruptible(struct rq *rq_src)
|
||||
{
|
||||
struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
double_rq_lock(rq_src, rq_dest);
|
||||
rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
|
||||
rq_src->nr_uninterruptible = 0;
|
||||
double_rq_unlock(rq_src, rq_dest);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
|
||||
/* Run through task list and migrate tasks from the dead cpu. */
|
||||
static void migrate_live_tasks(int src_cpu)
|
||||
{
|
||||
struct task_struct *p, *t;
|
||||
|
||||
read_lock(&tasklist_lock);
|
||||
|
||||
do_each_thread(t, p) {
|
||||
if (p == current)
|
||||
continue;
|
||||
|
||||
if (task_cpu(p) == src_cpu)
|
||||
move_task_off_dead_cpu(src_cpu, p);
|
||||
} while_each_thread(t, p);
|
||||
|
||||
read_unlock(&tasklist_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Schedules idle task to be the next runnable task on current CPU.
|
||||
* It does so by boosting its priority to highest possible.
|
||||
* Used by CPU offline code.
|
||||
*/
|
||||
void sched_idle_next(void)
|
||||
{
|
||||
int this_cpu = smp_processor_id();
|
||||
struct rq *rq = cpu_rq(this_cpu);
|
||||
struct task_struct *p = rq->idle;
|
||||
unsigned long flags;
|
||||
|
||||
/* cpu has to be offline */
|
||||
BUG_ON(cpu_online(this_cpu));
|
||||
|
||||
/*
|
||||
* Strictly not necessary since rest of the CPUs are stopped by now
|
||||
* and interrupts disabled on the current cpu.
|
||||
*/
|
||||
raw_spin_lock_irqsave(&rq->lock, flags);
|
||||
|
||||
__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
|
||||
|
||||
activate_task(rq, p, 0);
|
||||
|
||||
raw_spin_unlock_irqrestore(&rq->lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensures that the idle task is using init_mm right before its cpu goes
|
||||
|
@ -5818,47 +5725,19 @@ void idle_task_exit(void)
|
|||
mmdrop(mm);
|
||||
}
|
||||
|
||||
/* called under rq->lock with disabled interrupts */
|
||||
static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
|
||||
/*
|
||||
* While a dead CPU has no uninterruptible tasks queued at this point,
|
||||
* it might still have a nonzero ->nr_uninterruptible counter, because
|
||||
* for performance reasons the counter is not stricly tracking tasks to
|
||||
* their home CPUs. So we just add the counter to another CPU's counter,
|
||||
* to keep the global sum constant after CPU-down:
|
||||
*/
|
||||
static void migrate_nr_uninterruptible(struct rq *rq_src)
|
||||
{
|
||||
struct rq *rq = cpu_rq(dead_cpu);
|
||||
struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
|
||||
|
||||
/* Must be exiting, otherwise would be on tasklist. */
|
||||
BUG_ON(!p->exit_state);
|
||||
|
||||
/* Cannot have done final schedule yet: would have vanished. */
|
||||
BUG_ON(p->state == TASK_DEAD);
|
||||
|
||||
get_task_struct(p);
|
||||
|
||||
/*
|
||||
* Drop lock around migration; if someone else moves it,
|
||||
* that's OK. No task can be added to this CPU, so iteration is
|
||||
* fine.
|
||||
*/
|
||||
raw_spin_unlock_irq(&rq->lock);
|
||||
move_task_off_dead_cpu(dead_cpu, p);
|
||||
raw_spin_lock_irq(&rq->lock);
|
||||
|
||||
put_task_struct(p);
|
||||
}
|
||||
|
||||
/* release_task() removes task from tasklist, so we won't find dead tasks. */
|
||||
static void migrate_dead_tasks(unsigned int dead_cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(dead_cpu);
|
||||
struct task_struct *next;
|
||||
|
||||
for ( ; ; ) {
|
||||
if (!rq->nr_running)
|
||||
break;
|
||||
next = pick_next_task(rq);
|
||||
if (!next)
|
||||
break;
|
||||
next->sched_class->put_prev_task(rq, next);
|
||||
migrate_dead(dead_cpu, next);
|
||||
|
||||
}
|
||||
rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
|
||||
rq_src->nr_uninterruptible = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -5869,6 +5748,56 @@ static void calc_global_load_remove(struct rq *rq)
|
|||
atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
|
||||
rq->calc_load_active = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Migrate all tasks from the rq, sleeping tasks will be migrated by
|
||||
* try_to_wake_up()->select_task_rq().
|
||||
*
|
||||
* Called with rq->lock held even though we'er in stop_machine() and
|
||||
* there's no concurrency possible, we hold the required locks anyway
|
||||
* because of lock validation efforts.
|
||||
*/
|
||||
static void migrate_tasks(unsigned int dead_cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(dead_cpu);
|
||||
struct task_struct *next, *stop = rq->stop;
|
||||
int dest_cpu;
|
||||
|
||||
/*
|
||||
* Fudge the rq selection such that the below task selection loop
|
||||
* doesn't get stuck on the currently eligible stop task.
|
||||
*
|
||||
* We're currently inside stop_machine() and the rq is either stuck
|
||||
* in the stop_machine_cpu_stop() loop, or we're executing this code,
|
||||
* either way we should never end up calling schedule() until we're
|
||||
* done here.
|
||||
*/
|
||||
rq->stop = NULL;
|
||||
|
||||
for ( ; ; ) {
|
||||
/*
|
||||
* There's this thread running, bail when that's the only
|
||||
* remaining thread.
|
||||
*/
|
||||
if (rq->nr_running == 1)
|
||||
break;
|
||||
|
||||
next = pick_next_task(rq);
|
||||
BUG_ON(!next);
|
||||
next->sched_class->put_prev_task(rq, next);
|
||||
|
||||
/* Find suitable destination for @next, with force if needed. */
|
||||
dest_cpu = select_fallback_rq(dead_cpu, next);
|
||||
raw_spin_unlock(&rq->lock);
|
||||
|
||||
__migrate_task(next, dead_cpu, dest_cpu);
|
||||
|
||||
raw_spin_lock(&rq->lock);
|
||||
}
|
||||
|
||||
rq->stop = stop;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_HOTPLUG_CPU */
|
||||
|
||||
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
|
||||
|
@ -6078,15 +6007,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
|
|||
unsigned long flags;
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
|
||||
switch (action) {
|
||||
switch (action & ~CPU_TASKS_FROZEN) {
|
||||
|
||||
case CPU_UP_PREPARE:
|
||||
case CPU_UP_PREPARE_FROZEN:
|
||||
rq->calc_load_update = calc_load_update;
|
||||
break;
|
||||
|
||||
case CPU_ONLINE:
|
||||
case CPU_ONLINE_FROZEN:
|
||||
/* Update our root-domain */
|
||||
raw_spin_lock_irqsave(&rq->lock, flags);
|
||||
if (rq->rd) {
|
||||
|
@ -6098,30 +6025,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
|
|||
break;
|
||||
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
case CPU_DEAD:
|
||||
case CPU_DEAD_FROZEN:
|
||||
migrate_live_tasks(cpu);
|
||||
/* Idle task back to normal (off runqueue, low prio) */
|
||||
raw_spin_lock_irq(&rq->lock);
|
||||
deactivate_task(rq, rq->idle, 0);
|
||||
__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
|
||||
rq->idle->sched_class = &idle_sched_class;
|
||||
migrate_dead_tasks(cpu);
|
||||
raw_spin_unlock_irq(&rq->lock);
|
||||
migrate_nr_uninterruptible(rq);
|
||||
BUG_ON(rq->nr_running != 0);
|
||||
calc_global_load_remove(rq);
|
||||
break;
|
||||
|
||||
case CPU_DYING:
|
||||
case CPU_DYING_FROZEN:
|
||||
/* Update our root-domain */
|
||||
raw_spin_lock_irqsave(&rq->lock, flags);
|
||||
if (rq->rd) {
|
||||
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
|
||||
set_rq_offline(rq);
|
||||
}
|
||||
migrate_tasks(cpu);
|
||||
BUG_ON(rq->nr_running != 1); /* the migration thread */
|
||||
raw_spin_unlock_irqrestore(&rq->lock, flags);
|
||||
|
||||
migrate_nr_uninterruptible(rq);
|
||||
calc_global_load_remove(rq);
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue