Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: "The main changes in this cycle were: - sched/fair load tracking fixes and cleanups (Byungchul Park) - Make load tracking frequency scale invariant (Dietmar Eggemann) - sched/deadline updates (Juri Lelli) - stop machine fixes, cleanups and enhancements for bugs triggered by CPU hotplug stress testing (Oleg Nesterov) - scheduler preemption code rework: remove PREEMPT_ACTIVE and related cleanups (Peter Zijlstra) - Rework the sched_info::run_delay code to fix races (Peter Zijlstra) - Optimize per entity utilization tracking (Peter Zijlstra) - ... misc other fixes, cleanups and smaller updates" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits) sched: Don't scan all-offline ->cpus_allowed twice if !CONFIG_CPUSETS sched: Move cpu_active() tests from stop_two_cpus() into migrate_swap_stop() sched: Start stopper early stop_machine: Kill cpu_stop_threads->setup() and cpu_stop_unpark() stop_machine: Kill smp_hotplug_thread->pre_unpark, introduce stop_machine_unpark() stop_machine: Change cpu_stop_queue_two_works() to rely on stopper->enabled stop_machine: Introduce __cpu_stop_queue_work() and cpu_stop_queue_two_works() stop_machine: Ensure that a queued callback will be called before cpu_stop_park() sched/x86: Fix typo in __switch_to() comments sched/core: Remove a parameter in the migrate_task_rq() function sched/core: Drop unlikely behind BUG_ON() sched/core: Fix task and run queue sched_info::run_delay inconsistencies sched/numa: Fix task_tick_fair() from disabling numa_balancing sched/core: Add preempt_count invariant check sched/core: More notrace annotations sched/core: Kill PREEMPT_ACTIVE sched/core, sched/x86: Kill thread_info::saved_preempt_count sched/core: Simplify preempt_count tests sched/core: Robustify preemption leak checks sched/core: Stop setting PREEMPT_ACTIVE ...
This commit is contained in:
commit
53528695ff
26 changed files with 497 additions and 474 deletions
|
@ -30,12 +30,9 @@ static __always_inline void preempt_count_set(int pc)
|
||||||
/*
|
/*
|
||||||
* must be macros to avoid header recursion hell
|
* must be macros to avoid header recursion hell
|
||||||
*/
|
*/
|
||||||
#define init_task_preempt_count(p) do { \
|
#define init_task_preempt_count(p) do { } while (0)
|
||||||
task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
#define init_idle_preempt_count(p, cpu) do { \
|
#define init_idle_preempt_count(p, cpu) do { \
|
||||||
task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \
|
|
||||||
per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
|
per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
|
@ -57,7 +57,6 @@ struct thread_info {
|
||||||
__u32 flags; /* low level flags */
|
__u32 flags; /* low level flags */
|
||||||
__u32 status; /* thread synchronous flags */
|
__u32 status; /* thread synchronous flags */
|
||||||
__u32 cpu; /* current CPU */
|
__u32 cpu; /* current CPU */
|
||||||
int saved_preempt_count;
|
|
||||||
mm_segment_t addr_limit;
|
mm_segment_t addr_limit;
|
||||||
void __user *sysenter_return;
|
void __user *sysenter_return;
|
||||||
unsigned int sig_on_uaccess_error:1;
|
unsigned int sig_on_uaccess_error:1;
|
||||||
|
@ -69,7 +68,6 @@ struct thread_info {
|
||||||
.task = &tsk, \
|
.task = &tsk, \
|
||||||
.flags = 0, \
|
.flags = 0, \
|
||||||
.cpu = 0, \
|
.cpu = 0, \
|
||||||
.saved_preempt_count = INIT_PREEMPT_COUNT, \
|
|
||||||
.addr_limit = KERNEL_DS, \
|
.addr_limit = KERNEL_DS, \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -279,14 +279,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
||||||
if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
|
if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
|
||||||
set_iopl_mask(next->iopl);
|
set_iopl_mask(next->iopl);
|
||||||
|
|
||||||
/*
|
|
||||||
* If it were not for PREEMPT_ACTIVE we could guarantee that the
|
|
||||||
* preempt_count of all tasks was equal here and this would not be
|
|
||||||
* needed.
|
|
||||||
*/
|
|
||||||
task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
|
|
||||||
this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Now maybe handle debug registers and/or IO bitmaps
|
* Now maybe handle debug registers and/or IO bitmaps
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -332,7 +332,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
||||||
/*
|
/*
|
||||||
* Switch FS and GS.
|
* Switch FS and GS.
|
||||||
*
|
*
|
||||||
* These are even more complicated than FS and GS: they have
|
* These are even more complicated than DS and ES: they have
|
||||||
* 64-bit bases are that controlled by arch_prctl. Those bases
|
* 64-bit bases are that controlled by arch_prctl. Those bases
|
||||||
* only differ from the values in the GDT or LDT if the selector
|
* only differ from the values in the GDT or LDT if the selector
|
||||||
* is 0.
|
* is 0.
|
||||||
|
@ -401,14 +401,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
||||||
*/
|
*/
|
||||||
this_cpu_write(current_task, next_p);
|
this_cpu_write(current_task, next_p);
|
||||||
|
|
||||||
/*
|
|
||||||
* If it were not for PREEMPT_ACTIVE we could guarantee that the
|
|
||||||
* preempt_count of all tasks was equal here and this would not be
|
|
||||||
* needed.
|
|
||||||
*/
|
|
||||||
task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
|
|
||||||
this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
|
|
||||||
|
|
||||||
/* Reload esp0 and ss1. This changes current_thread_info(). */
|
/* Reload esp0 and ss1. This changes current_thread_info(). */
|
||||||
load_sp0(tss, next);
|
load_sp0(tss, next);
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,7 @@ static __always_inline void preempt_count_set(int pc)
|
||||||
* must be macros to avoid header recursion hell
|
* must be macros to avoid header recursion hell
|
||||||
*/
|
*/
|
||||||
#define init_task_preempt_count(p) do { \
|
#define init_task_preempt_count(p) do { \
|
||||||
task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
|
task_thread_info(p)->preempt_count = FORK_PREEMPT_COUNT; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define init_idle_preempt_count(p, cpu) do { \
|
#define init_idle_preempt_count(p, cpu) do { \
|
||||||
|
|
|
@ -26,7 +26,6 @@
|
||||||
* SOFTIRQ_MASK: 0x0000ff00
|
* SOFTIRQ_MASK: 0x0000ff00
|
||||||
* HARDIRQ_MASK: 0x000f0000
|
* HARDIRQ_MASK: 0x000f0000
|
||||||
* NMI_MASK: 0x00100000
|
* NMI_MASK: 0x00100000
|
||||||
* PREEMPT_ACTIVE: 0x00200000
|
|
||||||
* PREEMPT_NEED_RESCHED: 0x80000000
|
* PREEMPT_NEED_RESCHED: 0x80000000
|
||||||
*/
|
*/
|
||||||
#define PREEMPT_BITS 8
|
#define PREEMPT_BITS 8
|
||||||
|
@ -53,10 +52,6 @@
|
||||||
|
|
||||||
#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
|
#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
|
||||||
|
|
||||||
#define PREEMPT_ACTIVE_BITS 1
|
|
||||||
#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
|
|
||||||
#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT)
|
|
||||||
|
|
||||||
/* We use the MSB mostly because its available */
|
/* We use the MSB mostly because its available */
|
||||||
#define PREEMPT_NEED_RESCHED 0x80000000
|
#define PREEMPT_NEED_RESCHED 0x80000000
|
||||||
|
|
||||||
|
@ -126,8 +121,7 @@
|
||||||
* Check whether we were atomic before we did preempt_disable():
|
* Check whether we were atomic before we did preempt_disable():
|
||||||
* (used by the scheduler)
|
* (used by the scheduler)
|
||||||
*/
|
*/
|
||||||
#define in_atomic_preempt_off() \
|
#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)
|
||||||
((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_DISABLE_OFFSET)
|
|
||||||
|
|
||||||
#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
|
#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
|
||||||
extern void preempt_count_add(int val);
|
extern void preempt_count_add(int val);
|
||||||
|
@ -146,18 +140,6 @@ extern void preempt_count_sub(int val);
|
||||||
#define preempt_count_inc() preempt_count_add(1)
|
#define preempt_count_inc() preempt_count_add(1)
|
||||||
#define preempt_count_dec() preempt_count_sub(1)
|
#define preempt_count_dec() preempt_count_sub(1)
|
||||||
|
|
||||||
#define preempt_active_enter() \
|
|
||||||
do { \
|
|
||||||
preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
|
|
||||||
barrier(); \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
#define preempt_active_exit() \
|
|
||||||
do { \
|
|
||||||
barrier(); \
|
|
||||||
preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
#ifdef CONFIG_PREEMPT_COUNT
|
#ifdef CONFIG_PREEMPT_COUNT
|
||||||
|
|
||||||
#define preempt_disable() \
|
#define preempt_disable() \
|
||||||
|
|
|
@ -599,20 +599,26 @@ struct task_cputime_atomic {
|
||||||
.sum_exec_runtime = ATOMIC64_INIT(0), \
|
.sum_exec_runtime = ATOMIC64_INIT(0), \
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_PREEMPT_COUNT
|
#define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
|
||||||
#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED)
|
|
||||||
#else
|
|
||||||
#define PREEMPT_DISABLED PREEMPT_ENABLED
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Disable preemption until the scheduler is running.
|
* Disable preemption until the scheduler is running -- use an unconditional
|
||||||
* Reset by start_kernel()->sched_init()->init_idle().
|
* value so that it also works on !PREEMPT_COUNT kernels.
|
||||||
*
|
*
|
||||||
* We include PREEMPT_ACTIVE to avoid cond_resched() from working
|
* Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
|
||||||
* before the scheduler is active -- see should_resched().
|
|
||||||
*/
|
*/
|
||||||
#define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE)
|
#define INIT_PREEMPT_COUNT PREEMPT_OFFSET
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Initial preempt_count value; reflects the preempt_count schedule invariant
|
||||||
|
* which states that during context switches:
|
||||||
|
*
|
||||||
|
* preempt_count() == 2*PREEMPT_DISABLE_OFFSET
|
||||||
|
*
|
||||||
|
* Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
|
||||||
|
* Note: See finish_task_switch().
|
||||||
|
*/
|
||||||
|
#define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* struct thread_group_cputimer - thread group interval timer counts
|
* struct thread_group_cputimer - thread group interval timer counts
|
||||||
|
@ -1142,8 +1148,6 @@ struct sched_domain_topology_level {
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
extern struct sched_domain_topology_level *sched_domain_topology;
|
|
||||||
|
|
||||||
extern void set_sched_topology(struct sched_domain_topology_level *tl);
|
extern void set_sched_topology(struct sched_domain_topology_level *tl);
|
||||||
extern void wake_up_if_idle(int cpu);
|
extern void wake_up_if_idle(int cpu);
|
||||||
|
|
||||||
|
@ -1192,10 +1196,10 @@ struct load_weight {
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The load_avg/util_avg accumulates an infinite geometric series.
|
* The load_avg/util_avg accumulates an infinite geometric series.
|
||||||
* 1) load_avg factors the amount of time that a sched_entity is
|
* 1) load_avg factors frequency scaling into the amount of time that a
|
||||||
* runnable on a rq into its weight. For cfs_rq, it is the aggregated
|
* sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
|
||||||
* such weights of all runnable and blocked sched_entities.
|
* aggregated such weights of all runnable and blocked sched_entities.
|
||||||
* 2) util_avg factors frequency scaling into the amount of time
|
* 2) util_avg factors frequency and cpu scaling into the amount of time
|
||||||
* that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
|
* that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
|
||||||
* For cfs_rq, it is the aggregated such times of all runnable and
|
* For cfs_rq, it is the aggregated such times of all runnable and
|
||||||
* blocked sched_entities.
|
* blocked sched_entities.
|
||||||
|
|
|
@ -21,4 +21,9 @@ static inline int dl_task(struct task_struct *p)
|
||||||
return dl_prio(p->prio);
|
return dl_prio(p->prio);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline bool dl_time_before(u64 a, u64 b)
|
||||||
|
{
|
||||||
|
return (s64)(a - b) < 0;
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* _SCHED_DEADLINE_H */
|
#endif /* _SCHED_DEADLINE_H */
|
||||||
|
|
|
@ -24,9 +24,6 @@ struct smpboot_thread_data;
|
||||||
* parked (cpu offline)
|
* parked (cpu offline)
|
||||||
* @unpark: Optional unpark function, called when the thread is
|
* @unpark: Optional unpark function, called when the thread is
|
||||||
* unparked (cpu online)
|
* unparked (cpu online)
|
||||||
* @pre_unpark: Optional unpark function, called before the thread is
|
|
||||||
* unparked (cpu online). This is not guaranteed to be
|
|
||||||
* called on the target cpu of the thread. Careful!
|
|
||||||
* @cpumask: Internal state. To update which threads are unparked,
|
* @cpumask: Internal state. To update which threads are unparked,
|
||||||
* call smpboot_update_cpumask_percpu_thread().
|
* call smpboot_update_cpumask_percpu_thread().
|
||||||
* @selfparking: Thread is not parked by the park function.
|
* @selfparking: Thread is not parked by the park function.
|
||||||
|
@ -42,7 +39,6 @@ struct smp_hotplug_thread {
|
||||||
void (*cleanup)(unsigned int cpu, bool online);
|
void (*cleanup)(unsigned int cpu, bool online);
|
||||||
void (*park)(unsigned int cpu);
|
void (*park)(unsigned int cpu);
|
||||||
void (*unpark)(unsigned int cpu);
|
void (*unpark)(unsigned int cpu);
|
||||||
void (*pre_unpark)(unsigned int cpu);
|
|
||||||
cpumask_var_t cpumask;
|
cpumask_var_t cpumask;
|
||||||
bool selfparking;
|
bool selfparking;
|
||||||
const char *thread_comm;
|
const char *thread_comm;
|
||||||
|
|
|
@ -33,6 +33,8 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
|
||||||
struct cpu_stop_work *work_buf);
|
struct cpu_stop_work *work_buf);
|
||||||
int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
|
int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
|
||||||
int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
|
int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
|
||||||
|
void stop_machine_park(int cpu);
|
||||||
|
void stop_machine_unpark(int cpu);
|
||||||
|
|
||||||
#else /* CONFIG_SMP */
|
#else /* CONFIG_SMP */
|
||||||
|
|
||||||
|
|
|
@ -104,22 +104,17 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
|
||||||
TP_ARGS(p));
|
TP_ARGS(p));
|
||||||
|
|
||||||
#ifdef CREATE_TRACE_POINTS
|
#ifdef CREATE_TRACE_POINTS
|
||||||
static inline long __trace_sched_switch_state(struct task_struct *p)
|
static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
|
||||||
{
|
{
|
||||||
long state = p->state;
|
|
||||||
|
|
||||||
#ifdef CONFIG_PREEMPT
|
|
||||||
#ifdef CONFIG_SCHED_DEBUG
|
#ifdef CONFIG_SCHED_DEBUG
|
||||||
BUG_ON(p != current);
|
BUG_ON(p != current);
|
||||||
#endif /* CONFIG_SCHED_DEBUG */
|
#endif /* CONFIG_SCHED_DEBUG */
|
||||||
/*
|
|
||||||
* For all intents and purposes a preempted task is a running task.
|
|
||||||
*/
|
|
||||||
if (preempt_count() & PREEMPT_ACTIVE)
|
|
||||||
state = TASK_RUNNING | TASK_STATE_MAX;
|
|
||||||
#endif /* CONFIG_PREEMPT */
|
|
||||||
|
|
||||||
return state;
|
/*
|
||||||
|
* Preemption ignores task state, therefore preempted tasks are always
|
||||||
|
* RUNNING (we will not have dequeued if state != RUNNING).
|
||||||
|
*/
|
||||||
|
return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state;
|
||||||
}
|
}
|
||||||
#endif /* CREATE_TRACE_POINTS */
|
#endif /* CREATE_TRACE_POINTS */
|
||||||
|
|
||||||
|
@ -128,10 +123,11 @@ static inline long __trace_sched_switch_state(struct task_struct *p)
|
||||||
*/
|
*/
|
||||||
TRACE_EVENT(sched_switch,
|
TRACE_EVENT(sched_switch,
|
||||||
|
|
||||||
TP_PROTO(struct task_struct *prev,
|
TP_PROTO(bool preempt,
|
||||||
|
struct task_struct *prev,
|
||||||
struct task_struct *next),
|
struct task_struct *next),
|
||||||
|
|
||||||
TP_ARGS(prev, next),
|
TP_ARGS(preempt, prev, next),
|
||||||
|
|
||||||
TP_STRUCT__entry(
|
TP_STRUCT__entry(
|
||||||
__array( char, prev_comm, TASK_COMM_LEN )
|
__array( char, prev_comm, TASK_COMM_LEN )
|
||||||
|
@ -147,7 +143,7 @@ TRACE_EVENT(sched_switch,
|
||||||
memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
|
memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
|
||||||
__entry->prev_pid = prev->pid;
|
__entry->prev_pid = prev->pid;
|
||||||
__entry->prev_prio = prev->prio;
|
__entry->prev_prio = prev->prio;
|
||||||
__entry->prev_state = __trace_sched_switch_state(prev);
|
__entry->prev_state = __trace_sched_switch_state(preempt, prev);
|
||||||
memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
|
memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
|
||||||
__entry->next_pid = next->pid;
|
__entry->next_pid = next->pid;
|
||||||
__entry->next_prio = next->prio;
|
__entry->next_prio = next->prio;
|
||||||
|
|
10
kernel/cpu.c
10
kernel/cpu.c
|
@ -291,8 +291,8 @@ static inline void check_for_tasks(int dead_cpu)
|
||||||
{
|
{
|
||||||
struct task_struct *g, *p;
|
struct task_struct *g, *p;
|
||||||
|
|
||||||
read_lock_irq(&tasklist_lock);
|
read_lock(&tasklist_lock);
|
||||||
do_each_thread(g, p) {
|
for_each_process_thread(g, p) {
|
||||||
if (!p->on_rq)
|
if (!p->on_rq)
|
||||||
continue;
|
continue;
|
||||||
/*
|
/*
|
||||||
|
@ -307,8 +307,8 @@ static inline void check_for_tasks(int dead_cpu)
|
||||||
|
|
||||||
pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
|
pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
|
||||||
p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
|
p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
|
||||||
} while_each_thread(g, p);
|
}
|
||||||
read_unlock_irq(&tasklist_lock);
|
read_unlock(&tasklist_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct take_cpu_down_param {
|
struct take_cpu_down_param {
|
||||||
|
@ -331,7 +331,7 @@ static int take_cpu_down(void *_param)
|
||||||
/* Give up timekeeping duties */
|
/* Give up timekeeping duties */
|
||||||
tick_handover_do_timer();
|
tick_handover_do_timer();
|
||||||
/* Park the stopper thread */
|
/* Park the stopper thread */
|
||||||
kthread_park(current);
|
stop_machine_park((long)param->hcpu);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -706,10 +706,12 @@ void do_exit(long code)
|
||||||
smp_mb();
|
smp_mb();
|
||||||
raw_spin_unlock_wait(&tsk->pi_lock);
|
raw_spin_unlock_wait(&tsk->pi_lock);
|
||||||
|
|
||||||
if (unlikely(in_atomic()))
|
if (unlikely(in_atomic())) {
|
||||||
pr_info("note: %s[%d] exited with preempt_count %d\n",
|
pr_info("note: %s[%d] exited with preempt_count %d\n",
|
||||||
current->comm, task_pid_nr(current),
|
current->comm, task_pid_nr(current),
|
||||||
preempt_count());
|
preempt_count());
|
||||||
|
preempt_count_set(PREEMPT_ENABLED);
|
||||||
|
}
|
||||||
|
|
||||||
/* sync mm's RSS info before statistics gathering */
|
/* sync mm's RSS info before statistics gathering */
|
||||||
if (tsk->mm)
|
if (tsk->mm)
|
||||||
|
|
|
@ -170,7 +170,8 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left,
|
||||||
* then right waiter has a dl_prio() too.
|
* then right waiter has a dl_prio() too.
|
||||||
*/
|
*/
|
||||||
if (dl_prio(left->prio))
|
if (dl_prio(left->prio))
|
||||||
return (left->task->dl.deadline < right->task->dl.deadline);
|
return dl_time_before(left->task->dl.deadline,
|
||||||
|
right->task->dl.deadline);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -817,7 +817,7 @@ static void set_load_weight(struct task_struct *p)
|
||||||
/*
|
/*
|
||||||
* SCHED_IDLE tasks get minimal weight:
|
* SCHED_IDLE tasks get minimal weight:
|
||||||
*/
|
*/
|
||||||
if (p->policy == SCHED_IDLE) {
|
if (idle_policy(p->policy)) {
|
||||||
load->weight = scale_load(WEIGHT_IDLEPRIO);
|
load->weight = scale_load(WEIGHT_IDLEPRIO);
|
||||||
load->inv_weight = WMULT_IDLEPRIO;
|
load->inv_weight = WMULT_IDLEPRIO;
|
||||||
return;
|
return;
|
||||||
|
@ -827,17 +827,19 @@ static void set_load_weight(struct task_struct *p)
|
||||||
load->inv_weight = prio_to_wmult[prio];
|
load->inv_weight = prio_to_wmult[prio];
|
||||||
}
|
}
|
||||||
|
|
||||||
static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
|
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
|
||||||
{
|
{
|
||||||
update_rq_clock(rq);
|
update_rq_clock(rq);
|
||||||
sched_info_queued(rq, p);
|
if (!(flags & ENQUEUE_RESTORE))
|
||||||
|
sched_info_queued(rq, p);
|
||||||
p->sched_class->enqueue_task(rq, p, flags);
|
p->sched_class->enqueue_task(rq, p, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
|
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
|
||||||
{
|
{
|
||||||
update_rq_clock(rq);
|
update_rq_clock(rq);
|
||||||
sched_info_dequeued(rq, p);
|
if (!(flags & DEQUEUE_SAVE))
|
||||||
|
sched_info_dequeued(rq, p);
|
||||||
p->sched_class->dequeue_task(rq, p, flags);
|
p->sched_class->dequeue_task(rq, p, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1178,7 +1180,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
|
||||||
* holding rq->lock.
|
* holding rq->lock.
|
||||||
*/
|
*/
|
||||||
lockdep_assert_held(&rq->lock);
|
lockdep_assert_held(&rq->lock);
|
||||||
dequeue_task(rq, p, 0);
|
dequeue_task(rq, p, DEQUEUE_SAVE);
|
||||||
}
|
}
|
||||||
if (running)
|
if (running)
|
||||||
put_prev_task(rq, p);
|
put_prev_task(rq, p);
|
||||||
|
@ -1188,7 +1190,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
|
||||||
if (running)
|
if (running)
|
||||||
p->sched_class->set_curr_task(rq);
|
p->sched_class->set_curr_task(rq);
|
||||||
if (queued)
|
if (queued)
|
||||||
enqueue_task(rq, p, 0);
|
enqueue_task(rq, p, ENQUEUE_RESTORE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1292,7 +1294,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
|
||||||
|
|
||||||
if (task_cpu(p) != new_cpu) {
|
if (task_cpu(p) != new_cpu) {
|
||||||
if (p->sched_class->migrate_task_rq)
|
if (p->sched_class->migrate_task_rq)
|
||||||
p->sched_class->migrate_task_rq(p, new_cpu);
|
p->sched_class->migrate_task_rq(p);
|
||||||
p->se.nr_migrations++;
|
p->se.nr_migrations++;
|
||||||
perf_event_task_migrate(p);
|
perf_event_task_migrate(p);
|
||||||
}
|
}
|
||||||
|
@ -1333,12 +1335,16 @@ static int migrate_swap_stop(void *data)
|
||||||
struct rq *src_rq, *dst_rq;
|
struct rq *src_rq, *dst_rq;
|
||||||
int ret = -EAGAIN;
|
int ret = -EAGAIN;
|
||||||
|
|
||||||
|
if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
|
||||||
|
return -EAGAIN;
|
||||||
|
|
||||||
src_rq = cpu_rq(arg->src_cpu);
|
src_rq = cpu_rq(arg->src_cpu);
|
||||||
dst_rq = cpu_rq(arg->dst_cpu);
|
dst_rq = cpu_rq(arg->dst_cpu);
|
||||||
|
|
||||||
double_raw_lock(&arg->src_task->pi_lock,
|
double_raw_lock(&arg->src_task->pi_lock,
|
||||||
&arg->dst_task->pi_lock);
|
&arg->dst_task->pi_lock);
|
||||||
double_rq_lock(src_rq, dst_rq);
|
double_rq_lock(src_rq, dst_rq);
|
||||||
|
|
||||||
if (task_cpu(arg->dst_task) != arg->dst_cpu)
|
if (task_cpu(arg->dst_task) != arg->dst_cpu)
|
||||||
goto unlock;
|
goto unlock;
|
||||||
|
|
||||||
|
@ -1574,13 +1580,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* No more Mr. Nice Guy. */
|
||||||
switch (state) {
|
switch (state) {
|
||||||
case cpuset:
|
case cpuset:
|
||||||
/* No more Mr. Nice Guy. */
|
if (IS_ENABLED(CONFIG_CPUSETS)) {
|
||||||
cpuset_cpus_allowed_fallback(p);
|
cpuset_cpus_allowed_fallback(p);
|
||||||
state = possible;
|
state = possible;
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
/* fall-through */
|
||||||
case possible:
|
case possible:
|
||||||
do_set_cpus_allowed(p, cpu_possible_mask);
|
do_set_cpus_allowed(p, cpu_possible_mask);
|
||||||
state = fail;
|
state = fail;
|
||||||
|
@ -1692,7 +1700,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
|
||||||
#endif /* CONFIG_SCHEDSTATS */
|
#endif /* CONFIG_SCHEDSTATS */
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
|
static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
|
||||||
{
|
{
|
||||||
activate_task(rq, p, en_flags);
|
activate_task(rq, p, en_flags);
|
||||||
p->on_rq = TASK_ON_RQ_QUEUED;
|
p->on_rq = TASK_ON_RQ_QUEUED;
|
||||||
|
@ -2114,23 +2122,17 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
|
||||||
#endif /* CONFIG_NUMA_BALANCING */
|
#endif /* CONFIG_NUMA_BALANCING */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
|
||||||
|
|
||||||
#ifdef CONFIG_NUMA_BALANCING
|
#ifdef CONFIG_NUMA_BALANCING
|
||||||
#ifdef CONFIG_SCHED_DEBUG
|
|
||||||
void set_numabalancing_state(bool enabled)
|
|
||||||
{
|
|
||||||
if (enabled)
|
|
||||||
sched_feat_set("NUMA");
|
|
||||||
else
|
|
||||||
sched_feat_set("NO_NUMA");
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
__read_mostly bool numabalancing_enabled;
|
|
||||||
|
|
||||||
void set_numabalancing_state(bool enabled)
|
void set_numabalancing_state(bool enabled)
|
||||||
{
|
{
|
||||||
numabalancing_enabled = enabled;
|
if (enabled)
|
||||||
|
static_branch_enable(&sched_numa_balancing);
|
||||||
|
else
|
||||||
|
static_branch_disable(&sched_numa_balancing);
|
||||||
}
|
}
|
||||||
#endif /* CONFIG_SCHED_DEBUG */
|
|
||||||
|
|
||||||
#ifdef CONFIG_PROC_SYSCTL
|
#ifdef CONFIG_PROC_SYSCTL
|
||||||
int sysctl_numa_balancing(struct ctl_table *table, int write,
|
int sysctl_numa_balancing(struct ctl_table *table, int write,
|
||||||
|
@ -2138,7 +2140,7 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
|
||||||
{
|
{
|
||||||
struct ctl_table t;
|
struct ctl_table t;
|
||||||
int err;
|
int err;
|
||||||
int state = numabalancing_enabled;
|
int state = static_branch_likely(&sched_numa_balancing);
|
||||||
|
|
||||||
if (write && !capable(CAP_SYS_ADMIN))
|
if (write && !capable(CAP_SYS_ADMIN))
|
||||||
return -EPERM;
|
return -EPERM;
|
||||||
|
@ -2349,6 +2351,8 @@ void wake_up_new_task(struct task_struct *p)
|
||||||
struct rq *rq;
|
struct rq *rq;
|
||||||
|
|
||||||
raw_spin_lock_irqsave(&p->pi_lock, flags);
|
raw_spin_lock_irqsave(&p->pi_lock, flags);
|
||||||
|
/* Initialize new task's runnable average */
|
||||||
|
init_entity_runnable_average(&p->se);
|
||||||
#ifdef CONFIG_SMP
|
#ifdef CONFIG_SMP
|
||||||
/*
|
/*
|
||||||
* Fork balancing, do it here and not earlier because:
|
* Fork balancing, do it here and not earlier because:
|
||||||
|
@ -2358,8 +2362,6 @@ void wake_up_new_task(struct task_struct *p)
|
||||||
set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
|
set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Initialize new task's runnable average */
|
|
||||||
init_entity_runnable_average(&p->se);
|
|
||||||
rq = __task_rq_lock(p);
|
rq = __task_rq_lock(p);
|
||||||
activate_task(rq, p, 0);
|
activate_task(rq, p, 0);
|
||||||
p->on_rq = TASK_ON_RQ_QUEUED;
|
p->on_rq = TASK_ON_RQ_QUEUED;
|
||||||
|
@ -2483,7 +2485,6 @@ static inline void
|
||||||
prepare_task_switch(struct rq *rq, struct task_struct *prev,
|
prepare_task_switch(struct rq *rq, struct task_struct *prev,
|
||||||
struct task_struct *next)
|
struct task_struct *next)
|
||||||
{
|
{
|
||||||
trace_sched_switch(prev, next);
|
|
||||||
sched_info_switch(rq, prev, next);
|
sched_info_switch(rq, prev, next);
|
||||||
perf_event_task_sched_out(prev, next);
|
perf_event_task_sched_out(prev, next);
|
||||||
fire_sched_out_preempt_notifiers(prev, next);
|
fire_sched_out_preempt_notifiers(prev, next);
|
||||||
|
@ -2517,6 +2518,22 @@ static struct rq *finish_task_switch(struct task_struct *prev)
|
||||||
struct mm_struct *mm = rq->prev_mm;
|
struct mm_struct *mm = rq->prev_mm;
|
||||||
long prev_state;
|
long prev_state;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The previous task will have left us with a preempt_count of 2
|
||||||
|
* because it left us after:
|
||||||
|
*
|
||||||
|
* schedule()
|
||||||
|
* preempt_disable(); // 1
|
||||||
|
* __schedule()
|
||||||
|
* raw_spin_lock_irq(&rq->lock) // 2
|
||||||
|
*
|
||||||
|
* Also, see FORK_PREEMPT_COUNT.
|
||||||
|
*/
|
||||||
|
if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
|
||||||
|
"corrupted preempt_count: %s/%d/0x%x\n",
|
||||||
|
current->comm, current->pid, preempt_count()))
|
||||||
|
preempt_count_set(FORK_PREEMPT_COUNT);
|
||||||
|
|
||||||
rq->prev_mm = NULL;
|
rq->prev_mm = NULL;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -2601,8 +2618,15 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
|
||||||
{
|
{
|
||||||
struct rq *rq;
|
struct rq *rq;
|
||||||
|
|
||||||
/* finish_task_switch() drops rq->lock and enables preemtion */
|
/*
|
||||||
preempt_disable();
|
* New tasks start with FORK_PREEMPT_COUNT, see there and
|
||||||
|
* finish_task_switch() for details.
|
||||||
|
*
|
||||||
|
* finish_task_switch() will drop rq->lock() and lower preempt_count
|
||||||
|
* and the preempt_enable() will end up enabling preemption (on
|
||||||
|
* PREEMPT_COUNT kernels).
|
||||||
|
*/
|
||||||
|
|
||||||
rq = finish_task_switch(prev);
|
rq = finish_task_switch(prev);
|
||||||
balance_callback(rq);
|
balance_callback(rq);
|
||||||
preempt_enable();
|
preempt_enable();
|
||||||
|
@ -2960,15 +2984,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
|
||||||
static inline void schedule_debug(struct task_struct *prev)
|
static inline void schedule_debug(struct task_struct *prev)
|
||||||
{
|
{
|
||||||
#ifdef CONFIG_SCHED_STACK_END_CHECK
|
#ifdef CONFIG_SCHED_STACK_END_CHECK
|
||||||
BUG_ON(unlikely(task_stack_end_corrupted(prev)));
|
BUG_ON(task_stack_end_corrupted(prev));
|
||||||
#endif
|
#endif
|
||||||
/*
|
|
||||||
* Test if we are atomic. Since do_exit() needs to call into
|
if (unlikely(in_atomic_preempt_off())) {
|
||||||
* schedule() atomically, we ignore that path. Otherwise whine
|
|
||||||
* if we are scheduling when we should not.
|
|
||||||
*/
|
|
||||||
if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
|
|
||||||
__schedule_bug(prev);
|
__schedule_bug(prev);
|
||||||
|
preempt_count_set(PREEMPT_DISABLED);
|
||||||
|
}
|
||||||
rcu_sleep_check();
|
rcu_sleep_check();
|
||||||
|
|
||||||
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
|
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
|
||||||
|
@ -3054,7 +3076,7 @@ again:
|
||||||
*
|
*
|
||||||
* WARNING: must be called with preemption disabled!
|
* WARNING: must be called with preemption disabled!
|
||||||
*/
|
*/
|
||||||
static void __sched __schedule(void)
|
static void __sched notrace __schedule(bool preempt)
|
||||||
{
|
{
|
||||||
struct task_struct *prev, *next;
|
struct task_struct *prev, *next;
|
||||||
unsigned long *switch_count;
|
unsigned long *switch_count;
|
||||||
|
@ -3066,6 +3088,17 @@ static void __sched __schedule(void)
|
||||||
rcu_note_context_switch();
|
rcu_note_context_switch();
|
||||||
prev = rq->curr;
|
prev = rq->curr;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* do_exit() calls schedule() with preemption disabled as an exception;
|
||||||
|
* however we must fix that up, otherwise the next task will see an
|
||||||
|
* inconsistent (higher) preempt count.
|
||||||
|
*
|
||||||
|
* It also avoids the below schedule_debug() test from complaining
|
||||||
|
* about this.
|
||||||
|
*/
|
||||||
|
if (unlikely(prev->state == TASK_DEAD))
|
||||||
|
preempt_enable_no_resched_notrace();
|
||||||
|
|
||||||
schedule_debug(prev);
|
schedule_debug(prev);
|
||||||
|
|
||||||
if (sched_feat(HRTICK))
|
if (sched_feat(HRTICK))
|
||||||
|
@ -3083,7 +3116,7 @@ static void __sched __schedule(void)
|
||||||
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
|
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
|
||||||
|
|
||||||
switch_count = &prev->nivcsw;
|
switch_count = &prev->nivcsw;
|
||||||
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
|
if (!preempt && prev->state) {
|
||||||
if (unlikely(signal_pending_state(prev->state, prev))) {
|
if (unlikely(signal_pending_state(prev->state, prev))) {
|
||||||
prev->state = TASK_RUNNING;
|
prev->state = TASK_RUNNING;
|
||||||
} else {
|
} else {
|
||||||
|
@ -3119,6 +3152,7 @@ static void __sched __schedule(void)
|
||||||
rq->curr = next;
|
rq->curr = next;
|
||||||
++*switch_count;
|
++*switch_count;
|
||||||
|
|
||||||
|
trace_sched_switch(preempt, prev, next);
|
||||||
rq = context_switch(rq, prev, next); /* unlocks the rq */
|
rq = context_switch(rq, prev, next); /* unlocks the rq */
|
||||||
cpu = cpu_of(rq);
|
cpu = cpu_of(rq);
|
||||||
} else {
|
} else {
|
||||||
|
@ -3148,7 +3182,7 @@ asmlinkage __visible void __sched schedule(void)
|
||||||
sched_submit_work(tsk);
|
sched_submit_work(tsk);
|
||||||
do {
|
do {
|
||||||
preempt_disable();
|
preempt_disable();
|
||||||
__schedule();
|
__schedule(false);
|
||||||
sched_preempt_enable_no_resched();
|
sched_preempt_enable_no_resched();
|
||||||
} while (need_resched());
|
} while (need_resched());
|
||||||
}
|
}
|
||||||
|
@ -3188,9 +3222,9 @@ void __sched schedule_preempt_disabled(void)
|
||||||
static void __sched notrace preempt_schedule_common(void)
|
static void __sched notrace preempt_schedule_common(void)
|
||||||
{
|
{
|
||||||
do {
|
do {
|
||||||
preempt_active_enter();
|
preempt_disable_notrace();
|
||||||
__schedule();
|
__schedule(true);
|
||||||
preempt_active_exit();
|
preempt_enable_no_resched_notrace();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check again in case we missed a preemption opportunity
|
* Check again in case we missed a preemption opportunity
|
||||||
|
@ -3241,24 +3275,17 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
do {
|
do {
|
||||||
/*
|
preempt_disable_notrace();
|
||||||
* Use raw __prempt_count() ops that don't call function.
|
|
||||||
* We can't call functions before disabling preemption which
|
|
||||||
* disarm preemption tracing recursions.
|
|
||||||
*/
|
|
||||||
__preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
|
|
||||||
barrier();
|
|
||||||
/*
|
/*
|
||||||
* Needs preempt disabled in case user_exit() is traced
|
* Needs preempt disabled in case user_exit() is traced
|
||||||
* and the tracer calls preempt_enable_notrace() causing
|
* and the tracer calls preempt_enable_notrace() causing
|
||||||
* an infinite recursion.
|
* an infinite recursion.
|
||||||
*/
|
*/
|
||||||
prev_ctx = exception_enter();
|
prev_ctx = exception_enter();
|
||||||
__schedule();
|
__schedule(true);
|
||||||
exception_exit(prev_ctx);
|
exception_exit(prev_ctx);
|
||||||
|
|
||||||
barrier();
|
preempt_enable_no_resched_notrace();
|
||||||
__preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
|
|
||||||
} while (need_resched());
|
} while (need_resched());
|
||||||
}
|
}
|
||||||
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
|
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
|
||||||
|
@ -3281,11 +3308,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
|
||||||
prev_state = exception_enter();
|
prev_state = exception_enter();
|
||||||
|
|
||||||
do {
|
do {
|
||||||
preempt_active_enter();
|
preempt_disable();
|
||||||
local_irq_enable();
|
local_irq_enable();
|
||||||
__schedule();
|
__schedule(true);
|
||||||
local_irq_disable();
|
local_irq_disable();
|
||||||
preempt_active_exit();
|
sched_preempt_enable_no_resched();
|
||||||
} while (need_resched());
|
} while (need_resched());
|
||||||
|
|
||||||
exception_exit(prev_state);
|
exception_exit(prev_state);
|
||||||
|
@ -3313,7 +3340,7 @@ EXPORT_SYMBOL(default_wake_function);
|
||||||
*/
|
*/
|
||||||
void rt_mutex_setprio(struct task_struct *p, int prio)
|
void rt_mutex_setprio(struct task_struct *p, int prio)
|
||||||
{
|
{
|
||||||
int oldprio, queued, running, enqueue_flag = 0;
|
int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
|
||||||
struct rq *rq;
|
struct rq *rq;
|
||||||
const struct sched_class *prev_class;
|
const struct sched_class *prev_class;
|
||||||
|
|
||||||
|
@ -3345,7 +3372,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
|
||||||
queued = task_on_rq_queued(p);
|
queued = task_on_rq_queued(p);
|
||||||
running = task_current(rq, p);
|
running = task_current(rq, p);
|
||||||
if (queued)
|
if (queued)
|
||||||
dequeue_task(rq, p, 0);
|
dequeue_task(rq, p, DEQUEUE_SAVE);
|
||||||
if (running)
|
if (running)
|
||||||
put_prev_task(rq, p);
|
put_prev_task(rq, p);
|
||||||
|
|
||||||
|
@ -3363,7 +3390,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
|
||||||
if (!dl_prio(p->normal_prio) ||
|
if (!dl_prio(p->normal_prio) ||
|
||||||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
|
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
|
||||||
p->dl.dl_boosted = 1;
|
p->dl.dl_boosted = 1;
|
||||||
enqueue_flag = ENQUEUE_REPLENISH;
|
enqueue_flag |= ENQUEUE_REPLENISH;
|
||||||
} else
|
} else
|
||||||
p->dl.dl_boosted = 0;
|
p->dl.dl_boosted = 0;
|
||||||
p->sched_class = &dl_sched_class;
|
p->sched_class = &dl_sched_class;
|
||||||
|
@ -3371,7 +3398,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
|
||||||
if (dl_prio(oldprio))
|
if (dl_prio(oldprio))
|
||||||
p->dl.dl_boosted = 0;
|
p->dl.dl_boosted = 0;
|
||||||
if (oldprio < prio)
|
if (oldprio < prio)
|
||||||
enqueue_flag = ENQUEUE_HEAD;
|
enqueue_flag |= ENQUEUE_HEAD;
|
||||||
p->sched_class = &rt_sched_class;
|
p->sched_class = &rt_sched_class;
|
||||||
} else {
|
} else {
|
||||||
if (dl_prio(oldprio))
|
if (dl_prio(oldprio))
|
||||||
|
@ -3423,7 +3450,7 @@ void set_user_nice(struct task_struct *p, long nice)
|
||||||
}
|
}
|
||||||
queued = task_on_rq_queued(p);
|
queued = task_on_rq_queued(p);
|
||||||
if (queued)
|
if (queued)
|
||||||
dequeue_task(rq, p, 0);
|
dequeue_task(rq, p, DEQUEUE_SAVE);
|
||||||
|
|
||||||
p->static_prio = NICE_TO_PRIO(nice);
|
p->static_prio = NICE_TO_PRIO(nice);
|
||||||
set_load_weight(p);
|
set_load_weight(p);
|
||||||
|
@ -3432,7 +3459,7 @@ void set_user_nice(struct task_struct *p, long nice)
|
||||||
delta = p->prio - old_prio;
|
delta = p->prio - old_prio;
|
||||||
|
|
||||||
if (queued) {
|
if (queued) {
|
||||||
enqueue_task(rq, p, 0);
|
enqueue_task(rq, p, ENQUEUE_RESTORE);
|
||||||
/*
|
/*
|
||||||
* If the task increased its priority or is running and
|
* If the task increased its priority or is running and
|
||||||
* lowered its priority, then reschedule its CPU:
|
* lowered its priority, then reschedule its CPU:
|
||||||
|
@ -3753,10 +3780,7 @@ recheck:
|
||||||
} else {
|
} else {
|
||||||
reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
|
reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
|
||||||
|
|
||||||
if (policy != SCHED_DEADLINE &&
|
if (!valid_policy(policy))
|
||||||
policy != SCHED_FIFO && policy != SCHED_RR &&
|
|
||||||
policy != SCHED_NORMAL && policy != SCHED_BATCH &&
|
|
||||||
policy != SCHED_IDLE)
|
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3812,7 +3836,7 @@ recheck:
|
||||||
* Treat SCHED_IDLE as nice 20. Only allow a switch to
|
* Treat SCHED_IDLE as nice 20. Only allow a switch to
|
||||||
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
|
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
|
||||||
*/
|
*/
|
||||||
if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
|
if (idle_policy(p->policy) && !idle_policy(policy)) {
|
||||||
if (!can_nice(p, task_nice(p)))
|
if (!can_nice(p, task_nice(p)))
|
||||||
return -EPERM;
|
return -EPERM;
|
||||||
}
|
}
|
||||||
|
@ -3937,7 +3961,7 @@ change:
|
||||||
queued = task_on_rq_queued(p);
|
queued = task_on_rq_queued(p);
|
||||||
running = task_current(rq, p);
|
running = task_current(rq, p);
|
||||||
if (queued)
|
if (queued)
|
||||||
dequeue_task(rq, p, 0);
|
dequeue_task(rq, p, DEQUEUE_SAVE);
|
||||||
if (running)
|
if (running)
|
||||||
put_prev_task(rq, p);
|
put_prev_task(rq, p);
|
||||||
|
|
||||||
|
@ -3947,11 +3971,15 @@ change:
|
||||||
if (running)
|
if (running)
|
||||||
p->sched_class->set_curr_task(rq);
|
p->sched_class->set_curr_task(rq);
|
||||||
if (queued) {
|
if (queued) {
|
||||||
|
int enqueue_flags = ENQUEUE_RESTORE;
|
||||||
/*
|
/*
|
||||||
* We enqueue to tail when the priority of a task is
|
* We enqueue to tail when the priority of a task is
|
||||||
* increased (user space view).
|
* increased (user space view).
|
||||||
*/
|
*/
|
||||||
enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
|
if (oldprio <= p->prio)
|
||||||
|
enqueue_flags |= ENQUEUE_HEAD;
|
||||||
|
|
||||||
|
enqueue_task(rq, p, enqueue_flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
check_class_changed(rq, p, prev_class, oldprio);
|
check_class_changed(rq, p, prev_class, oldprio);
|
||||||
|
@ -5101,7 +5129,7 @@ void sched_setnuma(struct task_struct *p, int nid)
|
||||||
running = task_current(rq, p);
|
running = task_current(rq, p);
|
||||||
|
|
||||||
if (queued)
|
if (queued)
|
||||||
dequeue_task(rq, p, 0);
|
dequeue_task(rq, p, DEQUEUE_SAVE);
|
||||||
if (running)
|
if (running)
|
||||||
put_prev_task(rq, p);
|
put_prev_task(rq, p);
|
||||||
|
|
||||||
|
@ -5110,7 +5138,7 @@ void sched_setnuma(struct task_struct *p, int nid)
|
||||||
if (running)
|
if (running)
|
||||||
p->sched_class->set_curr_task(rq);
|
p->sched_class->set_curr_task(rq);
|
||||||
if (queued)
|
if (queued)
|
||||||
enqueue_task(rq, p, 0);
|
enqueue_task(rq, p, ENQUEUE_RESTORE);
|
||||||
task_rq_unlock(rq, p, &flags);
|
task_rq_unlock(rq, p, &flags);
|
||||||
}
|
}
|
||||||
#endif /* CONFIG_NUMA_BALANCING */
|
#endif /* CONFIG_NUMA_BALANCING */
|
||||||
|
@ -5531,21 +5559,27 @@ static void set_cpu_rq_start_time(void)
|
||||||
static int sched_cpu_active(struct notifier_block *nfb,
|
static int sched_cpu_active(struct notifier_block *nfb,
|
||||||
unsigned long action, void *hcpu)
|
unsigned long action, void *hcpu)
|
||||||
{
|
{
|
||||||
|
int cpu = (long)hcpu;
|
||||||
|
|
||||||
switch (action & ~CPU_TASKS_FROZEN) {
|
switch (action & ~CPU_TASKS_FROZEN) {
|
||||||
case CPU_STARTING:
|
case CPU_STARTING:
|
||||||
set_cpu_rq_start_time();
|
set_cpu_rq_start_time();
|
||||||
return NOTIFY_OK;
|
return NOTIFY_OK;
|
||||||
|
|
||||||
case CPU_ONLINE:
|
case CPU_ONLINE:
|
||||||
/*
|
/*
|
||||||
* At this point a starting CPU has marked itself as online via
|
* At this point a starting CPU has marked itself as online via
|
||||||
* set_cpu_online(). But it might not yet have marked itself
|
* set_cpu_online(). But it might not yet have marked itself
|
||||||
* as active, which is essential from here on.
|
* as active, which is essential from here on.
|
||||||
*
|
|
||||||
* Thus, fall-through and help the starting CPU along.
|
|
||||||
*/
|
*/
|
||||||
case CPU_DOWN_FAILED:
|
set_cpu_active(cpu, true);
|
||||||
set_cpu_active((long)hcpu, true);
|
stop_machine_unpark(cpu);
|
||||||
return NOTIFY_OK;
|
return NOTIFY_OK;
|
||||||
|
|
||||||
|
case CPU_DOWN_FAILED:
|
||||||
|
set_cpu_active(cpu, true);
|
||||||
|
return NOTIFY_OK;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
return NOTIFY_DONE;
|
return NOTIFY_DONE;
|
||||||
}
|
}
|
||||||
|
@ -6477,7 +6511,8 @@ static struct sched_domain_topology_level default_topology[] = {
|
||||||
{ NULL, },
|
{ NULL, },
|
||||||
};
|
};
|
||||||
|
|
||||||
struct sched_domain_topology_level *sched_domain_topology = default_topology;
|
static struct sched_domain_topology_level *sched_domain_topology =
|
||||||
|
default_topology;
|
||||||
|
|
||||||
#define for_each_sd_topology(tl) \
|
#define for_each_sd_topology(tl) \
|
||||||
for (tl = sched_domain_topology; tl->mask; tl++)
|
for (tl = sched_domain_topology; tl->mask; tl++)
|
||||||
|
@ -7478,7 +7513,7 @@ void __init sched_init(void)
|
||||||
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
|
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
|
||||||
static inline int preempt_count_equals(int preempt_offset)
|
static inline int preempt_count_equals(int preempt_offset)
|
||||||
{
|
{
|
||||||
int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
|
int nested = preempt_count() + rcu_preempt_depth();
|
||||||
|
|
||||||
return (nested == preempt_offset);
|
return (nested == preempt_offset);
|
||||||
}
|
}
|
||||||
|
@ -7725,7 +7760,7 @@ void sched_move_task(struct task_struct *tsk)
|
||||||
queued = task_on_rq_queued(tsk);
|
queued = task_on_rq_queued(tsk);
|
||||||
|
|
||||||
if (queued)
|
if (queued)
|
||||||
dequeue_task(rq, tsk, 0);
|
dequeue_task(rq, tsk, DEQUEUE_SAVE);
|
||||||
if (unlikely(running))
|
if (unlikely(running))
|
||||||
put_prev_task(rq, tsk);
|
put_prev_task(rq, tsk);
|
||||||
|
|
||||||
|
@ -7741,7 +7776,7 @@ void sched_move_task(struct task_struct *tsk)
|
||||||
|
|
||||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||||
if (tsk->sched_class->task_move_group)
|
if (tsk->sched_class->task_move_group)
|
||||||
tsk->sched_class->task_move_group(tsk, queued);
|
tsk->sched_class->task_move_group(tsk);
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
set_task_rq(tsk, task_cpu(tsk));
|
set_task_rq(tsk, task_cpu(tsk));
|
||||||
|
@ -7749,7 +7784,7 @@ void sched_move_task(struct task_struct *tsk)
|
||||||
if (unlikely(running))
|
if (unlikely(running))
|
||||||
tsk->sched_class->set_curr_task(rq);
|
tsk->sched_class->set_curr_task(rq);
|
||||||
if (queued)
|
if (queued)
|
||||||
enqueue_task(rq, tsk, 0);
|
enqueue_task(rq, tsk, ENQUEUE_RESTORE);
|
||||||
|
|
||||||
task_rq_unlock(rq, tsk, &flags);
|
task_rq_unlock(rq, tsk, &flags);
|
||||||
}
|
}
|
||||||
|
@ -8213,14 +8248,6 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
|
||||||
struct cgroup_subsys_state *old_css,
|
struct cgroup_subsys_state *old_css,
|
||||||
struct task_struct *task)
|
struct task_struct *task)
|
||||||
{
|
{
|
||||||
/*
|
|
||||||
* cgroup_exit() is called in the copy_process() failure path.
|
|
||||||
* Ignore this case since the task hasn't ran yet, this avoids
|
|
||||||
* trying to poke a half freed task state from generic code.
|
|
||||||
*/
|
|
||||||
if (!(task->flags & PF_EXITING))
|
|
||||||
return;
|
|
||||||
|
|
||||||
sched_move_task(task);
|
sched_move_task(task);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -31,11 +31,6 @@ static inline int right_child(int i)
|
||||||
return (i << 1) + 2;
|
return (i << 1) + 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int dl_time_before(u64 a, u64 b)
|
|
||||||
{
|
|
||||||
return (s64)(a - b) < 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void cpudl_exchange(struct cpudl *cp, int a, int b)
|
static void cpudl_exchange(struct cpudl *cp, int a, int b)
|
||||||
{
|
{
|
||||||
int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
|
int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
#define _LINUX_CPUDL_H
|
#define _LINUX_CPUDL_H
|
||||||
|
|
||||||
#include <linux/sched.h>
|
#include <linux/sched.h>
|
||||||
|
#include <linux/sched/deadline.h>
|
||||||
|
|
||||||
#define IDX_INVALID -1
|
#define IDX_INVALID -1
|
||||||
|
|
||||||
|
|
|
@ -661,11 +661,12 @@ static unsigned long task_h_load(struct task_struct *p);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We choose a half-life close to 1 scheduling period.
|
* We choose a half-life close to 1 scheduling period.
|
||||||
* Note: The tables below are dependent on this value.
|
* Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
|
||||||
|
* dependent on this value.
|
||||||
*/
|
*/
|
||||||
#define LOAD_AVG_PERIOD 32
|
#define LOAD_AVG_PERIOD 32
|
||||||
#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
|
#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
|
||||||
#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
|
#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
|
||||||
|
|
||||||
/* Give new sched_entity start runnable values to heavy its load in infant time */
|
/* Give new sched_entity start runnable values to heavy its load in infant time */
|
||||||
void init_entity_runnable_average(struct sched_entity *se)
|
void init_entity_runnable_average(struct sched_entity *se)
|
||||||
|
@ -682,7 +683,7 @@ void init_entity_runnable_average(struct sched_entity *se)
|
||||||
sa->load_avg = scale_load_down(se->load.weight);
|
sa->load_avg = scale_load_down(se->load.weight);
|
||||||
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
|
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
|
||||||
sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
|
sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
|
||||||
sa->util_sum = LOAD_AVG_MAX;
|
sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
|
||||||
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
|
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2069,7 +2070,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
||||||
int local = !!(flags & TNF_FAULT_LOCAL);
|
int local = !!(flags & TNF_FAULT_LOCAL);
|
||||||
int priv;
|
int priv;
|
||||||
|
|
||||||
if (!numabalancing_enabled)
|
if (!static_branch_likely(&sched_numa_balancing))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/* for example, ksmd faulting in a user's mm */
|
/* for example, ksmd faulting in a user's mm */
|
||||||
|
@ -2157,7 +2158,7 @@ void task_numa_work(struct callback_head *work)
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
unsigned long start, end;
|
unsigned long start, end;
|
||||||
unsigned long nr_pte_updates = 0;
|
unsigned long nr_pte_updates = 0;
|
||||||
long pages;
|
long pages, virtpages;
|
||||||
|
|
||||||
WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
|
WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
|
||||||
|
|
||||||
|
@ -2203,9 +2204,11 @@ void task_numa_work(struct callback_head *work)
|
||||||
start = mm->numa_scan_offset;
|
start = mm->numa_scan_offset;
|
||||||
pages = sysctl_numa_balancing_scan_size;
|
pages = sysctl_numa_balancing_scan_size;
|
||||||
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
|
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
|
||||||
|
virtpages = pages * 8; /* Scan up to this much virtual space */
|
||||||
if (!pages)
|
if (!pages)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
|
||||||
down_read(&mm->mmap_sem);
|
down_read(&mm->mmap_sem);
|
||||||
vma = find_vma(mm, start);
|
vma = find_vma(mm, start);
|
||||||
if (!vma) {
|
if (!vma) {
|
||||||
|
@ -2240,18 +2243,22 @@ void task_numa_work(struct callback_head *work)
|
||||||
start = max(start, vma->vm_start);
|
start = max(start, vma->vm_start);
|
||||||
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
|
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
|
||||||
end = min(end, vma->vm_end);
|
end = min(end, vma->vm_end);
|
||||||
nr_pte_updates += change_prot_numa(vma, start, end);
|
nr_pte_updates = change_prot_numa(vma, start, end);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Scan sysctl_numa_balancing_scan_size but ensure that
|
* Try to scan sysctl_numa_balancing_size worth of
|
||||||
* at least one PTE is updated so that unused virtual
|
* hpages that have at least one present PTE that
|
||||||
* address space is quickly skipped.
|
* is not already pte-numa. If the VMA contains
|
||||||
|
* areas that are unused or already full of prot_numa
|
||||||
|
* PTEs, scan up to virtpages, to skip through those
|
||||||
|
* areas faster.
|
||||||
*/
|
*/
|
||||||
if (nr_pte_updates)
|
if (nr_pte_updates)
|
||||||
pages -= (end - start) >> PAGE_SHIFT;
|
pages -= (end - start) >> PAGE_SHIFT;
|
||||||
|
virtpages -= (end - start) >> PAGE_SHIFT;
|
||||||
|
|
||||||
start = end;
|
start = end;
|
||||||
if (pages <= 0)
|
if (pages <= 0 || virtpages <= 0)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
cond_resched();
|
cond_resched();
|
||||||
|
@ -2515,6 +2522,12 @@ static u32 __compute_runnable_contrib(u64 n)
|
||||||
return contrib + runnable_avg_yN_sum[n];
|
return contrib + runnable_avg_yN_sum[n];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
|
||||||
|
#error "load tracking assumes 2^10 as unit"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We can represent the historical contribution to runnable average as the
|
* We can represent the historical contribution to runnable average as the
|
||||||
* coefficients of a geometric series. To do this we sub-divide our runnable
|
* coefficients of a geometric series. To do this we sub-divide our runnable
|
||||||
|
@ -2547,10 +2560,10 @@ static __always_inline int
|
||||||
__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
|
__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
|
||||||
unsigned long weight, int running, struct cfs_rq *cfs_rq)
|
unsigned long weight, int running, struct cfs_rq *cfs_rq)
|
||||||
{
|
{
|
||||||
u64 delta, periods;
|
u64 delta, scaled_delta, periods;
|
||||||
u32 contrib;
|
u32 contrib;
|
||||||
int delta_w, decayed = 0;
|
unsigned int delta_w, scaled_delta_w, decayed = 0;
|
||||||
unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
|
unsigned long scale_freq, scale_cpu;
|
||||||
|
|
||||||
delta = now - sa->last_update_time;
|
delta = now - sa->last_update_time;
|
||||||
/*
|
/*
|
||||||
|
@ -2571,6 +2584,9 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
|
||||||
return 0;
|
return 0;
|
||||||
sa->last_update_time = now;
|
sa->last_update_time = now;
|
||||||
|
|
||||||
|
scale_freq = arch_scale_freq_capacity(NULL, cpu);
|
||||||
|
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
|
||||||
|
|
||||||
/* delta_w is the amount already accumulated against our next period */
|
/* delta_w is the amount already accumulated against our next period */
|
||||||
delta_w = sa->period_contrib;
|
delta_w = sa->period_contrib;
|
||||||
if (delta + delta_w >= 1024) {
|
if (delta + delta_w >= 1024) {
|
||||||
|
@ -2585,13 +2601,16 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
|
||||||
* period and accrue it.
|
* period and accrue it.
|
||||||
*/
|
*/
|
||||||
delta_w = 1024 - delta_w;
|
delta_w = 1024 - delta_w;
|
||||||
|
scaled_delta_w = cap_scale(delta_w, scale_freq);
|
||||||
if (weight) {
|
if (weight) {
|
||||||
sa->load_sum += weight * delta_w;
|
sa->load_sum += weight * scaled_delta_w;
|
||||||
if (cfs_rq)
|
if (cfs_rq) {
|
||||||
cfs_rq->runnable_load_sum += weight * delta_w;
|
cfs_rq->runnable_load_sum +=
|
||||||
|
weight * scaled_delta_w;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (running)
|
if (running)
|
||||||
sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT;
|
sa->util_sum += scaled_delta_w * scale_cpu;
|
||||||
|
|
||||||
delta -= delta_w;
|
delta -= delta_w;
|
||||||
|
|
||||||
|
@ -2608,23 +2627,25 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
|
||||||
|
|
||||||
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
|
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
|
||||||
contrib = __compute_runnable_contrib(periods);
|
contrib = __compute_runnable_contrib(periods);
|
||||||
|
contrib = cap_scale(contrib, scale_freq);
|
||||||
if (weight) {
|
if (weight) {
|
||||||
sa->load_sum += weight * contrib;
|
sa->load_sum += weight * contrib;
|
||||||
if (cfs_rq)
|
if (cfs_rq)
|
||||||
cfs_rq->runnable_load_sum += weight * contrib;
|
cfs_rq->runnable_load_sum += weight * contrib;
|
||||||
}
|
}
|
||||||
if (running)
|
if (running)
|
||||||
sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT;
|
sa->util_sum += contrib * scale_cpu;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Remainder of delta accrued against u_0` */
|
/* Remainder of delta accrued against u_0` */
|
||||||
|
scaled_delta = cap_scale(delta, scale_freq);
|
||||||
if (weight) {
|
if (weight) {
|
||||||
sa->load_sum += weight * delta;
|
sa->load_sum += weight * scaled_delta;
|
||||||
if (cfs_rq)
|
if (cfs_rq)
|
||||||
cfs_rq->runnable_load_sum += weight * delta;
|
cfs_rq->runnable_load_sum += weight * scaled_delta;
|
||||||
}
|
}
|
||||||
if (running)
|
if (running)
|
||||||
sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT;
|
sa->util_sum += scaled_delta * scale_cpu;
|
||||||
|
|
||||||
sa->period_contrib += delta;
|
sa->period_contrib += delta;
|
||||||
|
|
||||||
|
@ -2634,7 +2655,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
|
||||||
cfs_rq->runnable_load_avg =
|
cfs_rq->runnable_load_avg =
|
||||||
div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
|
div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
|
||||||
}
|
}
|
||||||
sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX;
|
sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
return decayed;
|
return decayed;
|
||||||
|
@ -2677,8 +2698,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
|
||||||
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
|
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
|
||||||
long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
|
long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
|
||||||
sa->util_avg = max_t(long, sa->util_avg - r, 0);
|
sa->util_avg = max_t(long, sa->util_avg - r, 0);
|
||||||
sa->util_sum = max_t(s32, sa->util_sum -
|
sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
|
||||||
((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
|
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
|
||||||
|
@ -2696,33 +2716,70 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
|
||||||
static inline void update_load_avg(struct sched_entity *se, int update_tg)
|
static inline void update_load_avg(struct sched_entity *se, int update_tg)
|
||||||
{
|
{
|
||||||
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
||||||
int cpu = cpu_of(rq_of(cfs_rq));
|
|
||||||
u64 now = cfs_rq_clock_task(cfs_rq);
|
u64 now = cfs_rq_clock_task(cfs_rq);
|
||||||
|
int cpu = cpu_of(rq_of(cfs_rq));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Track task load average for carrying it to new CPU after migrated, and
|
* Track task load average for carrying it to new CPU after migrated, and
|
||||||
* track group sched_entity load average for task_h_load calc in migration
|
* track group sched_entity load average for task_h_load calc in migration
|
||||||
*/
|
*/
|
||||||
__update_load_avg(now, cpu, &se->avg,
|
__update_load_avg(now, cpu, &se->avg,
|
||||||
se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
|
se->on_rq * scale_load_down(se->load.weight),
|
||||||
|
cfs_rq->curr == se, NULL);
|
||||||
|
|
||||||
if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
|
if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
|
||||||
update_tg_load_avg(cfs_rq, 0);
|
update_tg_load_avg(cfs_rq, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||||
|
{
|
||||||
|
if (!sched_feat(ATTACH_AGE_LOAD))
|
||||||
|
goto skip_aging;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If we got migrated (either between CPUs or between cgroups) we'll
|
||||||
|
* have aged the average right before clearing @last_update_time.
|
||||||
|
*/
|
||||||
|
if (se->avg.last_update_time) {
|
||||||
|
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
|
||||||
|
&se->avg, 0, 0, NULL);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* XXX: we could have just aged the entire load away if we've been
|
||||||
|
* absent from the fair class for too long.
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
||||||
|
skip_aging:
|
||||||
|
se->avg.last_update_time = cfs_rq->avg.last_update_time;
|
||||||
|
cfs_rq->avg.load_avg += se->avg.load_avg;
|
||||||
|
cfs_rq->avg.load_sum += se->avg.load_sum;
|
||||||
|
cfs_rq->avg.util_avg += se->avg.util_avg;
|
||||||
|
cfs_rq->avg.util_sum += se->avg.util_sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||||
|
{
|
||||||
|
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
|
||||||
|
&se->avg, se->on_rq * scale_load_down(se->load.weight),
|
||||||
|
cfs_rq->curr == se, NULL);
|
||||||
|
|
||||||
|
cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
|
||||||
|
cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
|
||||||
|
cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
|
||||||
|
cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
|
||||||
|
}
|
||||||
|
|
||||||
/* Add the load generated by se into cfs_rq's load average */
|
/* Add the load generated by se into cfs_rq's load average */
|
||||||
static inline void
|
static inline void
|
||||||
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||||
{
|
{
|
||||||
struct sched_avg *sa = &se->avg;
|
struct sched_avg *sa = &se->avg;
|
||||||
u64 now = cfs_rq_clock_task(cfs_rq);
|
u64 now = cfs_rq_clock_task(cfs_rq);
|
||||||
int migrated = 0, decayed;
|
int migrated, decayed;
|
||||||
|
|
||||||
if (sa->last_update_time == 0) {
|
migrated = !sa->last_update_time;
|
||||||
sa->last_update_time = now;
|
if (!migrated) {
|
||||||
migrated = 1;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
|
__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
|
||||||
se->on_rq * scale_load_down(se->load.weight),
|
se->on_rq * scale_load_down(se->load.weight),
|
||||||
cfs_rq->curr == se, NULL);
|
cfs_rq->curr == se, NULL);
|
||||||
|
@ -2733,12 +2790,8 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||||
cfs_rq->runnable_load_avg += sa->load_avg;
|
cfs_rq->runnable_load_avg += sa->load_avg;
|
||||||
cfs_rq->runnable_load_sum += sa->load_sum;
|
cfs_rq->runnable_load_sum += sa->load_sum;
|
||||||
|
|
||||||
if (migrated) {
|
if (migrated)
|
||||||
cfs_rq->avg.load_avg += sa->load_avg;
|
attach_entity_load_avg(cfs_rq, se);
|
||||||
cfs_rq->avg.load_sum += sa->load_sum;
|
|
||||||
cfs_rq->avg.util_avg += sa->util_avg;
|
|
||||||
cfs_rq->avg.util_sum += sa->util_sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (decayed || migrated)
|
if (decayed || migrated)
|
||||||
update_tg_load_avg(cfs_rq, 0);
|
update_tg_load_avg(cfs_rq, 0);
|
||||||
|
@ -2753,7 +2806,7 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||||
cfs_rq->runnable_load_avg =
|
cfs_rq->runnable_load_avg =
|
||||||
max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
|
max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
|
||||||
cfs_rq->runnable_load_sum =
|
cfs_rq->runnable_load_sum =
|
||||||
max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
|
max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -2821,6 +2874,11 @@ static inline void
|
||||||
dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
|
dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
|
||||||
static inline void remove_entity_load_avg(struct sched_entity *se) {}
|
static inline void remove_entity_load_avg(struct sched_entity *se) {}
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
|
||||||
|
static inline void
|
||||||
|
detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
|
||||||
|
|
||||||
static inline int idle_balance(struct rq *rq)
|
static inline int idle_balance(struct rq *rq)
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -4817,32 +4875,39 @@ next:
|
||||||
done:
|
done:
|
||||||
return target;
|
return target;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* get_cpu_usage returns the amount of capacity of a CPU that is used by CFS
|
* cpu_util returns the amount of capacity of a CPU that is used by CFS
|
||||||
* tasks. The unit of the return value must be the one of capacity so we can
|
* tasks. The unit of the return value must be the one of capacity so we can
|
||||||
* compare the usage with the capacity of the CPU that is available for CFS
|
* compare the utilization with the capacity of the CPU that is available for
|
||||||
* task (ie cpu_capacity).
|
* CFS task (ie cpu_capacity).
|
||||||
* cfs.avg.util_avg is the sum of running time of runnable tasks on a
|
*
|
||||||
* CPU. It represents the amount of utilization of a CPU in the range
|
* cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
|
||||||
* [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full
|
* recent utilization of currently non-runnable tasks on a CPU. It represents
|
||||||
* capacity of the CPU because it's about the running time on this CPU.
|
* the amount of utilization of a CPU in the range [0..capacity_orig] where
|
||||||
* Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE
|
* capacity_orig is the cpu_capacity available at the highest frequency
|
||||||
* because of unfortunate rounding in util_avg or just
|
* (arch_scale_freq_capacity()).
|
||||||
* after migrating tasks until the average stabilizes with the new running
|
* The utilization of a CPU converges towards a sum equal to or less than the
|
||||||
* time. So we need to check that the usage stays into the range
|
* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
|
||||||
* [0..cpu_capacity_orig] and cap if necessary.
|
* the running time on this CPU scaled by capacity_curr.
|
||||||
* Without capping the usage, a group could be seen as overloaded (CPU0 usage
|
*
|
||||||
* at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity
|
* Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
|
||||||
|
* higher than capacity_orig because of unfortunate rounding in
|
||||||
|
* cfs.avg.util_avg or just after migrating tasks and new task wakeups until
|
||||||
|
* the average stabilizes with the new running time. We need to check that the
|
||||||
|
* utilization stays within the range of [0..capacity_orig] and cap it if
|
||||||
|
* necessary. Without utilization capping, a group could be seen as overloaded
|
||||||
|
* (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
|
||||||
|
* available capacity. We allow utilization to overshoot capacity_curr (but not
|
||||||
|
* capacity_orig) as it useful for predicting the capacity required after task
|
||||||
|
* migrations (scheduler-driven DVFS).
|
||||||
*/
|
*/
|
||||||
static int get_cpu_usage(int cpu)
|
static int cpu_util(int cpu)
|
||||||
{
|
{
|
||||||
unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg;
|
unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
|
||||||
unsigned long capacity = capacity_orig_of(cpu);
|
unsigned long capacity = capacity_orig_of(cpu);
|
||||||
|
|
||||||
if (usage >= SCHED_LOAD_SCALE)
|
return (util >= capacity) ? capacity : util;
|
||||||
return capacity;
|
|
||||||
|
|
||||||
return (usage * capacity) >> SCHED_LOAD_SHIFT;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -4945,7 +5010,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
||||||
* previous cpu. However, the caller only guarantees p->pi_lock is held; no
|
* previous cpu. However, the caller only guarantees p->pi_lock is held; no
|
||||||
* other assumptions, including the state of rq->lock, should be made.
|
* other assumptions, including the state of rq->lock, should be made.
|
||||||
*/
|
*/
|
||||||
static void migrate_task_rq_fair(struct task_struct *p, int next_cpu)
|
static void migrate_task_rq_fair(struct task_struct *p)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* We are supposed to update the task to "current" time, then its up to date
|
* We are supposed to update the task to "current" time, then its up to date
|
||||||
|
@ -5525,10 +5590,10 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
|
||||||
unsigned long src_faults, dst_faults;
|
unsigned long src_faults, dst_faults;
|
||||||
int src_nid, dst_nid;
|
int src_nid, dst_nid;
|
||||||
|
|
||||||
if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
|
if (!static_branch_likely(&sched_numa_balancing))
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
if (!sched_feat(NUMA))
|
if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
src_nid = cpu_to_node(env->src_cpu);
|
src_nid = cpu_to_node(env->src_cpu);
|
||||||
|
@ -5934,7 +5999,7 @@ struct sg_lb_stats {
|
||||||
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
|
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
|
||||||
unsigned long load_per_task;
|
unsigned long load_per_task;
|
||||||
unsigned long group_capacity;
|
unsigned long group_capacity;
|
||||||
unsigned long group_usage; /* Total usage of the group */
|
unsigned long group_util; /* Total utilization of the group */
|
||||||
unsigned int sum_nr_running; /* Nr tasks running in the group */
|
unsigned int sum_nr_running; /* Nr tasks running in the group */
|
||||||
unsigned int idle_cpus;
|
unsigned int idle_cpus;
|
||||||
unsigned int group_weight;
|
unsigned int group_weight;
|
||||||
|
@ -6010,19 +6075,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
|
||||||
return load_idx;
|
return load_idx;
|
||||||
}
|
}
|
||||||
|
|
||||||
static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
|
|
||||||
{
|
|
||||||
if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
|
|
||||||
return sd->smt_gain / sd->span_weight;
|
|
||||||
|
|
||||||
return SCHED_CAPACITY_SCALE;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
|
|
||||||
{
|
|
||||||
return default_scale_cpu_capacity(sd, cpu);
|
|
||||||
}
|
|
||||||
|
|
||||||
static unsigned long scale_rt_capacity(int cpu)
|
static unsigned long scale_rt_capacity(int cpu)
|
||||||
{
|
{
|
||||||
struct rq *rq = cpu_rq(cpu);
|
struct rq *rq = cpu_rq(cpu);
|
||||||
|
@ -6052,16 +6104,9 @@ static unsigned long scale_rt_capacity(int cpu)
|
||||||
|
|
||||||
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
|
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
|
||||||
{
|
{
|
||||||
unsigned long capacity = SCHED_CAPACITY_SCALE;
|
unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
|
||||||
struct sched_group *sdg = sd->groups;
|
struct sched_group *sdg = sd->groups;
|
||||||
|
|
||||||
if (sched_feat(ARCH_CAPACITY))
|
|
||||||
capacity *= arch_scale_cpu_capacity(sd, cpu);
|
|
||||||
else
|
|
||||||
capacity *= default_scale_cpu_capacity(sd, cpu);
|
|
||||||
|
|
||||||
capacity >>= SCHED_CAPACITY_SHIFT;
|
|
||||||
|
|
||||||
cpu_rq(cpu)->cpu_capacity_orig = capacity;
|
cpu_rq(cpu)->cpu_capacity_orig = capacity;
|
||||||
|
|
||||||
capacity *= scale_rt_capacity(cpu);
|
capacity *= scale_rt_capacity(cpu);
|
||||||
|
@ -6187,8 +6232,8 @@ static inline int sg_imbalanced(struct sched_group *group)
|
||||||
* group_has_capacity returns true if the group has spare capacity that could
|
* group_has_capacity returns true if the group has spare capacity that could
|
||||||
* be used by some tasks.
|
* be used by some tasks.
|
||||||
* We consider that a group has spare capacity if the * number of task is
|
* We consider that a group has spare capacity if the * number of task is
|
||||||
* smaller than the number of CPUs or if the usage is lower than the available
|
* smaller than the number of CPUs or if the utilization is lower than the
|
||||||
* capacity for CFS tasks.
|
* available capacity for CFS tasks.
|
||||||
* For the latter, we use a threshold to stabilize the state, to take into
|
* For the latter, we use a threshold to stabilize the state, to take into
|
||||||
* account the variance of the tasks' load and to return true if the available
|
* account the variance of the tasks' load and to return true if the available
|
||||||
* capacity in meaningful for the load balancer.
|
* capacity in meaningful for the load balancer.
|
||||||
|
@ -6202,7 +6247,7 @@ group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
if ((sgs->group_capacity * 100) >
|
if ((sgs->group_capacity * 100) >
|
||||||
(sgs->group_usage * env->sd->imbalance_pct))
|
(sgs->group_util * env->sd->imbalance_pct))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
|
@ -6223,15 +6268,15 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if ((sgs->group_capacity * 100) <
|
if ((sgs->group_capacity * 100) <
|
||||||
(sgs->group_usage * env->sd->imbalance_pct))
|
(sgs->group_util * env->sd->imbalance_pct))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static enum group_type group_classify(struct lb_env *env,
|
static inline enum
|
||||||
struct sched_group *group,
|
group_type group_classify(struct sched_group *group,
|
||||||
struct sg_lb_stats *sgs)
|
struct sg_lb_stats *sgs)
|
||||||
{
|
{
|
||||||
if (sgs->group_no_capacity)
|
if (sgs->group_no_capacity)
|
||||||
return group_overloaded;
|
return group_overloaded;
|
||||||
|
@ -6271,7 +6316,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
||||||
load = source_load(i, load_idx);
|
load = source_load(i, load_idx);
|
||||||
|
|
||||||
sgs->group_load += load;
|
sgs->group_load += load;
|
||||||
sgs->group_usage += get_cpu_usage(i);
|
sgs->group_util += cpu_util(i);
|
||||||
sgs->sum_nr_running += rq->cfs.h_nr_running;
|
sgs->sum_nr_running += rq->cfs.h_nr_running;
|
||||||
|
|
||||||
if (rq->nr_running > 1)
|
if (rq->nr_running > 1)
|
||||||
|
@ -6296,7 +6341,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
||||||
sgs->group_weight = group->group_weight;
|
sgs->group_weight = group->group_weight;
|
||||||
|
|
||||||
sgs->group_no_capacity = group_is_overloaded(env, sgs);
|
sgs->group_no_capacity = group_is_overloaded(env, sgs);
|
||||||
sgs->group_type = group_classify(env, group, sgs);
|
sgs->group_type = group_classify(group, sgs);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -6430,7 +6475,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
|
||||||
group_has_capacity(env, &sds->local_stat) &&
|
group_has_capacity(env, &sds->local_stat) &&
|
||||||
(sgs->sum_nr_running > 1)) {
|
(sgs->sum_nr_running > 1)) {
|
||||||
sgs->group_no_capacity = 1;
|
sgs->group_no_capacity = 1;
|
||||||
sgs->group_type = group_overloaded;
|
sgs->group_type = group_classify(sg, sgs);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
|
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
|
||||||
|
@ -7610,8 +7655,22 @@ out:
|
||||||
* When the cpu is attached to null domain for ex, it will not be
|
* When the cpu is attached to null domain for ex, it will not be
|
||||||
* updated.
|
* updated.
|
||||||
*/
|
*/
|
||||||
if (likely(update_next_balance))
|
if (likely(update_next_balance)) {
|
||||||
rq->next_balance = next_balance;
|
rq->next_balance = next_balance;
|
||||||
|
|
||||||
|
#ifdef CONFIG_NO_HZ_COMMON
|
||||||
|
/*
|
||||||
|
* If this CPU has been elected to perform the nohz idle
|
||||||
|
* balance. Other idle CPUs have already rebalanced with
|
||||||
|
* nohz_idle_balance() and nohz.next_balance has been
|
||||||
|
* updated accordingly. This CPU is now running the idle load
|
||||||
|
* balance for itself and we need to update the
|
||||||
|
* nohz.next_balance accordingly.
|
||||||
|
*/
|
||||||
|
if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
|
||||||
|
nohz.next_balance = rq->next_balance;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_NO_HZ_COMMON
|
#ifdef CONFIG_NO_HZ_COMMON
|
||||||
|
@ -7624,6 +7683,9 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
|
||||||
int this_cpu = this_rq->cpu;
|
int this_cpu = this_rq->cpu;
|
||||||
struct rq *rq;
|
struct rq *rq;
|
||||||
int balance_cpu;
|
int balance_cpu;
|
||||||
|
/* Earliest time when we have to do rebalance again */
|
||||||
|
unsigned long next_balance = jiffies + 60*HZ;
|
||||||
|
int update_next_balance = 0;
|
||||||
|
|
||||||
if (idle != CPU_IDLE ||
|
if (idle != CPU_IDLE ||
|
||||||
!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
|
!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
|
||||||
|
@ -7655,10 +7717,19 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
|
||||||
rebalance_domains(rq, CPU_IDLE);
|
rebalance_domains(rq, CPU_IDLE);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (time_after(this_rq->next_balance, rq->next_balance))
|
if (time_after(next_balance, rq->next_balance)) {
|
||||||
this_rq->next_balance = rq->next_balance;
|
next_balance = rq->next_balance;
|
||||||
|
update_next_balance = 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
nohz.next_balance = this_rq->next_balance;
|
|
||||||
|
/*
|
||||||
|
* next_balance will be updated only when there is a need.
|
||||||
|
* When the CPU is attached to null domain for ex, it will not be
|
||||||
|
* updated.
|
||||||
|
*/
|
||||||
|
if (likely(update_next_balance))
|
||||||
|
nohz.next_balance = next_balance;
|
||||||
end:
|
end:
|
||||||
clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
|
clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
|
||||||
}
|
}
|
||||||
|
@ -7811,7 +7882,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
|
||||||
entity_tick(cfs_rq, se, queued);
|
entity_tick(cfs_rq, se, queued);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (numabalancing_enabled)
|
if (static_branch_unlikely(&sched_numa_balancing))
|
||||||
task_tick_numa(rq, curr);
|
task_tick_numa(rq, curr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7887,21 +7958,39 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
|
||||||
check_preempt_curr(rq, p, 0);
|
check_preempt_curr(rq, p, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void switched_from_fair(struct rq *rq, struct task_struct *p)
|
static inline bool vruntime_normalized(struct task_struct *p)
|
||||||
|
{
|
||||||
|
struct sched_entity *se = &p->se;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
|
||||||
|
* the dequeue_entity(.flags=0) will already have normalized the
|
||||||
|
* vruntime.
|
||||||
|
*/
|
||||||
|
if (p->on_rq)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* When !on_rq, vruntime of the task has usually NOT been normalized.
|
||||||
|
* But there are some cases where it has already been normalized:
|
||||||
|
*
|
||||||
|
* - A forked child which is waiting for being woken up by
|
||||||
|
* wake_up_new_task().
|
||||||
|
* - A task which has been woken up by try_to_wake_up() and
|
||||||
|
* waiting for actually being woken up by sched_ttwu_pending().
|
||||||
|
*/
|
||||||
|
if (!se->sum_exec_runtime || p->state == TASK_WAKING)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void detach_task_cfs_rq(struct task_struct *p)
|
||||||
{
|
{
|
||||||
struct sched_entity *se = &p->se;
|
struct sched_entity *se = &p->se;
|
||||||
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
||||||
|
|
||||||
/*
|
if (!vruntime_normalized(p)) {
|
||||||
* Ensure the task's vruntime is normalized, so that when it's
|
|
||||||
* switched back to the fair class the enqueue_entity(.flags=0) will
|
|
||||||
* do the right thing.
|
|
||||||
*
|
|
||||||
* If it's queued, then the dequeue_entity(.flags=0) will already
|
|
||||||
* have normalized the vruntime, if it's !queued, then only when
|
|
||||||
* the task is sleeping will it still have non-normalized vruntime.
|
|
||||||
*/
|
|
||||||
if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
|
|
||||||
/*
|
/*
|
||||||
* Fix up our vruntime so that the current sleep doesn't
|
* Fix up our vruntime so that the current sleep doesn't
|
||||||
* cause 'unlimited' sleep bonus.
|
* cause 'unlimited' sleep bonus.
|
||||||
|
@ -7910,28 +7999,14 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
|
||||||
se->vruntime -= cfs_rq->min_vruntime;
|
se->vruntime -= cfs_rq->min_vruntime;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_SMP
|
|
||||||
/* Catch up with the cfs_rq and remove our load when we leave */
|
/* Catch up with the cfs_rq and remove our load when we leave */
|
||||||
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg,
|
detach_entity_load_avg(cfs_rq, se);
|
||||||
se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
|
|
||||||
|
|
||||||
cfs_rq->avg.load_avg =
|
|
||||||
max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
|
|
||||||
cfs_rq->avg.load_sum =
|
|
||||||
max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
|
|
||||||
cfs_rq->avg.util_avg =
|
|
||||||
max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
|
|
||||||
cfs_rq->avg.util_sum =
|
|
||||||
max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
static void attach_task_cfs_rq(struct task_struct *p)
|
||||||
* We switched to the sched_fair class.
|
|
||||||
*/
|
|
||||||
static void switched_to_fair(struct rq *rq, struct task_struct *p)
|
|
||||||
{
|
{
|
||||||
struct sched_entity *se = &p->se;
|
struct sched_entity *se = &p->se;
|
||||||
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
||||||
|
|
||||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||||
/*
|
/*
|
||||||
|
@ -7941,31 +8016,33 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
|
||||||
se->depth = se->parent ? se->parent->depth + 1 : 0;
|
se->depth = se->parent ? se->parent->depth + 1 : 0;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (!task_on_rq_queued(p)) {
|
/* Synchronize task with its cfs_rq */
|
||||||
|
attach_entity_load_avg(cfs_rq, se);
|
||||||
|
|
||||||
|
if (!vruntime_normalized(p))
|
||||||
|
se->vruntime += cfs_rq->min_vruntime;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void switched_from_fair(struct rq *rq, struct task_struct *p)
|
||||||
|
{
|
||||||
|
detach_task_cfs_rq(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void switched_to_fair(struct rq *rq, struct task_struct *p)
|
||||||
|
{
|
||||||
|
attach_task_cfs_rq(p);
|
||||||
|
|
||||||
|
if (task_on_rq_queued(p)) {
|
||||||
/*
|
/*
|
||||||
* Ensure the task has a non-normalized vruntime when it is switched
|
* We were most likely switched from sched_rt, so
|
||||||
* back to the fair class with !queued, so that enqueue_entity() at
|
* kick off the schedule if running, otherwise just see
|
||||||
* wake-up time will do the right thing.
|
* if we can still preempt the current task.
|
||||||
*
|
|
||||||
* If it's queued, then the enqueue_entity(.flags=0) makes the task
|
|
||||||
* has non-normalized vruntime, if it's !queued, then it still has
|
|
||||||
* normalized vruntime.
|
|
||||||
*/
|
*/
|
||||||
if (p->state != TASK_RUNNING)
|
if (rq->curr == p)
|
||||||
se->vruntime += cfs_rq_of(se)->min_vruntime;
|
resched_curr(rq);
|
||||||
return;
|
else
|
||||||
|
check_preempt_curr(rq, p, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* We were most likely switched from sched_rt, so
|
|
||||||
* kick off the schedule if running, otherwise just see
|
|
||||||
* if we can still preempt the current task.
|
|
||||||
*/
|
|
||||||
if (rq->curr == p)
|
|
||||||
resched_curr(rq);
|
|
||||||
else
|
|
||||||
check_preempt_curr(rq, p, 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Account for a task changing its policy or group.
|
/* Account for a task changing its policy or group.
|
||||||
|
@ -8000,56 +8077,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||||
static void task_move_group_fair(struct task_struct *p, int queued)
|
static void task_move_group_fair(struct task_struct *p)
|
||||||
{
|
{
|
||||||
struct sched_entity *se = &p->se;
|
detach_task_cfs_rq(p);
|
||||||
struct cfs_rq *cfs_rq;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If the task was not on the rq at the time of this cgroup movement
|
|
||||||
* it must have been asleep, sleeping tasks keep their ->vruntime
|
|
||||||
* absolute on their old rq until wakeup (needed for the fair sleeper
|
|
||||||
* bonus in place_entity()).
|
|
||||||
*
|
|
||||||
* If it was on the rq, we've just 'preempted' it, which does convert
|
|
||||||
* ->vruntime to a relative base.
|
|
||||||
*
|
|
||||||
* Make sure both cases convert their relative position when migrating
|
|
||||||
* to another cgroup's rq. This does somewhat interfere with the
|
|
||||||
* fair sleeper stuff for the first placement, but who cares.
|
|
||||||
*/
|
|
||||||
/*
|
|
||||||
* When !queued, vruntime of the task has usually NOT been normalized.
|
|
||||||
* But there are some cases where it has already been normalized:
|
|
||||||
*
|
|
||||||
* - Moving a forked child which is waiting for being woken up by
|
|
||||||
* wake_up_new_task().
|
|
||||||
* - Moving a task which has been woken up by try_to_wake_up() and
|
|
||||||
* waiting for actually being woken up by sched_ttwu_pending().
|
|
||||||
*
|
|
||||||
* To prevent boost or penalty in the new cfs_rq caused by delta
|
|
||||||
* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
|
|
||||||
*/
|
|
||||||
if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
|
|
||||||
queued = 1;
|
|
||||||
|
|
||||||
if (!queued)
|
|
||||||
se->vruntime -= cfs_rq_of(se)->min_vruntime;
|
|
||||||
set_task_rq(p, task_cpu(p));
|
set_task_rq(p, task_cpu(p));
|
||||||
se->depth = se->parent ? se->parent->depth + 1 : 0;
|
|
||||||
if (!queued) {
|
|
||||||
cfs_rq = cfs_rq_of(se);
|
|
||||||
se->vruntime += cfs_rq->min_vruntime;
|
|
||||||
|
|
||||||
#ifdef CONFIG_SMP
|
#ifdef CONFIG_SMP
|
||||||
/* Virtually synchronize task with its new cfs_rq */
|
/* Tell se's cfs_rq has been changed -- migrated */
|
||||||
p->se.avg.last_update_time = cfs_rq->avg.last_update_time;
|
p->se.avg.last_update_time = 0;
|
||||||
cfs_rq->avg.load_avg += p->se.avg.load_avg;
|
|
||||||
cfs_rq->avg.load_sum += p->se.avg.load_sum;
|
|
||||||
cfs_rq->avg.util_avg += p->se.avg.util_avg;
|
|
||||||
cfs_rq->avg.util_sum += p->se.avg.util_sum;
|
|
||||||
#endif
|
#endif
|
||||||
}
|
attach_task_cfs_rq(p);
|
||||||
}
|
}
|
||||||
|
|
||||||
void free_fair_sched_group(struct task_group *tg)
|
void free_fair_sched_group(struct task_group *tg)
|
||||||
|
|
|
@ -36,11 +36,6 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
|
||||||
*/
|
*/
|
||||||
SCHED_FEAT(WAKEUP_PREEMPTION, true)
|
SCHED_FEAT(WAKEUP_PREEMPTION, true)
|
||||||
|
|
||||||
/*
|
|
||||||
* Use arch dependent cpu capacity functions
|
|
||||||
*/
|
|
||||||
SCHED_FEAT(ARCH_CAPACITY, true)
|
|
||||||
|
|
||||||
SCHED_FEAT(HRTICK, false)
|
SCHED_FEAT(HRTICK, false)
|
||||||
SCHED_FEAT(DOUBLE_TICK, false)
|
SCHED_FEAT(DOUBLE_TICK, false)
|
||||||
SCHED_FEAT(LB_BIAS, true)
|
SCHED_FEAT(LB_BIAS, true)
|
||||||
|
@ -72,19 +67,5 @@ SCHED_FEAT(RT_PUSH_IPI, true)
|
||||||
SCHED_FEAT(FORCE_SD_OVERLAP, false)
|
SCHED_FEAT(FORCE_SD_OVERLAP, false)
|
||||||
SCHED_FEAT(RT_RUNTIME_SHARE, true)
|
SCHED_FEAT(RT_RUNTIME_SHARE, true)
|
||||||
SCHED_FEAT(LB_MIN, false)
|
SCHED_FEAT(LB_MIN, false)
|
||||||
|
SCHED_FEAT(ATTACH_AGE_LOAD, true)
|
||||||
|
|
||||||
/*
|
|
||||||
* Apply the automatic NUMA scheduling policy. Enabled automatically
|
|
||||||
* at runtime if running on a NUMA machine. Can be controlled via
|
|
||||||
* numa_balancing=
|
|
||||||
*/
|
|
||||||
#ifdef CONFIG_NUMA_BALANCING
|
|
||||||
|
|
||||||
/*
|
|
||||||
* NUMA will favor moving tasks towards nodes where a higher number of
|
|
||||||
* hinting faults are recorded during active load balancing. It will
|
|
||||||
* resist moving tasks towards nodes where a lower number of hinting
|
|
||||||
* faults have been recorded.
|
|
||||||
*/
|
|
||||||
SCHED_FEAT(NUMA, true)
|
|
||||||
#endif
|
|
||||||
|
|
|
@ -635,11 +635,11 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
|
||||||
/*
|
/*
|
||||||
* We ran out of runtime, see if we can borrow some from our neighbours.
|
* We ran out of runtime, see if we can borrow some from our neighbours.
|
||||||
*/
|
*/
|
||||||
static int do_balance_runtime(struct rt_rq *rt_rq)
|
static void do_balance_runtime(struct rt_rq *rt_rq)
|
||||||
{
|
{
|
||||||
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
|
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
|
||||||
struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
|
struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
|
||||||
int i, weight, more = 0;
|
int i, weight;
|
||||||
u64 rt_period;
|
u64 rt_period;
|
||||||
|
|
||||||
weight = cpumask_weight(rd->span);
|
weight = cpumask_weight(rd->span);
|
||||||
|
@ -673,7 +673,6 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
|
||||||
diff = rt_period - rt_rq->rt_runtime;
|
diff = rt_period - rt_rq->rt_runtime;
|
||||||
iter->rt_runtime -= diff;
|
iter->rt_runtime -= diff;
|
||||||
rt_rq->rt_runtime += diff;
|
rt_rq->rt_runtime += diff;
|
||||||
more = 1;
|
|
||||||
if (rt_rq->rt_runtime == rt_period) {
|
if (rt_rq->rt_runtime == rt_period) {
|
||||||
raw_spin_unlock(&iter->rt_runtime_lock);
|
raw_spin_unlock(&iter->rt_runtime_lock);
|
||||||
break;
|
break;
|
||||||
|
@ -683,8 +682,6 @@ next:
|
||||||
raw_spin_unlock(&iter->rt_runtime_lock);
|
raw_spin_unlock(&iter->rt_runtime_lock);
|
||||||
}
|
}
|
||||||
raw_spin_unlock(&rt_b->rt_runtime_lock);
|
raw_spin_unlock(&rt_b->rt_runtime_lock);
|
||||||
|
|
||||||
return more;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -796,26 +793,19 @@ static void __enable_runtime(struct rq *rq)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static int balance_runtime(struct rt_rq *rt_rq)
|
static void balance_runtime(struct rt_rq *rt_rq)
|
||||||
{
|
{
|
||||||
int more = 0;
|
|
||||||
|
|
||||||
if (!sched_feat(RT_RUNTIME_SHARE))
|
if (!sched_feat(RT_RUNTIME_SHARE))
|
||||||
return more;
|
return;
|
||||||
|
|
||||||
if (rt_rq->rt_time > rt_rq->rt_runtime) {
|
if (rt_rq->rt_time > rt_rq->rt_runtime) {
|
||||||
raw_spin_unlock(&rt_rq->rt_runtime_lock);
|
raw_spin_unlock(&rt_rq->rt_runtime_lock);
|
||||||
more = do_balance_runtime(rt_rq);
|
do_balance_runtime(rt_rq);
|
||||||
raw_spin_lock(&rt_rq->rt_runtime_lock);
|
raw_spin_lock(&rt_rq->rt_runtime_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
return more;
|
|
||||||
}
|
}
|
||||||
#else /* !CONFIG_SMP */
|
#else /* !CONFIG_SMP */
|
||||||
static inline int balance_runtime(struct rt_rq *rt_rq)
|
static inline void balance_runtime(struct rt_rq *rt_rq) {}
|
||||||
{
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
#endif /* CONFIG_SMP */
|
#endif /* CONFIG_SMP */
|
||||||
|
|
||||||
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
|
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
|
||||||
|
|
|
@ -84,6 +84,10 @@ static inline void update_cpu_load_active(struct rq *this_rq) { }
|
||||||
*/
|
*/
|
||||||
#define RUNTIME_INF ((u64)~0ULL)
|
#define RUNTIME_INF ((u64)~0ULL)
|
||||||
|
|
||||||
|
static inline int idle_policy(int policy)
|
||||||
|
{
|
||||||
|
return policy == SCHED_IDLE;
|
||||||
|
}
|
||||||
static inline int fair_policy(int policy)
|
static inline int fair_policy(int policy)
|
||||||
{
|
{
|
||||||
return policy == SCHED_NORMAL || policy == SCHED_BATCH;
|
return policy == SCHED_NORMAL || policy == SCHED_BATCH;
|
||||||
|
@ -98,6 +102,11 @@ static inline int dl_policy(int policy)
|
||||||
{
|
{
|
||||||
return policy == SCHED_DEADLINE;
|
return policy == SCHED_DEADLINE;
|
||||||
}
|
}
|
||||||
|
static inline bool valid_policy(int policy)
|
||||||
|
{
|
||||||
|
return idle_policy(policy) || fair_policy(policy) ||
|
||||||
|
rt_policy(policy) || dl_policy(policy);
|
||||||
|
}
|
||||||
|
|
||||||
static inline int task_has_rt_policy(struct task_struct *p)
|
static inline int task_has_rt_policy(struct task_struct *p)
|
||||||
{
|
{
|
||||||
|
@ -109,11 +118,6 @@ static inline int task_has_dl_policy(struct task_struct *p)
|
||||||
return dl_policy(p->policy);
|
return dl_policy(p->policy);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool dl_time_before(u64 a, u64 b)
|
|
||||||
{
|
|
||||||
return (s64)(a - b) < 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Tells if entity @a should preempt entity @b.
|
* Tells if entity @a should preempt entity @b.
|
||||||
*/
|
*/
|
||||||
|
@ -1003,17 +1007,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
|
||||||
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
|
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
|
||||||
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
|
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
|
||||||
|
|
||||||
#ifdef CONFIG_NUMA_BALANCING
|
extern struct static_key_false sched_numa_balancing;
|
||||||
#define sched_feat_numa(x) sched_feat(x)
|
|
||||||
#ifdef CONFIG_SCHED_DEBUG
|
|
||||||
#define numabalancing_enabled sched_feat_numa(NUMA)
|
|
||||||
#else
|
|
||||||
extern bool numabalancing_enabled;
|
|
||||||
#endif /* CONFIG_SCHED_DEBUG */
|
|
||||||
#else
|
|
||||||
#define sched_feat_numa(x) (0)
|
|
||||||
#define numabalancing_enabled (0)
|
|
||||||
#endif /* CONFIG_NUMA_BALANCING */
|
|
||||||
|
|
||||||
static inline u64 global_rt_period(void)
|
static inline u64 global_rt_period(void)
|
||||||
{
|
{
|
||||||
|
@ -1157,16 +1151,18 @@ static const u32 prio_to_wmult[40] = {
|
||||||
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
|
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
|
||||||
};
|
};
|
||||||
|
|
||||||
#define ENQUEUE_WAKEUP 1
|
#define ENQUEUE_WAKEUP 0x01
|
||||||
#define ENQUEUE_HEAD 2
|
#define ENQUEUE_HEAD 0x02
|
||||||
#ifdef CONFIG_SMP
|
#ifdef CONFIG_SMP
|
||||||
#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
|
#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */
|
||||||
#else
|
#else
|
||||||
#define ENQUEUE_WAKING 0
|
#define ENQUEUE_WAKING 0x00
|
||||||
#endif
|
#endif
|
||||||
#define ENQUEUE_REPLENISH 8
|
#define ENQUEUE_REPLENISH 0x08
|
||||||
|
#define ENQUEUE_RESTORE 0x10
|
||||||
|
|
||||||
#define DEQUEUE_SLEEP 1
|
#define DEQUEUE_SLEEP 0x01
|
||||||
|
#define DEQUEUE_SAVE 0x02
|
||||||
|
|
||||||
#define RETRY_TASK ((void *)-1UL)
|
#define RETRY_TASK ((void *)-1UL)
|
||||||
|
|
||||||
|
@ -1194,7 +1190,7 @@ struct sched_class {
|
||||||
|
|
||||||
#ifdef CONFIG_SMP
|
#ifdef CONFIG_SMP
|
||||||
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
|
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
|
||||||
void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
|
void (*migrate_task_rq)(struct task_struct *p);
|
||||||
|
|
||||||
void (*task_waking) (struct task_struct *task);
|
void (*task_waking) (struct task_struct *task);
|
||||||
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
|
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
|
||||||
|
@ -1227,7 +1223,7 @@ struct sched_class {
|
||||||
void (*update_curr) (struct rq *rq);
|
void (*update_curr) (struct rq *rq);
|
||||||
|
|
||||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||||
void (*task_move_group) (struct task_struct *p, int on_rq);
|
void (*task_move_group) (struct task_struct *p);
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1405,6 +1401,17 @@ unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef arch_scale_cpu_capacity
|
||||||
|
static __always_inline
|
||||||
|
unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
|
||||||
|
{
|
||||||
|
if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
|
||||||
|
return sd->smt_gain / sd->span_weight;
|
||||||
|
|
||||||
|
return SCHED_CAPACITY_SCALE;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
|
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
|
||||||
{
|
{
|
||||||
rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
|
rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
|
||||||
|
|
|
@ -222,9 +222,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
|
||||||
{
|
{
|
||||||
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
|
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
|
||||||
|
|
||||||
if (ht->pre_unpark)
|
if (!ht->selfparking)
|
||||||
ht->pre_unpark(cpu);
|
kthread_unpark(tsk);
|
||||||
kthread_unpark(tsk);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void smpboot_unpark_threads(unsigned int cpu)
|
void smpboot_unpark_threads(unsigned int cpu)
|
||||||
|
|
|
@ -73,21 +73,24 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void __cpu_stop_queue_work(struct cpu_stopper *stopper,
|
||||||
|
struct cpu_stop_work *work)
|
||||||
|
{
|
||||||
|
list_add_tail(&work->list, &stopper->works);
|
||||||
|
wake_up_process(stopper->thread);
|
||||||
|
}
|
||||||
|
|
||||||
/* queue @work to @stopper. if offline, @work is completed immediately */
|
/* queue @work to @stopper. if offline, @work is completed immediately */
|
||||||
static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
|
static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
|
||||||
{
|
{
|
||||||
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
|
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
|
||||||
|
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
|
||||||
spin_lock_irqsave(&stopper->lock, flags);
|
spin_lock_irqsave(&stopper->lock, flags);
|
||||||
|
if (stopper->enabled)
|
||||||
if (stopper->enabled) {
|
__cpu_stop_queue_work(stopper, work);
|
||||||
list_add_tail(&work->list, &stopper->works);
|
else
|
||||||
wake_up_process(stopper->thread);
|
|
||||||
} else
|
|
||||||
cpu_stop_signal_done(work->done, false);
|
cpu_stop_signal_done(work->done, false);
|
||||||
|
|
||||||
spin_unlock_irqrestore(&stopper->lock, flags);
|
spin_unlock_irqrestore(&stopper->lock, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -213,6 +216,31 @@ static int multi_cpu_stop(void *data)
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
|
||||||
|
int cpu2, struct cpu_stop_work *work2)
|
||||||
|
{
|
||||||
|
struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
|
||||||
|
struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
|
||||||
|
int err;
|
||||||
|
|
||||||
|
lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
|
||||||
|
spin_lock_irq(&stopper1->lock);
|
||||||
|
spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
|
||||||
|
|
||||||
|
err = -ENOENT;
|
||||||
|
if (!stopper1->enabled || !stopper2->enabled)
|
||||||
|
goto unlock;
|
||||||
|
|
||||||
|
err = 0;
|
||||||
|
__cpu_stop_queue_work(stopper1, work1);
|
||||||
|
__cpu_stop_queue_work(stopper2, work2);
|
||||||
|
unlock:
|
||||||
|
spin_unlock(&stopper2->lock);
|
||||||
|
spin_unlock_irq(&stopper1->lock);
|
||||||
|
lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
|
||||||
|
|
||||||
|
return err;
|
||||||
|
}
|
||||||
/**
|
/**
|
||||||
* stop_two_cpus - stops two cpus
|
* stop_two_cpus - stops two cpus
|
||||||
* @cpu1: the cpu to stop
|
* @cpu1: the cpu to stop
|
||||||
|
@ -247,24 +275,13 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
|
||||||
cpu_stop_init_done(&done, 2);
|
cpu_stop_init_done(&done, 2);
|
||||||
set_state(&msdata, MULTI_STOP_PREPARE);
|
set_state(&msdata, MULTI_STOP_PREPARE);
|
||||||
|
|
||||||
/*
|
if (cpu1 > cpu2)
|
||||||
* If we observe both CPUs active we know _cpu_down() cannot yet have
|
swap(cpu1, cpu2);
|
||||||
* queued its stop_machine works and therefore ours will get executed
|
if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) {
|
||||||
* first. Or its not either one of our CPUs that's getting unplugged,
|
|
||||||
* in which case we don't care.
|
|
||||||
*
|
|
||||||
* This relies on the stopper workqueues to be FIFO.
|
|
||||||
*/
|
|
||||||
if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
|
|
||||||
preempt_enable();
|
preempt_enable();
|
||||||
return -ENOENT;
|
return -ENOENT;
|
||||||
}
|
}
|
||||||
|
|
||||||
lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
|
|
||||||
cpu_stop_queue_work(cpu1, &work1);
|
|
||||||
cpu_stop_queue_work(cpu2, &work2);
|
|
||||||
lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
|
|
||||||
|
|
||||||
preempt_enable();
|
preempt_enable();
|
||||||
|
|
||||||
wait_for_completion(&done.completion);
|
wait_for_completion(&done.completion);
|
||||||
|
@ -452,6 +469,18 @@ repeat:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void stop_machine_park(int cpu)
|
||||||
|
{
|
||||||
|
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
|
||||||
|
/*
|
||||||
|
* Lockless. cpu_stopper_thread() will take stopper->lock and flush
|
||||||
|
* the pending works before it parks, until then it is fine to queue
|
||||||
|
* the new works.
|
||||||
|
*/
|
||||||
|
stopper->enabled = false;
|
||||||
|
kthread_park(stopper->thread);
|
||||||
|
}
|
||||||
|
|
||||||
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
|
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
|
||||||
|
|
||||||
static void cpu_stop_create(unsigned int cpu)
|
static void cpu_stop_create(unsigned int cpu)
|
||||||
|
@ -462,26 +491,16 @@ static void cpu_stop_create(unsigned int cpu)
|
||||||
static void cpu_stop_park(unsigned int cpu)
|
static void cpu_stop_park(unsigned int cpu)
|
||||||
{
|
{
|
||||||
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
|
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
|
||||||
struct cpu_stop_work *work, *tmp;
|
|
||||||
unsigned long flags;
|
|
||||||
|
|
||||||
/* drain remaining works */
|
WARN_ON(!list_empty(&stopper->works));
|
||||||
spin_lock_irqsave(&stopper->lock, flags);
|
|
||||||
list_for_each_entry_safe(work, tmp, &stopper->works, list) {
|
|
||||||
list_del_init(&work->list);
|
|
||||||
cpu_stop_signal_done(work->done, false);
|
|
||||||
}
|
|
||||||
stopper->enabled = false;
|
|
||||||
spin_unlock_irqrestore(&stopper->lock, flags);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void cpu_stop_unpark(unsigned int cpu)
|
void stop_machine_unpark(int cpu)
|
||||||
{
|
{
|
||||||
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
|
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
|
||||||
|
|
||||||
spin_lock_irq(&stopper->lock);
|
|
||||||
stopper->enabled = true;
|
stopper->enabled = true;
|
||||||
spin_unlock_irq(&stopper->lock);
|
kthread_unpark(stopper->thread);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct smp_hotplug_thread cpu_stop_threads = {
|
static struct smp_hotplug_thread cpu_stop_threads = {
|
||||||
|
@ -490,9 +509,7 @@ static struct smp_hotplug_thread cpu_stop_threads = {
|
||||||
.thread_fn = cpu_stopper_thread,
|
.thread_fn = cpu_stopper_thread,
|
||||||
.thread_comm = "migration/%u",
|
.thread_comm = "migration/%u",
|
||||||
.create = cpu_stop_create,
|
.create = cpu_stop_create,
|
||||||
.setup = cpu_stop_unpark,
|
|
||||||
.park = cpu_stop_park,
|
.park = cpu_stop_park,
|
||||||
.pre_unpark = cpu_stop_unpark,
|
|
||||||
.selfparking = true,
|
.selfparking = true,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -508,6 +525,7 @@ static int __init cpu_stop_init(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
|
BUG_ON(smpboot_register_percpu_thread(&cpu_stop_threads));
|
||||||
|
stop_machine_unpark(raw_smp_processor_id());
|
||||||
stop_machine_initialized = true;
|
stop_machine_initialized = true;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -5697,7 +5697,7 @@ free:
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
ftrace_graph_probe_sched_switch(void *ignore,
|
ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
|
||||||
struct task_struct *prev, struct task_struct *next)
|
struct task_struct *prev, struct task_struct *next)
|
||||||
{
|
{
|
||||||
unsigned long long timestamp;
|
unsigned long long timestamp;
|
||||||
|
|
|
@ -16,7 +16,8 @@ static int sched_ref;
|
||||||
static DEFINE_MUTEX(sched_register_mutex);
|
static DEFINE_MUTEX(sched_register_mutex);
|
||||||
|
|
||||||
static void
|
static void
|
||||||
probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
|
probe_sched_switch(void *ignore, bool preempt,
|
||||||
|
struct task_struct *prev, struct task_struct *next)
|
||||||
{
|
{
|
||||||
if (unlikely(!sched_ref))
|
if (unlikely(!sched_ref))
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -420,7 +420,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
|
||||||
}
|
}
|
||||||
|
|
||||||
static void notrace
|
static void notrace
|
||||||
probe_wakeup_sched_switch(void *ignore,
|
probe_wakeup_sched_switch(void *ignore, bool preempt,
|
||||||
struct task_struct *prev, struct task_struct *next)
|
struct task_struct *prev, struct task_struct *next)
|
||||||
{
|
{
|
||||||
struct trace_array_cpu *data;
|
struct trace_array_cpu *data;
|
||||||
|
|
Loading…
Reference in a new issue