Merge branch 'sched/balancing' into sched/core
This commit is contained in:
commit
66fef08f7d
1 changed files with 554 additions and 289 deletions
843
kernel/sched.c
843
kernel/sched.c
|
@ -3189,215 +3189,565 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
|
|||
|
||||
return 0;
|
||||
}
|
||||
/********** Helpers for find_busiest_group ************************/
|
||||
/**
|
||||
* sd_lb_stats - Structure to store the statistics of a sched_domain
|
||||
* during load balancing.
|
||||
*/
|
||||
struct sd_lb_stats {
|
||||
struct sched_group *busiest; /* Busiest group in this sd */
|
||||
struct sched_group *this; /* Local group in this sd */
|
||||
unsigned long total_load; /* Total load of all groups in sd */
|
||||
unsigned long total_pwr; /* Total power of all groups in sd */
|
||||
unsigned long avg_load; /* Average load across all groups in sd */
|
||||
|
||||
/*
|
||||
* find_busiest_group finds and returns the busiest CPU group within the
|
||||
* domain. It calculates and returns the amount of weighted load which
|
||||
* should be moved to restore balance via the imbalance parameter.
|
||||
/** Statistics of this group */
|
||||
unsigned long this_load;
|
||||
unsigned long this_load_per_task;
|
||||
unsigned long this_nr_running;
|
||||
|
||||
/* Statistics of the busiest group */
|
||||
unsigned long max_load;
|
||||
unsigned long busiest_load_per_task;
|
||||
unsigned long busiest_nr_running;
|
||||
|
||||
int group_imb; /* Is there imbalance in this sd */
|
||||
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
|
||||
int power_savings_balance; /* Is powersave balance needed for this sd */
|
||||
struct sched_group *group_min; /* Least loaded group in sd */
|
||||
struct sched_group *group_leader; /* Group which relieves group_min */
|
||||
unsigned long min_load_per_task; /* load_per_task in group_min */
|
||||
unsigned long leader_nr_running; /* Nr running of group_leader */
|
||||
unsigned long min_nr_running; /* Nr running of group_min */
|
||||
#endif
|
||||
};
|
||||
|
||||
/**
|
||||
* sg_lb_stats - stats of a sched_group required for load_balancing
|
||||
*/
|
||||
struct sg_lb_stats {
|
||||
unsigned long avg_load; /*Avg load across the CPUs of the group */
|
||||
unsigned long group_load; /* Total load over the CPUs of the group */
|
||||
unsigned long sum_nr_running; /* Nr tasks running in the group */
|
||||
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
|
||||
unsigned long group_capacity;
|
||||
int group_imb; /* Is there an imbalance in the group ? */
|
||||
};
|
||||
|
||||
/**
|
||||
* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
|
||||
* @group: The group whose first cpu is to be returned.
|
||||
*/
|
||||
static inline unsigned int group_first_cpu(struct sched_group *group)
|
||||
{
|
||||
return cpumask_first(sched_group_cpus(group));
|
||||
}
|
||||
|
||||
/**
|
||||
* get_sd_load_idx - Obtain the load index for a given sched domain.
|
||||
* @sd: The sched_domain whose load_idx is to be obtained.
|
||||
* @idle: The Idle status of the CPU for whose sd load_icx is obtained.
|
||||
*/
|
||||
static inline int get_sd_load_idx(struct sched_domain *sd,
|
||||
enum cpu_idle_type idle)
|
||||
{
|
||||
int load_idx;
|
||||
|
||||
switch (idle) {
|
||||
case CPU_NOT_IDLE:
|
||||
load_idx = sd->busy_idx;
|
||||
break;
|
||||
|
||||
case CPU_NEWLY_IDLE:
|
||||
load_idx = sd->newidle_idx;
|
||||
break;
|
||||
default:
|
||||
load_idx = sd->idle_idx;
|
||||
break;
|
||||
}
|
||||
|
||||
return load_idx;
|
||||
}
|
||||
|
||||
|
||||
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
|
||||
/**
|
||||
* init_sd_power_savings_stats - Initialize power savings statistics for
|
||||
* the given sched_domain, during load balancing.
|
||||
*
|
||||
* @sd: Sched domain whose power-savings statistics are to be initialized.
|
||||
* @sds: Variable containing the statistics for sd.
|
||||
* @idle: Idle status of the CPU at which we're performing load-balancing.
|
||||
*/
|
||||
static inline void init_sd_power_savings_stats(struct sched_domain *sd,
|
||||
struct sd_lb_stats *sds, enum cpu_idle_type idle)
|
||||
{
|
||||
/*
|
||||
* Busy processors will not participate in power savings
|
||||
* balance.
|
||||
*/
|
||||
if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
|
||||
sds->power_savings_balance = 0;
|
||||
else {
|
||||
sds->power_savings_balance = 1;
|
||||
sds->min_nr_running = ULONG_MAX;
|
||||
sds->leader_nr_running = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* update_sd_power_savings_stats - Update the power saving stats for a
|
||||
* sched_domain while performing load balancing.
|
||||
*
|
||||
* @group: sched_group belonging to the sched_domain under consideration.
|
||||
* @sds: Variable containing the statistics of the sched_domain
|
||||
* @local_group: Does group contain the CPU for which we're performing
|
||||
* load balancing ?
|
||||
* @sgs: Variable containing the statistics of the group.
|
||||
*/
|
||||
static inline void update_sd_power_savings_stats(struct sched_group *group,
|
||||
struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
|
||||
{
|
||||
|
||||
if (!sds->power_savings_balance)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If the local group is idle or completely loaded
|
||||
* no need to do power savings balance at this domain
|
||||
*/
|
||||
if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
|
||||
!sds->this_nr_running))
|
||||
sds->power_savings_balance = 0;
|
||||
|
||||
/*
|
||||
* If a group is already running at full capacity or idle,
|
||||
* don't include that group in power savings calculations
|
||||
*/
|
||||
if (!sds->power_savings_balance ||
|
||||
sgs->sum_nr_running >= sgs->group_capacity ||
|
||||
!sgs->sum_nr_running)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Calculate the group which has the least non-idle load.
|
||||
* This is the group from where we need to pick up the load
|
||||
* for saving power
|
||||
*/
|
||||
if ((sgs->sum_nr_running < sds->min_nr_running) ||
|
||||
(sgs->sum_nr_running == sds->min_nr_running &&
|
||||
group_first_cpu(group) > group_first_cpu(sds->group_min))) {
|
||||
sds->group_min = group;
|
||||
sds->min_nr_running = sgs->sum_nr_running;
|
||||
sds->min_load_per_task = sgs->sum_weighted_load /
|
||||
sgs->sum_nr_running;
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate the group which is almost near its
|
||||
* capacity but still has some space to pick up some load
|
||||
* from other group and save more power
|
||||
*/
|
||||
if (sgs->sum_nr_running > sgs->group_capacity - 1)
|
||||
return;
|
||||
|
||||
if (sgs->sum_nr_running > sds->leader_nr_running ||
|
||||
(sgs->sum_nr_running == sds->leader_nr_running &&
|
||||
group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
|
||||
sds->group_leader = group;
|
||||
sds->leader_nr_running = sgs->sum_nr_running;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* check_power_save_busiest_group - Check if we have potential to perform
|
||||
* some power-savings balance. If yes, set the busiest group to be
|
||||
* the least loaded group in the sched_domain, so that it's CPUs can
|
||||
* be put to idle.
|
||||
*
|
||||
* @sds: Variable containing the statistics of the sched_domain
|
||||
* under consideration.
|
||||
* @this_cpu: Cpu at which we're currently performing load-balancing.
|
||||
* @imbalance: Variable to store the imbalance.
|
||||
*
|
||||
* Returns 1 if there is potential to perform power-savings balance.
|
||||
* Else returns 0.
|
||||
*/
|
||||
static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
|
||||
int this_cpu, unsigned long *imbalance)
|
||||
{
|
||||
if (!sds->power_savings_balance)
|
||||
return 0;
|
||||
|
||||
if (sds->this != sds->group_leader ||
|
||||
sds->group_leader == sds->group_min)
|
||||
return 0;
|
||||
|
||||
*imbalance = sds->min_load_per_task;
|
||||
sds->busiest = sds->group_min;
|
||||
|
||||
if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
|
||||
cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
|
||||
group_first_cpu(sds->group_leader);
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
||||
}
|
||||
#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
|
||||
static inline void init_sd_power_savings_stats(struct sched_domain *sd,
|
||||
struct sd_lb_stats *sds, enum cpu_idle_type idle)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
static inline void update_sd_power_savings_stats(struct sched_group *group,
|
||||
struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
|
||||
int this_cpu, unsigned long *imbalance)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
|
||||
|
||||
|
||||
/**
|
||||
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
|
||||
* @group: sched_group whose statistics are to be updated.
|
||||
* @this_cpu: Cpu for which load balance is currently performed.
|
||||
* @idle: Idle status of this_cpu
|
||||
* @load_idx: Load index of sched_domain of this_cpu for load calc.
|
||||
* @sd_idle: Idle status of the sched_domain containing group.
|
||||
* @local_group: Does group contain this_cpu.
|
||||
* @cpus: Set of cpus considered for load balancing.
|
||||
* @balance: Should we balance.
|
||||
* @sgs: variable to hold the statistics for this group.
|
||||
*/
|
||||
static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
|
||||
enum cpu_idle_type idle, int load_idx, int *sd_idle,
|
||||
int local_group, const struct cpumask *cpus,
|
||||
int *balance, struct sg_lb_stats *sgs)
|
||||
{
|
||||
unsigned long load, max_cpu_load, min_cpu_load;
|
||||
int i;
|
||||
unsigned int balance_cpu = -1, first_idle_cpu = 0;
|
||||
unsigned long sum_avg_load_per_task;
|
||||
unsigned long avg_load_per_task;
|
||||
|
||||
if (local_group)
|
||||
balance_cpu = group_first_cpu(group);
|
||||
|
||||
/* Tally up the load of all CPUs in the group */
|
||||
sum_avg_load_per_task = avg_load_per_task = 0;
|
||||
max_cpu_load = 0;
|
||||
min_cpu_load = ~0UL;
|
||||
|
||||
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
|
||||
struct rq *rq = cpu_rq(i);
|
||||
|
||||
if (*sd_idle && rq->nr_running)
|
||||
*sd_idle = 0;
|
||||
|
||||
/* Bias balancing toward cpus of our domain */
|
||||
if (local_group) {
|
||||
if (idle_cpu(i) && !first_idle_cpu) {
|
||||
first_idle_cpu = 1;
|
||||
balance_cpu = i;
|
||||
}
|
||||
|
||||
load = target_load(i, load_idx);
|
||||
} else {
|
||||
load = source_load(i, load_idx);
|
||||
if (load > max_cpu_load)
|
||||
max_cpu_load = load;
|
||||
if (min_cpu_load > load)
|
||||
min_cpu_load = load;
|
||||
}
|
||||
|
||||
sgs->group_load += load;
|
||||
sgs->sum_nr_running += rq->nr_running;
|
||||
sgs->sum_weighted_load += weighted_cpuload(i);
|
||||
|
||||
sum_avg_load_per_task += cpu_avg_load_per_task(i);
|
||||
}
|
||||
|
||||
/*
|
||||
* First idle cpu or the first cpu(busiest) in this sched group
|
||||
* is eligible for doing load balancing at this and above
|
||||
* domains. In the newly idle case, we will allow all the cpu's
|
||||
* to do the newly idle load balance.
|
||||
*/
|
||||
if (idle != CPU_NEWLY_IDLE && local_group &&
|
||||
balance_cpu != this_cpu && balance) {
|
||||
*balance = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Adjust by relative CPU power of the group */
|
||||
sgs->avg_load = sg_div_cpu_power(group,
|
||||
sgs->group_load * SCHED_LOAD_SCALE);
|
||||
|
||||
|
||||
/*
|
||||
* Consider the group unbalanced when the imbalance is larger
|
||||
* than the average weight of two tasks.
|
||||
*
|
||||
* APZ: with cgroup the avg task weight can vary wildly and
|
||||
* might not be a suitable number - should we keep a
|
||||
* normalized nr_running number somewhere that negates
|
||||
* the hierarchy?
|
||||
*/
|
||||
avg_load_per_task = sg_div_cpu_power(group,
|
||||
sum_avg_load_per_task * SCHED_LOAD_SCALE);
|
||||
|
||||
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
|
||||
sgs->group_imb = 1;
|
||||
|
||||
sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* update_sd_lb_stats - Update sched_group's statistics for load balancing.
|
||||
* @sd: sched_domain whose statistics are to be updated.
|
||||
* @this_cpu: Cpu for which load balance is currently performed.
|
||||
* @idle: Idle status of this_cpu
|
||||
* @sd_idle: Idle status of the sched_domain containing group.
|
||||
* @cpus: Set of cpus considered for load balancing.
|
||||
* @balance: Should we balance.
|
||||
* @sds: variable to hold the statistics for this sched_domain.
|
||||
*/
|
||||
static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
|
||||
enum cpu_idle_type idle, int *sd_idle,
|
||||
const struct cpumask *cpus, int *balance,
|
||||
struct sd_lb_stats *sds)
|
||||
{
|
||||
struct sched_group *group = sd->groups;
|
||||
struct sg_lb_stats sgs;
|
||||
int load_idx;
|
||||
|
||||
init_sd_power_savings_stats(sd, sds, idle);
|
||||
load_idx = get_sd_load_idx(sd, idle);
|
||||
|
||||
do {
|
||||
int local_group;
|
||||
|
||||
local_group = cpumask_test_cpu(this_cpu,
|
||||
sched_group_cpus(group));
|
||||
memset(&sgs, 0, sizeof(sgs));
|
||||
update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
|
||||
local_group, cpus, balance, &sgs);
|
||||
|
||||
if (local_group && balance && !(*balance))
|
||||
return;
|
||||
|
||||
sds->total_load += sgs.group_load;
|
||||
sds->total_pwr += group->__cpu_power;
|
||||
|
||||
if (local_group) {
|
||||
sds->this_load = sgs.avg_load;
|
||||
sds->this = group;
|
||||
sds->this_nr_running = sgs.sum_nr_running;
|
||||
sds->this_load_per_task = sgs.sum_weighted_load;
|
||||
} else if (sgs.avg_load > sds->max_load &&
|
||||
(sgs.sum_nr_running > sgs.group_capacity ||
|
||||
sgs.group_imb)) {
|
||||
sds->max_load = sgs.avg_load;
|
||||
sds->busiest = group;
|
||||
sds->busiest_nr_running = sgs.sum_nr_running;
|
||||
sds->busiest_load_per_task = sgs.sum_weighted_load;
|
||||
sds->group_imb = sgs.group_imb;
|
||||
}
|
||||
|
||||
update_sd_power_savings_stats(group, sds, local_group, &sgs);
|
||||
group = group->next;
|
||||
} while (group != sd->groups);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* fix_small_imbalance - Calculate the minor imbalance that exists
|
||||
* amongst the groups of a sched_domain, during
|
||||
* load balancing.
|
||||
* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
|
||||
* @this_cpu: The cpu at whose sched_domain we're performing load-balance.
|
||||
* @imbalance: Variable to store the imbalance.
|
||||
*/
|
||||
static inline void fix_small_imbalance(struct sd_lb_stats *sds,
|
||||
int this_cpu, unsigned long *imbalance)
|
||||
{
|
||||
unsigned long tmp, pwr_now = 0, pwr_move = 0;
|
||||
unsigned int imbn = 2;
|
||||
|
||||
if (sds->this_nr_running) {
|
||||
sds->this_load_per_task /= sds->this_nr_running;
|
||||
if (sds->busiest_load_per_task >
|
||||
sds->this_load_per_task)
|
||||
imbn = 1;
|
||||
} else
|
||||
sds->this_load_per_task =
|
||||
cpu_avg_load_per_task(this_cpu);
|
||||
|
||||
if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
|
||||
sds->busiest_load_per_task * imbn) {
|
||||
*imbalance = sds->busiest_load_per_task;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* OK, we don't have enough imbalance to justify moving tasks,
|
||||
* however we may be able to increase total CPU power used by
|
||||
* moving them.
|
||||
*/
|
||||
|
||||
pwr_now += sds->busiest->__cpu_power *
|
||||
min(sds->busiest_load_per_task, sds->max_load);
|
||||
pwr_now += sds->this->__cpu_power *
|
||||
min(sds->this_load_per_task, sds->this_load);
|
||||
pwr_now /= SCHED_LOAD_SCALE;
|
||||
|
||||
/* Amount of load we'd subtract */
|
||||
tmp = sg_div_cpu_power(sds->busiest,
|
||||
sds->busiest_load_per_task * SCHED_LOAD_SCALE);
|
||||
if (sds->max_load > tmp)
|
||||
pwr_move += sds->busiest->__cpu_power *
|
||||
min(sds->busiest_load_per_task, sds->max_load - tmp);
|
||||
|
||||
/* Amount of load we'd add */
|
||||
if (sds->max_load * sds->busiest->__cpu_power <
|
||||
sds->busiest_load_per_task * SCHED_LOAD_SCALE)
|
||||
tmp = sg_div_cpu_power(sds->this,
|
||||
sds->max_load * sds->busiest->__cpu_power);
|
||||
else
|
||||
tmp = sg_div_cpu_power(sds->this,
|
||||
sds->busiest_load_per_task * SCHED_LOAD_SCALE);
|
||||
pwr_move += sds->this->__cpu_power *
|
||||
min(sds->this_load_per_task, sds->this_load + tmp);
|
||||
pwr_move /= SCHED_LOAD_SCALE;
|
||||
|
||||
/* Move if we gain throughput */
|
||||
if (pwr_move > pwr_now)
|
||||
*imbalance = sds->busiest_load_per_task;
|
||||
}
|
||||
|
||||
/**
|
||||
* calculate_imbalance - Calculate the amount of imbalance present within the
|
||||
* groups of a given sched_domain during load balance.
|
||||
* @sds: statistics of the sched_domain whose imbalance is to be calculated.
|
||||
* @this_cpu: Cpu for which currently load balance is being performed.
|
||||
* @imbalance: The variable to store the imbalance.
|
||||
*/
|
||||
static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
|
||||
unsigned long *imbalance)
|
||||
{
|
||||
unsigned long max_pull;
|
||||
/*
|
||||
* In the presence of smp nice balancing, certain scenarios can have
|
||||
* max load less than avg load(as we skip the groups at or below
|
||||
* its cpu_power, while calculating max_load..)
|
||||
*/
|
||||
if (sds->max_load < sds->avg_load) {
|
||||
*imbalance = 0;
|
||||
return fix_small_imbalance(sds, this_cpu, imbalance);
|
||||
}
|
||||
|
||||
/* Don't want to pull so many tasks that a group would go idle */
|
||||
max_pull = min(sds->max_load - sds->avg_load,
|
||||
sds->max_load - sds->busiest_load_per_task);
|
||||
|
||||
/* How much load to actually move to equalise the imbalance */
|
||||
*imbalance = min(max_pull * sds->busiest->__cpu_power,
|
||||
(sds->avg_load - sds->this_load) * sds->this->__cpu_power)
|
||||
/ SCHED_LOAD_SCALE;
|
||||
|
||||
/*
|
||||
* if *imbalance is less than the average load per runnable task
|
||||
* there is no gaurantee that any tasks will be moved so we'll have
|
||||
* a think about bumping its value to force at least one task to be
|
||||
* moved
|
||||
*/
|
||||
if (*imbalance < sds->busiest_load_per_task)
|
||||
return fix_small_imbalance(sds, this_cpu, imbalance);
|
||||
|
||||
}
|
||||
/******* find_busiest_group() helpers end here *********************/
|
||||
|
||||
/**
|
||||
* find_busiest_group - Returns the busiest group within the sched_domain
|
||||
* if there is an imbalance. If there isn't an imbalance, and
|
||||
* the user has opted for power-savings, it returns a group whose
|
||||
* CPUs can be put to idle by rebalancing those tasks elsewhere, if
|
||||
* such a group exists.
|
||||
*
|
||||
* Also calculates the amount of weighted load which should be moved
|
||||
* to restore balance.
|
||||
*
|
||||
* @sd: The sched_domain whose busiest group is to be returned.
|
||||
* @this_cpu: The cpu for which load balancing is currently being performed.
|
||||
* @imbalance: Variable which stores amount of weighted load which should
|
||||
* be moved to restore balance/put a group to idle.
|
||||
* @idle: The idle status of this_cpu.
|
||||
* @sd_idle: The idleness of sd
|
||||
* @cpus: The set of CPUs under consideration for load-balancing.
|
||||
* @balance: Pointer to a variable indicating if this_cpu
|
||||
* is the appropriate cpu to perform load balancing at this_level.
|
||||
*
|
||||
* Returns: - the busiest group if imbalance exists.
|
||||
* - If no imbalance and user has opted for power-savings balance,
|
||||
* return the least loaded group whose CPUs can be
|
||||
* put to idle by rebalancing its tasks onto our group.
|
||||
*/
|
||||
static struct sched_group *
|
||||
find_busiest_group(struct sched_domain *sd, int this_cpu,
|
||||
unsigned long *imbalance, enum cpu_idle_type idle,
|
||||
int *sd_idle, const struct cpumask *cpus, int *balance)
|
||||
{
|
||||
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
|
||||
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
|
||||
unsigned long max_pull;
|
||||
unsigned long busiest_load_per_task, busiest_nr_running;
|
||||
unsigned long this_load_per_task, this_nr_running;
|
||||
int load_idx, group_imb = 0;
|
||||
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
|
||||
int power_savings_balance = 1;
|
||||
unsigned long leader_nr_running = 0, min_load_per_task = 0;
|
||||
unsigned long min_nr_running = ULONG_MAX;
|
||||
struct sched_group *group_min = NULL, *group_leader = NULL;
|
||||
#endif
|
||||
struct sd_lb_stats sds;
|
||||
|
||||
max_load = this_load = total_load = total_pwr = 0;
|
||||
busiest_load_per_task = busiest_nr_running = 0;
|
||||
this_load_per_task = this_nr_running = 0;
|
||||
memset(&sds, 0, sizeof(sds));
|
||||
|
||||
if (idle == CPU_NOT_IDLE)
|
||||
load_idx = sd->busy_idx;
|
||||
else if (idle == CPU_NEWLY_IDLE)
|
||||
load_idx = sd->newidle_idx;
|
||||
else
|
||||
load_idx = sd->idle_idx;
|
||||
/*
|
||||
* Compute the various statistics relavent for load balancing at
|
||||
* this level.
|
||||
*/
|
||||
update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
|
||||
balance, &sds);
|
||||
|
||||
do {
|
||||
unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
|
||||
int local_group;
|
||||
int i;
|
||||
int __group_imb = 0;
|
||||
unsigned int balance_cpu = -1, first_idle_cpu = 0;
|
||||
unsigned long sum_nr_running, sum_weighted_load;
|
||||
unsigned long sum_avg_load_per_task;
|
||||
unsigned long avg_load_per_task;
|
||||
/* Cases where imbalance does not exist from POV of this_cpu */
|
||||
/* 1) this_cpu is not the appropriate cpu to perform load balancing
|
||||
* at this level.
|
||||
* 2) There is no busy sibling group to pull from.
|
||||
* 3) This group is the busiest group.
|
||||
* 4) This group is more busy than the avg busieness at this
|
||||
* sched_domain.
|
||||
* 5) The imbalance is within the specified limit.
|
||||
* 6) Any rebalance would lead to ping-pong
|
||||
*/
|
||||
if (balance && !(*balance))
|
||||
goto ret;
|
||||
|
||||
local_group = cpumask_test_cpu(this_cpu,
|
||||
sched_group_cpus(group));
|
||||
|
||||
if (local_group)
|
||||
balance_cpu = cpumask_first(sched_group_cpus(group));
|
||||
|
||||
/* Tally up the load of all CPUs in the group */
|
||||
sum_weighted_load = sum_nr_running = avg_load = 0;
|
||||
sum_avg_load_per_task = avg_load_per_task = 0;
|
||||
|
||||
max_cpu_load = 0;
|
||||
min_cpu_load = ~0UL;
|
||||
|
||||
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
|
||||
struct rq *rq = cpu_rq(i);
|
||||
|
||||
if (*sd_idle && rq->nr_running)
|
||||
*sd_idle = 0;
|
||||
|
||||
/* Bias balancing toward cpus of our domain */
|
||||
if (local_group) {
|
||||
if (idle_cpu(i) && !first_idle_cpu) {
|
||||
first_idle_cpu = 1;
|
||||
balance_cpu = i;
|
||||
}
|
||||
|
||||
load = target_load(i, load_idx);
|
||||
} else {
|
||||
load = source_load(i, load_idx);
|
||||
if (load > max_cpu_load)
|
||||
max_cpu_load = load;
|
||||
if (min_cpu_load > load)
|
||||
min_cpu_load = load;
|
||||
}
|
||||
|
||||
avg_load += load;
|
||||
sum_nr_running += rq->nr_running;
|
||||
sum_weighted_load += weighted_cpuload(i);
|
||||
|
||||
sum_avg_load_per_task += cpu_avg_load_per_task(i);
|
||||
}
|
||||
|
||||
/*
|
||||
* First idle cpu or the first cpu(busiest) in this sched group
|
||||
* is eligible for doing load balancing at this and above
|
||||
* domains. In the newly idle case, we will allow all the cpu's
|
||||
* to do the newly idle load balance.
|
||||
*/
|
||||
if (idle != CPU_NEWLY_IDLE && local_group &&
|
||||
balance_cpu != this_cpu && balance) {
|
||||
*balance = 0;
|
||||
goto ret;
|
||||
}
|
||||
|
||||
total_load += avg_load;
|
||||
total_pwr += group->__cpu_power;
|
||||
|
||||
/* Adjust by relative CPU power of the group */
|
||||
avg_load = sg_div_cpu_power(group,
|
||||
avg_load * SCHED_LOAD_SCALE);
|
||||
|
||||
|
||||
/*
|
||||
* Consider the group unbalanced when the imbalance is larger
|
||||
* than the average weight of two tasks.
|
||||
*
|
||||
* APZ: with cgroup the avg task weight can vary wildly and
|
||||
* might not be a suitable number - should we keep a
|
||||
* normalized nr_running number somewhere that negates
|
||||
* the hierarchy?
|
||||
*/
|
||||
avg_load_per_task = sg_div_cpu_power(group,
|
||||
sum_avg_load_per_task * SCHED_LOAD_SCALE);
|
||||
|
||||
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
|
||||
__group_imb = 1;
|
||||
|
||||
group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
|
||||
|
||||
if (local_group) {
|
||||
this_load = avg_load;
|
||||
this = group;
|
||||
this_nr_running = sum_nr_running;
|
||||
this_load_per_task = sum_weighted_load;
|
||||
} else if (avg_load > max_load &&
|
||||
(sum_nr_running > group_capacity || __group_imb)) {
|
||||
max_load = avg_load;
|
||||
busiest = group;
|
||||
busiest_nr_running = sum_nr_running;
|
||||
busiest_load_per_task = sum_weighted_load;
|
||||
group_imb = __group_imb;
|
||||
}
|
||||
|
||||
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
|
||||
/*
|
||||
* Busy processors will not participate in power savings
|
||||
* balance.
|
||||
*/
|
||||
if (idle == CPU_NOT_IDLE ||
|
||||
!(sd->flags & SD_POWERSAVINGS_BALANCE))
|
||||
goto group_next;
|
||||
|
||||
/*
|
||||
* If the local group is idle or completely loaded
|
||||
* no need to do power savings balance at this domain
|
||||
*/
|
||||
if (local_group && (this_nr_running >= group_capacity ||
|
||||
!this_nr_running))
|
||||
power_savings_balance = 0;
|
||||
|
||||
/*
|
||||
* If a group is already running at full capacity or idle,
|
||||
* don't include that group in power savings calculations
|
||||
*/
|
||||
if (!power_savings_balance || sum_nr_running >= group_capacity
|
||||
|| !sum_nr_running)
|
||||
goto group_next;
|
||||
|
||||
/*
|
||||
* Calculate the group which has the least non-idle load.
|
||||
* This is the group from where we need to pick up the load
|
||||
* for saving power
|
||||
*/
|
||||
if ((sum_nr_running < min_nr_running) ||
|
||||
(sum_nr_running == min_nr_running &&
|
||||
cpumask_first(sched_group_cpus(group)) >
|
||||
cpumask_first(sched_group_cpus(group_min)))) {
|
||||
group_min = group;
|
||||
min_nr_running = sum_nr_running;
|
||||
min_load_per_task = sum_weighted_load /
|
||||
sum_nr_running;
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate the group which is almost near its
|
||||
* capacity but still has some space to pick up some load
|
||||
* from other group and save more power
|
||||
*/
|
||||
if (sum_nr_running <= group_capacity - 1) {
|
||||
if (sum_nr_running > leader_nr_running ||
|
||||
(sum_nr_running == leader_nr_running &&
|
||||
cpumask_first(sched_group_cpus(group)) <
|
||||
cpumask_first(sched_group_cpus(group_leader)))) {
|
||||
group_leader = group;
|
||||
leader_nr_running = sum_nr_running;
|
||||
}
|
||||
}
|
||||
group_next:
|
||||
#endif
|
||||
group = group->next;
|
||||
} while (group != sd->groups);
|
||||
|
||||
if (!busiest || this_load >= max_load || busiest_nr_running == 0)
|
||||
if (!sds.busiest || sds.busiest_nr_running == 0)
|
||||
goto out_balanced;
|
||||
|
||||
avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
|
||||
|
||||
if (this_load >= avg_load ||
|
||||
100*max_load <= sd->imbalance_pct*this_load)
|
||||
if (sds.this_load >= sds.max_load)
|
||||
goto out_balanced;
|
||||
|
||||
busiest_load_per_task /= busiest_nr_running;
|
||||
if (group_imb)
|
||||
busiest_load_per_task = min(busiest_load_per_task, avg_load);
|
||||
sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
|
||||
|
||||
if (sds.this_load >= sds.avg_load)
|
||||
goto out_balanced;
|
||||
|
||||
if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
|
||||
goto out_balanced;
|
||||
|
||||
sds.busiest_load_per_task /= sds.busiest_nr_running;
|
||||
if (sds.group_imb)
|
||||
sds.busiest_load_per_task =
|
||||
min(sds.busiest_load_per_task, sds.avg_load);
|
||||
|
||||
/*
|
||||
* We're trying to get all the cpus to the average_load, so we don't
|
||||
|
@ -3410,105 +3760,20 @@ group_next:
|
|||
* by pulling tasks to us. Be careful of negative numbers as they'll
|
||||
* appear as very large values with unsigned longs.
|
||||
*/
|
||||
if (max_load <= busiest_load_per_task)
|
||||
if (sds.max_load <= sds.busiest_load_per_task)
|
||||
goto out_balanced;
|
||||
|
||||
/*
|
||||
* In the presence of smp nice balancing, certain scenarios can have
|
||||
* max load less than avg load(as we skip the groups at or below
|
||||
* its cpu_power, while calculating max_load..)
|
||||
*/
|
||||
if (max_load < avg_load) {
|
||||
*imbalance = 0;
|
||||
goto small_imbalance;
|
||||
}
|
||||
|
||||
/* Don't want to pull so many tasks that a group would go idle */
|
||||
max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
|
||||
|
||||
/* How much load to actually move to equalise the imbalance */
|
||||
*imbalance = min(max_pull * busiest->__cpu_power,
|
||||
(avg_load - this_load) * this->__cpu_power)
|
||||
/ SCHED_LOAD_SCALE;
|
||||
|
||||
/*
|
||||
* if *imbalance is less than the average load per runnable task
|
||||
* there is no gaurantee that any tasks will be moved so we'll have
|
||||
* a think about bumping its value to force at least one task to be
|
||||
* moved
|
||||
*/
|
||||
if (*imbalance < busiest_load_per_task) {
|
||||
unsigned long tmp, pwr_now, pwr_move;
|
||||
unsigned int imbn;
|
||||
|
||||
small_imbalance:
|
||||
pwr_move = pwr_now = 0;
|
||||
imbn = 2;
|
||||
if (this_nr_running) {
|
||||
this_load_per_task /= this_nr_running;
|
||||
if (busiest_load_per_task > this_load_per_task)
|
||||
imbn = 1;
|
||||
} else
|
||||
this_load_per_task = cpu_avg_load_per_task(this_cpu);
|
||||
|
||||
if (max_load - this_load + busiest_load_per_task >=
|
||||
busiest_load_per_task * imbn) {
|
||||
*imbalance = busiest_load_per_task;
|
||||
return busiest;
|
||||
}
|
||||
|
||||
/*
|
||||
* OK, we don't have enough imbalance to justify moving tasks,
|
||||
* however we may be able to increase total CPU power used by
|
||||
* moving them.
|
||||
*/
|
||||
|
||||
pwr_now += busiest->__cpu_power *
|
||||
min(busiest_load_per_task, max_load);
|
||||
pwr_now += this->__cpu_power *
|
||||
min(this_load_per_task, this_load);
|
||||
pwr_now /= SCHED_LOAD_SCALE;
|
||||
|
||||
/* Amount of load we'd subtract */
|
||||
tmp = sg_div_cpu_power(busiest,
|
||||
busiest_load_per_task * SCHED_LOAD_SCALE);
|
||||
if (max_load > tmp)
|
||||
pwr_move += busiest->__cpu_power *
|
||||
min(busiest_load_per_task, max_load - tmp);
|
||||
|
||||
/* Amount of load we'd add */
|
||||
if (max_load * busiest->__cpu_power <
|
||||
busiest_load_per_task * SCHED_LOAD_SCALE)
|
||||
tmp = sg_div_cpu_power(this,
|
||||
max_load * busiest->__cpu_power);
|
||||
else
|
||||
tmp = sg_div_cpu_power(this,
|
||||
busiest_load_per_task * SCHED_LOAD_SCALE);
|
||||
pwr_move += this->__cpu_power *
|
||||
min(this_load_per_task, this_load + tmp);
|
||||
pwr_move /= SCHED_LOAD_SCALE;
|
||||
|
||||
/* Move if we gain throughput */
|
||||
if (pwr_move > pwr_now)
|
||||
*imbalance = busiest_load_per_task;
|
||||
}
|
||||
|
||||
return busiest;
|
||||
/* Looks like there is an imbalance. Compute it */
|
||||
calculate_imbalance(&sds, this_cpu, imbalance);
|
||||
return sds.busiest;
|
||||
|
||||
out_balanced:
|
||||
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
|
||||
if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
|
||||
goto ret;
|
||||
|
||||
if (this == group_leader && group_leader != group_min) {
|
||||
*imbalance = min_load_per_task;
|
||||
if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
|
||||
cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
|
||||
cpumask_first(sched_group_cpus(group_leader));
|
||||
}
|
||||
return group_min;
|
||||
}
|
||||
#endif
|
||||
/*
|
||||
* There is no obvious imbalance. But check if we can do some balancing
|
||||
* to save power.
|
||||
*/
|
||||
if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
|
||||
return sds.busiest;
|
||||
ret:
|
||||
*imbalance = 0;
|
||||
return NULL;
|
||||
|
|
Loading…
Reference in a new issue