sched/numa: Skip some page migrations after a shared fault
Shared faults can lead to lots of unnecessary page migrations, slowing down the system, and causing private faults to hit the per-pgdat migration ratelimit. This patch adds sysctl numa_balancing_migrate_deferred, which specifies how many shared page migrations to skip unconditionally, after each page migration that is skipped because it is a shared fault. This reduces the number of page migrations back and forth in shared fault situations. It also gives a strong preference to the tasks that are already running where most of the memory is, and to moving the other tasks to near the memory. Testing this with a much higher scan rate than the default still seems to result in fewer page migrations than before. Memory seems to be somewhat better consolidated than previously, with multi-instance specjbb runs on a 4 node system. Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1381141781-10992-62-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
1e3646ffc6
commit
de1c9ce6f0
5 changed files with 75 additions and 3 deletions
|
@ -375,7 +375,8 @@ feature should be disabled. Otherwise, if the system overhead from the
|
|||
feature is too high then the rate the kernel samples for NUMA hinting
|
||||
faults may be controlled by the numa_balancing_scan_period_min_ms,
|
||||
numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
|
||||
numa_balancing_scan_size_mb and numa_balancing_settle_count sysctls.
|
||||
numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
|
||||
numa_balancing_migrate_deferred.
|
||||
|
||||
==============================================================
|
||||
|
||||
|
@ -421,6 +422,13 @@ the schedule balancer stops pushing the task towards a preferred node. This
|
|||
gives the scheduler a chance to place the task on an alternative node if the
|
||||
preferred node is overloaded.
|
||||
|
||||
numa_balancing_migrate_deferred is how many page migrations get skipped
|
||||
unconditionally, after a page migration is skipped because a page is shared
|
||||
with other tasks. This reduces page migration overhead, and determines
|
||||
how much stronger the "move task near its memory" policy scheduler becomes,
|
||||
versus the "move memory near its task" memory management policy, for workloads
|
||||
with shared memory.
|
||||
|
||||
==============================================================
|
||||
|
||||
osrelease, ostype & version:
|
||||
|
|
|
@ -1342,6 +1342,8 @@ struct task_struct {
|
|||
int numa_scan_seq;
|
||||
unsigned int numa_scan_period;
|
||||
unsigned int numa_scan_period_max;
|
||||
int numa_preferred_nid;
|
||||
int numa_migrate_deferred;
|
||||
unsigned long numa_migrate_retry;
|
||||
u64 node_stamp; /* migration stamp */
|
||||
struct callback_head numa_work;
|
||||
|
@ -1372,7 +1374,6 @@ struct task_struct {
|
|||
*/
|
||||
unsigned long numa_faults_locality[2];
|
||||
|
||||
int numa_preferred_nid;
|
||||
unsigned long numa_pages_migrated;
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
|
||||
|
@ -1469,6 +1470,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
|
|||
extern pid_t task_numa_group_id(struct task_struct *p);
|
||||
extern void set_numabalancing_state(bool enabled);
|
||||
extern void task_numa_free(struct task_struct *p);
|
||||
|
||||
extern unsigned int sysctl_numa_balancing_migrate_deferred;
|
||||
#else
|
||||
static inline void task_numa_fault(int last_node, int node, int pages,
|
||||
int flags)
|
||||
|
|
|
@ -833,6 +833,14 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
|
|||
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
|
||||
unsigned int sysctl_numa_balancing_scan_delay = 1000;
|
||||
|
||||
/*
|
||||
* After skipping a page migration on a shared page, skip N more numa page
|
||||
* migrations unconditionally. This reduces the number of NUMA migrations
|
||||
* in shared memory workloads, and has the effect of pulling tasks towards
|
||||
* where their memory lives, over pulling the memory towards the task.
|
||||
*/
|
||||
unsigned int sysctl_numa_balancing_migrate_deferred = 16;
|
||||
|
||||
static unsigned int task_nr_scan_windows(struct task_struct *p)
|
||||
{
|
||||
unsigned long rss = 0;
|
||||
|
|
|
@ -391,6 +391,13 @@ static struct ctl_table kern_table[] = {
|
|||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "numa_balancing_migrate_deferred",
|
||||
.data = &sysctl_numa_balancing_migrate_deferred,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
#endif /* CONFIG_SCHED_DEBUG */
|
||||
{
|
||||
|
|
|
@ -2301,6 +2301,35 @@ static void sp_free(struct sp_node *n)
|
|||
kmem_cache_free(sn_cache, n);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
|
||||
{
|
||||
/* Never defer a private fault */
|
||||
if (cpupid_match_pid(p, last_cpupid))
|
||||
return false;
|
||||
|
||||
if (p->numa_migrate_deferred) {
|
||||
p->numa_migrate_deferred--;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void defer_numa_migrate(struct task_struct *p)
|
||||
{
|
||||
p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
|
||||
}
|
||||
#else
|
||||
static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void defer_numa_migrate(struct task_struct *p)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
|
||||
/**
|
||||
* mpol_misplaced - check whether current page node is valid in policy
|
||||
*
|
||||
|
@ -2402,7 +2431,24 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
|
|||
* relation.
|
||||
*/
|
||||
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
|
||||
if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid)
|
||||
if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
|
||||
|
||||
/* See sysctl_numa_balancing_migrate_deferred comment */
|
||||
if (!cpupid_match_pid(current, last_cpupid))
|
||||
defer_numa_migrate(current);
|
||||
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* The quadratic filter above reduces extraneous migration
|
||||
* of shared pages somewhat. This code reduces it even more,
|
||||
* reducing the overhead of page migrations of shared pages.
|
||||
* This makes workloads with shared pages rely more on
|
||||
* "move task near its memory", and less on "move memory
|
||||
* towards its task", which is exactly what we want.
|
||||
*/
|
||||
if (numa_migrate_deferred(current, last_cpupid))
|
||||
goto out;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue