memcg: coalesce uncharge during unmap/truncate
In massive parallel enviroment, res_counter can be a performance bottleneck. One strong techinque to reduce lock contention is reducing calls by coalescing some amount of calls into one. Considering charge/uncharge chatacteristic, - charge is done one by one via demand-paging. - uncharge is done by - in chunk at munmap, truncate, exit, execve... - one by one via vmscan/paging. It seems we have a chance to coalesce uncharges for improving scalability at unmap/truncation. This patch is a for coalescing uncharge. For avoiding scattering memcg's structure to functions under /mm, this patch adds memcg batch uncharge information to the task. A reason for per-task batching is for making use of caller's context information. We do batched uncharge (deleyed uncharge) when truncation/unmap occurs but do direct uncharge when uncharge is called by memory reclaim (vmscan.c). The degree of coalescing depends on callers - at invalidate/trucate... pagevec size - at unmap ....ZAP_BLOCK_SIZE (memory itself will be freed in this degree.) Then, we'll not coalescing too much. On x86-64 8cpu server, I tested overheads of memcg at page fault by running a program which does map/fault/unmap in a loop. Running a task per a cpu by taskset and see sum of the number of page faults in 60secs. [without memcg config] 40156968 page-faults # 0.085 M/sec ( +- 0.046% ) 27.67 cache-miss/faults [root cgroup] 36659599 page-faults # 0.077 M/sec ( +- 0.247% ) 31.58 miss/faults [in a child cgroup] 18444157 page-faults # 0.039 M/sec ( +- 0.133% ) 69.96 miss/faults [child with this patch] 27133719 page-faults # 0.057 M/sec ( +- 0.155% ) 47.16 miss/faults We can see some amounts of improvement. (root cgroup doesn't affected by this patch) Another patch for "charge" will follow this and above will be improved more. Changelog(since 2009/10/02): - renamed filed of memcg_batch (as pages to bytes, memsw to memsw_bytes) - some clean up and commentary/description updates. - added initialize code to copy_process(). (possible bug fix) Changelog(old): - fixed !CONFIG_MEM_CGROUP case. - rebased onto the latest mmotm + softlimit fix patches. - unified patch for callers - added commetns. - make ->do_batch as bool. - removed css_get() at el. We don't need it. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
cd9b45b78a
commit
569b846df5
6 changed files with 123 additions and 6 deletions
|
@ -54,6 +54,11 @@ extern void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru);
|
|||
extern void mem_cgroup_del_lru(struct page *page);
|
||||
extern void mem_cgroup_move_lists(struct page *page,
|
||||
enum lru_list from, enum lru_list to);
|
||||
|
||||
/* For coalescing uncharge for reducing memcg' overhead*/
|
||||
extern void mem_cgroup_uncharge_start(void);
|
||||
extern void mem_cgroup_uncharge_end(void);
|
||||
|
||||
extern void mem_cgroup_uncharge_page(struct page *page);
|
||||
extern void mem_cgroup_uncharge_cache_page(struct page *page);
|
||||
extern int mem_cgroup_shmem_charge_fallback(struct page *page,
|
||||
|
@ -151,6 +156,14 @@ static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *ptr)
|
|||
{
|
||||
}
|
||||
|
||||
static inline void mem_cgroup_uncharge_start(void)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mem_cgroup_uncharge_end(void)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void mem_cgroup_uncharge_page(struct page *page)
|
||||
{
|
||||
}
|
||||
|
|
|
@ -1544,6 +1544,14 @@ struct task_struct {
|
|||
unsigned long trace_recursion;
|
||||
#endif /* CONFIG_TRACING */
|
||||
unsigned long stack_start;
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */
|
||||
struct memcg_batch_info {
|
||||
int do_batch; /* incremented when batch uncharge started */
|
||||
struct mem_cgroup *memcg; /* target memcg of uncharge */
|
||||
unsigned long bytes; /* uncharged usage */
|
||||
unsigned long memsw_bytes; /* uncharged mem+swap usage */
|
||||
} memcg_batch;
|
||||
#endif
|
||||
};
|
||||
|
||||
/* Future-safe accessor for struct task_struct's cpus_allowed. */
|
||||
|
|
|
@ -1127,6 +1127,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
|||
#ifdef CONFIG_DEBUG_MUTEXES
|
||||
p->blocked_on = NULL; /* not blocked yet */
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
|
||||
p->memcg_batch.do_batch = 0;
|
||||
p->memcg_batch.memcg = NULL;
|
||||
#endif
|
||||
|
||||
p->bts = NULL;
|
||||
|
||||
|
|
|
@ -1827,6 +1827,50 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
|
|||
css_put(&mem->css);
|
||||
}
|
||||
|
||||
static void
|
||||
__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
|
||||
{
|
||||
struct memcg_batch_info *batch = NULL;
|
||||
bool uncharge_memsw = true;
|
||||
/* If swapout, usage of swap doesn't decrease */
|
||||
if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
|
||||
uncharge_memsw = false;
|
||||
/*
|
||||
* do_batch > 0 when unmapping pages or inode invalidate/truncate.
|
||||
* In those cases, all pages freed continously can be expected to be in
|
||||
* the same cgroup and we have chance to coalesce uncharges.
|
||||
* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
|
||||
* because we want to do uncharge as soon as possible.
|
||||
*/
|
||||
if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
|
||||
goto direct_uncharge;
|
||||
|
||||
batch = ¤t->memcg_batch;
|
||||
/*
|
||||
* In usual, we do css_get() when we remember memcg pointer.
|
||||
* But in this case, we keep res->usage until end of a series of
|
||||
* uncharges. Then, it's ok to ignore memcg's refcnt.
|
||||
*/
|
||||
if (!batch->memcg)
|
||||
batch->memcg = mem;
|
||||
/*
|
||||
* In typical case, batch->memcg == mem. This means we can
|
||||
* merge a series of uncharges to an uncharge of res_counter.
|
||||
* If not, we uncharge res_counter ony by one.
|
||||
*/
|
||||
if (batch->memcg != mem)
|
||||
goto direct_uncharge;
|
||||
/* remember freed charge and uncharge it later */
|
||||
batch->bytes += PAGE_SIZE;
|
||||
if (uncharge_memsw)
|
||||
batch->memsw_bytes += PAGE_SIZE;
|
||||
return;
|
||||
direct_uncharge:
|
||||
res_counter_uncharge(&mem->res, PAGE_SIZE);
|
||||
if (uncharge_memsw)
|
||||
res_counter_uncharge(&mem->memsw, PAGE_SIZE);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* uncharge if !page_mapped(page)
|
||||
|
@ -1875,12 +1919,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
|
|||
break;
|
||||
}
|
||||
|
||||
if (!mem_cgroup_is_root(mem)) {
|
||||
res_counter_uncharge(&mem->res, PAGE_SIZE);
|
||||
if (do_swap_account &&
|
||||
(ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
|
||||
res_counter_uncharge(&mem->memsw, PAGE_SIZE);
|
||||
}
|
||||
if (!mem_cgroup_is_root(mem))
|
||||
__do_uncharge(mem, ctype);
|
||||
if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
|
||||
mem_cgroup_swap_statistics(mem, true);
|
||||
mem_cgroup_charge_statistics(mem, pc, false);
|
||||
|
@ -1926,6 +1966,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
|
|||
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
|
||||
* In that cases, pages are freed continuously and we can expect pages
|
||||
* are in the same memcg. All these calls itself limits the number of
|
||||
* pages freed at once, then uncharge_start/end() is called properly.
|
||||
* This may be called prural(2) times in a context,
|
||||
*/
|
||||
|
||||
void mem_cgroup_uncharge_start(void)
|
||||
{
|
||||
current->memcg_batch.do_batch++;
|
||||
/* We can do nest. */
|
||||
if (current->memcg_batch.do_batch == 1) {
|
||||
current->memcg_batch.memcg = NULL;
|
||||
current->memcg_batch.bytes = 0;
|
||||
current->memcg_batch.memsw_bytes = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void mem_cgroup_uncharge_end(void)
|
||||
{
|
||||
struct memcg_batch_info *batch = ¤t->memcg_batch;
|
||||
|
||||
if (!batch->do_batch)
|
||||
return;
|
||||
|
||||
batch->do_batch--;
|
||||
if (batch->do_batch) /* If stacked, do nothing. */
|
||||
return;
|
||||
|
||||
if (!batch->memcg)
|
||||
return;
|
||||
/*
|
||||
* This "batch->memcg" is valid without any css_get/put etc...
|
||||
* bacause we hide charges behind us.
|
||||
*/
|
||||
if (batch->bytes)
|
||||
res_counter_uncharge(&batch->memcg->res, batch->bytes);
|
||||
if (batch->memsw_bytes)
|
||||
res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
|
||||
/* forget this pointer (for sanity check) */
|
||||
batch->memcg = NULL;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SWAP
|
||||
/*
|
||||
* called after __delete_from_swap_cache() and drop "page" account.
|
||||
|
|
|
@ -956,6 +956,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
|
|||
details = NULL;
|
||||
|
||||
BUG_ON(addr >= end);
|
||||
mem_cgroup_uncharge_start();
|
||||
tlb_start_vma(tlb, vma);
|
||||
pgd = pgd_offset(vma->vm_mm, addr);
|
||||
do {
|
||||
|
@ -968,6 +969,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
|
|||
zap_work, details);
|
||||
} while (pgd++, addr = next, (addr != end && *zap_work > 0));
|
||||
tlb_end_vma(tlb, vma);
|
||||
mem_cgroup_uncharge_end();
|
||||
|
||||
return addr;
|
||||
}
|
||||
|
|
|
@ -272,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
|
|||
pagevec_release(&pvec);
|
||||
break;
|
||||
}
|
||||
mem_cgroup_uncharge_start();
|
||||
for (i = 0; i < pagevec_count(&pvec); i++) {
|
||||
struct page *page = pvec.pages[i];
|
||||
|
||||
|
@ -286,6 +287,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
|
|||
unlock_page(page);
|
||||
}
|
||||
pagevec_release(&pvec);
|
||||
mem_cgroup_uncharge_end();
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(truncate_inode_pages_range);
|
||||
|
@ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
|
|||
pagevec_init(&pvec, 0);
|
||||
while (next <= end &&
|
||||
pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
|
||||
mem_cgroup_uncharge_start();
|
||||
for (i = 0; i < pagevec_count(&pvec); i++) {
|
||||
struct page *page = pvec.pages[i];
|
||||
pgoff_t index;
|
||||
|
@ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
|
|||
break;
|
||||
}
|
||||
pagevec_release(&pvec);
|
||||
mem_cgroup_uncharge_end();
|
||||
cond_resched();
|
||||
}
|
||||
return ret;
|
||||
|
@ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
|
|||
while (next <= end && !wrapped &&
|
||||
pagevec_lookup(&pvec, mapping, next,
|
||||
min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
|
||||
mem_cgroup_uncharge_start();
|
||||
for (i = 0; i < pagevec_count(&pvec); i++) {
|
||||
struct page *page = pvec.pages[i];
|
||||
pgoff_t page_index;
|
||||
|
@ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
|
|||
unlock_page(page);
|
||||
}
|
||||
pagevec_release(&pvec);
|
||||
mem_cgroup_uncharge_end();
|
||||
cond_resched();
|
||||
}
|
||||
return ret;
|
||||
|
|
Loading…
Reference in a new issue