mm: Remove i_mmap_lock lockbreak
Hugh says:
"The only significant loser, I think, would be page reclaim (when
concurrent with truncation): could spin for a long time waiting for
the i_mmap_mutex it expects would soon be dropped? "
Counter points:
- cpu contention makes the spin stop (need_resched())
- zap pages should be freeing pages at a higher rate than reclaim
ever can
I think the simplification of the truncate code is definitely worth it.
Effectively reverts: 2aa15890f3
("mm: prevent concurrent
unmap_mapping_range() on the same inode") and takes out the code that
caused its problem.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Miller <davem@davemloft.net>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
e4c70a6629
commit
97a894136f
8 changed files with 28 additions and 188 deletions
|
@ -331,7 +331,6 @@ void address_space_init_once(struct address_space *mapping)
|
|||
spin_lock_init(&mapping->private_lock);
|
||||
INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
|
||||
INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
|
||||
mutex_init(&mapping->unmap_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL(address_space_init_once);
|
||||
|
||||
|
|
|
@ -635,7 +635,6 @@ struct address_space {
|
|||
struct prio_tree_root i_mmap; /* tree of private and shared mappings */
|
||||
struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
|
||||
spinlock_t i_mmap_lock; /* protect tree, count, list */
|
||||
unsigned int truncate_count; /* Cover race condition with truncate */
|
||||
unsigned long nrpages; /* number of total pages */
|
||||
pgoff_t writeback_index;/* writeback starts here */
|
||||
const struct address_space_operations *a_ops; /* methods */
|
||||
|
@ -644,7 +643,6 @@ struct address_space {
|
|||
spinlock_t private_lock; /* for use by the address_space */
|
||||
struct list_head private_list; /* ditto */
|
||||
struct address_space *assoc_mapping; /* ditto */
|
||||
struct mutex unmap_mutex; /* to protect unmapping */
|
||||
} __attribute__((aligned(sizeof(long))));
|
||||
/*
|
||||
* On most architectures that alignment is already the case; but
|
||||
|
|
|
@ -895,8 +895,6 @@ struct zap_details {
|
|||
struct address_space *check_mapping; /* Check page->mapping if set */
|
||||
pgoff_t first_index; /* Lowest page->index to unmap */
|
||||
pgoff_t last_index; /* Highest page->index to unmap */
|
||||
spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */
|
||||
unsigned long truncate_count; /* Compare vm_truncate_count */
|
||||
};
|
||||
|
||||
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
|
|
|
@ -175,7 +175,6 @@ struct vm_area_struct {
|
|||
units, *not* PAGE_CACHE_SIZE */
|
||||
struct file * vm_file; /* File we map to (can be NULL). */
|
||||
void * vm_private_data; /* was vm_pte (shared mem) */
|
||||
unsigned long vm_truncate_count;/* truncate_count or restart_addr */
|
||||
|
||||
#ifndef CONFIG_MMU
|
||||
struct vm_region *vm_region; /* NOMMU mapping region */
|
||||
|
|
|
@ -386,7 +386,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
|
|||
spin_lock(&mapping->i_mmap_lock);
|
||||
if (tmp->vm_flags & VM_SHARED)
|
||||
mapping->i_mmap_writable++;
|
||||
tmp->vm_truncate_count = mpnt->vm_truncate_count;
|
||||
flush_dcache_mmap_lock(mapping);
|
||||
/* insert tmp into the share list, just after mpnt */
|
||||
vma_prio_tree_add(tmp, mpnt);
|
||||
|
|
195
mm/memory.c
195
mm/memory.c
|
@ -986,13 +986,13 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
|||
static unsigned long zap_pte_range(struct mmu_gather *tlb,
|
||||
struct vm_area_struct *vma, pmd_t *pmd,
|
||||
unsigned long addr, unsigned long end,
|
||||
long *zap_work, struct zap_details *details)
|
||||
struct zap_details *details)
|
||||
{
|
||||
struct mm_struct *mm = tlb->mm;
|
||||
int force_flush = 0;
|
||||
pte_t *pte;
|
||||
spinlock_t *ptl;
|
||||
int rss[NR_MM_COUNTERS];
|
||||
spinlock_t *ptl;
|
||||
pte_t *pte;
|
||||
|
||||
again:
|
||||
init_rss_vec(rss);
|
||||
|
@ -1001,12 +1001,9 @@ again:
|
|||
do {
|
||||
pte_t ptent = *pte;
|
||||
if (pte_none(ptent)) {
|
||||
(*zap_work)--;
|
||||
continue;
|
||||
}
|
||||
|
||||
(*zap_work) -= PAGE_SIZE;
|
||||
|
||||
if (pte_present(ptent)) {
|
||||
struct page *page;
|
||||
|
||||
|
@ -1075,7 +1072,7 @@ again:
|
|||
print_bad_pte(vma, addr, ptent, NULL);
|
||||
}
|
||||
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
|
||||
} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
|
||||
} while (pte++, addr += PAGE_SIZE, addr != end);
|
||||
|
||||
add_mm_rss_vec(mm, rss);
|
||||
arch_leave_lazy_mmu_mode();
|
||||
|
@ -1099,7 +1096,7 @@ again:
|
|||
static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
|
||||
struct vm_area_struct *vma, pud_t *pud,
|
||||
unsigned long addr, unsigned long end,
|
||||
long *zap_work, struct zap_details *details)
|
||||
struct zap_details *details)
|
||||
{
|
||||
pmd_t *pmd;
|
||||
unsigned long next;
|
||||
|
@ -1111,19 +1108,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
|
|||
if (next-addr != HPAGE_PMD_SIZE) {
|
||||
VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
|
||||
split_huge_page_pmd(vma->vm_mm, pmd);
|
||||
} else if (zap_huge_pmd(tlb, vma, pmd)) {
|
||||
(*zap_work)--;
|
||||
} else if (zap_huge_pmd(tlb, vma, pmd))
|
||||
continue;
|
||||
}
|
||||
/* fall through */
|
||||
}
|
||||
if (pmd_none_or_clear_bad(pmd)) {
|
||||
(*zap_work)--;
|
||||
if (pmd_none_or_clear_bad(pmd))
|
||||
continue;
|
||||
}
|
||||
next = zap_pte_range(tlb, vma, pmd, addr, next,
|
||||
zap_work, details);
|
||||
} while (pmd++, addr = next, (addr != end && *zap_work > 0));
|
||||
next = zap_pte_range(tlb, vma, pmd, addr, next, details);
|
||||
cond_resched();
|
||||
} while (pmd++, addr = next, addr != end);
|
||||
|
||||
return addr;
|
||||
}
|
||||
|
@ -1131,7 +1124,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
|
|||
static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
|
||||
struct vm_area_struct *vma, pgd_t *pgd,
|
||||
unsigned long addr, unsigned long end,
|
||||
long *zap_work, struct zap_details *details)
|
||||
struct zap_details *details)
|
||||
{
|
||||
pud_t *pud;
|
||||
unsigned long next;
|
||||
|
@ -1139,13 +1132,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
|
|||
pud = pud_offset(pgd, addr);
|
||||
do {
|
||||
next = pud_addr_end(addr, end);
|
||||
if (pud_none_or_clear_bad(pud)) {
|
||||
(*zap_work)--;
|
||||
if (pud_none_or_clear_bad(pud))
|
||||
continue;
|
||||
}
|
||||
next = zap_pmd_range(tlb, vma, pud, addr, next,
|
||||
zap_work, details);
|
||||
} while (pud++, addr = next, (addr != end && *zap_work > 0));
|
||||
next = zap_pmd_range(tlb, vma, pud, addr, next, details);
|
||||
} while (pud++, addr = next, addr != end);
|
||||
|
||||
return addr;
|
||||
}
|
||||
|
@ -1153,7 +1143,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
|
|||
static unsigned long unmap_page_range(struct mmu_gather *tlb,
|
||||
struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long end,
|
||||
long *zap_work, struct zap_details *details)
|
||||
struct zap_details *details)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
unsigned long next;
|
||||
|
@ -1167,13 +1157,10 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
|
|||
pgd = pgd_offset(vma->vm_mm, addr);
|
||||
do {
|
||||
next = pgd_addr_end(addr, end);
|
||||
if (pgd_none_or_clear_bad(pgd)) {
|
||||
(*zap_work)--;
|
||||
if (pgd_none_or_clear_bad(pgd))
|
||||
continue;
|
||||
}
|
||||
next = zap_pud_range(tlb, vma, pgd, addr, next,
|
||||
zap_work, details);
|
||||
} while (pgd++, addr = next, (addr != end && *zap_work > 0));
|
||||
next = zap_pud_range(tlb, vma, pgd, addr, next, details);
|
||||
} while (pgd++, addr = next, addr != end);
|
||||
tlb_end_vma(tlb, vma);
|
||||
mem_cgroup_uncharge_end();
|
||||
|
||||
|
@ -1218,9 +1205,7 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
|
|||
unsigned long end_addr, unsigned long *nr_accounted,
|
||||
struct zap_details *details)
|
||||
{
|
||||
long zap_work = ZAP_BLOCK_SIZE;
|
||||
unsigned long start = start_addr;
|
||||
spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
|
||||
mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
|
||||
|
@ -1253,33 +1238,15 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
|
|||
* Since no pte has actually been setup, it is
|
||||
* safe to do nothing in this case.
|
||||
*/
|
||||
if (vma->vm_file) {
|
||||
if (vma->vm_file)
|
||||
unmap_hugepage_range(vma, start, end, NULL);
|
||||
zap_work -= (end - start) /
|
||||
pages_per_huge_page(hstate_vma(vma));
|
||||
}
|
||||
|
||||
start = end;
|
||||
} else
|
||||
start = unmap_page_range(tlb, vma,
|
||||
start, end, &zap_work, details);
|
||||
|
||||
if (zap_work > 0) {
|
||||
BUG_ON(start != end);
|
||||
break;
|
||||
}
|
||||
|
||||
if (need_resched() ||
|
||||
(i_mmap_lock && spin_needbreak(i_mmap_lock))) {
|
||||
if (i_mmap_lock)
|
||||
goto out;
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
zap_work = ZAP_BLOCK_SIZE;
|
||||
start = unmap_page_range(tlb, vma, start, end, details);
|
||||
}
|
||||
}
|
||||
out:
|
||||
|
||||
mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
|
||||
return start; /* which is now the end (or restart) address */
|
||||
}
|
||||
|
@ -2612,96 +2579,11 @@ unwritable_page:
|
|||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Helper functions for unmap_mapping_range().
|
||||
*
|
||||
* __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
|
||||
*
|
||||
* We have to restart searching the prio_tree whenever we drop the lock,
|
||||
* since the iterator is only valid while the lock is held, and anyway
|
||||
* a later vma might be split and reinserted earlier while lock dropped.
|
||||
*
|
||||
* The list of nonlinear vmas could be handled more efficiently, using
|
||||
* a placeholder, but handle it in the same way until a need is shown.
|
||||
* It is important to search the prio_tree before nonlinear list: a vma
|
||||
* may become nonlinear and be shifted from prio_tree to nonlinear list
|
||||
* while the lock is dropped; but never shifted from list to prio_tree.
|
||||
*
|
||||
* In order to make forward progress despite restarting the search,
|
||||
* vm_truncate_count is used to mark a vma as now dealt with, so we can
|
||||
* quickly skip it next time around. Since the prio_tree search only
|
||||
* shows us those vmas affected by unmapping the range in question, we
|
||||
* can't efficiently keep all vmas in step with mapping->truncate_count:
|
||||
* so instead reset them all whenever it wraps back to 0 (then go to 1).
|
||||
* mapping->truncate_count and vma->vm_truncate_count are protected by
|
||||
* i_mmap_lock.
|
||||
*
|
||||
* In order to make forward progress despite repeatedly restarting some
|
||||
* large vma, note the restart_addr from unmap_vmas when it breaks out:
|
||||
* and restart from that address when we reach that vma again. It might
|
||||
* have been split or merged, shrunk or extended, but never shifted: so
|
||||
* restart_addr remains valid so long as it remains in the vma's range.
|
||||
* unmap_mapping_range forces truncate_count to leap over page-aligned
|
||||
* values so we can save vma's restart_addr in its truncate_count field.
|
||||
*/
|
||||
#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
|
||||
|
||||
static void reset_vma_truncate_counts(struct address_space *mapping)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
struct prio_tree_iter iter;
|
||||
|
||||
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
|
||||
vma->vm_truncate_count = 0;
|
||||
list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
|
||||
vma->vm_truncate_count = 0;
|
||||
}
|
||||
|
||||
static int unmap_mapping_range_vma(struct vm_area_struct *vma,
|
||||
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
|
||||
unsigned long start_addr, unsigned long end_addr,
|
||||
struct zap_details *details)
|
||||
{
|
||||
unsigned long restart_addr;
|
||||
int need_break;
|
||||
|
||||
/*
|
||||
* files that support invalidating or truncating portions of the
|
||||
* file from under mmaped areas must have their ->fault function
|
||||
* return a locked page (and set VM_FAULT_LOCKED in the return).
|
||||
* This provides synchronisation against concurrent unmapping here.
|
||||
*/
|
||||
|
||||
again:
|
||||
restart_addr = vma->vm_truncate_count;
|
||||
if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
|
||||
start_addr = restart_addr;
|
||||
if (start_addr >= end_addr) {
|
||||
/* Top of vma has been split off since last time */
|
||||
vma->vm_truncate_count = details->truncate_count;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
restart_addr = zap_page_range(vma, start_addr,
|
||||
end_addr - start_addr, details);
|
||||
need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
|
||||
|
||||
if (restart_addr >= end_addr) {
|
||||
/* We have now completed this vma: mark it so */
|
||||
vma->vm_truncate_count = details->truncate_count;
|
||||
if (!need_break)
|
||||
return 0;
|
||||
} else {
|
||||
/* Note restart_addr in vma's truncate_count field */
|
||||
vma->vm_truncate_count = restart_addr;
|
||||
if (!need_break)
|
||||
goto again;
|
||||
}
|
||||
|
||||
spin_unlock(details->i_mmap_lock);
|
||||
cond_resched();
|
||||
spin_lock(details->i_mmap_lock);
|
||||
return -EINTR;
|
||||
zap_page_range(vma, start_addr, end_addr - start_addr, details);
|
||||
}
|
||||
|
||||
static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
|
||||
|
@ -2711,12 +2593,8 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
|
|||
struct prio_tree_iter iter;
|
||||
pgoff_t vba, vea, zba, zea;
|
||||
|
||||
restart:
|
||||
vma_prio_tree_foreach(vma, &iter, root,
|
||||
details->first_index, details->last_index) {
|
||||
/* Skip quickly over those we have already dealt with */
|
||||
if (vma->vm_truncate_count == details->truncate_count)
|
||||
continue;
|
||||
|
||||
vba = vma->vm_pgoff;
|
||||
vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
|
||||
|
@ -2728,11 +2606,10 @@ restart:
|
|||
if (zea > vea)
|
||||
zea = vea;
|
||||
|
||||
if (unmap_mapping_range_vma(vma,
|
||||
unmap_mapping_range_vma(vma,
|
||||
((zba - vba) << PAGE_SHIFT) + vma->vm_start,
|
||||
((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
|
||||
details) < 0)
|
||||
goto restart;
|
||||
details);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2747,15 +2624,9 @@ static inline void unmap_mapping_range_list(struct list_head *head,
|
|||
* across *all* the pages in each nonlinear VMA, not just the pages
|
||||
* whose virtual address lies outside the file truncation point.
|
||||
*/
|
||||
restart:
|
||||
list_for_each_entry(vma, head, shared.vm_set.list) {
|
||||
/* Skip quickly over those we have already dealt with */
|
||||
if (vma->vm_truncate_count == details->truncate_count)
|
||||
continue;
|
||||
details->nonlinear_vma = vma;
|
||||
if (unmap_mapping_range_vma(vma, vma->vm_start,
|
||||
vma->vm_end, details) < 0)
|
||||
goto restart;
|
||||
unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2794,26 +2665,14 @@ void unmap_mapping_range(struct address_space *mapping,
|
|||
details.last_index = hba + hlen - 1;
|
||||
if (details.last_index < details.first_index)
|
||||
details.last_index = ULONG_MAX;
|
||||
details.i_mmap_lock = &mapping->i_mmap_lock;
|
||||
|
||||
mutex_lock(&mapping->unmap_mutex);
|
||||
|
||||
spin_lock(&mapping->i_mmap_lock);
|
||||
|
||||
/* Protect against endless unmapping loops */
|
||||
mapping->truncate_count++;
|
||||
if (unlikely(is_restart_addr(mapping->truncate_count))) {
|
||||
if (mapping->truncate_count == 0)
|
||||
reset_vma_truncate_counts(mapping);
|
||||
mapping->truncate_count++;
|
||||
}
|
||||
details.truncate_count = mapping->truncate_count;
|
||||
|
||||
if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
|
||||
unmap_mapping_range_tree(&mapping->i_mmap, &details);
|
||||
if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
|
||||
unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
|
||||
spin_unlock(&mapping->i_mmap_lock);
|
||||
mutex_unlock(&mapping->unmap_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL(unmap_mapping_range);
|
||||
|
||||
|
|
13
mm/mmap.c
13
mm/mmap.c
|
@ -445,10 +445,8 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
if (vma->vm_file)
|
||||
mapping = vma->vm_file->f_mapping;
|
||||
|
||||
if (mapping) {
|
||||
if (mapping)
|
||||
spin_lock(&mapping->i_mmap_lock);
|
||||
vma->vm_truncate_count = mapping->truncate_count;
|
||||
}
|
||||
|
||||
__vma_link(mm, vma, prev, rb_link, rb_parent);
|
||||
__vma_link_file(vma);
|
||||
|
@ -558,16 +556,7 @@ again: remove_next = 1 + (end > next->vm_end);
|
|||
if (!(vma->vm_flags & VM_NONLINEAR))
|
||||
root = &mapping->i_mmap;
|
||||
spin_lock(&mapping->i_mmap_lock);
|
||||
if (importer &&
|
||||
vma->vm_truncate_count != next->vm_truncate_count) {
|
||||
/*
|
||||
* unmap_mapping_range might be in progress:
|
||||
* ensure that the expanding vma is rescanned.
|
||||
*/
|
||||
importer->vm_truncate_count = 0;
|
||||
}
|
||||
if (insert) {
|
||||
insert->vm_truncate_count = vma->vm_truncate_count;
|
||||
/*
|
||||
* Put into prio_tree now, so instantiated pages
|
||||
* are visible to arm/parisc __flush_dcache_page
|
||||
|
|
|
@ -94,7 +94,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
|
|||
*/
|
||||
mapping = vma->vm_file->f_mapping;
|
||||
spin_lock(&mapping->i_mmap_lock);
|
||||
new_vma->vm_truncate_count = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
Loading…
Reference in a new issue