Merge branch 'akpm' (more incoming from Andrew)
Merge second patch-bomb from Andrew Morton: - A little DM fix - the MM queue * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (154 commits) ksm: allocate roots when needed mm: cleanup "swapcache" in do_swap_page mm,ksm: swapoff might need to copy mm,ksm: FOLL_MIGRATION do migration_entry_wait ksm: shrink 32-bit rmap_item back to 32 bytes ksm: treat unstable nid like in stable tree ksm: add some comments tmpfs: fix mempolicy object leaks tmpfs: fix use-after-free of mempolicy object mm/fadvise.c: drain all pagevecs if POSIX_FADV_DONTNEED fails to discard all pages mm: export mmu notifier invalidates mm: accelerate mm_populate() treatment of THP pages mm: use long type for page counts in mm_populate() and get_user_pages() mm: accurately document nr_free_*_pages functions with code comments HWPOISON: change order of error_states[]'s elements HWPOISON: fix misjudgement of page_action() for errors on mlocked pages memcg: stop warning on memcg_propagate_kmem net: change type of virtio_chan->p9_max_pages vmscan: change type of vm_total_pages to unsigned long fs/nfsd: change type of max_delegations, nfsd_drc_max_mem and nfsd_drc_mem_used ...
This commit is contained in:
commit
5ce1a70e2f
113 changed files with 4443 additions and 1667 deletions
52
Documentation/ABI/testing/sysfs-kernel-mm-ksm
Normal file
52
Documentation/ABI/testing/sysfs-kernel-mm-ksm
Normal file
|
@ -0,0 +1,52 @@
|
|||
What: /sys/kernel/mm/ksm
|
||||
Date: September 2009
|
||||
KernelVersion: 2.6.32
|
||||
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||
Description: Interface for Kernel Samepage Merging (KSM)
|
||||
|
||||
What: /sys/kernel/mm/ksm/full_scans
|
||||
What: /sys/kernel/mm/ksm/pages_shared
|
||||
What: /sys/kernel/mm/ksm/pages_sharing
|
||||
What: /sys/kernel/mm/ksm/pages_to_scan
|
||||
What: /sys/kernel/mm/ksm/pages_unshared
|
||||
What: /sys/kernel/mm/ksm/pages_volatile
|
||||
What: /sys/kernel/mm/ksm/run
|
||||
What: /sys/kernel/mm/ksm/sleep_millisecs
|
||||
Date: September 2009
|
||||
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||
Description: Kernel Samepage Merging daemon sysfs interface
|
||||
|
||||
full_scans: how many times all mergeable areas have been
|
||||
scanned.
|
||||
|
||||
pages_shared: how many shared pages are being used.
|
||||
|
||||
pages_sharing: how many more sites are sharing them i.e. how
|
||||
much saved.
|
||||
|
||||
pages_to_scan: how many present pages to scan before ksmd goes
|
||||
to sleep.
|
||||
|
||||
pages_unshared: how many pages unique but repeatedly checked
|
||||
for merging.
|
||||
|
||||
pages_volatile: how many pages changing too fast to be placed
|
||||
in a tree.
|
||||
|
||||
run: write 0 to disable ksm, read 0 while ksm is disabled.
|
||||
write 1 to run ksm, read 1 while ksm is running.
|
||||
write 2 to disable ksm and unmerge all its pages.
|
||||
|
||||
sleep_millisecs: how many milliseconds ksm should sleep between
|
||||
scans.
|
||||
|
||||
See Documentation/vm/ksm.txt for more information.
|
||||
|
||||
What: /sys/kernel/mm/ksm/merge_across_nodes
|
||||
Date: January 2013
|
||||
KernelVersion: 3.9
|
||||
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||
Description: Control merging pages across different NUMA nodes.
|
||||
|
||||
When it is set to 0 only pages from the same node are merged,
|
||||
otherwise pages from all nodes can be merged together (default).
|
|
@ -1640,6 +1640,42 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
that the amount of memory usable for all allocations
|
||||
is not too small.
|
||||
|
||||
movablemem_map=acpi
|
||||
[KNL,X86,IA-64,PPC] This parameter is similar to
|
||||
memmap except it specifies the memory map of
|
||||
ZONE_MOVABLE.
|
||||
This option inform the kernel to use Hot Pluggable bit
|
||||
in flags from SRAT from ACPI BIOS to determine which
|
||||
memory devices could be hotplugged. The corresponding
|
||||
memory ranges will be set as ZONE_MOVABLE.
|
||||
NOTE: Whatever node the kernel resides in will always
|
||||
be un-hotpluggable.
|
||||
|
||||
movablemem_map=nn[KMG]@ss[KMG]
|
||||
[KNL,X86,IA-64,PPC] This parameter is similar to
|
||||
memmap except it specifies the memory map of
|
||||
ZONE_MOVABLE.
|
||||
If user specifies memory ranges, the info in SRAT will
|
||||
be ingored. And it works like the following:
|
||||
- If more ranges are all within one node, then from
|
||||
lowest ss to the end of the node will be ZONE_MOVABLE.
|
||||
- If a range is within a node, then from ss to the end
|
||||
of the node will be ZONE_MOVABLE.
|
||||
- If a range covers two or more nodes, then from ss to
|
||||
the end of the 1st node will be ZONE_MOVABLE, and all
|
||||
the rest nodes will only have ZONE_MOVABLE.
|
||||
If memmap is specified at the same time, the
|
||||
movablemem_map will be limited within the memmap
|
||||
areas. If kernelcore or movablecore is also specified,
|
||||
movablemem_map will have higher priority to be
|
||||
satisfied. So the administrator should be careful that
|
||||
the amount of movablemem_map areas are not too large.
|
||||
Otherwise kernel won't have enough memory to start.
|
||||
NOTE: We don't stop users specifying the node the
|
||||
kernel resides in as hotpluggable so that this
|
||||
option can be used as a workaround of firmware
|
||||
bugs.
|
||||
|
||||
MTD_Partition= [MTD]
|
||||
Format: <name>,<region-number>,<size>,<offset>
|
||||
|
||||
|
|
|
@ -58,6 +58,21 @@ sleep_millisecs - how many milliseconds ksmd should sleep before next scan
|
|||
e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs"
|
||||
Default: 20 (chosen for demonstration purposes)
|
||||
|
||||
merge_across_nodes - specifies if pages from different numa nodes can be merged.
|
||||
When set to 0, ksm merges only pages which physically
|
||||
reside in the memory area of same NUMA node. That brings
|
||||
lower latency to access of shared pages. Systems with more
|
||||
nodes, at significant NUMA distances, are likely to benefit
|
||||
from the lower latency of setting 0. Smaller systems, which
|
||||
need to minimize memory usage, are likely to benefit from
|
||||
the greater sharing of setting 1 (default). You may wish to
|
||||
compare how your system performs under each setting, before
|
||||
deciding on which to use. merge_across_nodes setting can be
|
||||
changed only when there are no ksm shared pages in system:
|
||||
set run 2 to unmerge pages first, then to 1 after changing
|
||||
merge_across_nodes, to remerge according to the new setting.
|
||||
Default: 1 (merging across nodes as in earlier releases)
|
||||
|
||||
run - set 0 to stop ksmd from running but keep merged pages,
|
||||
set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run",
|
||||
set 2 to stop ksmd and unmerge all pages currently merged,
|
||||
|
|
|
@ -434,4 +434,7 @@ int __meminit vmemmap_populate(struct page *start_page,
|
|||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_ARM64_64K_PAGES */
|
||||
void vmemmap_free(struct page *memmap, unsigned long nr_pages)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
|
||||
|
|
|
@ -93,7 +93,7 @@ void show_mem(unsigned int filter)
|
|||
printk(KERN_INFO "%d pages swap cached\n", total_cached);
|
||||
printk(KERN_INFO "Total of %ld pages in page table cache\n",
|
||||
quicklist_total_size());
|
||||
printk(KERN_INFO "%d free buffer pages\n", nr_free_buffer_pages());
|
||||
printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages());
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -666,7 +666,7 @@ void show_mem(unsigned int filter)
|
|||
printk(KERN_INFO "%d pages swap cached\n", total_cached);
|
||||
printk(KERN_INFO "Total of %ld pages in page table cache\n",
|
||||
quicklist_total_size());
|
||||
printk(KERN_INFO "%d free buffer pages\n", nr_free_buffer_pages());
|
||||
printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -822,4 +822,8 @@ int __meminit vmemmap_populate(struct page *start_page,
|
|||
{
|
||||
return vmemmap_populate_basepages(start_page, size, node);
|
||||
}
|
||||
|
||||
void vmemmap_free(struct page *memmap, unsigned long nr_pages)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -688,6 +688,24 @@ int arch_add_memory(int nid, u64 start, u64 size)
|
|||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTREMOVE
|
||||
int arch_remove_memory(u64 start, u64 size)
|
||||
{
|
||||
unsigned long start_pfn = start >> PAGE_SHIFT;
|
||||
unsigned long nr_pages = size >> PAGE_SHIFT;
|
||||
struct zone *zone;
|
||||
int ret;
|
||||
|
||||
zone = page_zone(pfn_to_page(start_pfn));
|
||||
ret = __remove_pages(zone, start_pfn, nr_pages);
|
||||
if (ret)
|
||||
pr_warn("%s: Problem encountered in __remove_pages() as"
|
||||
" ret=%d\n", __func__, ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
|
|
@ -297,5 +297,10 @@ int __meminit vmemmap_populate(struct page *start_page,
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void vmemmap_free(struct page *memmap, unsigned long nr_pages)
|
||||
{
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
|
||||
|
||||
|
|
|
@ -133,6 +133,18 @@ int arch_add_memory(int nid, u64 start, u64 size)
|
|||
|
||||
return __add_pages(nid, zone, start_pfn, nr_pages);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTREMOVE
|
||||
int arch_remove_memory(u64 start, u64 size)
|
||||
{
|
||||
unsigned long start_pfn = start >> PAGE_SHIFT;
|
||||
unsigned long nr_pages = size >> PAGE_SHIFT;
|
||||
struct zone *zone;
|
||||
|
||||
zone = page_zone(pfn_to_page(start_pfn));
|
||||
return __remove_pages(zone, start_pfn, nr_pages);
|
||||
}
|
||||
#endif
|
||||
#endif /* CONFIG_MEMORY_HOTPLUG */
|
||||
|
||||
/*
|
||||
|
|
|
@ -228,4 +228,16 @@ int arch_add_memory(int nid, u64 start, u64 size)
|
|||
vmem_remove_mapping(start, size);
|
||||
return rc;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTREMOVE
|
||||
int arch_remove_memory(u64 start, u64 size)
|
||||
{
|
||||
/*
|
||||
* There is no hardware or firmware interface which could trigger a
|
||||
* hot memory remove on s390. So there is nothing that needs to be
|
||||
* implemented.
|
||||
*/
|
||||
return -EBUSY;
|
||||
}
|
||||
#endif
|
||||
#endif /* CONFIG_MEMORY_HOTPLUG */
|
||||
|
|
|
@ -268,6 +268,10 @@ out:
|
|||
return ret;
|
||||
}
|
||||
|
||||
void vmemmap_free(struct page *memmap, unsigned long nr_pages)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* Add memory segment to the segment list if it doesn't overlap with
|
||||
* an already present segment.
|
||||
|
|
|
@ -558,4 +558,21 @@ int memory_add_physaddr_to_nid(u64 addr)
|
|||
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTREMOVE
|
||||
int arch_remove_memory(u64 start, u64 size)
|
||||
{
|
||||
unsigned long start_pfn = start >> PAGE_SHIFT;
|
||||
unsigned long nr_pages = size >> PAGE_SHIFT;
|
||||
struct zone *zone;
|
||||
int ret;
|
||||
|
||||
zone = page_zone(pfn_to_page(start_pfn));
|
||||
ret = __remove_pages(zone, start_pfn, nr_pages);
|
||||
if (unlikely(ret))
|
||||
pr_warn("%s: Failed, __remove_pages() == %d\n", __func__,
|
||||
ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
#endif /* CONFIG_MEMORY_HOTPLUG */
|
||||
|
|
|
@ -57,7 +57,7 @@ void show_mem(unsigned int filter)
|
|||
printk("Mem-info:\n");
|
||||
show_free_areas(filter);
|
||||
printk("Free swap: %6ldkB\n",
|
||||
nr_swap_pages << (PAGE_SHIFT-10));
|
||||
get_nr_swap_pages() << (PAGE_SHIFT-10));
|
||||
printk("%ld pages of RAM\n", totalram_pages);
|
||||
printk("%ld free pages\n", nr_free_pages());
|
||||
}
|
||||
|
|
|
@ -2235,6 +2235,11 @@ void __meminit vmemmap_populate_print_last(void)
|
|||
node_start = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void vmemmap_free(struct page *memmap, unsigned long nr_pages)
|
||||
{
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
|
||||
|
||||
static void prot_init_common(unsigned long page_none,
|
||||
|
|
|
@ -130,7 +130,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
|
|||
if (!retval) {
|
||||
unsigned long addr = MEM_USER_INTRPT;
|
||||
addr = mmap_region(NULL, addr, INTRPT_SIZE,
|
||||
MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
|
||||
VM_READ|VM_EXEC|
|
||||
VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
|
||||
if (addr > (unsigned long) -PAGE_SIZE)
|
||||
|
|
|
@ -935,6 +935,14 @@ int remove_memory(u64 start, u64 size)
|
|||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTREMOVE
|
||||
int arch_remove_memory(u64 start, u64 size)
|
||||
{
|
||||
/* TODO */
|
||||
return -EBUSY;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
struct kmem_cache *pgd_cache;
|
||||
|
|
|
@ -61,7 +61,7 @@ void show_mem(unsigned int filter)
|
|||
global_page_state(NR_PAGETABLE),
|
||||
global_page_state(NR_BOUNCE),
|
||||
global_page_state(NR_FILE_PAGES),
|
||||
nr_swap_pages);
|
||||
get_nr_swap_pages());
|
||||
|
||||
for_each_zone(zone) {
|
||||
unsigned long flags, order, total = 0, largest_order = -1;
|
||||
|
|
|
@ -57,8 +57,8 @@ static inline int numa_cpu_node(int cpu)
|
|||
#endif
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
extern void __cpuinit numa_set_node(int cpu, int node);
|
||||
extern void __cpuinit numa_clear_node(int cpu);
|
||||
extern void numa_set_node(int cpu, int node);
|
||||
extern void numa_clear_node(int cpu);
|
||||
extern void __init init_cpu_to_node(void);
|
||||
extern void __cpuinit numa_add_cpu(int cpu);
|
||||
extern void __cpuinit numa_remove_cpu(int cpu);
|
||||
|
|
|
@ -351,6 +351,7 @@ static inline void update_page_count(int level, unsigned long pages) { }
|
|||
* as a pte too.
|
||||
*/
|
||||
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
|
||||
extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase);
|
||||
extern phys_addr_t slow_virt_to_phys(void *__address);
|
||||
|
||||
#endif /* !__ASSEMBLY__ */
|
||||
|
|
|
@ -696,6 +696,10 @@ EXPORT_SYMBOL(acpi_map_lsapic);
|
|||
|
||||
int acpi_unmap_lsapic(int cpu)
|
||||
{
|
||||
#ifdef CONFIG_ACPI_NUMA
|
||||
set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
|
||||
#endif
|
||||
|
||||
per_cpu(x86_cpu_to_apicid, cpu) = -1;
|
||||
set_cpu_present(cpu, false);
|
||||
num_processors--;
|
||||
|
|
|
@ -1056,6 +1056,15 @@ void __init setup_arch(char **cmdline_p)
|
|||
setup_bios_corruption_check();
|
||||
#endif
|
||||
|
||||
/*
|
||||
* In the memory hotplug case, the kernel needs info from SRAT to
|
||||
* determine which memory is hotpluggable before allocating memory
|
||||
* using memblock.
|
||||
*/
|
||||
acpi_boot_table_init();
|
||||
early_acpi_boot_init();
|
||||
early_parse_srat();
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
|
||||
(max_pfn_mapped<<PAGE_SHIFT) - 1);
|
||||
|
@ -1101,10 +1110,6 @@ void __init setup_arch(char **cmdline_p)
|
|||
/*
|
||||
* Parse the ACPI tables for possible boot-time SMP configuration.
|
||||
*/
|
||||
acpi_boot_table_init();
|
||||
|
||||
early_acpi_boot_init();
|
||||
|
||||
initmem_init();
|
||||
memblock_find_dma_reserve();
|
||||
|
||||
|
|
|
@ -862,6 +862,18 @@ int arch_add_memory(int nid, u64 start, u64 size)
|
|||
|
||||
return __add_pages(nid, zone, start_pfn, nr_pages);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTREMOVE
|
||||
int arch_remove_memory(u64 start, u64 size)
|
||||
{
|
||||
unsigned long start_pfn = start >> PAGE_SHIFT;
|
||||
unsigned long nr_pages = size >> PAGE_SHIFT;
|
||||
struct zone *zone;
|
||||
|
||||
zone = page_zone(pfn_to_page(start_pfn));
|
||||
return __remove_pages(zone, start_pfn, nr_pages);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
|
|
@ -707,6 +707,343 @@ int arch_add_memory(int nid, u64 start, u64 size)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(arch_add_memory);
|
||||
|
||||
#define PAGE_INUSE 0xFD
|
||||
|
||||
static void __meminit free_pagetable(struct page *page, int order)
|
||||
{
|
||||
struct zone *zone;
|
||||
bool bootmem = false;
|
||||
unsigned long magic;
|
||||
unsigned int nr_pages = 1 << order;
|
||||
|
||||
/* bootmem page has reserved flag */
|
||||
if (PageReserved(page)) {
|
||||
__ClearPageReserved(page);
|
||||
bootmem = true;
|
||||
|
||||
magic = (unsigned long)page->lru.next;
|
||||
if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
|
||||
while (nr_pages--)
|
||||
put_page_bootmem(page++);
|
||||
} else
|
||||
__free_pages_bootmem(page, order);
|
||||
} else
|
||||
free_pages((unsigned long)page_address(page), order);
|
||||
|
||||
/*
|
||||
* SECTION_INFO pages and MIX_SECTION_INFO pages
|
||||
* are all allocated by bootmem.
|
||||
*/
|
||||
if (bootmem) {
|
||||
zone = page_zone(page);
|
||||
zone_span_writelock(zone);
|
||||
zone->present_pages += nr_pages;
|
||||
zone_span_writeunlock(zone);
|
||||
totalram_pages += nr_pages;
|
||||
}
|
||||
}
|
||||
|
||||
static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
|
||||
{
|
||||
pte_t *pte;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < PTRS_PER_PTE; i++) {
|
||||
pte = pte_start + i;
|
||||
if (pte_val(*pte))
|
||||
return;
|
||||
}
|
||||
|
||||
/* free a pte talbe */
|
||||
free_pagetable(pmd_page(*pmd), 0);
|
||||
spin_lock(&init_mm.page_table_lock);
|
||||
pmd_clear(pmd);
|
||||
spin_unlock(&init_mm.page_table_lock);
|
||||
}
|
||||
|
||||
static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
|
||||
{
|
||||
pmd_t *pmd;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < PTRS_PER_PMD; i++) {
|
||||
pmd = pmd_start + i;
|
||||
if (pmd_val(*pmd))
|
||||
return;
|
||||
}
|
||||
|
||||
/* free a pmd talbe */
|
||||
free_pagetable(pud_page(*pud), 0);
|
||||
spin_lock(&init_mm.page_table_lock);
|
||||
pud_clear(pud);
|
||||
spin_unlock(&init_mm.page_table_lock);
|
||||
}
|
||||
|
||||
/* Return true if pgd is changed, otherwise return false. */
|
||||
static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
|
||||
{
|
||||
pud_t *pud;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < PTRS_PER_PUD; i++) {
|
||||
pud = pud_start + i;
|
||||
if (pud_val(*pud))
|
||||
return false;
|
||||
}
|
||||
|
||||
/* free a pud table */
|
||||
free_pagetable(pgd_page(*pgd), 0);
|
||||
spin_lock(&init_mm.page_table_lock);
|
||||
pgd_clear(pgd);
|
||||
spin_unlock(&init_mm.page_table_lock);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void __meminit
|
||||
remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
|
||||
bool direct)
|
||||
{
|
||||
unsigned long next, pages = 0;
|
||||
pte_t *pte;
|
||||
void *page_addr;
|
||||
phys_addr_t phys_addr;
|
||||
|
||||
pte = pte_start + pte_index(addr);
|
||||
for (; addr < end; addr = next, pte++) {
|
||||
next = (addr + PAGE_SIZE) & PAGE_MASK;
|
||||
if (next > end)
|
||||
next = end;
|
||||
|
||||
if (!pte_present(*pte))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* We mapped [0,1G) memory as identity mapping when
|
||||
* initializing, in arch/x86/kernel/head_64.S. These
|
||||
* pagetables cannot be removed.
|
||||
*/
|
||||
phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
|
||||
if (phys_addr < (phys_addr_t)0x40000000)
|
||||
return;
|
||||
|
||||
if (IS_ALIGNED(addr, PAGE_SIZE) &&
|
||||
IS_ALIGNED(next, PAGE_SIZE)) {
|
||||
/*
|
||||
* Do not free direct mapping pages since they were
|
||||
* freed when offlining, or simplely not in use.
|
||||
*/
|
||||
if (!direct)
|
||||
free_pagetable(pte_page(*pte), 0);
|
||||
|
||||
spin_lock(&init_mm.page_table_lock);
|
||||
pte_clear(&init_mm, addr, pte);
|
||||
spin_unlock(&init_mm.page_table_lock);
|
||||
|
||||
/* For non-direct mapping, pages means nothing. */
|
||||
pages++;
|
||||
} else {
|
||||
/*
|
||||
* If we are here, we are freeing vmemmap pages since
|
||||
* direct mapped memory ranges to be freed are aligned.
|
||||
*
|
||||
* If we are not removing the whole page, it means
|
||||
* other page structs in this page are being used and
|
||||
* we canot remove them. So fill the unused page_structs
|
||||
* with 0xFD, and remove the page when it is wholly
|
||||
* filled with 0xFD.
|
||||
*/
|
||||
memset((void *)addr, PAGE_INUSE, next - addr);
|
||||
|
||||
page_addr = page_address(pte_page(*pte));
|
||||
if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
|
||||
free_pagetable(pte_page(*pte), 0);
|
||||
|
||||
spin_lock(&init_mm.page_table_lock);
|
||||
pte_clear(&init_mm, addr, pte);
|
||||
spin_unlock(&init_mm.page_table_lock);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Call free_pte_table() in remove_pmd_table(). */
|
||||
flush_tlb_all();
|
||||
if (direct)
|
||||
update_page_count(PG_LEVEL_4K, -pages);
|
||||
}
|
||||
|
||||
static void __meminit
|
||||
remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
|
||||
bool direct)
|
||||
{
|
||||
unsigned long next, pages = 0;
|
||||
pte_t *pte_base;
|
||||
pmd_t *pmd;
|
||||
void *page_addr;
|
||||
|
||||
pmd = pmd_start + pmd_index(addr);
|
||||
for (; addr < end; addr = next, pmd++) {
|
||||
next = pmd_addr_end(addr, end);
|
||||
|
||||
if (!pmd_present(*pmd))
|
||||
continue;
|
||||
|
||||
if (pmd_large(*pmd)) {
|
||||
if (IS_ALIGNED(addr, PMD_SIZE) &&
|
||||
IS_ALIGNED(next, PMD_SIZE)) {
|
||||
if (!direct)
|
||||
free_pagetable(pmd_page(*pmd),
|
||||
get_order(PMD_SIZE));
|
||||
|
||||
spin_lock(&init_mm.page_table_lock);
|
||||
pmd_clear(pmd);
|
||||
spin_unlock(&init_mm.page_table_lock);
|
||||
pages++;
|
||||
} else {
|
||||
/* If here, we are freeing vmemmap pages. */
|
||||
memset((void *)addr, PAGE_INUSE, next - addr);
|
||||
|
||||
page_addr = page_address(pmd_page(*pmd));
|
||||
if (!memchr_inv(page_addr, PAGE_INUSE,
|
||||
PMD_SIZE)) {
|
||||
free_pagetable(pmd_page(*pmd),
|
||||
get_order(PMD_SIZE));
|
||||
|
||||
spin_lock(&init_mm.page_table_lock);
|
||||
pmd_clear(pmd);
|
||||
spin_unlock(&init_mm.page_table_lock);
|
||||
}
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
pte_base = (pte_t *)pmd_page_vaddr(*pmd);
|
||||
remove_pte_table(pte_base, addr, next, direct);
|
||||
free_pte_table(pte_base, pmd);
|
||||
}
|
||||
|
||||
/* Call free_pmd_table() in remove_pud_table(). */
|
||||
if (direct)
|
||||
update_page_count(PG_LEVEL_2M, -pages);
|
||||
}
|
||||
|
||||
static void __meminit
|
||||
remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
|
||||
bool direct)
|
||||
{
|
||||
unsigned long next, pages = 0;
|
||||
pmd_t *pmd_base;
|
||||
pud_t *pud;
|
||||
void *page_addr;
|
||||
|
||||
pud = pud_start + pud_index(addr);
|
||||
for (; addr < end; addr = next, pud++) {
|
||||
next = pud_addr_end(addr, end);
|
||||
|
||||
if (!pud_present(*pud))
|
||||
continue;
|
||||
|
||||
if (pud_large(*pud)) {
|
||||
if (IS_ALIGNED(addr, PUD_SIZE) &&
|
||||
IS_ALIGNED(next, PUD_SIZE)) {
|
||||
if (!direct)
|
||||
free_pagetable(pud_page(*pud),
|
||||
get_order(PUD_SIZE));
|
||||
|
||||
spin_lock(&init_mm.page_table_lock);
|
||||
pud_clear(pud);
|
||||
spin_unlock(&init_mm.page_table_lock);
|
||||
pages++;
|
||||
} else {
|
||||
/* If here, we are freeing vmemmap pages. */
|
||||
memset((void *)addr, PAGE_INUSE, next - addr);
|
||||
|
||||
page_addr = page_address(pud_page(*pud));
|
||||
if (!memchr_inv(page_addr, PAGE_INUSE,
|
||||
PUD_SIZE)) {
|
||||
free_pagetable(pud_page(*pud),
|
||||
get_order(PUD_SIZE));
|
||||
|
||||
spin_lock(&init_mm.page_table_lock);
|
||||
pud_clear(pud);
|
||||
spin_unlock(&init_mm.page_table_lock);
|
||||
}
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
pmd_base = (pmd_t *)pud_page_vaddr(*pud);
|
||||
remove_pmd_table(pmd_base, addr, next, direct);
|
||||
free_pmd_table(pmd_base, pud);
|
||||
}
|
||||
|
||||
if (direct)
|
||||
update_page_count(PG_LEVEL_1G, -pages);
|
||||
}
|
||||
|
||||
/* start and end are both virtual address. */
|
||||
static void __meminit
|
||||
remove_pagetable(unsigned long start, unsigned long end, bool direct)
|
||||
{
|
||||
unsigned long next;
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
bool pgd_changed = false;
|
||||
|
||||
for (; start < end; start = next) {
|
||||
next = pgd_addr_end(start, end);
|
||||
|
||||
pgd = pgd_offset_k(start);
|
||||
if (!pgd_present(*pgd))
|
||||
continue;
|
||||
|
||||
pud = (pud_t *)pgd_page_vaddr(*pgd);
|
||||
remove_pud_table(pud, start, next, direct);
|
||||
if (free_pud_table(pud, pgd))
|
||||
pgd_changed = true;
|
||||
}
|
||||
|
||||
if (pgd_changed)
|
||||
sync_global_pgds(start, end - 1);
|
||||
|
||||
flush_tlb_all();
|
||||
}
|
||||
|
||||
void __ref vmemmap_free(struct page *memmap, unsigned long nr_pages)
|
||||
{
|
||||
unsigned long start = (unsigned long)memmap;
|
||||
unsigned long end = (unsigned long)(memmap + nr_pages);
|
||||
|
||||
remove_pagetable(start, end, false);
|
||||
}
|
||||
|
||||
static void __meminit
|
||||
kernel_physical_mapping_remove(unsigned long start, unsigned long end)
|
||||
{
|
||||
start = (unsigned long)__va(start);
|
||||
end = (unsigned long)__va(end);
|
||||
|
||||
remove_pagetable(start, end, true);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTREMOVE
|
||||
int __ref arch_remove_memory(u64 start, u64 size)
|
||||
{
|
||||
unsigned long start_pfn = start >> PAGE_SHIFT;
|
||||
unsigned long nr_pages = size >> PAGE_SHIFT;
|
||||
struct zone *zone;
|
||||
int ret;
|
||||
|
||||
zone = page_zone(pfn_to_page(start_pfn));
|
||||
kernel_physical_mapping_remove(start, start + size);
|
||||
ret = __remove_pages(zone, start_pfn, nr_pages);
|
||||
WARN_ON_ONCE(ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
#endif /* CONFIG_MEMORY_HOTPLUG */
|
||||
|
||||
static struct kcore_list kcore_vsyscall;
|
||||
|
@ -1019,6 +1356,66 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
|
|||
return 0;
|
||||
}
|
||||
|
||||
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
|
||||
void register_page_bootmem_memmap(unsigned long section_nr,
|
||||
struct page *start_page, unsigned long size)
|
||||
{
|
||||
unsigned long addr = (unsigned long)start_page;
|
||||
unsigned long end = (unsigned long)(start_page + size);
|
||||
unsigned long next;
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
unsigned int nr_pages;
|
||||
struct page *page;
|
||||
|
||||
for (; addr < end; addr = next) {
|
||||
pte_t *pte = NULL;
|
||||
|
||||
pgd = pgd_offset_k(addr);
|
||||
if (pgd_none(*pgd)) {
|
||||
next = (addr + PAGE_SIZE) & PAGE_MASK;
|
||||
continue;
|
||||
}
|
||||
get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
|
||||
|
||||
pud = pud_offset(pgd, addr);
|
||||
if (pud_none(*pud)) {
|
||||
next = (addr + PAGE_SIZE) & PAGE_MASK;
|
||||
continue;
|
||||
}
|
||||
get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
|
||||
|
||||
if (!cpu_has_pse) {
|
||||
next = (addr + PAGE_SIZE) & PAGE_MASK;
|
||||
pmd = pmd_offset(pud, addr);
|
||||
if (pmd_none(*pmd))
|
||||
continue;
|
||||
get_page_bootmem(section_nr, pmd_page(*pmd),
|
||||
MIX_SECTION_INFO);
|
||||
|
||||
pte = pte_offset_kernel(pmd, addr);
|
||||
if (pte_none(*pte))
|
||||
continue;
|
||||
get_page_bootmem(section_nr, pte_page(*pte),
|
||||
SECTION_INFO);
|
||||
} else {
|
||||
next = pmd_addr_end(addr, end);
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
if (pmd_none(*pmd))
|
||||
continue;
|
||||
|
||||
nr_pages = 1 << (get_order(PMD_SIZE));
|
||||
page = pmd_page(*pmd);
|
||||
while (nr_pages--)
|
||||
get_page_bootmem(section_nr, page++,
|
||||
SECTION_INFO);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void __meminit vmemmap_populate_print_last(void)
|
||||
{
|
||||
if (p_start) {
|
||||
|
|
|
@ -56,7 +56,7 @@ early_param("numa", numa_setup);
|
|||
/*
|
||||
* apicid, cpu, node mappings
|
||||
*/
|
||||
s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
|
||||
s16 __apicid_to_node[MAX_LOCAL_APIC] = {
|
||||
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
|
||||
};
|
||||
|
||||
|
@ -78,7 +78,7 @@ EXPORT_SYMBOL(node_to_cpumask_map);
|
|||
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
|
||||
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
|
||||
|
||||
void __cpuinit numa_set_node(int cpu, int node)
|
||||
void numa_set_node(int cpu, int node)
|
||||
{
|
||||
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
|
||||
|
||||
|
@ -101,7 +101,7 @@ void __cpuinit numa_set_node(int cpu, int node)
|
|||
set_cpu_numa_node(cpu, node);
|
||||
}
|
||||
|
||||
void __cpuinit numa_clear_node(int cpu)
|
||||
void numa_clear_node(int cpu)
|
||||
{
|
||||
numa_set_node(cpu, NUMA_NO_NODE);
|
||||
}
|
||||
|
@ -213,10 +213,9 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
|
|||
* Allocate node data. Try node-local memory and then any node.
|
||||
* Never allocate in DMA zone.
|
||||
*/
|
||||
nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
|
||||
nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
|
||||
if (!nd_pa) {
|
||||
pr_err("Cannot find %zu bytes in node %d\n",
|
||||
nd_size, nid);
|
||||
pr_err("Cannot find %zu bytes in any node\n", nd_size);
|
||||
return;
|
||||
}
|
||||
nd = __va(nd_pa);
|
||||
|
@ -561,10 +560,12 @@ static int __init numa_init(int (*init_func)(void))
|
|||
for (i = 0; i < MAX_LOCAL_APIC; i++)
|
||||
set_apicid_to_node(i, NUMA_NO_NODE);
|
||||
|
||||
nodes_clear(numa_nodes_parsed);
|
||||
/*
|
||||
* Do not clear numa_nodes_parsed or zero numa_meminfo here, because
|
||||
* SRAT was parsed earlier in early_parse_srat().
|
||||
*/
|
||||
nodes_clear(node_possible_map);
|
||||
nodes_clear(node_online_map);
|
||||
memset(&numa_meminfo, 0, sizeof(numa_meminfo));
|
||||
WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
|
||||
numa_reset_distance();
|
||||
|
||||
|
|
|
@ -529,21 +529,13 @@ out_unlock:
|
|||
return do_split;
|
||||
}
|
||||
|
||||
static int split_large_page(pte_t *kpte, unsigned long address)
|
||||
int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
|
||||
{
|
||||
unsigned long pfn, pfninc = 1;
|
||||
unsigned int i, level;
|
||||
pte_t *pbase, *tmp;
|
||||
pte_t *tmp;
|
||||
pgprot_t ref_prot;
|
||||
struct page *base;
|
||||
|
||||
if (!debug_pagealloc)
|
||||
spin_unlock(&cpa_lock);
|
||||
base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
|
||||
if (!debug_pagealloc)
|
||||
spin_lock(&cpa_lock);
|
||||
if (!base)
|
||||
return -ENOMEM;
|
||||
struct page *base = virt_to_page(pbase);
|
||||
|
||||
spin_lock(&pgd_lock);
|
||||
/*
|
||||
|
@ -551,10 +543,11 @@ static int split_large_page(pte_t *kpte, unsigned long address)
|
|||
* up for us already:
|
||||
*/
|
||||
tmp = lookup_address(address, &level);
|
||||
if (tmp != kpte)
|
||||
goto out_unlock;
|
||||
if (tmp != kpte) {
|
||||
spin_unlock(&pgd_lock);
|
||||
return 1;
|
||||
}
|
||||
|
||||
pbase = (pte_t *)page_address(base);
|
||||
paravirt_alloc_pte(&init_mm, page_to_pfn(base));
|
||||
ref_prot = pte_pgprot(pte_clrhuge(*kpte));
|
||||
/*
|
||||
|
@ -601,21 +594,31 @@ static int split_large_page(pte_t *kpte, unsigned long address)
|
|||
* going on.
|
||||
*/
|
||||
__flush_tlb_all();
|
||||
|
||||
base = NULL;
|
||||
|
||||
out_unlock:
|
||||
/*
|
||||
* If we dropped out via the lookup_address check under
|
||||
* pgd_lock then stick the page back into the pool:
|
||||
*/
|
||||
if (base)
|
||||
__free_page(base);
|
||||
spin_unlock(&pgd_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int split_large_page(pte_t *kpte, unsigned long address)
|
||||
{
|
||||
pte_t *pbase;
|
||||
struct page *base;
|
||||
|
||||
if (!debug_pagealloc)
|
||||
spin_unlock(&cpa_lock);
|
||||
base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
|
||||
if (!debug_pagealloc)
|
||||
spin_lock(&cpa_lock);
|
||||
if (!base)
|
||||
return -ENOMEM;
|
||||
|
||||
pbase = (pte_t *)page_address(base);
|
||||
if (__split_large_page(kpte, address, pbase))
|
||||
__free_page(base);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
|
||||
int primary)
|
||||
{
|
||||
|
|
|
@ -141,11 +141,126 @@ static inline int save_add_info(void) {return 1;}
|
|||
static inline int save_add_info(void) {return 0;}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
||||
static void __init
|
||||
handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
|
||||
{
|
||||
int overlap, i;
|
||||
unsigned long start_pfn, end_pfn;
|
||||
|
||||
start_pfn = PFN_DOWN(start);
|
||||
end_pfn = PFN_UP(end);
|
||||
|
||||
/*
|
||||
* For movablemem_map=acpi:
|
||||
*
|
||||
* SRAT: |_____| |_____| |_________| |_________| ......
|
||||
* node id: 0 1 1 2
|
||||
* hotpluggable: n y y n
|
||||
* movablemem_map: |_____| |_________|
|
||||
*
|
||||
* Using movablemem_map, we can prevent memblock from allocating memory
|
||||
* on ZONE_MOVABLE at boot time.
|
||||
*
|
||||
* Before parsing SRAT, memblock has already reserve some memory ranges
|
||||
* for other purposes, such as for kernel image. We cannot prevent
|
||||
* kernel from using these memory, so we need to exclude these memory
|
||||
* even if it is hotpluggable.
|
||||
* Furthermore, to ensure the kernel has enough memory to boot, we make
|
||||
* all the memory on the node which the kernel resides in
|
||||
* un-hotpluggable.
|
||||
*/
|
||||
if (hotpluggable && movablemem_map.acpi) {
|
||||
/* Exclude ranges reserved by memblock. */
|
||||
struct memblock_type *rgn = &memblock.reserved;
|
||||
|
||||
for (i = 0; i < rgn->cnt; i++) {
|
||||
if (end <= rgn->regions[i].base ||
|
||||
start >= rgn->regions[i].base +
|
||||
rgn->regions[i].size)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If the memory range overlaps the memory reserved by
|
||||
* memblock, then the kernel resides in this node.
|
||||
*/
|
||||
node_set(node, movablemem_map.numa_nodes_kernel);
|
||||
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the kernel resides in this node, then the whole node
|
||||
* should not be hotpluggable.
|
||||
*/
|
||||
if (node_isset(node, movablemem_map.numa_nodes_kernel))
|
||||
goto out;
|
||||
|
||||
insert_movablemem_map(start_pfn, end_pfn);
|
||||
|
||||
/*
|
||||
* numa_nodes_hotplug nodemask represents which nodes are put
|
||||
* into movablemem_map.map[].
|
||||
*/
|
||||
node_set(node, movablemem_map.numa_nodes_hotplug);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* For movablemem_map=nn[KMG]@ss[KMG]:
|
||||
*
|
||||
* SRAT: |_____| |_____| |_________| |_________| ......
|
||||
* node id: 0 1 1 2
|
||||
* user specified: |__| |___|
|
||||
* movablemem_map: |___| |_________| |______| ......
|
||||
*
|
||||
* Using movablemem_map, we can prevent memblock from allocating memory
|
||||
* on ZONE_MOVABLE at boot time.
|
||||
*
|
||||
* NOTE: In this case, SRAT info will be ingored.
|
||||
*/
|
||||
overlap = movablemem_map_overlap(start_pfn, end_pfn);
|
||||
if (overlap >= 0) {
|
||||
/*
|
||||
* If part of this range is in movablemem_map, we need to
|
||||
* add the range after it to extend the range to the end
|
||||
* of the node, because from the min address specified to
|
||||
* the end of the node will be ZONE_MOVABLE.
|
||||
*/
|
||||
start_pfn = max(start_pfn,
|
||||
movablemem_map.map[overlap].start_pfn);
|
||||
insert_movablemem_map(start_pfn, end_pfn);
|
||||
|
||||
/*
|
||||
* Set the nodemask, so that if the address range on one node
|
||||
* is not continuse, we can add the subsequent ranges on the
|
||||
* same node into movablemem_map.
|
||||
*/
|
||||
node_set(node, movablemem_map.numa_nodes_hotplug);
|
||||
} else {
|
||||
if (node_isset(node, movablemem_map.numa_nodes_hotplug))
|
||||
/*
|
||||
* Insert the range if we already have movable ranges
|
||||
* on the same node.
|
||||
*/
|
||||
insert_movablemem_map(start_pfn, end_pfn);
|
||||
}
|
||||
out:
|
||||
return;
|
||||
}
|
||||
#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
|
||||
static inline void
|
||||
handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
|
||||
|
||||
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
|
||||
int __init
|
||||
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
|
||||
{
|
||||
u64 start, end;
|
||||
u32 hotpluggable;
|
||||
int node, pxm;
|
||||
|
||||
if (srat_disabled())
|
||||
|
@ -154,7 +269,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
|
|||
goto out_err_bad_srat;
|
||||
if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
|
||||
goto out_err;
|
||||
if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
|
||||
hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
|
||||
if (hotpluggable && !save_add_info())
|
||||
goto out_err;
|
||||
|
||||
start = ma->base_address;
|
||||
|
@ -174,9 +290,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
|
|||
|
||||
node_set(node, numa_nodes_parsed);
|
||||
|
||||
printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
|
||||
printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n",
|
||||
node, pxm,
|
||||
(unsigned long long) start, (unsigned long long) end - 1);
|
||||
(unsigned long long) start, (unsigned long long) end - 1,
|
||||
hotpluggable ? "Hot Pluggable": "");
|
||||
|
||||
handle_movablemem(node, start, end, hotpluggable);
|
||||
|
||||
return 0;
|
||||
out_err_bad_srat:
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include <linux/mutex.h>
|
||||
#include <linux/idr.h>
|
||||
#include <linux/log2.h>
|
||||
#include <linux/pm_runtime.h>
|
||||
|
||||
#include "blk.h"
|
||||
|
||||
|
@ -534,6 +535,14 @@ static void register_disk(struct gendisk *disk)
|
|||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* avoid probable deadlock caused by allocating memory with
|
||||
* GFP_KERNEL in runtime_resume callback of its all ancestor
|
||||
* devices
|
||||
*/
|
||||
pm_runtime_set_memalloc_noio(ddev, true);
|
||||
|
||||
disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
|
||||
disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
|
||||
|
||||
|
@ -663,6 +672,7 @@ void del_gendisk(struct gendisk *disk)
|
|||
disk->driverfs_dev = NULL;
|
||||
if (!sysfs_deprecated)
|
||||
sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
|
||||
pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
|
||||
device_del(disk_to_dev(disk));
|
||||
}
|
||||
EXPORT_SYMBOL(del_gendisk);
|
||||
|
|
|
@ -280,9 +280,11 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
|
|||
|
||||
static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
|
||||
{
|
||||
int result = 0;
|
||||
int result = 0, nid;
|
||||
struct acpi_memory_info *info, *n;
|
||||
|
||||
nid = acpi_get_node(mem_device->device->handle);
|
||||
|
||||
list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
|
||||
if (info->failed)
|
||||
/* The kernel does not use this memory block */
|
||||
|
@ -295,7 +297,9 @@ static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
|
|||
*/
|
||||
return -EBUSY;
|
||||
|
||||
result = remove_memory(info->start_addr, info->length);
|
||||
if (nid < 0)
|
||||
nid = memory_add_physaddr_to_nid(info->start_addr);
|
||||
result = remove_memory(nid, info->start_addr, info->length);
|
||||
if (result)
|
||||
return result;
|
||||
|
||||
|
|
|
@ -282,10 +282,10 @@ acpi_table_parse_srat(enum acpi_srat_type id,
|
|||
handler, max_entries);
|
||||
}
|
||||
|
||||
int __init acpi_numa_init(void)
|
||||
{
|
||||
int cnt = 0;
|
||||
static int srat_mem_cnt;
|
||||
|
||||
void __init early_parse_srat(void)
|
||||
{
|
||||
/*
|
||||
* Should not limit number with cpu num that is from NR_CPUS or nr_cpus=
|
||||
* SRAT cpu entries could have different order with that in MADT.
|
||||
|
@ -295,21 +295,24 @@ int __init acpi_numa_init(void)
|
|||
/* SRAT: Static Resource Affinity Table */
|
||||
if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
|
||||
acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
|
||||
acpi_parse_x2apic_affinity, 0);
|
||||
acpi_parse_x2apic_affinity, 0);
|
||||
acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
|
||||
acpi_parse_processor_affinity, 0);
|
||||
cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
|
||||
acpi_parse_memory_affinity,
|
||||
NR_NODE_MEMBLKS);
|
||||
acpi_parse_processor_affinity, 0);
|
||||
srat_mem_cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
|
||||
acpi_parse_memory_affinity,
|
||||
NR_NODE_MEMBLKS);
|
||||
}
|
||||
}
|
||||
|
||||
int __init acpi_numa_init(void)
|
||||
{
|
||||
/* SLIT: System Locality Information Table */
|
||||
acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
|
||||
|
||||
acpi_numa_arch_fixup();
|
||||
|
||||
if (cnt < 0)
|
||||
return cnt;
|
||||
if (srat_mem_cnt < 0)
|
||||
return srat_mem_cnt;
|
||||
else if (!parsed_numa_memblks)
|
||||
return -ENOENT;
|
||||
return 0;
|
||||
|
|
|
@ -45,6 +45,7 @@
|
|||
#include <linux/cpuidle.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/acpi.h>
|
||||
#include <linux/memory_hotplug.h>
|
||||
|
||||
#include <asm/io.h>
|
||||
#include <asm/cpu.h>
|
||||
|
@ -641,6 +642,7 @@ static int acpi_processor_remove(struct acpi_device *device)
|
|||
|
||||
per_cpu(processors, pr->id) = NULL;
|
||||
per_cpu(processor_device_array, pr->id) = NULL;
|
||||
try_offline_node(cpu_to_node(pr->id));
|
||||
|
||||
free:
|
||||
free_cpumask_var(pr->throttling.shared_cpu_map);
|
||||
|
|
|
@ -693,6 +693,12 @@ int offline_memory_block(struct memory_block *mem)
|
|||
return ret;
|
||||
}
|
||||
|
||||
/* return true if the memory block is offlined, otherwise, return false */
|
||||
bool is_memblock_offlined(struct memory_block *mem)
|
||||
{
|
||||
return mem->state == MEM_OFFLINE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize the sysfs support for memory devices...
|
||||
*/
|
||||
|
|
|
@ -124,6 +124,76 @@ unsigned long pm_runtime_autosuspend_expiration(struct device *dev)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration);
|
||||
|
||||
static int dev_memalloc_noio(struct device *dev, void *data)
|
||||
{
|
||||
return dev->power.memalloc_noio;
|
||||
}
|
||||
|
||||
/*
|
||||
* pm_runtime_set_memalloc_noio - Set a device's memalloc_noio flag.
|
||||
* @dev: Device to handle.
|
||||
* @enable: True for setting the flag and False for clearing the flag.
|
||||
*
|
||||
* Set the flag for all devices in the path from the device to the
|
||||
* root device in the device tree if @enable is true, otherwise clear
|
||||
* the flag for devices in the path whose siblings don't set the flag.
|
||||
*
|
||||
* The function should only be called by block device, or network
|
||||
* device driver for solving the deadlock problem during runtime
|
||||
* resume/suspend:
|
||||
*
|
||||
* If memory allocation with GFP_KERNEL is called inside runtime
|
||||
* resume/suspend callback of any one of its ancestors(or the
|
||||
* block device itself), the deadlock may be triggered inside the
|
||||
* memory allocation since it might not complete until the block
|
||||
* device becomes active and the involed page I/O finishes. The
|
||||
* situation is pointed out first by Alan Stern. Network device
|
||||
* are involved in iSCSI kind of situation.
|
||||
*
|
||||
* The lock of dev_hotplug_mutex is held in the function for handling
|
||||
* hotplug race because pm_runtime_set_memalloc_noio() may be called
|
||||
* in async probe().
|
||||
*
|
||||
* The function should be called between device_add() and device_del()
|
||||
* on the affected device(block/network device).
|
||||
*/
|
||||
void pm_runtime_set_memalloc_noio(struct device *dev, bool enable)
|
||||
{
|
||||
static DEFINE_MUTEX(dev_hotplug_mutex);
|
||||
|
||||
mutex_lock(&dev_hotplug_mutex);
|
||||
for (;;) {
|
||||
bool enabled;
|
||||
|
||||
/* hold power lock since bitfield is not SMP-safe. */
|
||||
spin_lock_irq(&dev->power.lock);
|
||||
enabled = dev->power.memalloc_noio;
|
||||
dev->power.memalloc_noio = enable;
|
||||
spin_unlock_irq(&dev->power.lock);
|
||||
|
||||
/*
|
||||
* not need to enable ancestors any more if the device
|
||||
* has been enabled.
|
||||
*/
|
||||
if (enabled && enable)
|
||||
break;
|
||||
|
||||
dev = dev->parent;
|
||||
|
||||
/*
|
||||
* clear flag of the parent device only if all the
|
||||
* children don't set the flag because ancestor's
|
||||
* flag was set by any one of the descendants.
|
||||
*/
|
||||
if (!dev || (!enable &&
|
||||
device_for_each_child(dev, NULL,
|
||||
dev_memalloc_noio)))
|
||||
break;
|
||||
}
|
||||
mutex_unlock(&dev_hotplug_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(pm_runtime_set_memalloc_noio);
|
||||
|
||||
/**
|
||||
* rpm_check_suspend_allowed - Test whether a device may be suspended.
|
||||
* @dev: Device to test.
|
||||
|
@ -278,7 +348,24 @@ static int rpm_callback(int (*cb)(struct device *), struct device *dev)
|
|||
if (!cb)
|
||||
return -ENOSYS;
|
||||
|
||||
retval = __rpm_callback(cb, dev);
|
||||
if (dev->power.memalloc_noio) {
|
||||
unsigned int noio_flag;
|
||||
|
||||
/*
|
||||
* Deadlock might be caused if memory allocation with
|
||||
* GFP_KERNEL happens inside runtime_suspend and
|
||||
* runtime_resume callbacks of one block device's
|
||||
* ancestor or the block device itself. Network
|
||||
* device might be thought as part of iSCSI block
|
||||
* device, so network device and its ancestor should
|
||||
* be marked as memalloc_noio too.
|
||||
*/
|
||||
noio_flag = memalloc_noio_save();
|
||||
retval = __rpm_callback(cb, dev);
|
||||
memalloc_noio_restore(noio_flag);
|
||||
} else {
|
||||
retval = __rpm_callback(cb, dev);
|
||||
}
|
||||
|
||||
dev->power.runtime_error = retval;
|
||||
return retval != -EACCES ? retval : -EIO;
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include <linux/types.h>
|
||||
#include <linux/bootmem.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/mm.h>
|
||||
|
||||
/*
|
||||
* Data types ------------------------------------------------------------------
|
||||
|
@ -52,6 +53,9 @@ static ssize_t start_show(struct firmware_map_entry *entry, char *buf);
|
|||
static ssize_t end_show(struct firmware_map_entry *entry, char *buf);
|
||||
static ssize_t type_show(struct firmware_map_entry *entry, char *buf);
|
||||
|
||||
static struct firmware_map_entry * __meminit
|
||||
firmware_map_find_entry(u64 start, u64 end, const char *type);
|
||||
|
||||
/*
|
||||
* Static data -----------------------------------------------------------------
|
||||
*/
|
||||
|
@ -79,7 +83,52 @@ static const struct sysfs_ops memmap_attr_ops = {
|
|||
.show = memmap_attr_show,
|
||||
};
|
||||
|
||||
static struct kobj_type memmap_ktype = {
|
||||
/* Firmware memory map entries. */
|
||||
static LIST_HEAD(map_entries);
|
||||
static DEFINE_SPINLOCK(map_entries_lock);
|
||||
|
||||
/*
|
||||
* For memory hotplug, there is no way to free memory map entries allocated
|
||||
* by boot mem after the system is up. So when we hot-remove memory whose
|
||||
* map entry is allocated by bootmem, we need to remember the storage and
|
||||
* reuse it when the memory is hot-added again.
|
||||
*/
|
||||
static LIST_HEAD(map_entries_bootmem);
|
||||
static DEFINE_SPINLOCK(map_entries_bootmem_lock);
|
||||
|
||||
|
||||
static inline struct firmware_map_entry *
|
||||
to_memmap_entry(struct kobject *kobj)
|
||||
{
|
||||
return container_of(kobj, struct firmware_map_entry, kobj);
|
||||
}
|
||||
|
||||
static void __meminit release_firmware_map_entry(struct kobject *kobj)
|
||||
{
|
||||
struct firmware_map_entry *entry = to_memmap_entry(kobj);
|
||||
|
||||
if (PageReserved(virt_to_page(entry))) {
|
||||
/*
|
||||
* Remember the storage allocated by bootmem, and reuse it when
|
||||
* the memory is hot-added again. The entry will be added to
|
||||
* map_entries_bootmem here, and deleted from &map_entries in
|
||||
* firmware_map_remove_entry().
|
||||
*/
|
||||
if (firmware_map_find_entry(entry->start, entry->end,
|
||||
entry->type)) {
|
||||
spin_lock(&map_entries_bootmem_lock);
|
||||
list_add(&entry->list, &map_entries_bootmem);
|
||||
spin_unlock(&map_entries_bootmem_lock);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
kfree(entry);
|
||||
}
|
||||
|
||||
static struct kobj_type __refdata memmap_ktype = {
|
||||
.release = release_firmware_map_entry,
|
||||
.sysfs_ops = &memmap_attr_ops,
|
||||
.default_attrs = def_attrs,
|
||||
};
|
||||
|
@ -88,13 +137,6 @@ static struct kobj_type memmap_ktype = {
|
|||
* Registration functions ------------------------------------------------------
|
||||
*/
|
||||
|
||||
/*
|
||||
* Firmware memory map entries. No locking is needed because the
|
||||
* firmware_map_add() and firmware_map_add_early() functions are called
|
||||
* in firmware initialisation code in one single thread of execution.
|
||||
*/
|
||||
static LIST_HEAD(map_entries);
|
||||
|
||||
/**
|
||||
* firmware_map_add_entry() - Does the real work to add a firmware memmap entry.
|
||||
* @start: Start of the memory range.
|
||||
|
@ -118,11 +160,25 @@ static int firmware_map_add_entry(u64 start, u64 end,
|
|||
INIT_LIST_HEAD(&entry->list);
|
||||
kobject_init(&entry->kobj, &memmap_ktype);
|
||||
|
||||
spin_lock(&map_entries_lock);
|
||||
list_add_tail(&entry->list, &map_entries);
|
||||
spin_unlock(&map_entries_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* firmware_map_remove_entry() - Does the real work to remove a firmware
|
||||
* memmap entry.
|
||||
* @entry: removed entry.
|
||||
*
|
||||
* The caller must hold map_entries_lock, and release it properly.
|
||||
**/
|
||||
static inline void firmware_map_remove_entry(struct firmware_map_entry *entry)
|
||||
{
|
||||
list_del(&entry->list);
|
||||
}
|
||||
|
||||
/*
|
||||
* Add memmap entry on sysfs
|
||||
*/
|
||||
|
@ -144,6 +200,78 @@ static int add_sysfs_fw_map_entry(struct firmware_map_entry *entry)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove memmap entry on sysfs
|
||||
*/
|
||||
static inline void remove_sysfs_fw_map_entry(struct firmware_map_entry *entry)
|
||||
{
|
||||
kobject_put(&entry->kobj);
|
||||
}
|
||||
|
||||
/*
|
||||
* firmware_map_find_entry_in_list() - Search memmap entry in a given list.
|
||||
* @start: Start of the memory range.
|
||||
* @end: End of the memory range (exclusive).
|
||||
* @type: Type of the memory range.
|
||||
* @list: In which to find the entry.
|
||||
*
|
||||
* This function is to find the memmap entey of a given memory range in a
|
||||
* given list. The caller must hold map_entries_lock, and must not release
|
||||
* the lock until the processing of the returned entry has completed.
|
||||
*
|
||||
* Return: Pointer to the entry to be found on success, or NULL on failure.
|
||||
*/
|
||||
static struct firmware_map_entry * __meminit
|
||||
firmware_map_find_entry_in_list(u64 start, u64 end, const char *type,
|
||||
struct list_head *list)
|
||||
{
|
||||
struct firmware_map_entry *entry;
|
||||
|
||||
list_for_each_entry(entry, list, list)
|
||||
if ((entry->start == start) && (entry->end == end) &&
|
||||
(!strcmp(entry->type, type))) {
|
||||
return entry;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* firmware_map_find_entry() - Search memmap entry in map_entries.
|
||||
* @start: Start of the memory range.
|
||||
* @end: End of the memory range (exclusive).
|
||||
* @type: Type of the memory range.
|
||||
*
|
||||
* This function is to find the memmap entey of a given memory range.
|
||||
* The caller must hold map_entries_lock, and must not release the lock
|
||||
* until the processing of the returned entry has completed.
|
||||
*
|
||||
* Return: Pointer to the entry to be found on success, or NULL on failure.
|
||||
*/
|
||||
static struct firmware_map_entry * __meminit
|
||||
firmware_map_find_entry(u64 start, u64 end, const char *type)
|
||||
{
|
||||
return firmware_map_find_entry_in_list(start, end, type, &map_entries);
|
||||
}
|
||||
|
||||
/*
|
||||
* firmware_map_find_entry_bootmem() - Search memmap entry in map_entries_bootmem.
|
||||
* @start: Start of the memory range.
|
||||
* @end: End of the memory range (exclusive).
|
||||
* @type: Type of the memory range.
|
||||
*
|
||||
* This function is similar to firmware_map_find_entry except that it find the
|
||||
* given entry in map_entries_bootmem.
|
||||
*
|
||||
* Return: Pointer to the entry to be found on success, or NULL on failure.
|
||||
*/
|
||||
static struct firmware_map_entry * __meminit
|
||||
firmware_map_find_entry_bootmem(u64 start, u64 end, const char *type)
|
||||
{
|
||||
return firmware_map_find_entry_in_list(start, end, type,
|
||||
&map_entries_bootmem);
|
||||
}
|
||||
|
||||
/**
|
||||
* firmware_map_add_hotplug() - Adds a firmware mapping entry when we do
|
||||
* memory hotplug.
|
||||
|
@ -161,9 +289,19 @@ int __meminit firmware_map_add_hotplug(u64 start, u64 end, const char *type)
|
|||
{
|
||||
struct firmware_map_entry *entry;
|
||||
|
||||
entry = kzalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC);
|
||||
if (!entry)
|
||||
return -ENOMEM;
|
||||
entry = firmware_map_find_entry_bootmem(start, end, type);
|
||||
if (!entry) {
|
||||
entry = kzalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC);
|
||||
if (!entry)
|
||||
return -ENOMEM;
|
||||
} else {
|
||||
/* Reuse storage allocated by bootmem. */
|
||||
spin_lock(&map_entries_bootmem_lock);
|
||||
list_del(&entry->list);
|
||||
spin_unlock(&map_entries_bootmem_lock);
|
||||
|
||||
memset(entry, 0, sizeof(*entry));
|
||||
}
|
||||
|
||||
firmware_map_add_entry(start, end, type, entry);
|
||||
/* create the memmap entry */
|
||||
|
@ -196,6 +334,36 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type)
|
|||
return firmware_map_add_entry(start, end, type, entry);
|
||||
}
|
||||
|
||||
/**
|
||||
* firmware_map_remove() - remove a firmware mapping entry
|
||||
* @start: Start of the memory range.
|
||||
* @end: End of the memory range.
|
||||
* @type: Type of the memory range.
|
||||
*
|
||||
* removes a firmware mapping entry.
|
||||
*
|
||||
* Returns 0 on success, or -EINVAL if no entry.
|
||||
**/
|
||||
int __meminit firmware_map_remove(u64 start, u64 end, const char *type)
|
||||
{
|
||||
struct firmware_map_entry *entry;
|
||||
|
||||
spin_lock(&map_entries_lock);
|
||||
entry = firmware_map_find_entry(start, end - 1, type);
|
||||
if (!entry) {
|
||||
spin_unlock(&map_entries_lock);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
firmware_map_remove_entry(entry);
|
||||
spin_unlock(&map_entries_lock);
|
||||
|
||||
/* remove the memmap entry */
|
||||
remove_sysfs_fw_map_entry(entry);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sysfs functions -------------------------------------------------------------
|
||||
*/
|
||||
|
@ -217,8 +385,10 @@ static ssize_t type_show(struct firmware_map_entry *entry, char *buf)
|
|||
return snprintf(buf, PAGE_SIZE, "%s\n", entry->type);
|
||||
}
|
||||
|
||||
#define to_memmap_attr(_attr) container_of(_attr, struct memmap_attribute, attr)
|
||||
#define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj)
|
||||
static inline struct memmap_attribute *to_memmap_attr(struct attribute *attr)
|
||||
{
|
||||
return container_of(attr, struct memmap_attribute, attr);
|
||||
}
|
||||
|
||||
static ssize_t memmap_attr_show(struct kobject *kobj,
|
||||
struct attribute *attr, char *buf)
|
||||
|
|
|
@ -25,8 +25,8 @@ struct shadow_info {
|
|||
/*
|
||||
* It would be nice if we scaled with the size of transaction.
|
||||
*/
|
||||
#define HASH_SIZE 256
|
||||
#define HASH_MASK (HASH_SIZE - 1)
|
||||
#define DM_HASH_SIZE 256
|
||||
#define DM_HASH_MASK (DM_HASH_SIZE - 1)
|
||||
|
||||
struct dm_transaction_manager {
|
||||
int is_clone;
|
||||
|
@ -36,7 +36,7 @@ struct dm_transaction_manager {
|
|||
struct dm_space_map *sm;
|
||||
|
||||
spinlock_t lock;
|
||||
struct hlist_head buckets[HASH_SIZE];
|
||||
struct hlist_head buckets[DM_HASH_SIZE];
|
||||
};
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
|
@ -44,7 +44,7 @@ struct dm_transaction_manager {
|
|||
static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b)
|
||||
{
|
||||
int r = 0;
|
||||
unsigned bucket = dm_hash_block(b, HASH_MASK);
|
||||
unsigned bucket = dm_hash_block(b, DM_HASH_MASK);
|
||||
struct shadow_info *si;
|
||||
struct hlist_node *n;
|
||||
|
||||
|
@ -71,7 +71,7 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b)
|
|||
si = kmalloc(sizeof(*si), GFP_NOIO);
|
||||
if (si) {
|
||||
si->where = b;
|
||||
bucket = dm_hash_block(b, HASH_MASK);
|
||||
bucket = dm_hash_block(b, DM_HASH_MASK);
|
||||
spin_lock(&tm->lock);
|
||||
hlist_add_head(&si->hlist, tm->buckets + bucket);
|
||||
spin_unlock(&tm->lock);
|
||||
|
@ -86,7 +86,7 @@ static void wipe_shadow_table(struct dm_transaction_manager *tm)
|
|||
int i;
|
||||
|
||||
spin_lock(&tm->lock);
|
||||
for (i = 0; i < HASH_SIZE; i++) {
|
||||
for (i = 0; i < DM_HASH_SIZE; i++) {
|
||||
bucket = tm->buckets + i;
|
||||
hlist_for_each_entry_safe(si, n, tmp, bucket, hlist)
|
||||
kfree(si);
|
||||
|
@ -115,7 +115,7 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
|
|||
tm->sm = sm;
|
||||
|
||||
spin_lock_init(&tm->lock);
|
||||
for (i = 0; i < HASH_SIZE; i++)
|
||||
for (i = 0; i < DM_HASH_SIZE; i++)
|
||||
INIT_HLIST_HEAD(tm->buckets + i);
|
||||
|
||||
return tm;
|
||||
|
|
|
@ -404,7 +404,7 @@ static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage,
|
|||
else
|
||||
zbud_pers_pageframes--;
|
||||
zbudpage_spin_unlock(zbudpage);
|
||||
reset_page_mapcount(page);
|
||||
page_mapcount_reset(page);
|
||||
init_page_count(page);
|
||||
page->index = 0;
|
||||
return page;
|
||||
|
|
|
@ -472,7 +472,7 @@ static void reset_page(struct page *page)
|
|||
set_page_private(page, 0);
|
||||
page->mapping = NULL;
|
||||
page->freelist = NULL;
|
||||
reset_page_mapcount(page);
|
||||
page_mapcount_reset(page);
|
||||
}
|
||||
|
||||
static void free_zspage(struct page *first_page)
|
||||
|
|
|
@ -5177,6 +5177,7 @@ int usb_reset_device(struct usb_device *udev)
|
|||
{
|
||||
int ret;
|
||||
int i;
|
||||
unsigned int noio_flag;
|
||||
struct usb_host_config *config = udev->actconfig;
|
||||
|
||||
if (udev->state == USB_STATE_NOTATTACHED ||
|
||||
|
@ -5186,6 +5187,17 @@ int usb_reset_device(struct usb_device *udev)
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Don't allocate memory with GFP_KERNEL in current
|
||||
* context to avoid possible deadlock if usb mass
|
||||
* storage interface or usbnet interface(iSCSI case)
|
||||
* is included in current configuration. The easist
|
||||
* approach is to do it for every device reset,
|
||||
* because the device 'memalloc_noio' flag may have
|
||||
* not been set before reseting the usb device.
|
||||
*/
|
||||
noio_flag = memalloc_noio_save();
|
||||
|
||||
/* Prevent autosuspend during the reset */
|
||||
usb_autoresume_device(udev);
|
||||
|
||||
|
@ -5230,6 +5242,7 @@ int usb_reset_device(struct usb_device *udev)
|
|||
}
|
||||
|
||||
usb_autosuspend_device(udev);
|
||||
memalloc_noio_restore(noio_flag);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(usb_reset_device);
|
||||
|
|
7
fs/aio.c
7
fs/aio.c
|
@ -101,7 +101,7 @@ static int aio_setup_ring(struct kioctx *ctx)
|
|||
struct aio_ring *ring;
|
||||
struct aio_ring_info *info = &ctx->ring_info;
|
||||
unsigned nr_events = ctx->max_reqs;
|
||||
unsigned long size;
|
||||
unsigned long size, populate;
|
||||
int nr_pages;
|
||||
|
||||
/* Compensate for the ring buffer's head/tail overlap entry */
|
||||
|
@ -129,7 +129,8 @@ static int aio_setup_ring(struct kioctx *ctx)
|
|||
down_write(&ctx->mm->mmap_sem);
|
||||
info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size,
|
||||
PROT_READ|PROT_WRITE,
|
||||
MAP_ANONYMOUS|MAP_PRIVATE, 0);
|
||||
MAP_ANONYMOUS|MAP_PRIVATE, 0,
|
||||
&populate);
|
||||
if (IS_ERR((void *)info->mmap_base)) {
|
||||
up_write(&ctx->mm->mmap_sem);
|
||||
info->mmap_size = 0;
|
||||
|
@ -147,6 +148,8 @@ static int aio_setup_ring(struct kioctx *ctx)
|
|||
aio_free_ring(ctx);
|
||||
return -EAGAIN;
|
||||
}
|
||||
if (populate)
|
||||
mm_populate(info->mmap_base, populate);
|
||||
|
||||
ctx->user_id = info->mmap_base;
|
||||
|
||||
|
|
|
@ -3227,7 +3227,7 @@ static struct kmem_cache *bh_cachep __read_mostly;
|
|||
* Once the number of bh's in the machine exceeds this level, we start
|
||||
* stripping them in writeback.
|
||||
*/
|
||||
static int max_buffer_heads;
|
||||
static unsigned long max_buffer_heads;
|
||||
|
||||
int buffer_heads_over_limit;
|
||||
|
||||
|
@ -3343,7 +3343,7 @@ EXPORT_SYMBOL(bh_submit_read);
|
|||
|
||||
void __init buffer_init(void)
|
||||
{
|
||||
int nrpages;
|
||||
unsigned long nrpages;
|
||||
|
||||
bh_cachep = kmem_cache_create("buffer_head",
|
||||
sizeof(struct buffer_head), 0,
|
||||
|
|
|
@ -151,7 +151,7 @@ get_nfs4_file(struct nfs4_file *fi)
|
|||
}
|
||||
|
||||
static int num_delegations;
|
||||
unsigned int max_delegations;
|
||||
unsigned long max_delegations;
|
||||
|
||||
/*
|
||||
* Open owner state (share locks)
|
||||
|
@ -700,8 +700,8 @@ static int nfsd4_get_drc_mem(int slotsize, u32 num)
|
|||
num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
|
||||
|
||||
spin_lock(&nfsd_drc_lock);
|
||||
avail = min_t(int, NFSD_MAX_MEM_PER_SESSION,
|
||||
nfsd_drc_max_mem - nfsd_drc_mem_used);
|
||||
avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION,
|
||||
nfsd_drc_max_mem - nfsd_drc_mem_used);
|
||||
num = min_t(int, num, avail / slotsize);
|
||||
nfsd_drc_mem_used += num * slotsize;
|
||||
spin_unlock(&nfsd_drc_lock);
|
||||
|
|
|
@ -56,8 +56,8 @@ extern struct svc_version nfsd_version2, nfsd_version3,
|
|||
extern u32 nfsd_supported_minorversion;
|
||||
extern struct mutex nfsd_mutex;
|
||||
extern spinlock_t nfsd_drc_lock;
|
||||
extern unsigned int nfsd_drc_max_mem;
|
||||
extern unsigned int nfsd_drc_mem_used;
|
||||
extern unsigned long nfsd_drc_max_mem;
|
||||
extern unsigned long nfsd_drc_mem_used;
|
||||
|
||||
extern const struct seq_operations nfs_exports_op;
|
||||
|
||||
|
@ -106,7 +106,7 @@ static inline int nfsd_v4client(struct svc_rqst *rq)
|
|||
* NFSv4 State
|
||||
*/
|
||||
#ifdef CONFIG_NFSD_V4
|
||||
extern unsigned int max_delegations;
|
||||
extern unsigned long max_delegations;
|
||||
void nfs4_state_init(void);
|
||||
int nfsd4_init_slabs(void);
|
||||
void nfsd4_free_slabs(void);
|
||||
|
|
|
@ -59,8 +59,8 @@ DEFINE_MUTEX(nfsd_mutex);
|
|||
* nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
|
||||
*/
|
||||
spinlock_t nfsd_drc_lock;
|
||||
unsigned int nfsd_drc_max_mem;
|
||||
unsigned int nfsd_drc_mem_used;
|
||||
unsigned long nfsd_drc_max_mem;
|
||||
unsigned long nfsd_drc_mem_used;
|
||||
|
||||
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
|
||||
static struct svc_stat nfsd_acl_svcstats;
|
||||
|
@ -342,7 +342,7 @@ static void set_max_drc(void)
|
|||
>> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
|
||||
nfsd_drc_mem_used = 0;
|
||||
spin_lock_init(&nfsd_drc_lock);
|
||||
dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem);
|
||||
dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem);
|
||||
}
|
||||
|
||||
static int nfsd_get_default_max_blksize(void)
|
||||
|
|
|
@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
|
|||
* sysctl_overcommit_ratio / 100) + total_swap_pages;
|
||||
|
||||
cached = global_page_state(NR_FILE_PAGES) -
|
||||
total_swapcache_pages - i.bufferram;
|
||||
total_swapcache_pages() - i.bufferram;
|
||||
if (cached < 0)
|
||||
cached = 0;
|
||||
|
||||
|
@ -109,7 +109,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
|
|||
K(i.freeram),
|
||||
K(i.bufferram),
|
||||
K(cached),
|
||||
K(total_swapcache_pages),
|
||||
K(total_swapcache_pages()),
|
||||
K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]),
|
||||
K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
|
||||
K(pages[LRU_ACTIVE_ANON]),
|
||||
|
@ -158,7 +158,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
|
|||
vmi.used >> 10,
|
||||
vmi.largest_chunk >> 10
|
||||
#ifdef CONFIG_MEMORY_FAILURE
|
||||
,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
|
||||
,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
|
||||
#endif
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
|
||||
|
|
|
@ -485,6 +485,14 @@ static inline bool acpi_driver_match_device(struct device *dev,
|
|||
|
||||
#endif /* !CONFIG_ACPI */
|
||||
|
||||
#ifdef CONFIG_ACPI_NUMA
|
||||
void __init early_parse_srat(void);
|
||||
#else
|
||||
static inline void early_parse_srat(void)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_ACPI
|
||||
void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state,
|
||||
u32 pm1a_ctrl, u32 pm1b_ctrl));
|
||||
|
|
|
@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat,
|
|||
unsigned long size);
|
||||
extern void free_bootmem(unsigned long physaddr, unsigned long size);
|
||||
extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
|
||||
extern void __free_pages_bootmem(struct page *page, unsigned int order);
|
||||
|
||||
/*
|
||||
* Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
|
||||
|
|
|
@ -23,7 +23,7 @@ extern int fragmentation_index(struct zone *zone, unsigned int order);
|
|||
extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
|
||||
int order, gfp_t gfp_mask, nodemask_t *mask,
|
||||
bool sync, bool *contended);
|
||||
extern int compact_pgdat(pg_data_t *pgdat, int order);
|
||||
extern void compact_pgdat(pg_data_t *pgdat, int order);
|
||||
extern void reset_isolation_suitable(pg_data_t *pgdat);
|
||||
extern unsigned long compaction_suitable(struct zone *zone, int order);
|
||||
|
||||
|
@ -80,9 +80,8 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
|
|||
return COMPACT_CONTINUE;
|
||||
}
|
||||
|
||||
static inline int compact_pgdat(pg_data_t *pgdat, int order)
|
||||
static inline void compact_pgdat(pg_data_t *pgdat, int order)
|
||||
{
|
||||
return COMPACT_CONTINUE;
|
||||
}
|
||||
|
||||
static inline void reset_isolation_suitable(pg_data_t *pgdat)
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
|
||||
int firmware_map_add_early(u64 start, u64 end, const char *type);
|
||||
int firmware_map_add_hotplug(u64 start, u64 end, const char *type);
|
||||
int firmware_map_remove(u64 start, u64 end, const char *type);
|
||||
|
||||
#else /* CONFIG_FIRMWARE_MEMMAP */
|
||||
|
||||
|
@ -38,6 +39,11 @@ static inline int firmware_map_add_hotplug(u64 start, u64 end, const char *type)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline int firmware_map_remove(u64 start, u64 end, const char *type)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_FIRMWARE_MEMMAP */
|
||||
|
||||
#endif /* _LINUX_FIRMWARE_MAP_H */
|
||||
|
|
|
@ -219,12 +219,6 @@ static inline void zero_user(struct page *page,
|
|||
zero_user_segments(page, start, start + size, 0, 0);
|
||||
}
|
||||
|
||||
static inline void __deprecated memclear_highpage_flush(struct page *page,
|
||||
unsigned int offset, unsigned int size)
|
||||
{
|
||||
zero_user(page, offset, size);
|
||||
}
|
||||
|
||||
#ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE
|
||||
|
||||
static inline void copy_user_highpage(struct page *to, struct page *from,
|
||||
|
|
|
@ -113,7 +113,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma,
|
|||
do { \
|
||||
pmd_t *____pmd = (__pmd); \
|
||||
anon_vma_lock_write(__anon_vma); \
|
||||
anon_vma_unlock(__anon_vma); \
|
||||
anon_vma_unlock_write(__anon_vma); \
|
||||
BUG_ON(pmd_trans_splitting(*____pmd) || \
|
||||
pmd_trans_huge(*____pmd)); \
|
||||
} while (0)
|
||||
|
|
|
@ -43,9 +43,9 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int,
|
|||
#endif
|
||||
|
||||
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
|
||||
int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
|
||||
struct page **, struct vm_area_struct **,
|
||||
unsigned long *, int *, int, unsigned int flags);
|
||||
long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
|
||||
struct page **, struct vm_area_struct **,
|
||||
unsigned long *, unsigned long *, long, unsigned int);
|
||||
void unmap_hugepage_range(struct vm_area_struct *,
|
||||
unsigned long, unsigned long, struct page *);
|
||||
void __unmap_hugepage_range_final(struct mmu_gather *tlb,
|
||||
|
|
|
@ -16,9 +16,6 @@
|
|||
struct stable_node;
|
||||
struct mem_cgroup;
|
||||
|
||||
struct page *ksm_does_need_to_copy(struct page *page,
|
||||
struct vm_area_struct *vma, unsigned long address);
|
||||
|
||||
#ifdef CONFIG_KSM
|
||||
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
|
||||
unsigned long end, int advice, unsigned long *vm_flags);
|
||||
|
@ -73,15 +70,8 @@ static inline void set_page_stable_node(struct page *page,
|
|||
* We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
|
||||
* but what if the vma was unmerged while the page was swapped out?
|
||||
*/
|
||||
static inline int ksm_might_need_to_copy(struct page *page,
|
||||
struct vm_area_struct *vma, unsigned long address)
|
||||
{
|
||||
struct anon_vma *anon_vma = page_anon_vma(page);
|
||||
|
||||
return anon_vma &&
|
||||
(anon_vma->root != vma->anon_vma->root ||
|
||||
page->index != linear_page_index(vma, address));
|
||||
}
|
||||
struct page *ksm_might_need_to_copy(struct page *page,
|
||||
struct vm_area_struct *vma, unsigned long address);
|
||||
|
||||
int page_referenced_ksm(struct page *page,
|
||||
struct mem_cgroup *memcg, unsigned long *vm_flags);
|
||||
|
@ -113,10 +103,10 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline int ksm_might_need_to_copy(struct page *page,
|
||||
static inline struct page *ksm_might_need_to_copy(struct page *page,
|
||||
struct vm_area_struct *vma, unsigned long address)
|
||||
{
|
||||
return 0;
|
||||
return page;
|
||||
}
|
||||
|
||||
static inline int page_referenced_ksm(struct page *page,
|
||||
|
|
|
@ -42,6 +42,7 @@ struct memblock {
|
|||
|
||||
extern struct memblock memblock;
|
||||
extern int memblock_debug;
|
||||
extern struct movablemem_map movablemem_map;
|
||||
|
||||
#define memblock_dbg(fmt, ...) \
|
||||
if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
|
||||
|
@ -60,6 +61,7 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size);
|
|||
void memblock_trim_memory(phys_addr_t align);
|
||||
|
||||
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
||||
|
||||
void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
|
||||
unsigned long *out_end_pfn, int *out_nid);
|
||||
|
||||
|
|
|
@ -116,7 +116,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
|
|||
* For memory reclaim.
|
||||
*/
|
||||
int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
|
||||
int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec);
|
||||
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
|
||||
unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
|
||||
void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
|
||||
|
@ -321,12 +320,6 @@ mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
|
|||
return 1;
|
||||
}
|
||||
|
||||
static inline int
|
||||
mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline unsigned long
|
||||
mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
|
||||
{
|
||||
|
|
|
@ -96,6 +96,7 @@ extern void __online_page_free(struct page *page);
|
|||
|
||||
#ifdef CONFIG_MEMORY_HOTREMOVE
|
||||
extern bool is_pageblock_removable_nolock(struct page *page);
|
||||
extern int arch_remove_memory(u64 start, u64 size);
|
||||
#endif /* CONFIG_MEMORY_HOTREMOVE */
|
||||
|
||||
/* reasonably generic interface to expand the physical pages in a zone */
|
||||
|
@ -173,17 +174,16 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
|
|||
#endif /* CONFIG_NUMA */
|
||||
#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
|
||||
|
||||
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
||||
#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
|
||||
extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
|
||||
#else
|
||||
static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
|
||||
{
|
||||
}
|
||||
static inline void put_page_bootmem(struct page *page)
|
||||
{
|
||||
}
|
||||
#else
|
||||
extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
|
||||
extern void put_page_bootmem(struct page *page);
|
||||
#endif
|
||||
extern void put_page_bootmem(struct page *page);
|
||||
extern void get_page_bootmem(unsigned long ingo, struct page *page,
|
||||
unsigned long type);
|
||||
|
||||
/*
|
||||
* Lock for memory hotplug guarantees 1) all callbacks for memory hotplug
|
||||
|
@ -233,6 +233,7 @@ static inline void unlock_memory_hotplug(void) {}
|
|||
#ifdef CONFIG_MEMORY_HOTREMOVE
|
||||
|
||||
extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
|
||||
extern void try_offline_node(int nid);
|
||||
|
||||
#else
|
||||
static inline int is_mem_section_removable(unsigned long pfn,
|
||||
|
@ -240,6 +241,8 @@ static inline int is_mem_section_removable(unsigned long pfn,
|
|||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void try_offline_node(int nid) {}
|
||||
#endif /* CONFIG_MEMORY_HOTREMOVE */
|
||||
|
||||
extern int mem_online_node(int nid);
|
||||
|
@ -247,7 +250,8 @@ extern int add_memory(int nid, u64 start, u64 size);
|
|||
extern int arch_add_memory(int nid, u64 start, u64 size);
|
||||
extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
|
||||
extern int offline_memory_block(struct memory_block *mem);
|
||||
extern int remove_memory(u64 start, u64 size);
|
||||
extern bool is_memblock_offlined(struct memory_block *mem);
|
||||
extern int remove_memory(int nid, u64 start, u64 size);
|
||||
extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
|
||||
int nr_pages);
|
||||
extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms);
|
||||
|
|
|
@ -40,11 +40,9 @@ extern void putback_movable_pages(struct list_head *l);
|
|||
extern int migrate_page(struct address_space *,
|
||||
struct page *, struct page *, enum migrate_mode);
|
||||
extern int migrate_pages(struct list_head *l, new_page_t x,
|
||||
unsigned long private, bool offlining,
|
||||
enum migrate_mode mode, int reason);
|
||||
unsigned long private, enum migrate_mode mode, int reason);
|
||||
extern int migrate_huge_page(struct page *, new_page_t x,
|
||||
unsigned long private, bool offlining,
|
||||
enum migrate_mode mode);
|
||||
unsigned long private, enum migrate_mode mode);
|
||||
|
||||
extern int fail_migrate_page(struct address_space *,
|
||||
struct page *, struct page *);
|
||||
|
@ -62,11 +60,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
|
|||
static inline void putback_lru_pages(struct list_head *l) {}
|
||||
static inline void putback_movable_pages(struct list_head *l) {}
|
||||
static inline int migrate_pages(struct list_head *l, new_page_t x,
|
||||
unsigned long private, bool offlining,
|
||||
enum migrate_mode mode, int reason) { return -ENOSYS; }
|
||||
unsigned long private, enum migrate_mode mode, int reason)
|
||||
{ return -ENOSYS; }
|
||||
static inline int migrate_huge_page(struct page *page, new_page_t x,
|
||||
unsigned long private, bool offlining,
|
||||
enum migrate_mode mode) { return -ENOSYS; }
|
||||
unsigned long private, enum migrate_mode mode)
|
||||
{ return -ENOSYS; }
|
||||
|
||||
static inline int migrate_prep(void) { return -ENOSYS; }
|
||||
static inline int migrate_prep_local(void) { return -ENOSYS; }
|
||||
|
|
|
@ -87,6 +87,7 @@ extern unsigned int kobjsize(const void *objp);
|
|||
#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
|
||||
#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
|
||||
|
||||
#define VM_POPULATE 0x00001000
|
||||
#define VM_LOCKED 0x00002000
|
||||
#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
|
||||
|
||||
|
@ -366,7 +367,7 @@ static inline struct page *compound_head(struct page *page)
|
|||
* both from it and to it can be tracked, using atomic_inc_and_test
|
||||
* and atomic_add_negative(-1).
|
||||
*/
|
||||
static inline void reset_page_mapcount(struct page *page)
|
||||
static inline void page_mapcount_reset(struct page *page)
|
||||
{
|
||||
atomic_set(&(page)->_mapcount, -1);
|
||||
}
|
||||
|
@ -580,50 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
|
|||
* sets it, so none of the operations on it need to be atomic.
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* page->flags layout:
|
||||
*
|
||||
* There are three possibilities for how page->flags get
|
||||
* laid out. The first is for the normal case, without
|
||||
* sparsemem. The second is for sparsemem when there is
|
||||
* plenty of space for node and section. The last is when
|
||||
* we have run out of space and have to fall back to an
|
||||
* alternate (slower) way of determining the node.
|
||||
*
|
||||
* No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS |
|
||||
* classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS |
|
||||
* classic sparse no space for node: | SECTION | ZONE | ... | FLAGS |
|
||||
*/
|
||||
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
|
||||
#define SECTIONS_WIDTH SECTIONS_SHIFT
|
||||
#else
|
||||
#define SECTIONS_WIDTH 0
|
||||
#endif
|
||||
|
||||
#define ZONES_WIDTH ZONES_SHIFT
|
||||
|
||||
#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#define NODES_WIDTH NODES_SHIFT
|
||||
#else
|
||||
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
||||
#error "Vmemmap: No space for nodes field in page flags"
|
||||
#endif
|
||||
#define NODES_WIDTH 0
|
||||
#endif
|
||||
|
||||
/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
|
||||
/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */
|
||||
#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
|
||||
#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
|
||||
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
|
||||
|
||||
/*
|
||||
* We are going to use the flags for the page to node mapping if its in
|
||||
* there. This includes the case where there is no node, so it is implicit.
|
||||
*/
|
||||
#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0)
|
||||
#define NODE_NOT_IN_PAGE_FLAGS
|
||||
#endif
|
||||
#define LAST_NID_PGOFF (ZONES_PGOFF - LAST_NID_WIDTH)
|
||||
|
||||
/*
|
||||
* Define the bit shifts to access each section. For non-existent
|
||||
|
@ -633,6 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
|
|||
#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
|
||||
#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
|
||||
#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
|
||||
#define LAST_NID_PGSHIFT (LAST_NID_PGOFF * (LAST_NID_WIDTH != 0))
|
||||
|
||||
/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
|
||||
#ifdef NODE_NOT_IN_PAGE_FLAGS
|
||||
|
@ -654,6 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
|
|||
#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
|
||||
#define NODES_MASK ((1UL << NODES_WIDTH) - 1)
|
||||
#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
|
||||
#define LAST_NID_MASK ((1UL << LAST_NID_WIDTH) - 1)
|
||||
#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1)
|
||||
|
||||
static inline enum zone_type page_zonenum(const struct page *page)
|
||||
|
@ -661,6 +625,10 @@ static inline enum zone_type page_zonenum(const struct page *page)
|
|||
return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
|
||||
}
|
||||
|
||||
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
|
||||
#define SECTION_IN_PAGE_FLAGS
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The identification function is only used by the buddy allocator for
|
||||
* determining if two pages could be buddies. We are not really
|
||||
|
@ -693,31 +661,48 @@ static inline int page_to_nid(const struct page *page)
|
|||
#endif
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
static inline int page_xchg_last_nid(struct page *page, int nid)
|
||||
#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
|
||||
static inline int page_nid_xchg_last(struct page *page, int nid)
|
||||
{
|
||||
return xchg(&page->_last_nid, nid);
|
||||
}
|
||||
|
||||
static inline int page_last_nid(struct page *page)
|
||||
static inline int page_nid_last(struct page *page)
|
||||
{
|
||||
return page->_last_nid;
|
||||
}
|
||||
static inline void reset_page_last_nid(struct page *page)
|
||||
static inline void page_nid_reset_last(struct page *page)
|
||||
{
|
||||
page->_last_nid = -1;
|
||||
}
|
||||
#else
|
||||
static inline int page_xchg_last_nid(struct page *page, int nid)
|
||||
static inline int page_nid_last(struct page *page)
|
||||
{
|
||||
return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK;
|
||||
}
|
||||
|
||||
extern int page_nid_xchg_last(struct page *page, int nid);
|
||||
|
||||
static inline void page_nid_reset_last(struct page *page)
|
||||
{
|
||||
int nid = (1 << LAST_NID_SHIFT) - 1;
|
||||
|
||||
page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
|
||||
page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
|
||||
}
|
||||
#endif /* LAST_NID_NOT_IN_PAGE_FLAGS */
|
||||
#else
|
||||
static inline int page_nid_xchg_last(struct page *page, int nid)
|
||||
{
|
||||
return page_to_nid(page);
|
||||
}
|
||||
|
||||
static inline int page_last_nid(struct page *page)
|
||||
static inline int page_nid_last(struct page *page)
|
||||
{
|
||||
return page_to_nid(page);
|
||||
}
|
||||
|
||||
static inline void reset_page_last_nid(struct page *page)
|
||||
static inline void page_nid_reset_last(struct page *page)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
@ -727,7 +712,7 @@ static inline struct zone *page_zone(const struct page *page)
|
|||
return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
|
||||
}
|
||||
|
||||
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
|
||||
#ifdef SECTION_IN_PAGE_FLAGS
|
||||
static inline void set_page_section(struct page *page, unsigned long section)
|
||||
{
|
||||
page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
|
||||
|
@ -757,7 +742,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
|
|||
{
|
||||
set_page_zone(page, zone);
|
||||
set_page_node(page, node);
|
||||
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
|
||||
#ifdef SECTION_IN_PAGE_FLAGS
|
||||
set_page_section(page, pfn_to_section_nr(pfn));
|
||||
#endif
|
||||
}
|
||||
|
@ -817,18 +802,7 @@ void page_address_init(void);
|
|||
#define PAGE_MAPPING_KSM 2
|
||||
#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
|
||||
|
||||
extern struct address_space swapper_space;
|
||||
static inline struct address_space *page_mapping(struct page *page)
|
||||
{
|
||||
struct address_space *mapping = page->mapping;
|
||||
|
||||
VM_BUG_ON(PageSlab(page));
|
||||
if (unlikely(PageSwapCache(page)))
|
||||
mapping = &swapper_space;
|
||||
else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
|
||||
mapping = NULL;
|
||||
return mapping;
|
||||
}
|
||||
extern struct address_space *page_mapping(struct page *page);
|
||||
|
||||
/* Neutral page->mapping pointer to address_space or anon_vma or other */
|
||||
static inline void *page_rmapping(struct page *page)
|
||||
|
@ -1035,18 +1009,18 @@ static inline int fixup_user_fault(struct task_struct *tsk,
|
|||
}
|
||||
#endif
|
||||
|
||||
extern int make_pages_present(unsigned long addr, unsigned long end);
|
||||
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
|
||||
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
|
||||
void *buf, int len, int write);
|
||||
|
||||
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, int len, unsigned int foll_flags,
|
||||
struct page **pages, struct vm_area_struct **vmas,
|
||||
int *nonblocking);
|
||||
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, int nr_pages, int write, int force,
|
||||
struct page **pages, struct vm_area_struct **vmas);
|
||||
long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long nr_pages,
|
||||
unsigned int foll_flags, struct page **pages,
|
||||
struct vm_area_struct **vmas, int *nonblocking);
|
||||
long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long nr_pages,
|
||||
int write, int force, struct page **pages,
|
||||
struct vm_area_struct **vmas);
|
||||
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
|
||||
struct page **pages);
|
||||
struct kvec;
|
||||
|
@ -1359,6 +1333,24 @@ extern void free_bootmem_with_active_regions(int nid,
|
|||
unsigned long max_low_pfn);
|
||||
extern void sparse_memory_present_with_active_regions(int nid);
|
||||
|
||||
#define MOVABLEMEM_MAP_MAX MAX_NUMNODES
|
||||
struct movablemem_entry {
|
||||
unsigned long start_pfn; /* start pfn of memory segment */
|
||||
unsigned long end_pfn; /* end pfn of memory segment (exclusive) */
|
||||
};
|
||||
|
||||
struct movablemem_map {
|
||||
bool acpi; /* true if using SRAT info */
|
||||
int nr_map;
|
||||
struct movablemem_entry map[MOVABLEMEM_MAP_MAX];
|
||||
nodemask_t numa_nodes_hotplug; /* on which nodes we specify memory */
|
||||
nodemask_t numa_nodes_kernel; /* on which nodes kernel resides in */
|
||||
};
|
||||
|
||||
extern void __init insert_movablemem_map(unsigned long start_pfn,
|
||||
unsigned long end_pfn);
|
||||
extern int __init movablemem_map_overlap(unsigned long start_pfn,
|
||||
unsigned long end_pfn);
|
||||
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
|
||||
|
||||
#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
|
||||
|
@ -1395,6 +1387,9 @@ extern void setup_per_cpu_pageset(void);
|
|||
extern void zone_pcp_update(struct zone *zone);
|
||||
extern void zone_pcp_reset(struct zone *zone);
|
||||
|
||||
/* page_alloc.c */
|
||||
extern int min_free_kbytes;
|
||||
|
||||
/* nommu.c */
|
||||
extern atomic_long_t mmap_pages_allocated;
|
||||
extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
|
||||
|
@ -1472,13 +1467,24 @@ extern int install_special_mapping(struct mm_struct *mm,
|
|||
extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
|
||||
|
||||
extern unsigned long mmap_region(struct file *file, unsigned long addr,
|
||||
unsigned long len, unsigned long flags,
|
||||
vm_flags_t vm_flags, unsigned long pgoff);
|
||||
extern unsigned long do_mmap_pgoff(struct file *, unsigned long,
|
||||
unsigned long, unsigned long,
|
||||
unsigned long, unsigned long);
|
||||
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff);
|
||||
extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
|
||||
unsigned long len, unsigned long prot, unsigned long flags,
|
||||
unsigned long pgoff, unsigned long *populate);
|
||||
extern int do_munmap(struct mm_struct *, unsigned long, size_t);
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
extern int __mm_populate(unsigned long addr, unsigned long len,
|
||||
int ignore_errors);
|
||||
static inline void mm_populate(unsigned long addr, unsigned long len)
|
||||
{
|
||||
/* Ignore errors */
|
||||
(void) __mm_populate(addr, len, 1);
|
||||
}
|
||||
#else
|
||||
static inline void mm_populate(unsigned long addr, unsigned long len) {}
|
||||
#endif
|
||||
|
||||
/* These take the mm semaphore themselves */
|
||||
extern unsigned long vm_brk(unsigned long, unsigned long);
|
||||
extern int vm_munmap(unsigned long, size_t);
|
||||
|
@ -1623,8 +1629,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
|
|||
int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
|
||||
unsigned long pfn);
|
||||
|
||||
struct page *follow_page(struct vm_area_struct *, unsigned long address,
|
||||
unsigned int foll_flags);
|
||||
struct page *follow_page_mask(struct vm_area_struct *vma,
|
||||
unsigned long address, unsigned int foll_flags,
|
||||
unsigned int *page_mask);
|
||||
|
||||
static inline struct page *follow_page(struct vm_area_struct *vma,
|
||||
unsigned long address, unsigned int foll_flags)
|
||||
{
|
||||
unsigned int unused_page_mask;
|
||||
return follow_page_mask(vma, address, foll_flags, &unused_page_mask);
|
||||
}
|
||||
|
||||
#define FOLL_WRITE 0x01 /* check pte is writable */
|
||||
#define FOLL_TOUCH 0x02 /* mark page accessed */
|
||||
#define FOLL_GET 0x04 /* do get_page on page */
|
||||
|
@ -1636,6 +1651,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
|
|||
#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */
|
||||
#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
|
||||
#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
|
||||
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
|
||||
|
||||
typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
|
||||
void *data);
|
||||
|
@ -1707,7 +1723,11 @@ int vmemmap_populate_basepages(struct page *start_page,
|
|||
unsigned long pages, int node);
|
||||
int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
|
||||
void vmemmap_populate_print_last(void);
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
void vmemmap_free(struct page *memmap, unsigned long nr_pages);
|
||||
#endif
|
||||
void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
|
||||
unsigned long size);
|
||||
|
||||
enum mf_flags {
|
||||
MF_COUNT_INCREASED = 1 << 0,
|
||||
|
@ -1720,7 +1740,7 @@ extern int unpoison_memory(unsigned long pfn);
|
|||
extern int sysctl_memory_failure_early_kill;
|
||||
extern int sysctl_memory_failure_recovery;
|
||||
extern void shake_page(struct page *p, int access);
|
||||
extern atomic_long_t mce_bad_pages;
|
||||
extern atomic_long_t num_poisoned_pages;
|
||||
extern int soft_offline_page(struct page *page, int flags);
|
||||
|
||||
extern void dump_page(struct page *page);
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include <linux/cpumask.h>
|
||||
#include <linux/page-debug-flags.h>
|
||||
#include <linux/uprobes.h>
|
||||
#include <linux/page-flags-layout.h>
|
||||
#include <asm/page.h>
|
||||
#include <asm/mmu.h>
|
||||
|
||||
|
@ -173,7 +174,7 @@ struct page {
|
|||
void *shadow;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
|
||||
int _last_nid;
|
||||
#endif
|
||||
}
|
||||
|
@ -414,9 +415,9 @@ struct mm_struct {
|
|||
#endif
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
/*
|
||||
* numa_next_scan is the next time when the PTEs will me marked
|
||||
* pte_numa to gather statistics and migrate pages to new nodes
|
||||
* if necessary
|
||||
* numa_next_scan is the next time that the PTEs will be marked
|
||||
* pte_numa. NUMA hinting faults will gather statistics and migrate
|
||||
* pages to new nodes if necessary.
|
||||
*/
|
||||
unsigned long numa_next_scan;
|
||||
|
||||
|
|
|
@ -79,6 +79,8 @@ calc_vm_flag_bits(unsigned long flags)
|
|||
{
|
||||
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
|
||||
_calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
|
||||
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
|
||||
((flags & MAP_LOCKED) ? (VM_LOCKED | VM_POPULATE) : 0) |
|
||||
(((flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE) ?
|
||||
VM_POPULATE : 0);
|
||||
}
|
||||
#endif /* _LINUX_MMAN_H */
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
#include <linux/seqlock.h>
|
||||
#include <linux/nodemask.h>
|
||||
#include <linux/pageblock-flags.h>
|
||||
#include <generated/bounds.h>
|
||||
#include <linux/page-flags-layout.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <asm/page.h>
|
||||
|
||||
|
@ -57,7 +57,9 @@ enum {
|
|||
*/
|
||||
MIGRATE_CMA,
|
||||
#endif
|
||||
#ifdef CONFIG_MEMORY_ISOLATION
|
||||
MIGRATE_ISOLATE, /* can't allocate from here */
|
||||
#endif
|
||||
MIGRATE_TYPES
|
||||
};
|
||||
|
||||
|
@ -308,24 +310,6 @@ enum zone_type {
|
|||
|
||||
#ifndef __GENERATING_BOUNDS_H
|
||||
|
||||
/*
|
||||
* When a memory allocation must conform to specific limitations (such
|
||||
* as being suitable for DMA) the caller will pass in hints to the
|
||||
* allocator in the gfp_mask, in the zone modifier bits. These bits
|
||||
* are used to select a priority ordered list of memory zones which
|
||||
* match the requested limits. See gfp_zone() in include/linux/gfp.h
|
||||
*/
|
||||
|
||||
#if MAX_NR_ZONES < 2
|
||||
#define ZONES_SHIFT 0
|
||||
#elif MAX_NR_ZONES <= 2
|
||||
#define ZONES_SHIFT 1
|
||||
#elif MAX_NR_ZONES <= 4
|
||||
#define ZONES_SHIFT 2
|
||||
#else
|
||||
#error ZONES_SHIFT -- too many zones configured adjust calculation
|
||||
#endif
|
||||
|
||||
struct zone {
|
||||
/* Fields commonly accessed by the page allocator */
|
||||
|
||||
|
@ -543,6 +527,26 @@ static inline int zone_is_oom_locked(const struct zone *zone)
|
|||
return test_bit(ZONE_OOM_LOCKED, &zone->flags);
|
||||
}
|
||||
|
||||
static inline unsigned zone_end_pfn(const struct zone *zone)
|
||||
{
|
||||
return zone->zone_start_pfn + zone->spanned_pages;
|
||||
}
|
||||
|
||||
static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
|
||||
{
|
||||
return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
|
||||
}
|
||||
|
||||
static inline bool zone_is_initialized(struct zone *zone)
|
||||
{
|
||||
return !!zone->wait_table;
|
||||
}
|
||||
|
||||
static inline bool zone_is_empty(struct zone *zone)
|
||||
{
|
||||
return zone->spanned_pages == 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* The "priority" of VM scanning is how much of the queues we will scan in one
|
||||
* go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
|
||||
|
@ -752,11 +756,17 @@ typedef struct pglist_data {
|
|||
#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr))
|
||||
|
||||
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
|
||||
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
|
||||
|
||||
#define node_end_pfn(nid) ({\
|
||||
pg_data_t *__pgdat = NODE_DATA(nid);\
|
||||
__pgdat->node_start_pfn + __pgdat->node_spanned_pages;\
|
||||
})
|
||||
static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
|
||||
{
|
||||
return pgdat->node_start_pfn + pgdat->node_spanned_pages;
|
||||
}
|
||||
|
||||
static inline bool pgdat_is_empty(pg_data_t *pgdat)
|
||||
{
|
||||
return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
|
||||
}
|
||||
|
||||
#include <linux/memory_hotplug.h>
|
||||
|
||||
|
@ -1053,8 +1063,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
|
|||
* PA_SECTION_SHIFT physical address to/from section number
|
||||
* PFN_SECTION_SHIFT pfn to/from section number
|
||||
*/
|
||||
#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
|
||||
|
||||
#define PA_SECTION_SHIFT (SECTION_SIZE_BITS)
|
||||
#define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT)
|
||||
|
||||
|
|
88
include/linux/page-flags-layout.h
Normal file
88
include/linux/page-flags-layout.h
Normal file
|
@ -0,0 +1,88 @@
|
|||
#ifndef PAGE_FLAGS_LAYOUT_H
|
||||
#define PAGE_FLAGS_LAYOUT_H
|
||||
|
||||
#include <linux/numa.h>
|
||||
#include <generated/bounds.h>
|
||||
|
||||
/*
|
||||
* When a memory allocation must conform to specific limitations (such
|
||||
* as being suitable for DMA) the caller will pass in hints to the
|
||||
* allocator in the gfp_mask, in the zone modifier bits. These bits
|
||||
* are used to select a priority ordered list of memory zones which
|
||||
* match the requested limits. See gfp_zone() in include/linux/gfp.h
|
||||
*/
|
||||
#if MAX_NR_ZONES < 2
|
||||
#define ZONES_SHIFT 0
|
||||
#elif MAX_NR_ZONES <= 2
|
||||
#define ZONES_SHIFT 1
|
||||
#elif MAX_NR_ZONES <= 4
|
||||
#define ZONES_SHIFT 2
|
||||
#else
|
||||
#error ZONES_SHIFT -- too many zones configured adjust calculation
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SPARSEMEM
|
||||
#include <asm/sparsemem.h>
|
||||
|
||||
/* SECTION_SHIFT #bits space required to store a section # */
|
||||
#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
|
||||
|
||||
#endif /* CONFIG_SPARSEMEM */
|
||||
|
||||
/*
|
||||
* page->flags layout:
|
||||
*
|
||||
* There are five possibilities for how page->flags get laid out. The first
|
||||
* pair is for the normal case without sparsemem. The second pair is for
|
||||
* sparsemem when there is plenty of space for node and section information.
|
||||
* The last is when there is insufficient space in page->flags and a separate
|
||||
* lookup is necessary.
|
||||
*
|
||||
* No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS |
|
||||
* " plus space for last_nid: | NODE | ZONE | LAST_NID ... | FLAGS |
|
||||
* classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS |
|
||||
* " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS |
|
||||
* classic sparse no space for node: | SECTION | ZONE | ... | FLAGS |
|
||||
*/
|
||||
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
|
||||
#define SECTIONS_WIDTH SECTIONS_SHIFT
|
||||
#else
|
||||
#define SECTIONS_WIDTH 0
|
||||
#endif
|
||||
|
||||
#define ZONES_WIDTH ZONES_SHIFT
|
||||
|
||||
#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#define NODES_WIDTH NODES_SHIFT
|
||||
#else
|
||||
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
||||
#error "Vmemmap: No space for nodes field in page flags"
|
||||
#endif
|
||||
#define NODES_WIDTH 0
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
#define LAST_NID_SHIFT NODES_SHIFT
|
||||
#else
|
||||
#define LAST_NID_SHIFT 0
|
||||
#endif
|
||||
|
||||
#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#define LAST_NID_WIDTH LAST_NID_SHIFT
|
||||
#else
|
||||
#define LAST_NID_WIDTH 0
|
||||
#endif
|
||||
|
||||
/*
|
||||
* We are going to use the flags for the page to node mapping if its in
|
||||
* there. This includes the case where there is no node, so it is implicit.
|
||||
*/
|
||||
#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0)
|
||||
#define NODE_NOT_IN_PAGE_FLAGS
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0
|
||||
#define LAST_NID_NOT_IN_PAGE_FLAGS
|
||||
#endif
|
||||
|
||||
#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
|
|
@ -1,6 +1,25 @@
|
|||
#ifndef __LINUX_PAGEISOLATION_H
|
||||
#define __LINUX_PAGEISOLATION_H
|
||||
|
||||
#ifdef CONFIG_MEMORY_ISOLATION
|
||||
static inline bool is_migrate_isolate_page(struct page *page)
|
||||
{
|
||||
return get_pageblock_migratetype(page) == MIGRATE_ISOLATE;
|
||||
}
|
||||
static inline bool is_migrate_isolate(int migratetype)
|
||||
{
|
||||
return migratetype == MIGRATE_ISOLATE;
|
||||
}
|
||||
#else
|
||||
static inline bool is_migrate_isolate_page(struct page *page)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
static inline bool is_migrate_isolate(int migratetype)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
|
||||
bool skip_hwpoisoned_pages);
|
||||
|
|
|
@ -537,6 +537,7 @@ struct dev_pm_info {
|
|||
unsigned int irq_safe:1;
|
||||
unsigned int use_autosuspend:1;
|
||||
unsigned int timer_autosuspends:1;
|
||||
unsigned int memalloc_noio:1;
|
||||
enum rpm_request request;
|
||||
enum rpm_status runtime_status;
|
||||
int runtime_error;
|
||||
|
|
|
@ -47,6 +47,7 @@ extern void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);
|
|||
extern unsigned long pm_runtime_autosuspend_expiration(struct device *dev);
|
||||
extern void pm_runtime_update_max_time_suspended(struct device *dev,
|
||||
s64 delta_ns);
|
||||
extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable);
|
||||
|
||||
static inline bool pm_children_suspended(struct device *dev)
|
||||
{
|
||||
|
@ -156,6 +157,8 @@ static inline void pm_runtime_set_autosuspend_delay(struct device *dev,
|
|||
int delay) {}
|
||||
static inline unsigned long pm_runtime_autosuspend_expiration(
|
||||
struct device *dev) { return 0; }
|
||||
static inline void pm_runtime_set_memalloc_noio(struct device *dev,
|
||||
bool enable){}
|
||||
|
||||
#endif /* !CONFIG_PM_RUNTIME */
|
||||
|
||||
|
|
|
@ -123,7 +123,7 @@ static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
|
|||
down_write(&anon_vma->root->rwsem);
|
||||
}
|
||||
|
||||
static inline void anon_vma_unlock(struct anon_vma *anon_vma)
|
||||
static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
|
||||
{
|
||||
up_write(&anon_vma->root->rwsem);
|
||||
}
|
||||
|
|
|
@ -51,6 +51,7 @@ struct sched_param {
|
|||
#include <linux/cred.h>
|
||||
#include <linux/llist.h>
|
||||
#include <linux/uidgid.h>
|
||||
#include <linux/gfp.h>
|
||||
|
||||
#include <asm/processor.h>
|
||||
|
||||
|
@ -1791,6 +1792,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
|
|||
#define PF_FROZEN 0x00010000 /* frozen for system suspend */
|
||||
#define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */
|
||||
#define PF_KSWAPD 0x00040000 /* I am kswapd */
|
||||
#define PF_MEMALLOC_NOIO 0x00080000 /* Allocating memory without IO involved */
|
||||
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
|
||||
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
|
||||
#define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */
|
||||
|
@ -1828,6 +1830,26 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
|
|||
#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
|
||||
#define used_math() tsk_used_math(current)
|
||||
|
||||
/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */
|
||||
static inline gfp_t memalloc_noio_flags(gfp_t flags)
|
||||
{
|
||||
if (unlikely(current->flags & PF_MEMALLOC_NOIO))
|
||||
flags &= ~__GFP_IO;
|
||||
return flags;
|
||||
}
|
||||
|
||||
static inline unsigned int memalloc_noio_save(void)
|
||||
{
|
||||
unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
|
||||
current->flags |= PF_MEMALLOC_NOIO;
|
||||
return flags;
|
||||
}
|
||||
|
||||
static inline void memalloc_noio_restore(unsigned int flags)
|
||||
{
|
||||
current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
|
||||
}
|
||||
|
||||
/*
|
||||
* task->jobctl flags
|
||||
*/
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
#include <linux/memcontrol.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/node.h>
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <asm/page.h>
|
||||
|
||||
|
@ -156,7 +156,7 @@ enum {
|
|||
SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
|
||||
};
|
||||
|
||||
#define SWAP_CLUSTER_MAX 32
|
||||
#define SWAP_CLUSTER_MAX 32UL
|
||||
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
|
||||
|
||||
/*
|
||||
|
@ -202,6 +202,18 @@ struct swap_info_struct {
|
|||
unsigned long *frontswap_map; /* frontswap in-use, one bit per page */
|
||||
atomic_t frontswap_pages; /* frontswap pages in-use counter */
|
||||
#endif
|
||||
spinlock_t lock; /*
|
||||
* protect map scan related fields like
|
||||
* swap_map, lowest_bit, highest_bit,
|
||||
* inuse_pages, cluster_next,
|
||||
* cluster_nr, lowest_alloc and
|
||||
* highest_alloc. other fields are only
|
||||
* changed at swapon/swapoff, so are
|
||||
* protected by swap_lock. changing
|
||||
* flags need hold this lock and
|
||||
* swap_lock. If both locks need hold,
|
||||
* hold swap_lock first.
|
||||
*/
|
||||
};
|
||||
|
||||
struct swap_list_t {
|
||||
|
@ -209,15 +221,12 @@ struct swap_list_t {
|
|||
int next; /* swapfile to be used next */
|
||||
};
|
||||
|
||||
/* Swap 50% full? Release swapcache more aggressively.. */
|
||||
#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
|
||||
|
||||
/* linux/mm/page_alloc.c */
|
||||
extern unsigned long totalram_pages;
|
||||
extern unsigned long totalreserve_pages;
|
||||
extern unsigned long dirty_balance_reserve;
|
||||
extern unsigned int nr_free_buffer_pages(void);
|
||||
extern unsigned int nr_free_pagecache_pages(void);
|
||||
extern unsigned long nr_free_buffer_pages(void);
|
||||
extern unsigned long nr_free_pagecache_pages(void);
|
||||
|
||||
/* Definition of global_page_state not available yet */
|
||||
#define nr_free_pages() global_page_state(NR_FREE_PAGES)
|
||||
|
@ -266,7 +275,7 @@ extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
|
|||
extern unsigned long shrink_all_memory(unsigned long nr_pages);
|
||||
extern int vm_swappiness;
|
||||
extern int remove_mapping(struct address_space *mapping, struct page *page);
|
||||
extern long vm_total_pages;
|
||||
extern unsigned long vm_total_pages;
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
extern int zone_reclaim_mode;
|
||||
|
@ -330,8 +339,9 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *,
|
|||
sector_t *);
|
||||
|
||||
/* linux/mm/swap_state.c */
|
||||
extern struct address_space swapper_space;
|
||||
#define total_swapcache_pages swapper_space.nrpages
|
||||
extern struct address_space swapper_spaces[];
|
||||
#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)])
|
||||
extern unsigned long total_swapcache_pages(void);
|
||||
extern void show_swap_cache_info(void);
|
||||
extern int add_to_swap(struct page *);
|
||||
extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
|
||||
|
@ -346,8 +356,20 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t,
|
|||
struct vm_area_struct *vma, unsigned long addr);
|
||||
|
||||
/* linux/mm/swapfile.c */
|
||||
extern long nr_swap_pages;
|
||||
extern atomic_long_t nr_swap_pages;
|
||||
extern long total_swap_pages;
|
||||
|
||||
/* Swap 50% full? Release swapcache more aggressively.. */
|
||||
static inline bool vm_swap_full(void)
|
||||
{
|
||||
return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages;
|
||||
}
|
||||
|
||||
static inline long get_nr_swap_pages(void)
|
||||
{
|
||||
return atomic_long_read(&nr_swap_pages);
|
||||
}
|
||||
|
||||
extern void si_swapinfo(struct sysinfo *);
|
||||
extern swp_entry_t get_swap_page(void);
|
||||
extern swp_entry_t get_swap_page_of_type(int);
|
||||
|
@ -380,9 +402,10 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
|
|||
|
||||
#else /* CONFIG_SWAP */
|
||||
|
||||
#define nr_swap_pages 0L
|
||||
#define get_nr_swap_pages() 0L
|
||||
#define total_swap_pages 0L
|
||||
#define total_swapcache_pages 0UL
|
||||
#define total_swapcache_pages() 0UL
|
||||
#define vm_swap_full() 0
|
||||
|
||||
#define si_swapinfo(val) \
|
||||
do { (val)->freeswap = (val)->totalswap = 0; } while (0)
|
||||
|
|
|
@ -36,7 +36,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
|
|||
#endif
|
||||
PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL,
|
||||
KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
|
||||
KSWAPD_SKIP_CONGESTION_WAIT,
|
||||
PAGEOUTRUN, ALLOCSTALL, PGROTATED,
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
NUMA_PTE_UPDATES,
|
||||
|
|
|
@ -85,7 +85,7 @@ static inline void vm_events_fold_cpu(int cpu)
|
|||
#define count_vm_numa_events(x, y) count_vm_events(x, y)
|
||||
#else
|
||||
#define count_vm_numa_event(x) do {} while (0)
|
||||
#define count_vm_numa_events(x, y) do {} while (0)
|
||||
#define count_vm_numa_events(x, y) do { (void)(y); } while (0)
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
|
||||
#define __count_zone_vm_events(item, zone, delta) \
|
||||
|
|
12
ipc/shm.c
12
ipc/shm.c
|
@ -967,11 +967,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
|
|||
unsigned long flags;
|
||||
unsigned long prot;
|
||||
int acc_mode;
|
||||
unsigned long user_addr;
|
||||
struct ipc_namespace *ns;
|
||||
struct shm_file_data *sfd;
|
||||
struct path path;
|
||||
fmode_t f_mode;
|
||||
unsigned long populate = 0;
|
||||
|
||||
err = -EINVAL;
|
||||
if (shmid < 0)
|
||||
|
@ -1070,13 +1070,15 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
|
|||
goto invalid;
|
||||
}
|
||||
|
||||
user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0);
|
||||
*raddr = user_addr;
|
||||
addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
|
||||
*raddr = addr;
|
||||
err = 0;
|
||||
if (IS_ERR_VALUE(user_addr))
|
||||
err = (long)user_addr;
|
||||
if (IS_ERR_VALUE(addr))
|
||||
err = (long)addr;
|
||||
invalid:
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
if (populate)
|
||||
mm_populate(addr, populate);
|
||||
|
||||
out_fput:
|
||||
fput(file);
|
||||
|
|
|
@ -1132,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process);
|
|||
*/
|
||||
static int select_fallback_rq(int cpu, struct task_struct *p)
|
||||
{
|
||||
const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
|
||||
int nid = cpu_to_node(cpu);
|
||||
const struct cpumask *nodemask = NULL;
|
||||
enum { cpuset, possible, fail } state = cpuset;
|
||||
int dest_cpu;
|
||||
|
||||
/* Look for allowed, online CPU in same node. */
|
||||
for_each_cpu(dest_cpu, nodemask) {
|
||||
if (!cpu_online(dest_cpu))
|
||||
continue;
|
||||
if (!cpu_active(dest_cpu))
|
||||
continue;
|
||||
if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
|
||||
return dest_cpu;
|
||||
/*
|
||||
* If the node that the cpu is on has been offlined, cpu_to_node()
|
||||
* will return -1. There is no cpu on the node, and we should
|
||||
* select the cpu on the other node.
|
||||
*/
|
||||
if (nid != -1) {
|
||||
nodemask = cpumask_of_node(nid);
|
||||
|
||||
/* Look for allowed, online CPU in same node. */
|
||||
for_each_cpu(dest_cpu, nodemask) {
|
||||
if (!cpu_online(dest_cpu))
|
||||
continue;
|
||||
if (!cpu_active(dest_cpu))
|
||||
continue;
|
||||
if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
|
||||
return dest_cpu;
|
||||
}
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
|
|
|
@ -105,7 +105,6 @@ extern char core_pattern[];
|
|||
extern unsigned int core_pipe_limit;
|
||||
#endif
|
||||
extern int pid_max;
|
||||
extern int min_free_kbytes;
|
||||
extern int pid_max_min, pid_max_max;
|
||||
extern int sysctl_drop_caches;
|
||||
extern int percpu_pagelist_fraction;
|
||||
|
|
10
mm/Kconfig
10
mm/Kconfig
|
@ -162,10 +162,16 @@ config MOVABLE_NODE
|
|||
Say Y here if you want to hotplug a whole node.
|
||||
Say N here if you want kernel to use memory on all nodes evenly.
|
||||
|
||||
#
|
||||
# Only be set on architectures that have completely implemented memory hotplug
|
||||
# feature. If you are not sure, don't touch it.
|
||||
#
|
||||
config HAVE_BOOTMEM_INFO_NODE
|
||||
def_bool n
|
||||
|
||||
# eventually, we can have this option just 'select SPARSEMEM'
|
||||
config MEMORY_HOTPLUG
|
||||
bool "Allow for memory hot-add"
|
||||
select MEMORY_ISOLATION
|
||||
depends on SPARSEMEM || X86_64_ACPI_NUMA
|
||||
depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
|
||||
|
@ -176,6 +182,8 @@ config MEMORY_HOTPLUG_SPARSE
|
|||
|
||||
config MEMORY_HOTREMOVE
|
||||
bool "Allow for memory hot remove"
|
||||
select MEMORY_ISOLATION
|
||||
select HAVE_BOOTMEM_INFO_NODE if X86_64
|
||||
depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
|
||||
depends on MIGRATION
|
||||
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include <linux/sysctl.h>
|
||||
#include <linux/sysfs.h>
|
||||
#include <linux/balloon_compaction.h>
|
||||
#include <linux/page-isolation.h>
|
||||
#include "internal.h"
|
||||
|
||||
#ifdef CONFIG_COMPACTION
|
||||
|
@ -85,7 +86,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
|
|||
static void __reset_isolation_suitable(struct zone *zone)
|
||||
{
|
||||
unsigned long start_pfn = zone->zone_start_pfn;
|
||||
unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
|
||||
unsigned long end_pfn = zone_end_pfn(zone);
|
||||
unsigned long pfn;
|
||||
|
||||
zone->compact_cached_migrate_pfn = start_pfn;
|
||||
|
@ -215,7 +216,10 @@ static bool suitable_migration_target(struct page *page)
|
|||
int migratetype = get_pageblock_migratetype(page);
|
||||
|
||||
/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
|
||||
if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
|
||||
if (migratetype == MIGRATE_RESERVE)
|
||||
return false;
|
||||
|
||||
if (is_migrate_isolate(migratetype))
|
||||
return false;
|
||||
|
||||
/* If the page is a large free page, then allow migration */
|
||||
|
@ -611,8 +615,7 @@ check_compact_cluster:
|
|||
continue;
|
||||
|
||||
next_pageblock:
|
||||
low_pfn += pageblock_nr_pages;
|
||||
low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
|
||||
low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
|
||||
last_pageblock_nr = pageblock_nr;
|
||||
}
|
||||
|
||||
|
@ -644,7 +647,7 @@ static void isolate_freepages(struct zone *zone,
|
|||
struct compact_control *cc)
|
||||
{
|
||||
struct page *page;
|
||||
unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
|
||||
unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn;
|
||||
int nr_freepages = cc->nr_freepages;
|
||||
struct list_head *freelist = &cc->freepages;
|
||||
|
||||
|
@ -663,7 +666,7 @@ static void isolate_freepages(struct zone *zone,
|
|||
*/
|
||||
high_pfn = min(low_pfn, pfn);
|
||||
|
||||
zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
|
||||
z_end_pfn = zone_end_pfn(zone);
|
||||
|
||||
/*
|
||||
* Isolate free pages until enough are available to migrate the
|
||||
|
@ -706,7 +709,7 @@ static void isolate_freepages(struct zone *zone,
|
|||
* only scans within a pageblock
|
||||
*/
|
||||
end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
|
||||
end_pfn = min(end_pfn, zone_end_pfn);
|
||||
end_pfn = min(end_pfn, z_end_pfn);
|
||||
isolated = isolate_freepages_block(cc, pfn, end_pfn,
|
||||
freelist, false);
|
||||
nr_freepages += isolated;
|
||||
|
@ -795,7 +798,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
|
|||
low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
|
||||
|
||||
/* Only scan within a pageblock boundary */
|
||||
end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
|
||||
end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
|
||||
|
||||
/* Do not cross the free scanner or scan within a memory hole */
|
||||
if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
|
||||
|
@ -920,7 +923,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
|
|||
{
|
||||
int ret;
|
||||
unsigned long start_pfn = zone->zone_start_pfn;
|
||||
unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
|
||||
unsigned long end_pfn = zone_end_pfn(zone);
|
||||
|
||||
ret = compaction_suitable(zone, cc->order);
|
||||
switch (ret) {
|
||||
|
@ -977,7 +980,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
|
|||
|
||||
nr_migrate = cc->nr_migratepages;
|
||||
err = migrate_pages(&cc->migratepages, compaction_alloc,
|
||||
(unsigned long)cc, false,
|
||||
(unsigned long)cc,
|
||||
cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
|
||||
MR_COMPACTION);
|
||||
update_nr_listpages(cc);
|
||||
|
@ -1086,7 +1089,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
|
|||
|
||||
|
||||
/* Compact all zones within a node */
|
||||
static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
|
||||
static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
|
||||
{
|
||||
int zoneid;
|
||||
struct zone *zone;
|
||||
|
@ -1119,28 +1122,26 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
|
|||
VM_BUG_ON(!list_empty(&cc->freepages));
|
||||
VM_BUG_ON(!list_empty(&cc->migratepages));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int compact_pgdat(pg_data_t *pgdat, int order)
|
||||
void compact_pgdat(pg_data_t *pgdat, int order)
|
||||
{
|
||||
struct compact_control cc = {
|
||||
.order = order,
|
||||
.sync = false,
|
||||
};
|
||||
|
||||
return __compact_pgdat(pgdat, &cc);
|
||||
__compact_pgdat(pgdat, &cc);
|
||||
}
|
||||
|
||||
static int compact_node(int nid)
|
||||
static void compact_node(int nid)
|
||||
{
|
||||
struct compact_control cc = {
|
||||
.order = -1,
|
||||
.sync = true,
|
||||
};
|
||||
|
||||
return __compact_pgdat(NODE_DATA(nid), &cc);
|
||||
__compact_pgdat(NODE_DATA(nid), &cc);
|
||||
}
|
||||
|
||||
/* Compact all nodes in the system */
|
||||
|
|
18
mm/fadvise.c
18
mm/fadvise.c
|
@ -17,6 +17,7 @@
|
|||
#include <linux/fadvise.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/swap.h>
|
||||
|
||||
#include <asm/unistd.h>
|
||||
|
||||
|
@ -120,9 +121,22 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
|
|||
start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
|
||||
end_index = (endbyte >> PAGE_CACHE_SHIFT);
|
||||
|
||||
if (end_index >= start_index)
|
||||
invalidate_mapping_pages(mapping, start_index,
|
||||
if (end_index >= start_index) {
|
||||
unsigned long count = invalidate_mapping_pages(mapping,
|
||||
start_index, end_index);
|
||||
|
||||
/*
|
||||
* If fewer pages were invalidated than expected then
|
||||
* it is possible that some of the pages were on
|
||||
* a per-cpu pagevec for a remote CPU. Drain all
|
||||
* pagevecs and try again.
|
||||
*/
|
||||
if (count < (end_index - start_index + 1)) {
|
||||
lru_add_drain_all();
|
||||
invalidate_mapping_pages(mapping, start_index,
|
||||
end_index);
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
|
|
51
mm/fremap.c
51
mm/fremap.c
|
@ -129,6 +129,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
|
|||
struct vm_area_struct *vma;
|
||||
int err = -EINVAL;
|
||||
int has_write_lock = 0;
|
||||
vm_flags_t vm_flags;
|
||||
|
||||
if (prot)
|
||||
return err;
|
||||
|
@ -160,15 +161,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
|
|||
/*
|
||||
* Make sure the vma is shared, that it supports prefaulting,
|
||||
* and that the remapped range is valid and fully within
|
||||
* the single existing vma. vm_private_data is used as a
|
||||
* swapout cursor in a VM_NONLINEAR vma.
|
||||
* the single existing vma.
|
||||
*/
|
||||
if (!vma || !(vma->vm_flags & VM_SHARED))
|
||||
goto out;
|
||||
|
||||
if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
|
||||
goto out;
|
||||
|
||||
if (!vma->vm_ops || !vma->vm_ops->remap_pages)
|
||||
goto out;
|
||||
|
||||
|
@ -177,6 +174,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
|
|||
|
||||
/* Must set VM_NONLINEAR before any pages are populated. */
|
||||
if (!(vma->vm_flags & VM_NONLINEAR)) {
|
||||
/*
|
||||
* vm_private_data is used as a swapout cursor
|
||||
* in a VM_NONLINEAR vma.
|
||||
*/
|
||||
if (vma->vm_private_data)
|
||||
goto out;
|
||||
|
||||
/* Don't need a nonlinear mapping, exit success */
|
||||
if (pgoff == linear_page_index(vma, start)) {
|
||||
err = 0;
|
||||
|
@ -184,6 +188,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
|
|||
}
|
||||
|
||||
if (!has_write_lock) {
|
||||
get_write_lock:
|
||||
up_read(&mm->mmap_sem);
|
||||
down_write(&mm->mmap_sem);
|
||||
has_write_lock = 1;
|
||||
|
@ -199,9 +204,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
|
|||
unsigned long addr;
|
||||
struct file *file = get_file(vma->vm_file);
|
||||
|
||||
flags &= MAP_NONBLOCK;
|
||||
addr = mmap_region(file, start, size,
|
||||
flags, vma->vm_flags, pgoff);
|
||||
vm_flags = vma->vm_flags;
|
||||
if (!(flags & MAP_NONBLOCK))
|
||||
vm_flags |= VM_POPULATE;
|
||||
addr = mmap_region(file, start, size, vm_flags, pgoff);
|
||||
fput(file);
|
||||
if (IS_ERR_VALUE(addr)) {
|
||||
err = addr;
|
||||
|
@ -220,32 +226,26 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
|
|||
mutex_unlock(&mapping->i_mmap_mutex);
|
||||
}
|
||||
|
||||
if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) {
|
||||
if (!has_write_lock)
|
||||
goto get_write_lock;
|
||||
vma->vm_flags |= VM_POPULATE;
|
||||
}
|
||||
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
/*
|
||||
* drop PG_Mlocked flag for over-mapped range
|
||||
*/
|
||||
vm_flags_t saved_flags = vma->vm_flags;
|
||||
if (!has_write_lock)
|
||||
goto get_write_lock;
|
||||
vm_flags = vma->vm_flags;
|
||||
munlock_vma_pages_range(vma, start, start + size);
|
||||
vma->vm_flags = saved_flags;
|
||||
vma->vm_flags = vm_flags;
|
||||
}
|
||||
|
||||
mmu_notifier_invalidate_range_start(mm, start, start + size);
|
||||
err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
|
||||
mmu_notifier_invalidate_range_end(mm, start, start + size);
|
||||
if (!err && !(flags & MAP_NONBLOCK)) {
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
/*
|
||||
* might be mapping previously unmapped range of file
|
||||
*/
|
||||
mlock_vma_pages_range(vma, start, start + size);
|
||||
} else {
|
||||
if (unlikely(has_write_lock)) {
|
||||
downgrade_write(&mm->mmap_sem);
|
||||
has_write_lock = 0;
|
||||
}
|
||||
make_pages_present(start, start+size);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We can't clear VM_NONLINEAR because we'd have to do
|
||||
|
@ -254,10 +254,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
|
|||
*/
|
||||
|
||||
out:
|
||||
vm_flags = vma->vm_flags;
|
||||
if (likely(!has_write_lock))
|
||||
up_read(&mm->mmap_sem);
|
||||
else
|
||||
up_write(&mm->mmap_sem);
|
||||
if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
|
||||
mm_populate(start, size);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
#include <linux/mman.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/migrate.h>
|
||||
#include <linux/hashtable.h>
|
||||
|
||||
#include <asm/tlb.h>
|
||||
#include <asm/pgalloc.h>
|
||||
|
@ -62,12 +63,11 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
|
|||
static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
|
||||
|
||||
static int khugepaged(void *none);
|
||||
static int mm_slots_hash_init(void);
|
||||
static int khugepaged_slab_init(void);
|
||||
static void khugepaged_slab_free(void);
|
||||
|
||||
#define MM_SLOTS_HASH_HEADS 1024
|
||||
static struct hlist_head *mm_slots_hash __read_mostly;
|
||||
#define MM_SLOTS_HASH_BITS 10
|
||||
static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
|
||||
|
||||
static struct kmem_cache *mm_slot_cache __read_mostly;
|
||||
|
||||
/**
|
||||
|
@ -105,7 +105,6 @@ static int set_recommended_min_free_kbytes(void)
|
|||
struct zone *zone;
|
||||
int nr_zones = 0;
|
||||
unsigned long recommended_min;
|
||||
extern int min_free_kbytes;
|
||||
|
||||
if (!khugepaged_enabled())
|
||||
return 0;
|
||||
|
@ -634,12 +633,6 @@ static int __init hugepage_init(void)
|
|||
if (err)
|
||||
goto out;
|
||||
|
||||
err = mm_slots_hash_init();
|
||||
if (err) {
|
||||
khugepaged_slab_free();
|
||||
goto out;
|
||||
}
|
||||
|
||||
register_shrinker(&huge_zero_page_shrinker);
|
||||
|
||||
/*
|
||||
|
@ -1302,7 +1295,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
int target_nid;
|
||||
int current_nid = -1;
|
||||
bool migrated;
|
||||
bool page_locked = false;
|
||||
|
||||
spin_lock(&mm->page_table_lock);
|
||||
if (unlikely(!pmd_same(pmd, *pmdp)))
|
||||
|
@ -1324,7 +1316,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
/* Acquire the page lock to serialise THP migrations */
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
lock_page(page);
|
||||
page_locked = true;
|
||||
|
||||
/* Confirm the PTE did not while locked */
|
||||
spin_lock(&mm->page_table_lock);
|
||||
|
@ -1337,34 +1328,26 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
|
||||
/* Migrate the THP to the requested node */
|
||||
migrated = migrate_misplaced_transhuge_page(mm, vma,
|
||||
pmdp, pmd, addr,
|
||||
page, target_nid);
|
||||
if (migrated)
|
||||
current_nid = target_nid;
|
||||
else {
|
||||
spin_lock(&mm->page_table_lock);
|
||||
if (unlikely(!pmd_same(pmd, *pmdp))) {
|
||||
unlock_page(page);
|
||||
goto out_unlock;
|
||||
}
|
||||
goto clear_pmdnuma;
|
||||
}
|
||||
pmdp, pmd, addr, page, target_nid);
|
||||
if (!migrated)
|
||||
goto check_same;
|
||||
|
||||
task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
|
||||
task_numa_fault(target_nid, HPAGE_PMD_NR, true);
|
||||
return 0;
|
||||
|
||||
check_same:
|
||||
spin_lock(&mm->page_table_lock);
|
||||
if (unlikely(!pmd_same(pmd, *pmdp)))
|
||||
goto out_unlock;
|
||||
clear_pmdnuma:
|
||||
pmd = pmd_mknonnuma(pmd);
|
||||
set_pmd_at(mm, haddr, pmdp, pmd);
|
||||
VM_BUG_ON(pmd_numa(*pmdp));
|
||||
update_mmu_cache_pmd(vma, addr, pmdp);
|
||||
if (page_locked)
|
||||
unlock_page(page);
|
||||
|
||||
out_unlock:
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
if (current_nid != -1)
|
||||
task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
|
||||
task_numa_fault(current_nid, HPAGE_PMD_NR, false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1656,7 +1639,7 @@ static void __split_huge_page_refcount(struct page *page)
|
|||
page_tail->mapping = page->mapping;
|
||||
|
||||
page_tail->index = page->index + i;
|
||||
page_xchg_last_nid(page_tail, page_last_nid(page));
|
||||
page_nid_xchg_last(page_tail, page_nid_last(page));
|
||||
|
||||
BUG_ON(!PageAnon(page_tail));
|
||||
BUG_ON(!PageUptodate(page_tail));
|
||||
|
@ -1846,7 +1829,7 @@ int split_huge_page(struct page *page)
|
|||
|
||||
BUG_ON(PageCompound(page));
|
||||
out_unlock:
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_write(anon_vma);
|
||||
put_anon_vma(anon_vma);
|
||||
out:
|
||||
return ret;
|
||||
|
@ -1908,12 +1891,6 @@ static int __init khugepaged_slab_init(void)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void __init khugepaged_slab_free(void)
|
||||
{
|
||||
kmem_cache_destroy(mm_slot_cache);
|
||||
mm_slot_cache = NULL;
|
||||
}
|
||||
|
||||
static inline struct mm_slot *alloc_mm_slot(void)
|
||||
{
|
||||
if (!mm_slot_cache) /* initialization failed */
|
||||
|
@ -1926,47 +1903,23 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
|
|||
kmem_cache_free(mm_slot_cache, mm_slot);
|
||||
}
|
||||
|
||||
static int __init mm_slots_hash_init(void)
|
||||
{
|
||||
mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
|
||||
GFP_KERNEL);
|
||||
if (!mm_slots_hash)
|
||||
return -ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if 0
|
||||
static void __init mm_slots_hash_free(void)
|
||||
{
|
||||
kfree(mm_slots_hash);
|
||||
mm_slots_hash = NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
static struct mm_slot *get_mm_slot(struct mm_struct *mm)
|
||||
{
|
||||
struct mm_slot *mm_slot;
|
||||
struct hlist_head *bucket;
|
||||
struct hlist_node *node;
|
||||
|
||||
bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
|
||||
% MM_SLOTS_HASH_HEADS];
|
||||
hlist_for_each_entry(mm_slot, node, bucket, hash) {
|
||||
hash_for_each_possible(mm_slots_hash, mm_slot, node, hash, (unsigned long)mm)
|
||||
if (mm == mm_slot->mm)
|
||||
return mm_slot;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void insert_to_mm_slots_hash(struct mm_struct *mm,
|
||||
struct mm_slot *mm_slot)
|
||||
{
|
||||
struct hlist_head *bucket;
|
||||
|
||||
bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
|
||||
% MM_SLOTS_HASH_HEADS];
|
||||
mm_slot->mm = mm;
|
||||
hlist_add_head(&mm_slot->hash, bucket);
|
||||
hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
|
||||
}
|
||||
|
||||
static inline int khugepaged_test_exit(struct mm_struct *mm)
|
||||
|
@ -2035,7 +1988,7 @@ void __khugepaged_exit(struct mm_struct *mm)
|
|||
spin_lock(&khugepaged_mm_lock);
|
||||
mm_slot = get_mm_slot(mm);
|
||||
if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
|
||||
hlist_del(&mm_slot->hash);
|
||||
hash_del(&mm_slot->hash);
|
||||
list_del(&mm_slot->mm_node);
|
||||
free = 1;
|
||||
}
|
||||
|
@ -2368,7 +2321,7 @@ static void collapse_huge_page(struct mm_struct *mm,
|
|||
BUG_ON(!pmd_none(*pmd));
|
||||
set_pmd_at(mm, address, pmd, _pmd);
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
anon_vma_unlock(vma->anon_vma);
|
||||
anon_vma_unlock_write(vma->anon_vma);
|
||||
goto out;
|
||||
}
|
||||
|
||||
|
@ -2376,7 +2329,7 @@ static void collapse_huge_page(struct mm_struct *mm,
|
|||
* All pages are isolated and locked so anon_vma rmap
|
||||
* can't run anymore.
|
||||
*/
|
||||
anon_vma_unlock(vma->anon_vma);
|
||||
anon_vma_unlock_write(vma->anon_vma);
|
||||
|
||||
__collapse_huge_page_copy(pte, new_page, vma, address, ptl);
|
||||
pte_unmap(pte);
|
||||
|
@ -2423,7 +2376,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
|
|||
struct page *page;
|
||||
unsigned long _address;
|
||||
spinlock_t *ptl;
|
||||
int node = -1;
|
||||
int node = NUMA_NO_NODE;
|
||||
|
||||
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
|
||||
|
||||
|
@ -2453,7 +2406,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
|
|||
* be more sophisticated and look at more pages,
|
||||
* but isn't for now.
|
||||
*/
|
||||
if (node == -1)
|
||||
if (node == NUMA_NO_NODE)
|
||||
node = page_to_nid(page);
|
||||
VM_BUG_ON(PageCompound(page));
|
||||
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
|
||||
|
@ -2484,7 +2437,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
|
|||
|
||||
if (khugepaged_test_exit(mm)) {
|
||||
/* free mm_slot */
|
||||
hlist_del(&mm_slot->hash);
|
||||
hash_del(&mm_slot->hash);
|
||||
list_del(&mm_slot->mm_node);
|
||||
|
||||
/*
|
||||
|
|
34
mm/hugetlb.c
34
mm/hugetlb.c
|
@ -1293,8 +1293,7 @@ static void __init report_hugepages(void)
|
|||
|
||||
for_each_hstate(h) {
|
||||
char buf[32];
|
||||
printk(KERN_INFO "HugeTLB registered %s page size, "
|
||||
"pre-allocated %ld pages\n",
|
||||
pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
|
||||
memfmt(buf, huge_page_size(h)),
|
||||
h->free_huge_pages);
|
||||
}
|
||||
|
@ -1702,8 +1701,7 @@ static void __init hugetlb_sysfs_init(void)
|
|||
err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
|
||||
hstate_kobjs, &hstate_attr_group);
|
||||
if (err)
|
||||
printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
|
||||
h->name);
|
||||
pr_err("Hugetlb: Unable to add hstate %s", h->name);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1826,9 +1824,8 @@ void hugetlb_register_node(struct node *node)
|
|||
nhs->hstate_kobjs,
|
||||
&per_node_hstate_attr_group);
|
||||
if (err) {
|
||||
printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
|
||||
" for node %d\n",
|
||||
h->name, node->dev.id);
|
||||
pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
|
||||
h->name, node->dev.id);
|
||||
hugetlb_unregister_node(node);
|
||||
break;
|
||||
}
|
||||
|
@ -1924,7 +1921,7 @@ void __init hugetlb_add_hstate(unsigned order)
|
|||
unsigned long i;
|
||||
|
||||
if (size_to_hstate(PAGE_SIZE << order)) {
|
||||
printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
|
||||
pr_warning("hugepagesz= specified twice, ignoring\n");
|
||||
return;
|
||||
}
|
||||
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
|
||||
|
@ -1960,8 +1957,8 @@ static int __init hugetlb_nrpages_setup(char *s)
|
|||
mhp = &parsed_hstate->max_huge_pages;
|
||||
|
||||
if (mhp == last_mhp) {
|
||||
printk(KERN_WARNING "hugepages= specified twice without "
|
||||
"interleaving hugepagesz=, ignoring\n");
|
||||
pr_warning("hugepages= specified twice without "
|
||||
"interleaving hugepagesz=, ignoring\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -2692,9 +2689,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
* COW. Warn that such a situation has occurred as it may not be obvious
|
||||
*/
|
||||
if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
|
||||
printk(KERN_WARNING
|
||||
"PID %d killed due to inadequate hugepage pool\n",
|
||||
current->pid);
|
||||
pr_warning("PID %d killed due to inadequate hugepage pool\n",
|
||||
current->pid);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -2924,14 +2920,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
struct page **pages, struct vm_area_struct **vmas,
|
||||
unsigned long *position, int *length, int i,
|
||||
unsigned int flags)
|
||||
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
struct page **pages, struct vm_area_struct **vmas,
|
||||
unsigned long *position, unsigned long *nr_pages,
|
||||
long i, unsigned int flags)
|
||||
{
|
||||
unsigned long pfn_offset;
|
||||
unsigned long vaddr = *position;
|
||||
int remainder = *length;
|
||||
unsigned long remainder = *nr_pages;
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
|
||||
spin_lock(&mm->page_table_lock);
|
||||
|
@ -3001,7 +2997,7 @@ same_page:
|
|||
}
|
||||
}
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
*length = remainder;
|
||||
*nr_pages = remainder;
|
||||
*position = vaddr;
|
||||
|
||||
return i ? i : -EFAULT;
|
||||
|
|
|
@ -162,8 +162,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
struct vm_area_struct *prev, struct rb_node *rb_parent);
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
extern long mlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end);
|
||||
extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end, int *nonblocking);
|
||||
extern void munlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end);
|
||||
static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
|
||||
|
|
|
@ -1300,9 +1300,8 @@ static void kmemleak_scan(void)
|
|||
*/
|
||||
lock_memory_hotplug();
|
||||
for_each_online_node(i) {
|
||||
pg_data_t *pgdat = NODE_DATA(i);
|
||||
unsigned long start_pfn = pgdat->node_start_pfn;
|
||||
unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
|
||||
unsigned long start_pfn = node_start_pfn(i);
|
||||
unsigned long end_pfn = node_end_pfn(i);
|
||||
unsigned long pfn;
|
||||
|
||||
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
|
||||
|
|
105
mm/madvise.c
105
mm/madvise.c
|
@ -16,6 +16,9 @@
|
|||
#include <linux/ksm.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/swapops.h>
|
||||
|
||||
/*
|
||||
* Any behaviour which results in changes to the vma->vm_flags needs to
|
||||
|
@ -131,6 +134,84 @@ out:
|
|||
return error;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SWAP
|
||||
static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
|
||||
unsigned long end, struct mm_walk *walk)
|
||||
{
|
||||
pte_t *orig_pte;
|
||||
struct vm_area_struct *vma = walk->private;
|
||||
unsigned long index;
|
||||
|
||||
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
|
||||
return 0;
|
||||
|
||||
for (index = start; index != end; index += PAGE_SIZE) {
|
||||
pte_t pte;
|
||||
swp_entry_t entry;
|
||||
struct page *page;
|
||||
spinlock_t *ptl;
|
||||
|
||||
orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
|
||||
pte = *(orig_pte + ((index - start) / PAGE_SIZE));
|
||||
pte_unmap_unlock(orig_pte, ptl);
|
||||
|
||||
if (pte_present(pte) || pte_none(pte) || pte_file(pte))
|
||||
continue;
|
||||
entry = pte_to_swp_entry(pte);
|
||||
if (unlikely(non_swap_entry(entry)))
|
||||
continue;
|
||||
|
||||
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
|
||||
vma, index);
|
||||
if (page)
|
||||
page_cache_release(page);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void force_swapin_readahead(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
struct mm_walk walk = {
|
||||
.mm = vma->vm_mm,
|
||||
.pmd_entry = swapin_walk_pmd_entry,
|
||||
.private = vma,
|
||||
};
|
||||
|
||||
walk_page_range(start, end, &walk);
|
||||
|
||||
lru_add_drain(); /* Push any new pages onto the LRU now */
|
||||
}
|
||||
|
||||
static void force_shm_swapin_readahead(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end,
|
||||
struct address_space *mapping)
|
||||
{
|
||||
pgoff_t index;
|
||||
struct page *page;
|
||||
swp_entry_t swap;
|
||||
|
||||
for (; start < end; start += PAGE_SIZE) {
|
||||
index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
|
||||
|
||||
page = find_get_page(mapping, index);
|
||||
if (!radix_tree_exceptional_entry(page)) {
|
||||
if (page)
|
||||
page_cache_release(page);
|
||||
continue;
|
||||
}
|
||||
swap = radix_to_swp_entry(page);
|
||||
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
|
||||
NULL, 0);
|
||||
if (page)
|
||||
page_cache_release(page);
|
||||
}
|
||||
|
||||
lru_add_drain(); /* Push any new pages onto the LRU now */
|
||||
}
|
||||
#endif /* CONFIG_SWAP */
|
||||
|
||||
/*
|
||||
* Schedule all required I/O operations. Do not wait for completion.
|
||||
*/
|
||||
|
@ -140,6 +221,18 @@ static long madvise_willneed(struct vm_area_struct * vma,
|
|||
{
|
||||
struct file *file = vma->vm_file;
|
||||
|
||||
#ifdef CONFIG_SWAP
|
||||
if (!file || mapping_cap_swap_backed(file->f_mapping)) {
|
||||
*prev = vma;
|
||||
if (!file)
|
||||
force_swapin_readahead(vma, start, end);
|
||||
else
|
||||
force_shm_swapin_readahead(vma, start, end,
|
||||
file->f_mapping);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!file)
|
||||
return -EBADF;
|
||||
|
||||
|
@ -371,6 +464,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
|
|||
int error = -EINVAL;
|
||||
int write;
|
||||
size_t len;
|
||||
struct blk_plug plug;
|
||||
|
||||
#ifdef CONFIG_MEMORY_FAILURE
|
||||
if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
|
||||
|
@ -410,18 +504,19 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
|
|||
if (vma && start > vma->vm_start)
|
||||
prev = vma;
|
||||
|
||||
blk_start_plug(&plug);
|
||||
for (;;) {
|
||||
/* Still start < end. */
|
||||
error = -ENOMEM;
|
||||
if (!vma)
|
||||
goto out;
|
||||
goto out_plug;
|
||||
|
||||
/* Here start < (end|vma->vm_end). */
|
||||
if (start < vma->vm_start) {
|
||||
unmapped_error = -ENOMEM;
|
||||
start = vma->vm_start;
|
||||
if (start >= end)
|
||||
goto out;
|
||||
goto out_plug;
|
||||
}
|
||||
|
||||
/* Here vma->vm_start <= start < (end|vma->vm_end) */
|
||||
|
@ -432,18 +527,20 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
|
|||
/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
|
||||
error = madvise_vma(vma, &prev, start, tmp, behavior);
|
||||
if (error)
|
||||
goto out;
|
||||
goto out_plug;
|
||||
start = tmp;
|
||||
if (prev && start < prev->vm_end)
|
||||
start = prev->vm_end;
|
||||
error = unmapped_error;
|
||||
if (start >= end)
|
||||
goto out;
|
||||
goto out_plug;
|
||||
if (prev)
|
||||
vma = prev->vm_next;
|
||||
else /* madvise_remove dropped mmap_sem */
|
||||
vma = find_vma(current->mm, start);
|
||||
}
|
||||
out_plug:
|
||||
blk_finish_plug(&plug);
|
||||
out:
|
||||
if (write)
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
|
|
|
@ -92,9 +92,58 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
|
|||
*
|
||||
* Find @size free area aligned to @align in the specified range and node.
|
||||
*
|
||||
* If we have CONFIG_HAVE_MEMBLOCK_NODE_MAP defined, we need to check if the
|
||||
* memory we found if not in hotpluggable ranges.
|
||||
*
|
||||
* RETURNS:
|
||||
* Found address on success, %0 on failure.
|
||||
*/
|
||||
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
||||
phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
|
||||
phys_addr_t end, phys_addr_t size,
|
||||
phys_addr_t align, int nid)
|
||||
{
|
||||
phys_addr_t this_start, this_end, cand;
|
||||
u64 i;
|
||||
int curr = movablemem_map.nr_map - 1;
|
||||
|
||||
/* pump up @end */
|
||||
if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
|
||||
end = memblock.current_limit;
|
||||
|
||||
/* avoid allocating the first page */
|
||||
start = max_t(phys_addr_t, start, PAGE_SIZE);
|
||||
end = max(start, end);
|
||||
|
||||
for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
|
||||
this_start = clamp(this_start, start, end);
|
||||
this_end = clamp(this_end, start, end);
|
||||
|
||||
restart:
|
||||
if (this_end <= this_start || this_end < size)
|
||||
continue;
|
||||
|
||||
for (; curr >= 0; curr--) {
|
||||
if ((movablemem_map.map[curr].start_pfn << PAGE_SHIFT)
|
||||
< this_end)
|
||||
break;
|
||||
}
|
||||
|
||||
cand = round_down(this_end - size, align);
|
||||
if (curr >= 0 &&
|
||||
cand < movablemem_map.map[curr].end_pfn << PAGE_SHIFT) {
|
||||
this_end = movablemem_map.map[curr].start_pfn
|
||||
<< PAGE_SHIFT;
|
||||
goto restart;
|
||||
}
|
||||
|
||||
if (cand >= this_start)
|
||||
return cand;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
|
||||
phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
|
||||
phys_addr_t end, phys_addr_t size,
|
||||
phys_addr_t align, int nid)
|
||||
|
@ -123,6 +172,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
|
||||
|
||||
/**
|
||||
* memblock_find_in_range - find free area in given range
|
||||
|
|
477
mm/memcontrol.c
477
mm/memcontrol.c
|
@ -120,6 +120,14 @@ static const char * const mem_cgroup_events_names[] = {
|
|||
"pgmajfault",
|
||||
};
|
||||
|
||||
static const char * const mem_cgroup_lru_names[] = {
|
||||
"inactive_anon",
|
||||
"active_anon",
|
||||
"inactive_file",
|
||||
"active_file",
|
||||
"unevictable",
|
||||
};
|
||||
|
||||
/*
|
||||
* Per memcg event counter is incremented at every pagein/pageout. With THP,
|
||||
* it will be incremated by the number of pages. This counter is used for
|
||||
|
@ -172,7 +180,7 @@ struct mem_cgroup_per_node {
|
|||
};
|
||||
|
||||
struct mem_cgroup_lru_info {
|
||||
struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
|
||||
struct mem_cgroup_per_node *nodeinfo[0];
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -275,17 +283,6 @@ struct mem_cgroup {
|
|||
* the counter to account for kernel memory usage.
|
||||
*/
|
||||
struct res_counter kmem;
|
||||
/*
|
||||
* Per cgroup active and inactive list, similar to the
|
||||
* per zone LRU lists.
|
||||
*/
|
||||
struct mem_cgroup_lru_info info;
|
||||
int last_scanned_node;
|
||||
#if MAX_NUMNODES > 1
|
||||
nodemask_t scan_nodes;
|
||||
atomic_t numainfo_events;
|
||||
atomic_t numainfo_updating;
|
||||
#endif
|
||||
/*
|
||||
* Should the accounting and control be hierarchical, per subtree?
|
||||
*/
|
||||
|
@ -349,8 +346,29 @@ struct mem_cgroup {
|
|||
/* Index in the kmem_cache->memcg_params->memcg_caches array */
|
||||
int kmemcg_id;
|
||||
#endif
|
||||
|
||||
int last_scanned_node;
|
||||
#if MAX_NUMNODES > 1
|
||||
nodemask_t scan_nodes;
|
||||
atomic_t numainfo_events;
|
||||
atomic_t numainfo_updating;
|
||||
#endif
|
||||
/*
|
||||
* Per cgroup active and inactive list, similar to the
|
||||
* per zone LRU lists.
|
||||
*
|
||||
* WARNING: This has to be the last element of the struct. Don't
|
||||
* add new fields after this point.
|
||||
*/
|
||||
struct mem_cgroup_lru_info info;
|
||||
};
|
||||
|
||||
static size_t memcg_size(void)
|
||||
{
|
||||
return sizeof(struct mem_cgroup) +
|
||||
nr_node_ids * sizeof(struct mem_cgroup_per_node);
|
||||
}
|
||||
|
||||
/* internal only representation about the status of kmem accounting. */
|
||||
enum {
|
||||
KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
|
||||
|
@ -398,8 +416,8 @@ static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
|
|||
|
||||
/* Stuffs for move charges at task migration. */
|
||||
/*
|
||||
* Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
|
||||
* left-shifted bitmap of these types.
|
||||
* Types of charges to be moved. "move_charge_at_immitgrate" and
|
||||
* "immigrate_flags" are treated as a left-shifted bitmap of these types.
|
||||
*/
|
||||
enum move_type {
|
||||
MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
|
||||
|
@ -412,6 +430,7 @@ static struct move_charge_struct {
|
|||
spinlock_t lock; /* for from, to */
|
||||
struct mem_cgroup *from;
|
||||
struct mem_cgroup *to;
|
||||
unsigned long immigrate_flags;
|
||||
unsigned long precharge;
|
||||
unsigned long moved_charge;
|
||||
unsigned long moved_swap;
|
||||
|
@ -424,14 +443,12 @@ static struct move_charge_struct {
|
|||
|
||||
static bool move_anon(void)
|
||||
{
|
||||
return test_bit(MOVE_CHARGE_TYPE_ANON,
|
||||
&mc.to->move_charge_at_immigrate);
|
||||
return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
|
||||
}
|
||||
|
||||
static bool move_file(void)
|
||||
{
|
||||
return test_bit(MOVE_CHARGE_TYPE_FILE,
|
||||
&mc.to->move_charge_at_immigrate);
|
||||
return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -471,6 +488,13 @@ enum res_type {
|
|||
#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
|
||||
#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
|
||||
|
||||
/*
|
||||
* The memcg_create_mutex will be held whenever a new cgroup is created.
|
||||
* As a consequence, any change that needs to protect against new child cgroups
|
||||
* appearing has to hold it as well.
|
||||
*/
|
||||
static DEFINE_MUTEX(memcg_create_mutex);
|
||||
|
||||
static void mem_cgroup_get(struct mem_cgroup *memcg);
|
||||
static void mem_cgroup_put(struct mem_cgroup *memcg);
|
||||
|
||||
|
@ -627,6 +651,7 @@ static void drain_all_stock_async(struct mem_cgroup *memcg);
|
|||
static struct mem_cgroup_per_zone *
|
||||
mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
|
||||
{
|
||||
VM_BUG_ON((unsigned)nid >= nr_node_ids);
|
||||
return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
|
||||
}
|
||||
|
||||
|
@ -1371,17 +1396,6 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
|
|||
return inactive * inactive_ratio < active;
|
||||
}
|
||||
|
||||
int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
|
||||
{
|
||||
unsigned long active;
|
||||
unsigned long inactive;
|
||||
|
||||
inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
|
||||
active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
|
||||
|
||||
return (active > inactive);
|
||||
}
|
||||
|
||||
#define mem_cgroup_from_res_counter(counter, member) \
|
||||
container_of(counter, struct mem_cgroup, member)
|
||||
|
||||
|
@ -1524,8 +1538,9 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
|
|||
spin_unlock_irqrestore(&memcg->move_lock, *flags);
|
||||
}
|
||||
|
||||
#define K(x) ((x) << (PAGE_SHIFT-10))
|
||||
/**
|
||||
* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
|
||||
* mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
|
||||
* @memcg: The memory cgroup that went over limit
|
||||
* @p: Task that is going to be killed
|
||||
*
|
||||
|
@ -1543,8 +1558,10 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
|
|||
*/
|
||||
static char memcg_name[PATH_MAX];
|
||||
int ret;
|
||||
struct mem_cgroup *iter;
|
||||
unsigned int i;
|
||||
|
||||
if (!memcg || !p)
|
||||
if (!p)
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
|
@ -1563,7 +1580,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
|
|||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
printk(KERN_INFO "Task in %s killed", memcg_name);
|
||||
pr_info("Task in %s killed", memcg_name);
|
||||
|
||||
rcu_read_lock();
|
||||
ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
|
||||
|
@ -1576,22 +1593,45 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
|
|||
/*
|
||||
* Continues from above, so we don't need an KERN_ level
|
||||
*/
|
||||
printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
|
||||
pr_cont(" as a result of limit of %s\n", memcg_name);
|
||||
done:
|
||||
|
||||
printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
|
||||
pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
|
||||
res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
|
||||
res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
|
||||
res_counter_read_u64(&memcg->res, RES_FAILCNT));
|
||||
printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
|
||||
"failcnt %llu\n",
|
||||
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
|
||||
res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
|
||||
res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
|
||||
res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
|
||||
printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
|
||||
pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
|
||||
res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
|
||||
res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
|
||||
res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
|
||||
|
||||
for_each_mem_cgroup_tree(iter, memcg) {
|
||||
pr_info("Memory cgroup stats");
|
||||
|
||||
rcu_read_lock();
|
||||
ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
|
||||
if (!ret)
|
||||
pr_cont(" for %s", memcg_name);
|
||||
rcu_read_unlock();
|
||||
pr_cont(":");
|
||||
|
||||
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
|
||||
if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
|
||||
continue;
|
||||
pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
|
||||
K(mem_cgroup_read_stat(iter, i)));
|
||||
}
|
||||
|
||||
for (i = 0; i < NR_LRU_LISTS; i++)
|
||||
pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
|
||||
K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
|
||||
|
||||
pr_cont("\n");
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -2256,6 +2296,17 @@ static void drain_local_stock(struct work_struct *dummy)
|
|||
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
|
||||
}
|
||||
|
||||
static void __init memcg_stock_init(void)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct memcg_stock_pcp *stock =
|
||||
&per_cpu(memcg_stock, cpu);
|
||||
INIT_WORK(&stock->work, drain_local_stock);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Cache charges(val) which is from res_counter, to local per_cpu area.
|
||||
* This will be consumed by consume_stock() function, later.
|
||||
|
@ -4391,8 +4442,8 @@ void mem_cgroup_print_bad_page(struct page *page)
|
|||
|
||||
pc = lookup_page_cgroup_used(page);
|
||||
if (pc) {
|
||||
printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
|
||||
pc, pc->flags, pc->mem_cgroup);
|
||||
pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
|
||||
pc, pc->flags, pc->mem_cgroup);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -4718,6 +4769,33 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
|
|||
} while (usage > 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* This mainly exists for tests during the setting of set of use_hierarchy.
|
||||
* Since this is the very setting we are changing, the current hierarchy value
|
||||
* is meaningless
|
||||
*/
|
||||
static inline bool __memcg_has_children(struct mem_cgroup *memcg)
|
||||
{
|
||||
struct cgroup *pos;
|
||||
|
||||
/* bounce at first found */
|
||||
cgroup_for_each_child(pos, memcg->css.cgroup)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
|
||||
* to be already dead (as in mem_cgroup_force_empty, for instance). This is
|
||||
* from mem_cgroup_count_children(), in the sense that we don't really care how
|
||||
* many children we have; we only need to know if we have any. It also counts
|
||||
* any memcg without hierarchy as infertile.
|
||||
*/
|
||||
static inline bool memcg_has_children(struct mem_cgroup *memcg)
|
||||
{
|
||||
return memcg->use_hierarchy && __memcg_has_children(memcg);
|
||||
}
|
||||
|
||||
/*
|
||||
* Reclaims as many pages from the given memcg as possible and moves
|
||||
* the rest to the parent.
|
||||
|
@ -4788,7 +4866,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
|
|||
if (parent)
|
||||
parent_memcg = mem_cgroup_from_cont(parent);
|
||||
|
||||
cgroup_lock();
|
||||
mutex_lock(&memcg_create_mutex);
|
||||
|
||||
if (memcg->use_hierarchy == val)
|
||||
goto out;
|
||||
|
@ -4803,7 +4881,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
|
|||
*/
|
||||
if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
|
||||
(val == 1 || val == 0)) {
|
||||
if (list_empty(&cont->children))
|
||||
if (!__memcg_has_children(memcg))
|
||||
memcg->use_hierarchy = val;
|
||||
else
|
||||
retval = -EBUSY;
|
||||
|
@ -4811,7 +4889,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
|
|||
retval = -EINVAL;
|
||||
|
||||
out:
|
||||
cgroup_unlock();
|
||||
mutex_unlock(&memcg_create_mutex);
|
||||
|
||||
return retval;
|
||||
}
|
||||
|
@ -4896,8 +4974,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
|
|||
{
|
||||
int ret = -EINVAL;
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
bool must_inc_static_branch = false;
|
||||
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
|
||||
/*
|
||||
* For simplicity, we won't allow this to be disabled. It also can't
|
||||
|
@ -4910,18 +4986,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
|
|||
*
|
||||
* After it first became limited, changes in the value of the limit are
|
||||
* of course permitted.
|
||||
*
|
||||
* Taking the cgroup_lock is really offensive, but it is so far the only
|
||||
* way to guarantee that no children will appear. There are plenty of
|
||||
* other offenders, and they should all go away. Fine grained locking
|
||||
* is probably the way to go here. When we are fully hierarchical, we
|
||||
* can also get rid of the use_hierarchy check.
|
||||
*/
|
||||
cgroup_lock();
|
||||
mutex_lock(&memcg_create_mutex);
|
||||
mutex_lock(&set_limit_mutex);
|
||||
if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
|
||||
if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
|
||||
!list_empty(&cont->children))) {
|
||||
if (cgroup_task_count(cont) || memcg_has_children(memcg)) {
|
||||
ret = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
|
@ -4933,7 +5002,13 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
|
|||
res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
|
||||
goto out;
|
||||
}
|
||||
must_inc_static_branch = true;
|
||||
static_key_slow_inc(&memcg_kmem_enabled_key);
|
||||
/*
|
||||
* setting the active bit after the inc will guarantee no one
|
||||
* starts accounting before all call sites are patched
|
||||
*/
|
||||
memcg_kmem_set_active(memcg);
|
||||
|
||||
/*
|
||||
* kmem charges can outlive the cgroup. In the case of slab
|
||||
* pages, for instance, a page contain objects from various
|
||||
|
@ -4945,32 +5020,12 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
|
|||
ret = res_counter_set_limit(&memcg->kmem, val);
|
||||
out:
|
||||
mutex_unlock(&set_limit_mutex);
|
||||
cgroup_unlock();
|
||||
|
||||
/*
|
||||
* We are by now familiar with the fact that we can't inc the static
|
||||
* branch inside cgroup_lock. See disarm functions for details. A
|
||||
* worker here is overkill, but also wrong: After the limit is set, we
|
||||
* must start accounting right away. Since this operation can't fail,
|
||||
* we can safely defer it to here - no rollback will be needed.
|
||||
*
|
||||
* The boolean used to control this is also safe, because
|
||||
* KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
|
||||
* able to set it to true;
|
||||
*/
|
||||
if (must_inc_static_branch) {
|
||||
static_key_slow_inc(&memcg_kmem_enabled_key);
|
||||
/*
|
||||
* setting the active bit after the inc will guarantee no one
|
||||
* starts accounting before all call sites are patched
|
||||
*/
|
||||
memcg_kmem_set_active(memcg);
|
||||
}
|
||||
|
||||
mutex_unlock(&memcg_create_mutex);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
static int memcg_propagate_kmem(struct mem_cgroup *memcg)
|
||||
{
|
||||
int ret = 0;
|
||||
|
@ -4979,7 +5034,6 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
|
|||
goto out;
|
||||
|
||||
memcg->kmem_account_flags = parent->kmem_account_flags;
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
/*
|
||||
* When that happen, we need to disable the static branch only on those
|
||||
* memcgs that enabled it. To achieve this, we would be forced to
|
||||
|
@ -5005,10 +5059,10 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
|
|||
mutex_lock(&set_limit_mutex);
|
||||
ret = memcg_update_cache_sizes(memcg);
|
||||
mutex_unlock(&set_limit_mutex);
|
||||
#endif
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
#endif /* CONFIG_MEMCG_KMEM */
|
||||
|
||||
/*
|
||||
* The user of this function is...
|
||||
|
@ -5148,15 +5202,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
|
|||
|
||||
if (val >= (1 << NR_MOVE_TYPE))
|
||||
return -EINVAL;
|
||||
/*
|
||||
* We check this value several times in both in can_attach() and
|
||||
* attach(), so we need cgroup lock to prevent this value from being
|
||||
* inconsistent.
|
||||
*/
|
||||
cgroup_lock();
|
||||
memcg->move_charge_at_immigrate = val;
|
||||
cgroup_unlock();
|
||||
|
||||
/*
|
||||
* No kind of locking is needed in here, because ->can_attach() will
|
||||
* check this value once in the beginning of the process, and then carry
|
||||
* on with stale data. This means that changes to this value will only
|
||||
* affect task migrations starting after the change.
|
||||
*/
|
||||
memcg->move_charge_at_immigrate = val;
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
|
@ -5214,14 +5267,6 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
|
|||
}
|
||||
#endif /* CONFIG_NUMA */
|
||||
|
||||
static const char * const mem_cgroup_lru_names[] = {
|
||||
"inactive_anon",
|
||||
"active_anon",
|
||||
"inactive_file",
|
||||
"active_file",
|
||||
"unevictable",
|
||||
};
|
||||
|
||||
static inline void mem_cgroup_lru_names_not_uptodate(void)
|
||||
{
|
||||
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
|
||||
|
@ -5335,18 +5380,17 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
|
|||
|
||||
parent = mem_cgroup_from_cont(cgrp->parent);
|
||||
|
||||
cgroup_lock();
|
||||
mutex_lock(&memcg_create_mutex);
|
||||
|
||||
/* If under hierarchy, only empty-root can set this value */
|
||||
if ((parent->use_hierarchy) ||
|
||||
(memcg->use_hierarchy && !list_empty(&cgrp->children))) {
|
||||
cgroup_unlock();
|
||||
if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
|
||||
mutex_unlock(&memcg_create_mutex);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
memcg->swappiness = val;
|
||||
|
||||
cgroup_unlock();
|
||||
mutex_unlock(&memcg_create_mutex);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -5672,17 +5716,16 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
|
|||
|
||||
parent = mem_cgroup_from_cont(cgrp->parent);
|
||||
|
||||
cgroup_lock();
|
||||
mutex_lock(&memcg_create_mutex);
|
||||
/* oom-kill-disable is a flag for subhierarchy. */
|
||||
if ((parent->use_hierarchy) ||
|
||||
(memcg->use_hierarchy && !list_empty(&cgrp->children))) {
|
||||
cgroup_unlock();
|
||||
if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
|
||||
mutex_unlock(&memcg_create_mutex);
|
||||
return -EINVAL;
|
||||
}
|
||||
memcg->oom_kill_disable = val;
|
||||
if (!val)
|
||||
memcg_oom_recover(memcg);
|
||||
cgroup_unlock();
|
||||
mutex_unlock(&memcg_create_mutex);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -5797,33 +5840,6 @@ static struct cftype mem_cgroup_files[] = {
|
|||
.read_seq_string = memcg_numa_stat_show,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_MEMCG_SWAP
|
||||
{
|
||||
.name = "memsw.usage_in_bytes",
|
||||
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
|
||||
.read = mem_cgroup_read,
|
||||
.register_event = mem_cgroup_usage_register_event,
|
||||
.unregister_event = mem_cgroup_usage_unregister_event,
|
||||
},
|
||||
{
|
||||
.name = "memsw.max_usage_in_bytes",
|
||||
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
|
||||
.trigger = mem_cgroup_reset,
|
||||
.read = mem_cgroup_read,
|
||||
},
|
||||
{
|
||||
.name = "memsw.limit_in_bytes",
|
||||
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
|
||||
.write_string = mem_cgroup_write,
|
||||
.read = mem_cgroup_read,
|
||||
},
|
||||
{
|
||||
.name = "memsw.failcnt",
|
||||
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
|
||||
.trigger = mem_cgroup_reset,
|
||||
.read = mem_cgroup_read,
|
||||
},
|
||||
#endif
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
{
|
||||
.name = "kmem.limit_in_bytes",
|
||||
|
@ -5858,6 +5874,36 @@ static struct cftype mem_cgroup_files[] = {
|
|||
{ }, /* terminate */
|
||||
};
|
||||
|
||||
#ifdef CONFIG_MEMCG_SWAP
|
||||
static struct cftype memsw_cgroup_files[] = {
|
||||
{
|
||||
.name = "memsw.usage_in_bytes",
|
||||
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
|
||||
.read = mem_cgroup_read,
|
||||
.register_event = mem_cgroup_usage_register_event,
|
||||
.unregister_event = mem_cgroup_usage_unregister_event,
|
||||
},
|
||||
{
|
||||
.name = "memsw.max_usage_in_bytes",
|
||||
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
|
||||
.trigger = mem_cgroup_reset,
|
||||
.read = mem_cgroup_read,
|
||||
},
|
||||
{
|
||||
.name = "memsw.limit_in_bytes",
|
||||
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
|
||||
.write_string = mem_cgroup_write,
|
||||
.read = mem_cgroup_read,
|
||||
},
|
||||
{
|
||||
.name = "memsw.failcnt",
|
||||
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
|
||||
.trigger = mem_cgroup_reset,
|
||||
.read = mem_cgroup_read,
|
||||
},
|
||||
{ }, /* terminate */
|
||||
};
|
||||
#endif
|
||||
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
|
||||
{
|
||||
struct mem_cgroup_per_node *pn;
|
||||
|
@ -5896,9 +5942,9 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
|
|||
static struct mem_cgroup *mem_cgroup_alloc(void)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
int size = sizeof(struct mem_cgroup);
|
||||
size_t size = memcg_size();
|
||||
|
||||
/* Can be very big if MAX_NUMNODES is very big */
|
||||
/* Can be very big if nr_node_ids is very big */
|
||||
if (size < PAGE_SIZE)
|
||||
memcg = kzalloc(size, GFP_KERNEL);
|
||||
else
|
||||
|
@ -5935,7 +5981,7 @@ out_free:
|
|||
static void __mem_cgroup_free(struct mem_cgroup *memcg)
|
||||
{
|
||||
int node;
|
||||
int size = sizeof(struct mem_cgroup);
|
||||
size_t size = memcg_size();
|
||||
|
||||
mem_cgroup_remove_from_trees(memcg);
|
||||
free_css_id(&mem_cgroup_subsys, &memcg->css);
|
||||
|
@ -6017,19 +6063,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
|
|||
}
|
||||
EXPORT_SYMBOL(parent_mem_cgroup);
|
||||
|
||||
#ifdef CONFIG_MEMCG_SWAP
|
||||
static void __init enable_swap_cgroup(void)
|
||||
{
|
||||
if (!mem_cgroup_disabled() && really_do_swap_account)
|
||||
do_swap_account = 1;
|
||||
}
|
||||
#else
|
||||
static void __init enable_swap_cgroup(void)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
static int mem_cgroup_soft_limit_tree_init(void)
|
||||
static void __init mem_cgroup_soft_limit_tree_init(void)
|
||||
{
|
||||
struct mem_cgroup_tree_per_node *rtpn;
|
||||
struct mem_cgroup_tree_per_zone *rtpz;
|
||||
|
@ -6040,8 +6074,7 @@ static int mem_cgroup_soft_limit_tree_init(void)
|
|||
if (!node_state(node, N_NORMAL_MEMORY))
|
||||
tmp = -1;
|
||||
rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
|
||||
if (!rtpn)
|
||||
goto err_cleanup;
|
||||
BUG_ON(!rtpn);
|
||||
|
||||
soft_limit_tree.rb_tree_per_node[node] = rtpn;
|
||||
|
||||
|
@ -6051,23 +6084,12 @@ static int mem_cgroup_soft_limit_tree_init(void)
|
|||
spin_lock_init(&rtpz->lock);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
||||
err_cleanup:
|
||||
for_each_node(node) {
|
||||
if (!soft_limit_tree.rb_tree_per_node[node])
|
||||
break;
|
||||
kfree(soft_limit_tree.rb_tree_per_node[node]);
|
||||
soft_limit_tree.rb_tree_per_node[node] = NULL;
|
||||
}
|
||||
return 1;
|
||||
|
||||
}
|
||||
|
||||
static struct cgroup_subsys_state * __ref
|
||||
mem_cgroup_css_alloc(struct cgroup *cont)
|
||||
{
|
||||
struct mem_cgroup *memcg, *parent;
|
||||
struct mem_cgroup *memcg;
|
||||
long error = -ENOMEM;
|
||||
int node;
|
||||
|
||||
|
@ -6081,24 +6103,44 @@ mem_cgroup_css_alloc(struct cgroup *cont)
|
|||
|
||||
/* root ? */
|
||||
if (cont->parent == NULL) {
|
||||
int cpu;
|
||||
enable_swap_cgroup();
|
||||
parent = NULL;
|
||||
if (mem_cgroup_soft_limit_tree_init())
|
||||
goto free_out;
|
||||
root_mem_cgroup = memcg;
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct memcg_stock_pcp *stock =
|
||||
&per_cpu(memcg_stock, cpu);
|
||||
INIT_WORK(&stock->work, drain_local_stock);
|
||||
}
|
||||
} else {
|
||||
parent = mem_cgroup_from_cont(cont->parent);
|
||||
memcg->use_hierarchy = parent->use_hierarchy;
|
||||
memcg->oom_kill_disable = parent->oom_kill_disable;
|
||||
res_counter_init(&memcg->res, NULL);
|
||||
res_counter_init(&memcg->memsw, NULL);
|
||||
res_counter_init(&memcg->kmem, NULL);
|
||||
}
|
||||
|
||||
if (parent && parent->use_hierarchy) {
|
||||
memcg->last_scanned_node = MAX_NUMNODES;
|
||||
INIT_LIST_HEAD(&memcg->oom_notify);
|
||||
atomic_set(&memcg->refcnt, 1);
|
||||
memcg->move_charge_at_immigrate = 0;
|
||||
mutex_init(&memcg->thresholds_lock);
|
||||
spin_lock_init(&memcg->move_lock);
|
||||
|
||||
return &memcg->css;
|
||||
|
||||
free_out:
|
||||
__mem_cgroup_free(memcg);
|
||||
return ERR_PTR(error);
|
||||
}
|
||||
|
||||
static int
|
||||
mem_cgroup_css_online(struct cgroup *cont)
|
||||
{
|
||||
struct mem_cgroup *memcg, *parent;
|
||||
int error = 0;
|
||||
|
||||
if (!cont->parent)
|
||||
return 0;
|
||||
|
||||
mutex_lock(&memcg_create_mutex);
|
||||
memcg = mem_cgroup_from_cont(cont);
|
||||
parent = mem_cgroup_from_cont(cont->parent);
|
||||
|
||||
memcg->use_hierarchy = parent->use_hierarchy;
|
||||
memcg->oom_kill_disable = parent->oom_kill_disable;
|
||||
memcg->swappiness = mem_cgroup_swappiness(parent);
|
||||
|
||||
if (parent->use_hierarchy) {
|
||||
res_counter_init(&memcg->res, &parent->res);
|
||||
res_counter_init(&memcg->memsw, &parent->memsw);
|
||||
res_counter_init(&memcg->kmem, &parent->kmem);
|
||||
|
@ -6119,20 +6161,12 @@ mem_cgroup_css_alloc(struct cgroup *cont)
|
|||
* much sense so let cgroup subsystem know about this
|
||||
* unfortunate state in our controller.
|
||||
*/
|
||||
if (parent && parent != root_mem_cgroup)
|
||||
if (parent != root_mem_cgroup)
|
||||
mem_cgroup_subsys.broken_hierarchy = true;
|
||||
}
|
||||
memcg->last_scanned_node = MAX_NUMNODES;
|
||||
INIT_LIST_HEAD(&memcg->oom_notify);
|
||||
|
||||
if (parent)
|
||||
memcg->swappiness = mem_cgroup_swappiness(parent);
|
||||
atomic_set(&memcg->refcnt, 1);
|
||||
memcg->move_charge_at_immigrate = 0;
|
||||
mutex_init(&memcg->thresholds_lock);
|
||||
spin_lock_init(&memcg->move_lock);
|
||||
|
||||
error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
|
||||
mutex_unlock(&memcg_create_mutex);
|
||||
if (error) {
|
||||
/*
|
||||
* We call put now because our (and parent's) refcnts
|
||||
|
@ -6140,12 +6174,10 @@ mem_cgroup_css_alloc(struct cgroup *cont)
|
|||
* call __mem_cgroup_free, so return directly
|
||||
*/
|
||||
mem_cgroup_put(memcg);
|
||||
return ERR_PTR(error);
|
||||
if (parent->use_hierarchy)
|
||||
mem_cgroup_put(parent);
|
||||
}
|
||||
return &memcg->css;
|
||||
free_out:
|
||||
__mem_cgroup_free(memcg);
|
||||
return ERR_PTR(error);
|
||||
return error;
|
||||
}
|
||||
|
||||
static void mem_cgroup_css_offline(struct cgroup *cont)
|
||||
|
@ -6281,7 +6313,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
|
|||
* Because lookup_swap_cache() updates some statistics counter,
|
||||
* we call find_get_page() with swapper_space directly.
|
||||
*/
|
||||
page = find_get_page(&swapper_space, ent.val);
|
||||
page = find_get_page(swap_address_space(ent), ent.val);
|
||||
if (do_swap_account)
|
||||
entry->val = ent.val;
|
||||
|
||||
|
@ -6322,7 +6354,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
|
|||
swp_entry_t swap = radix_to_swp_entry(page);
|
||||
if (do_swap_account)
|
||||
*entry = swap;
|
||||
page = find_get_page(&swapper_space, swap.val);
|
||||
page = find_get_page(swap_address_space(swap), swap.val);
|
||||
}
|
||||
#endif
|
||||
return page;
|
||||
|
@ -6532,8 +6564,15 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
|
|||
struct task_struct *p = cgroup_taskset_first(tset);
|
||||
int ret = 0;
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
|
||||
unsigned long move_charge_at_immigrate;
|
||||
|
||||
if (memcg->move_charge_at_immigrate) {
|
||||
/*
|
||||
* We are now commited to this value whatever it is. Changes in this
|
||||
* tunable will only affect upcoming migrations, not the current one.
|
||||
* So we need to save it, and keep it going.
|
||||
*/
|
||||
move_charge_at_immigrate = memcg->move_charge_at_immigrate;
|
||||
if (move_charge_at_immigrate) {
|
||||
struct mm_struct *mm;
|
||||
struct mem_cgroup *from = mem_cgroup_from_task(p);
|
||||
|
||||
|
@ -6553,6 +6592,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
|
|||
spin_lock(&mc.lock);
|
||||
mc.from = from;
|
||||
mc.to = memcg;
|
||||
mc.immigrate_flags = move_charge_at_immigrate;
|
||||
spin_unlock(&mc.lock);
|
||||
/* We set mc.moving_task later */
|
||||
|
||||
|
@ -6747,6 +6787,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
|
|||
.name = "memory",
|
||||
.subsys_id = mem_cgroup_subsys_id,
|
||||
.css_alloc = mem_cgroup_css_alloc,
|
||||
.css_online = mem_cgroup_css_online,
|
||||
.css_offline = mem_cgroup_css_offline,
|
||||
.css_free = mem_cgroup_css_free,
|
||||
.can_attach = mem_cgroup_can_attach,
|
||||
|
@ -6757,19 +6798,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
|
|||
.use_id = 1,
|
||||
};
|
||||
|
||||
/*
|
||||
* The rest of init is performed during ->css_alloc() for root css which
|
||||
* happens before initcalls. hotcpu_notifier() can't be done together as
|
||||
* it would introduce circular locking by adding cgroup_lock -> cpu hotplug
|
||||
* dependency. Do it from a subsys_initcall().
|
||||
*/
|
||||
static int __init mem_cgroup_init(void)
|
||||
{
|
||||
hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
|
||||
return 0;
|
||||
}
|
||||
subsys_initcall(mem_cgroup_init);
|
||||
|
||||
#ifdef CONFIG_MEMCG_SWAP
|
||||
static int __init enable_swap_account(char *s)
|
||||
{
|
||||
|
@ -6782,4 +6810,39 @@ static int __init enable_swap_account(char *s)
|
|||
}
|
||||
__setup("swapaccount=", enable_swap_account);
|
||||
|
||||
static void __init memsw_file_init(void)
|
||||
{
|
||||
WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
|
||||
}
|
||||
|
||||
static void __init enable_swap_cgroup(void)
|
||||
{
|
||||
if (!mem_cgroup_disabled() && really_do_swap_account) {
|
||||
do_swap_account = 1;
|
||||
memsw_file_init();
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
static void __init enable_swap_cgroup(void)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* subsys_initcall() for memory controller.
|
||||
*
|
||||
* Some parts like hotcpu_notifier() have to be initialized from this context
|
||||
* because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
|
||||
* everything that doesn't depend on a specific mem_cgroup structure should
|
||||
* be initialized from here.
|
||||
*/
|
||||
static int __init mem_cgroup_init(void)
|
||||
{
|
||||
hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
|
||||
enable_swap_cgroup();
|
||||
mem_cgroup_soft_limit_tree_init();
|
||||
memcg_stock_init();
|
||||
return 0;
|
||||
}
|
||||
subsys_initcall(mem_cgroup_init);
|
||||
|
|
|
@ -61,7 +61,7 @@ int sysctl_memory_failure_early_kill __read_mostly = 0;
|
|||
|
||||
int sysctl_memory_failure_recovery __read_mostly = 1;
|
||||
|
||||
atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
|
||||
atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
|
||||
|
||||
#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
|
||||
|
||||
|
@ -784,12 +784,12 @@ static struct page_state {
|
|||
{ sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
|
||||
{ sc|dirty, sc, "clean swapcache", me_swapcache_clean },
|
||||
|
||||
{ unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
|
||||
{ unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
|
||||
|
||||
{ mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
|
||||
{ mlock, mlock, "clean mlocked LRU", me_pagecache_clean },
|
||||
|
||||
{ unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
|
||||
{ unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
|
||||
|
||||
{ lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
|
||||
{ lru|dirty, lru, "clean LRU", me_pagecache_clean },
|
||||
|
||||
|
@ -1021,6 +1021,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
|||
struct page *hpage;
|
||||
int res;
|
||||
unsigned int nr_pages;
|
||||
unsigned long page_flags;
|
||||
|
||||
if (!sysctl_memory_failure_recovery)
|
||||
panic("Memory failure from trap %d on page %lx", trapno, pfn);
|
||||
|
@ -1039,8 +1040,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
|||
return 0;
|
||||
}
|
||||
|
||||
nr_pages = 1 << compound_trans_order(hpage);
|
||||
atomic_long_add(nr_pages, &mce_bad_pages);
|
||||
/*
|
||||
* Currently errors on hugetlbfs pages are measured in hugepage units,
|
||||
* so nr_pages should be 1 << compound_order. OTOH when errors are on
|
||||
* transparent hugepages, they are supposed to be split and error
|
||||
* measurement is done in normal page units. So nr_pages should be one
|
||||
* in this case.
|
||||
*/
|
||||
if (PageHuge(p))
|
||||
nr_pages = 1 << compound_order(hpage);
|
||||
else /* normal page or thp */
|
||||
nr_pages = 1;
|
||||
atomic_long_add(nr_pages, &num_poisoned_pages);
|
||||
|
||||
/*
|
||||
* We need/can do nothing about count=0 pages.
|
||||
|
@ -1070,7 +1081,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
|||
if (!PageHWPoison(hpage)
|
||||
|| (hwpoison_filter(p) && TestClearPageHWPoison(p))
|
||||
|| (p != hpage && TestSetPageHWPoison(hpage))) {
|
||||
atomic_long_sub(nr_pages, &mce_bad_pages);
|
||||
atomic_long_sub(nr_pages, &num_poisoned_pages);
|
||||
return 0;
|
||||
}
|
||||
set_page_hwpoison_huge_page(hpage);
|
||||
|
@ -1118,6 +1129,15 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
|||
*/
|
||||
lock_page(hpage);
|
||||
|
||||
/*
|
||||
* We use page flags to determine what action should be taken, but
|
||||
* the flags can be modified by the error containment action. One
|
||||
* example is an mlocked page, where PG_mlocked is cleared by
|
||||
* page_remove_rmap() in try_to_unmap_one(). So to determine page status
|
||||
* correctly, we save a copy of the page flags at this time.
|
||||
*/
|
||||
page_flags = p->flags;
|
||||
|
||||
/*
|
||||
* unpoison always clear PG_hwpoison inside page lock
|
||||
*/
|
||||
|
@ -1128,7 +1148,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
|||
}
|
||||
if (hwpoison_filter(p)) {
|
||||
if (TestClearPageHWPoison(p))
|
||||
atomic_long_sub(nr_pages, &mce_bad_pages);
|
||||
atomic_long_sub(nr_pages, &num_poisoned_pages);
|
||||
unlock_page(hpage);
|
||||
put_page(hpage);
|
||||
return 0;
|
||||
|
@ -1176,12 +1196,19 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
|||
}
|
||||
|
||||
res = -EBUSY;
|
||||
for (ps = error_states;; ps++) {
|
||||
if ((p->flags & ps->mask) == ps->res) {
|
||||
res = page_action(ps, p, pfn);
|
||||
/*
|
||||
* The first check uses the current page flags which may not have any
|
||||
* relevant information. The second check with the saved page flagss is
|
||||
* carried out only if the first check can't determine the page status.
|
||||
*/
|
||||
for (ps = error_states;; ps++)
|
||||
if ((p->flags & ps->mask) == ps->res)
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!ps->mask)
|
||||
for (ps = error_states;; ps++)
|
||||
if ((page_flags & ps->mask) == ps->res)
|
||||
break;
|
||||
res = page_action(ps, p, pfn);
|
||||
out:
|
||||
unlock_page(hpage);
|
||||
return res;
|
||||
|
@ -1323,7 +1350,7 @@ int unpoison_memory(unsigned long pfn)
|
|||
return 0;
|
||||
}
|
||||
if (TestClearPageHWPoison(p))
|
||||
atomic_long_sub(nr_pages, &mce_bad_pages);
|
||||
atomic_long_sub(nr_pages, &num_poisoned_pages);
|
||||
pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
|
||||
return 0;
|
||||
}
|
||||
|
@ -1337,7 +1364,7 @@ int unpoison_memory(unsigned long pfn)
|
|||
*/
|
||||
if (TestClearPageHWPoison(page)) {
|
||||
pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
|
||||
atomic_long_sub(nr_pages, &mce_bad_pages);
|
||||
atomic_long_sub(nr_pages, &num_poisoned_pages);
|
||||
freeit = 1;
|
||||
if (PageHuge(page))
|
||||
clear_page_hwpoison_huge_page(page);
|
||||
|
@ -1368,7 +1395,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
|
|||
* that is not free, and 1 for any other page type.
|
||||
* For 1 the page is returned with increased page count, otherwise not.
|
||||
*/
|
||||
static int get_any_page(struct page *p, unsigned long pfn, int flags)
|
||||
static int __get_any_page(struct page *p, unsigned long pfn, int flags)
|
||||
{
|
||||
int ret;
|
||||
|
||||
|
@ -1393,11 +1420,9 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
|
|||
if (!get_page_unless_zero(compound_head(p))) {
|
||||
if (PageHuge(p)) {
|
||||
pr_info("%s: %#lx free huge page\n", __func__, pfn);
|
||||
ret = dequeue_hwpoisoned_huge_page(compound_head(p));
|
||||
ret = 0;
|
||||
} else if (is_free_buddy_page(p)) {
|
||||
pr_info("%s: %#lx free buddy page\n", __func__, pfn);
|
||||
/* Set hwpoison bit while page is still isolated */
|
||||
SetPageHWPoison(p);
|
||||
ret = 0;
|
||||
} else {
|
||||
pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
|
||||
|
@ -1413,43 +1438,68 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int get_any_page(struct page *page, unsigned long pfn, int flags)
|
||||
{
|
||||
int ret = __get_any_page(page, pfn, flags);
|
||||
|
||||
if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
|
||||
/*
|
||||
* Try to free it.
|
||||
*/
|
||||
put_page(page);
|
||||
shake_page(page, 1);
|
||||
|
||||
/*
|
||||
* Did it turn free?
|
||||
*/
|
||||
ret = __get_any_page(page, pfn, 0);
|
||||
if (!PageLRU(page)) {
|
||||
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
|
||||
pfn, page->flags);
|
||||
return -EIO;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int soft_offline_huge_page(struct page *page, int flags)
|
||||
{
|
||||
int ret;
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
struct page *hpage = compound_head(page);
|
||||
|
||||
ret = get_any_page(page, pfn, flags);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret == 0)
|
||||
goto done;
|
||||
|
||||
/*
|
||||
* This double-check of PageHWPoison is to avoid the race with
|
||||
* memory_failure(). See also comment in __soft_offline_page().
|
||||
*/
|
||||
lock_page(hpage);
|
||||
if (PageHWPoison(hpage)) {
|
||||
unlock_page(hpage);
|
||||
put_page(hpage);
|
||||
pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
|
||||
return -EBUSY;
|
||||
}
|
||||
unlock_page(hpage);
|
||||
|
||||
/* Keep page count to indicate a given hugepage is isolated. */
|
||||
ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
|
||||
ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL,
|
||||
MIGRATE_SYNC);
|
||||
put_page(hpage);
|
||||
if (ret) {
|
||||
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
|
||||
pfn, ret, page->flags);
|
||||
return ret;
|
||||
}
|
||||
done:
|
||||
if (!PageHWPoison(hpage))
|
||||
} else {
|
||||
set_page_hwpoison_huge_page(hpage);
|
||||
dequeue_hwpoisoned_huge_page(hpage);
|
||||
atomic_long_add(1 << compound_trans_order(hpage),
|
||||
&mce_bad_pages);
|
||||
set_page_hwpoison_huge_page(hpage);
|
||||
dequeue_hwpoisoned_huge_page(hpage);
|
||||
&num_poisoned_pages);
|
||||
}
|
||||
/* keep elevated page count for bad page */
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __soft_offline_page(struct page *page, int flags);
|
||||
|
||||
/**
|
||||
* soft_offline_page - Soft offline a page.
|
||||
* @page: page to offline
|
||||
|
@ -1478,9 +1528,11 @@ int soft_offline_page(struct page *page, int flags)
|
|||
unsigned long pfn = page_to_pfn(page);
|
||||
struct page *hpage = compound_trans_head(page);
|
||||
|
||||
if (PageHuge(page))
|
||||
return soft_offline_huge_page(page, flags);
|
||||
if (PageTransHuge(hpage)) {
|
||||
if (PageHWPoison(page)) {
|
||||
pr_info("soft offline: %#lx page already poisoned\n", pfn);
|
||||
return -EBUSY;
|
||||
}
|
||||
if (!PageHuge(page) && PageTransHuge(hpage)) {
|
||||
if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
|
||||
pr_info("soft offline: %#lx: failed to split THP\n",
|
||||
pfn);
|
||||
|
@ -1491,47 +1543,45 @@ int soft_offline_page(struct page *page, int flags)
|
|||
ret = get_any_page(page, pfn, flags);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret == 0)
|
||||
goto done;
|
||||
if (ret) { /* for in-use pages */
|
||||
if (PageHuge(page))
|
||||
ret = soft_offline_huge_page(page, flags);
|
||||
else
|
||||
ret = __soft_offline_page(page, flags);
|
||||
} else { /* for free pages */
|
||||
if (PageHuge(page)) {
|
||||
set_page_hwpoison_huge_page(hpage);
|
||||
dequeue_hwpoisoned_huge_page(hpage);
|
||||
atomic_long_add(1 << compound_trans_order(hpage),
|
||||
&num_poisoned_pages);
|
||||
} else {
|
||||
SetPageHWPoison(page);
|
||||
atomic_long_inc(&num_poisoned_pages);
|
||||
}
|
||||
}
|
||||
/* keep elevated page count for bad page */
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __soft_offline_page(struct page *page, int flags)
|
||||
{
|
||||
int ret;
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
|
||||
/*
|
||||
* Page cache page we can handle?
|
||||
* Check PageHWPoison again inside page lock because PageHWPoison
|
||||
* is set by memory_failure() outside page lock. Note that
|
||||
* memory_failure() also double-checks PageHWPoison inside page lock,
|
||||
* so there's no race between soft_offline_page() and memory_failure().
|
||||
*/
|
||||
if (!PageLRU(page)) {
|
||||
/*
|
||||
* Try to free it.
|
||||
*/
|
||||
put_page(page);
|
||||
shake_page(page, 1);
|
||||
|
||||
/*
|
||||
* Did it turn free?
|
||||
*/
|
||||
ret = get_any_page(page, pfn, 0);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret == 0)
|
||||
goto done;
|
||||
}
|
||||
if (!PageLRU(page)) {
|
||||
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
|
||||
pfn, page->flags);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
lock_page(page);
|
||||
wait_on_page_writeback(page);
|
||||
|
||||
/*
|
||||
* Synchronized using the page lock with memory_failure()
|
||||
*/
|
||||
if (PageHWPoison(page)) {
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
pr_info("soft offline: %#lx page already poisoned\n", pfn);
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to invalidate first. This should work for
|
||||
* non dirty unmapped page cache pages.
|
||||
|
@ -1544,9 +1594,10 @@ int soft_offline_page(struct page *page, int flags)
|
|||
*/
|
||||
if (ret == 1) {
|
||||
put_page(page);
|
||||
ret = 0;
|
||||
pr_info("soft_offline: %#lx: invalidated\n", pfn);
|
||||
goto done;
|
||||
SetPageHWPoison(page);
|
||||
atomic_long_inc(&num_poisoned_pages);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1563,28 +1614,23 @@ int soft_offline_page(struct page *page, int flags)
|
|||
if (!ret) {
|
||||
LIST_HEAD(pagelist);
|
||||
inc_zone_page_state(page, NR_ISOLATED_ANON +
|
||||
page_is_file_cache(page));
|
||||
page_is_file_cache(page));
|
||||
list_add(&page->lru, &pagelist);
|
||||
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
|
||||
false, MIGRATE_SYNC,
|
||||
MR_MEMORY_FAILURE);
|
||||
MIGRATE_SYNC, MR_MEMORY_FAILURE);
|
||||
if (ret) {
|
||||
putback_lru_pages(&pagelist);
|
||||
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
|
||||
pfn, ret, page->flags);
|
||||
if (ret > 0)
|
||||
ret = -EIO;
|
||||
} else {
|
||||
SetPageHWPoison(page);
|
||||
atomic_long_inc(&num_poisoned_pages);
|
||||
}
|
||||
} else {
|
||||
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
|
||||
pfn, ret, page_count(page), page->flags);
|
||||
}
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
done:
|
||||
atomic_long_add(1, &mce_bad_pages);
|
||||
SetPageHWPoison(page);
|
||||
/* keep elevated page count for bad page */
|
||||
return ret;
|
||||
}
|
||||
|
|
125
mm/memory.c
125
mm/memory.c
|
@ -69,6 +69,10 @@
|
|||
|
||||
#include "internal.h"
|
||||
|
||||
#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
|
||||
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
|
||||
#endif
|
||||
|
||||
#ifndef CONFIG_NEED_MULTIPLE_NODES
|
||||
/* use the per-pgdat data instead for discontigmem - mbligh */
|
||||
unsigned long max_mapnr;
|
||||
|
@ -1458,10 +1462,11 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
|
|||
EXPORT_SYMBOL_GPL(zap_vma_ptes);
|
||||
|
||||
/**
|
||||
* follow_page - look up a page descriptor from a user-virtual address
|
||||
* follow_page_mask - look up a page descriptor from a user-virtual address
|
||||
* @vma: vm_area_struct mapping @address
|
||||
* @address: virtual address to look up
|
||||
* @flags: flags modifying lookup behaviour
|
||||
* @page_mask: on output, *page_mask is set according to the size of the page
|
||||
*
|
||||
* @flags can have FOLL_ flags set, defined in <linux/mm.h>
|
||||
*
|
||||
|
@ -1469,8 +1474,9 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes);
|
|||
* an error pointer if there is a mapping to something not represented
|
||||
* by a page descriptor (see also vm_normal_page()).
|
||||
*/
|
||||
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
|
||||
unsigned int flags)
|
||||
struct page *follow_page_mask(struct vm_area_struct *vma,
|
||||
unsigned long address, unsigned int flags,
|
||||
unsigned int *page_mask)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
pud_t *pud;
|
||||
|
@ -1480,6 +1486,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
|
|||
struct page *page;
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
|
||||
*page_mask = 0;
|
||||
|
||||
page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
|
||||
if (!IS_ERR(page)) {
|
||||
BUG_ON(flags & FOLL_GET);
|
||||
|
@ -1526,6 +1534,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
|
|||
page = follow_trans_huge_pmd(vma, address,
|
||||
pmd, flags);
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
*page_mask = HPAGE_PMD_NR - 1;
|
||||
goto out;
|
||||
}
|
||||
} else
|
||||
|
@ -1539,8 +1548,24 @@ split_fallthrough:
|
|||
ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
|
||||
|
||||
pte = *ptep;
|
||||
if (!pte_present(pte))
|
||||
goto no_page;
|
||||
if (!pte_present(pte)) {
|
||||
swp_entry_t entry;
|
||||
/*
|
||||
* KSM's break_ksm() relies upon recognizing a ksm page
|
||||
* even while it is being migrated, so for that case we
|
||||
* need migration_entry_wait().
|
||||
*/
|
||||
if (likely(!(flags & FOLL_MIGRATION)))
|
||||
goto no_page;
|
||||
if (pte_none(pte) || pte_file(pte))
|
||||
goto no_page;
|
||||
entry = pte_to_swp_entry(pte);
|
||||
if (!is_migration_entry(entry))
|
||||
goto no_page;
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
migration_entry_wait(mm, pmd, address);
|
||||
goto split_fallthrough;
|
||||
}
|
||||
if ((flags & FOLL_NUMA) && pte_numa(pte))
|
||||
goto no_page;
|
||||
if ((flags & FOLL_WRITE) && !pte_write(pte))
|
||||
|
@ -1673,15 +1698,16 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
|
|||
* instead of __get_user_pages. __get_user_pages should be used only if
|
||||
* you need some special @gup_flags.
|
||||
*/
|
||||
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, int nr_pages, unsigned int gup_flags,
|
||||
struct page **pages, struct vm_area_struct **vmas,
|
||||
int *nonblocking)
|
||||
long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long nr_pages,
|
||||
unsigned int gup_flags, struct page **pages,
|
||||
struct vm_area_struct **vmas, int *nonblocking)
|
||||
{
|
||||
int i;
|
||||
long i;
|
||||
unsigned long vm_flags;
|
||||
unsigned int page_mask;
|
||||
|
||||
if (nr_pages <= 0)
|
||||
if (!nr_pages)
|
||||
return 0;
|
||||
|
||||
VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
|
||||
|
@ -1757,6 +1783,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
|||
get_page(page);
|
||||
}
|
||||
pte_unmap(pte);
|
||||
page_mask = 0;
|
||||
goto next_page;
|
||||
}
|
||||
|
||||
|
@ -1774,6 +1801,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
|||
do {
|
||||
struct page *page;
|
||||
unsigned int foll_flags = gup_flags;
|
||||
unsigned int page_increm;
|
||||
|
||||
/*
|
||||
* If we have a pending SIGKILL, don't keep faulting
|
||||
|
@ -1783,7 +1811,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
|||
return i ? i : -ERESTARTSYS;
|
||||
|
||||
cond_resched();
|
||||
while (!(page = follow_page(vma, start, foll_flags))) {
|
||||
while (!(page = follow_page_mask(vma, start,
|
||||
foll_flags, &page_mask))) {
|
||||
int ret;
|
||||
unsigned int fault_flags = 0;
|
||||
|
||||
|
@ -1857,13 +1886,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
|||
|
||||
flush_anon_page(vma, page, start);
|
||||
flush_dcache_page(page);
|
||||
page_mask = 0;
|
||||
}
|
||||
next_page:
|
||||
if (vmas)
|
||||
if (vmas) {
|
||||
vmas[i] = vma;
|
||||
i++;
|
||||
start += PAGE_SIZE;
|
||||
nr_pages--;
|
||||
page_mask = 0;
|
||||
}
|
||||
page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
|
||||
if (page_increm > nr_pages)
|
||||
page_increm = nr_pages;
|
||||
i += page_increm;
|
||||
start += page_increm * PAGE_SIZE;
|
||||
nr_pages -= page_increm;
|
||||
} while (nr_pages && start < vma->vm_end);
|
||||
} while (nr_pages);
|
||||
return i;
|
||||
|
@ -1977,9 +2012,9 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
|
|||
*
|
||||
* See also get_user_pages_fast, for performance critical applications.
|
||||
*/
|
||||
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, int nr_pages, int write, int force,
|
||||
struct page **pages, struct vm_area_struct **vmas)
|
||||
long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long nr_pages, int write,
|
||||
int force, struct page **pages, struct vm_area_struct **vmas)
|
||||
{
|
||||
int flags = FOLL_TOUCH;
|
||||
|
||||
|
@ -2919,7 +2954,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
unsigned int flags, pte_t orig_pte)
|
||||
{
|
||||
spinlock_t *ptl;
|
||||
struct page *page, *swapcache = NULL;
|
||||
struct page *page, *swapcache;
|
||||
swp_entry_t entry;
|
||||
pte_t pte;
|
||||
int locked;
|
||||
|
@ -2970,9 +3005,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
*/
|
||||
ret = VM_FAULT_HWPOISON;
|
||||
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
|
||||
swapcache = page;
|
||||
goto out_release;
|
||||
}
|
||||
|
||||
swapcache = page;
|
||||
locked = lock_page_or_retry(page, mm, flags);
|
||||
|
||||
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
|
||||
|
@ -2990,16 +3027,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
|
||||
goto out_page;
|
||||
|
||||
if (ksm_might_need_to_copy(page, vma, address)) {
|
||||
swapcache = page;
|
||||
page = ksm_does_need_to_copy(page, vma, address);
|
||||
|
||||
if (unlikely(!page)) {
|
||||
ret = VM_FAULT_OOM;
|
||||
page = swapcache;
|
||||
swapcache = NULL;
|
||||
goto out_page;
|
||||
}
|
||||
page = ksm_might_need_to_copy(page, vma, address);
|
||||
if (unlikely(!page)) {
|
||||
ret = VM_FAULT_OOM;
|
||||
page = swapcache;
|
||||
goto out_page;
|
||||
}
|
||||
|
||||
if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
|
||||
|
@ -3044,7 +3076,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
}
|
||||
flush_icache_page(vma, page);
|
||||
set_pte_at(mm, address, page_table, pte);
|
||||
do_page_add_anon_rmap(page, vma, address, exclusive);
|
||||
if (page == swapcache)
|
||||
do_page_add_anon_rmap(page, vma, address, exclusive);
|
||||
else /* ksm created a completely new copy */
|
||||
page_add_new_anon_rmap(page, vma, address);
|
||||
/* It's better to call commit-charge after rmap is established */
|
||||
mem_cgroup_commit_charge_swapin(page, ptr);
|
||||
|
||||
|
@ -3052,7 +3087,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
|
||||
try_to_free_swap(page);
|
||||
unlock_page(page);
|
||||
if (swapcache) {
|
||||
if (page != swapcache) {
|
||||
/*
|
||||
* Hold the lock to avoid the swap entry to be reused
|
||||
* until we take the PT lock for the pte_same() check
|
||||
|
@ -3085,7 +3120,7 @@ out_page:
|
|||
unlock_page(page);
|
||||
out_release:
|
||||
page_cache_release(page);
|
||||
if (swapcache) {
|
||||
if (page != swapcache) {
|
||||
unlock_page(swapcache);
|
||||
page_cache_release(swapcache);
|
||||
}
|
||||
|
@ -3821,30 +3856,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
|
|||
}
|
||||
#endif /* __PAGETABLE_PMD_FOLDED */
|
||||
|
||||
int make_pages_present(unsigned long addr, unsigned long end)
|
||||
{
|
||||
int ret, len, write;
|
||||
struct vm_area_struct * vma;
|
||||
|
||||
vma = find_vma(current->mm, addr);
|
||||
if (!vma)
|
||||
return -ENOMEM;
|
||||
/*
|
||||
* We want to touch writable mappings with a write fault in order
|
||||
* to break COW, except for shared mappings because these don't COW
|
||||
* and we would not want to dirty them for nothing.
|
||||
*/
|
||||
write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
|
||||
BUG_ON(addr >= end);
|
||||
BUG_ON(end > vma->vm_end);
|
||||
len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
|
||||
ret = get_user_pages(current, current->mm, addr,
|
||||
len, write, 0, NULL, NULL);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
return ret == len ? 0 : -EFAULT;
|
||||
}
|
||||
|
||||
#if !defined(__HAVE_ARCH_GATE_AREA)
|
||||
|
||||
#if defined(AT_SYSINFO_EHDR)
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
#include <linux/suspend.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/firmware-map.h>
|
||||
#include <linux/stop_machine.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
|
@ -91,9 +92,8 @@ static void release_memory_resource(struct resource *res)
|
|||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
|
||||
#ifndef CONFIG_SPARSEMEM_VMEMMAP
|
||||
static void get_page_bootmem(unsigned long info, struct page *page,
|
||||
unsigned long type)
|
||||
void get_page_bootmem(unsigned long info, struct page *page,
|
||||
unsigned long type)
|
||||
{
|
||||
page->lru.next = (struct list_head *) type;
|
||||
SetPagePrivate(page);
|
||||
|
@ -124,10 +124,13 @@ void __ref put_page_bootmem(struct page *page)
|
|||
mutex_lock(&ppb_lock);
|
||||
__free_pages_bootmem(page, 0);
|
||||
mutex_unlock(&ppb_lock);
|
||||
totalram_pages++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
|
||||
#ifndef CONFIG_SPARSEMEM_VMEMMAP
|
||||
static void register_page_bootmem_info_section(unsigned long start_pfn)
|
||||
{
|
||||
unsigned long *usemap, mapsize, section_nr, i;
|
||||
|
@ -161,6 +164,32 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
|
|||
get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
|
||||
|
||||
}
|
||||
#else /* CONFIG_SPARSEMEM_VMEMMAP */
|
||||
static void register_page_bootmem_info_section(unsigned long start_pfn)
|
||||
{
|
||||
unsigned long *usemap, mapsize, section_nr, i;
|
||||
struct mem_section *ms;
|
||||
struct page *page, *memmap;
|
||||
|
||||
if (!pfn_valid(start_pfn))
|
||||
return;
|
||||
|
||||
section_nr = pfn_to_section_nr(start_pfn);
|
||||
ms = __nr_to_section(section_nr);
|
||||
|
||||
memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
|
||||
|
||||
register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
|
||||
|
||||
usemap = __nr_to_section(section_nr)->pageblock_flags;
|
||||
page = virt_to_page(usemap);
|
||||
|
||||
mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
|
||||
|
||||
for (i = 0; i < mapsize; i++, page++)
|
||||
get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
|
||||
}
|
||||
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
|
||||
|
||||
void register_page_bootmem_info_node(struct pglist_data *pgdat)
|
||||
{
|
||||
|
@ -189,7 +218,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
|
|||
}
|
||||
|
||||
pfn = pgdat->node_start_pfn;
|
||||
end_pfn = pfn + pgdat->node_spanned_pages;
|
||||
end_pfn = pgdat_end_pfn(pgdat);
|
||||
|
||||
/* register_section info */
|
||||
for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
|
||||
|
@ -203,7 +232,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
|
|||
register_page_bootmem_info_section(pfn);
|
||||
}
|
||||
}
|
||||
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
|
||||
#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
|
||||
|
||||
static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
|
||||
unsigned long end_pfn)
|
||||
|
@ -253,6 +282,17 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
|
|||
set_page_links(pfn_to_page(pfn), zid, nid, pfn);
|
||||
}
|
||||
|
||||
/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
|
||||
* alloc_bootmem_node_nopanic() */
|
||||
static int __ref ensure_zone_is_initialized(struct zone *zone,
|
||||
unsigned long start_pfn, unsigned long num_pages)
|
||||
{
|
||||
if (!zone_is_initialized(zone))
|
||||
return init_currently_empty_zone(zone, start_pfn, num_pages,
|
||||
MEMMAP_HOTPLUG);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
|
||||
unsigned long start_pfn, unsigned long end_pfn)
|
||||
{
|
||||
|
@ -260,17 +300,14 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
|
|||
unsigned long flags;
|
||||
unsigned long z1_start_pfn;
|
||||
|
||||
if (!z1->wait_table) {
|
||||
ret = init_currently_empty_zone(z1, start_pfn,
|
||||
end_pfn - start_pfn, MEMMAP_HOTPLUG);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
pgdat_resize_lock(z1->zone_pgdat, &flags);
|
||||
|
||||
/* can't move pfns which are higher than @z2 */
|
||||
if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
|
||||
if (end_pfn > zone_end_pfn(z2))
|
||||
goto out_fail;
|
||||
/* the move out part mast at the left most of @z2 */
|
||||
if (start_pfn > z2->zone_start_pfn)
|
||||
|
@ -286,7 +323,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
|
|||
z1_start_pfn = start_pfn;
|
||||
|
||||
resize_zone(z1, z1_start_pfn, end_pfn);
|
||||
resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
|
||||
resize_zone(z2, end_pfn, zone_end_pfn(z2));
|
||||
|
||||
pgdat_resize_unlock(z1->zone_pgdat, &flags);
|
||||
|
||||
|
@ -305,12 +342,9 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
|
|||
unsigned long flags;
|
||||
unsigned long z2_end_pfn;
|
||||
|
||||
if (!z2->wait_table) {
|
||||
ret = init_currently_empty_zone(z2, start_pfn,
|
||||
end_pfn - start_pfn, MEMMAP_HOTPLUG);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
pgdat_resize_lock(z1->zone_pgdat, &flags);
|
||||
|
||||
|
@ -318,15 +352,15 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
|
|||
if (z1->zone_start_pfn > start_pfn)
|
||||
goto out_fail;
|
||||
/* the move out part mast at the right most of @z1 */
|
||||
if (z1->zone_start_pfn + z1->spanned_pages > end_pfn)
|
||||
if (zone_end_pfn(z1) > end_pfn)
|
||||
goto out_fail;
|
||||
/* must included/overlap */
|
||||
if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
|
||||
if (start_pfn >= zone_end_pfn(z1))
|
||||
goto out_fail;
|
||||
|
||||
/* use end_pfn for z2's end_pfn if z2 is empty */
|
||||
if (z2->spanned_pages)
|
||||
z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
|
||||
z2_end_pfn = zone_end_pfn(z2);
|
||||
else
|
||||
z2_end_pfn = end_pfn;
|
||||
|
||||
|
@ -363,16 +397,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
|
|||
int nid = pgdat->node_id;
|
||||
int zone_type;
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
|
||||
zone_type = zone - pgdat->node_zones;
|
||||
if (!zone->wait_table) {
|
||||
int ret;
|
||||
ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = init_currently_empty_zone(zone, phys_start_pfn,
|
||||
nr_pages, MEMMAP_HOTPLUG);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
pgdat_resize_lock(zone->zone_pgdat, &flags);
|
||||
grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
|
||||
grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
|
||||
|
@ -405,20 +436,211 @@ static int __meminit __add_section(int nid, struct zone *zone,
|
|||
return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
||||
static int __remove_section(struct zone *zone, struct mem_section *ms)
|
||||
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
|
||||
static int find_smallest_section_pfn(int nid, struct zone *zone,
|
||||
unsigned long start_pfn,
|
||||
unsigned long end_pfn)
|
||||
{
|
||||
/*
|
||||
* XXX: Freeing memmap with vmemmap is not implement yet.
|
||||
* This should be removed later.
|
||||
*/
|
||||
return -EBUSY;
|
||||
struct mem_section *ms;
|
||||
|
||||
for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
|
||||
ms = __pfn_to_section(start_pfn);
|
||||
|
||||
if (unlikely(!valid_section(ms)))
|
||||
continue;
|
||||
|
||||
if (unlikely(pfn_to_nid(start_pfn) != nid))
|
||||
continue;
|
||||
|
||||
if (zone && zone != page_zone(pfn_to_page(start_pfn)))
|
||||
continue;
|
||||
|
||||
return start_pfn;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
|
||||
/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
|
||||
static int find_biggest_section_pfn(int nid, struct zone *zone,
|
||||
unsigned long start_pfn,
|
||||
unsigned long end_pfn)
|
||||
{
|
||||
struct mem_section *ms;
|
||||
unsigned long pfn;
|
||||
|
||||
/* pfn is the end pfn of a memory section. */
|
||||
pfn = end_pfn - 1;
|
||||
for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
|
||||
ms = __pfn_to_section(pfn);
|
||||
|
||||
if (unlikely(!valid_section(ms)))
|
||||
continue;
|
||||
|
||||
if (unlikely(pfn_to_nid(pfn) != nid))
|
||||
continue;
|
||||
|
||||
if (zone && zone != page_zone(pfn_to_page(pfn)))
|
||||
continue;
|
||||
|
||||
return pfn;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
|
||||
unsigned long end_pfn)
|
||||
{
|
||||
unsigned long zone_start_pfn = zone->zone_start_pfn;
|
||||
unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
|
||||
unsigned long pfn;
|
||||
struct mem_section *ms;
|
||||
int nid = zone_to_nid(zone);
|
||||
|
||||
zone_span_writelock(zone);
|
||||
if (zone_start_pfn == start_pfn) {
|
||||
/*
|
||||
* If the section is smallest section in the zone, it need
|
||||
* shrink zone->zone_start_pfn and zone->zone_spanned_pages.
|
||||
* In this case, we find second smallest valid mem_section
|
||||
* for shrinking zone.
|
||||
*/
|
||||
pfn = find_smallest_section_pfn(nid, zone, end_pfn,
|
||||
zone_end_pfn);
|
||||
if (pfn) {
|
||||
zone->zone_start_pfn = pfn;
|
||||
zone->spanned_pages = zone_end_pfn - pfn;
|
||||
}
|
||||
} else if (zone_end_pfn == end_pfn) {
|
||||
/*
|
||||
* If the section is biggest section in the zone, it need
|
||||
* shrink zone->spanned_pages.
|
||||
* In this case, we find second biggest valid mem_section for
|
||||
* shrinking zone.
|
||||
*/
|
||||
pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
|
||||
start_pfn);
|
||||
if (pfn)
|
||||
zone->spanned_pages = pfn - zone_start_pfn + 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* The section is not biggest or smallest mem_section in the zone, it
|
||||
* only creates a hole in the zone. So in this case, we need not
|
||||
* change the zone. But perhaps, the zone has only hole data. Thus
|
||||
* it check the zone has only hole or not.
|
||||
*/
|
||||
pfn = zone_start_pfn;
|
||||
for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
|
||||
ms = __pfn_to_section(pfn);
|
||||
|
||||
if (unlikely(!valid_section(ms)))
|
||||
continue;
|
||||
|
||||
if (page_zone(pfn_to_page(pfn)) != zone)
|
||||
continue;
|
||||
|
||||
/* If the section is current section, it continues the loop */
|
||||
if (start_pfn == pfn)
|
||||
continue;
|
||||
|
||||
/* If we find valid section, we have nothing to do */
|
||||
zone_span_writeunlock(zone);
|
||||
return;
|
||||
}
|
||||
|
||||
/* The zone has no valid section */
|
||||
zone->zone_start_pfn = 0;
|
||||
zone->spanned_pages = 0;
|
||||
zone_span_writeunlock(zone);
|
||||
}
|
||||
|
||||
static void shrink_pgdat_span(struct pglist_data *pgdat,
|
||||
unsigned long start_pfn, unsigned long end_pfn)
|
||||
{
|
||||
unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
|
||||
unsigned long pgdat_end_pfn =
|
||||
pgdat->node_start_pfn + pgdat->node_spanned_pages;
|
||||
unsigned long pfn;
|
||||
struct mem_section *ms;
|
||||
int nid = pgdat->node_id;
|
||||
|
||||
if (pgdat_start_pfn == start_pfn) {
|
||||
/*
|
||||
* If the section is smallest section in the pgdat, it need
|
||||
* shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
|
||||
* In this case, we find second smallest valid mem_section
|
||||
* for shrinking zone.
|
||||
*/
|
||||
pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
|
||||
pgdat_end_pfn);
|
||||
if (pfn) {
|
||||
pgdat->node_start_pfn = pfn;
|
||||
pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
|
||||
}
|
||||
} else if (pgdat_end_pfn == end_pfn) {
|
||||
/*
|
||||
* If the section is biggest section in the pgdat, it need
|
||||
* shrink pgdat->node_spanned_pages.
|
||||
* In this case, we find second biggest valid mem_section for
|
||||
* shrinking zone.
|
||||
*/
|
||||
pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
|
||||
start_pfn);
|
||||
if (pfn)
|
||||
pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the section is not biggest or smallest mem_section in the pgdat,
|
||||
* it only creates a hole in the pgdat. So in this case, we need not
|
||||
* change the pgdat.
|
||||
* But perhaps, the pgdat has only hole data. Thus it check the pgdat
|
||||
* has only hole or not.
|
||||
*/
|
||||
pfn = pgdat_start_pfn;
|
||||
for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
|
||||
ms = __pfn_to_section(pfn);
|
||||
|
||||
if (unlikely(!valid_section(ms)))
|
||||
continue;
|
||||
|
||||
if (pfn_to_nid(pfn) != nid)
|
||||
continue;
|
||||
|
||||
/* If the section is current section, it continues the loop */
|
||||
if (start_pfn == pfn)
|
||||
continue;
|
||||
|
||||
/* If we find valid section, we have nothing to do */
|
||||
return;
|
||||
}
|
||||
|
||||
/* The pgdat has no valid section */
|
||||
pgdat->node_start_pfn = 0;
|
||||
pgdat->node_spanned_pages = 0;
|
||||
}
|
||||
|
||||
static void __remove_zone(struct zone *zone, unsigned long start_pfn)
|
||||
{
|
||||
struct pglist_data *pgdat = zone->zone_pgdat;
|
||||
int nr_pages = PAGES_PER_SECTION;
|
||||
int zone_type;
|
||||
unsigned long flags;
|
||||
|
||||
zone_type = zone - pgdat->node_zones;
|
||||
|
||||
pgdat_resize_lock(zone->zone_pgdat, &flags);
|
||||
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
|
||||
shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
|
||||
pgdat_resize_unlock(zone->zone_pgdat, &flags);
|
||||
}
|
||||
|
||||
static int __remove_section(struct zone *zone, struct mem_section *ms)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct pglist_data *pgdat = zone->zone_pgdat;
|
||||
unsigned long start_pfn;
|
||||
int scn_nr;
|
||||
int ret = -EINVAL;
|
||||
|
||||
if (!valid_section(ms))
|
||||
|
@ -428,12 +650,13 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
pgdat_resize_lock(pgdat, &flags);
|
||||
scn_nr = __section_nr(ms);
|
||||
start_pfn = section_nr_to_pfn(scn_nr);
|
||||
__remove_zone(zone, start_pfn);
|
||||
|
||||
sparse_remove_one_section(zone, ms);
|
||||
pgdat_resize_unlock(pgdat, &flags);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Reasonably generic function for adding memory. It is
|
||||
|
@ -797,11 +1020,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
|
|||
unsigned long zholes_size[MAX_NR_ZONES] = {0};
|
||||
unsigned long start_pfn = start >> PAGE_SHIFT;
|
||||
|
||||
pgdat = arch_alloc_nodedata(nid);
|
||||
if (!pgdat)
|
||||
return NULL;
|
||||
pgdat = NODE_DATA(nid);
|
||||
if (!pgdat) {
|
||||
pgdat = arch_alloc_nodedata(nid);
|
||||
if (!pgdat)
|
||||
return NULL;
|
||||
|
||||
arch_refresh_nodedata(nid, pgdat);
|
||||
arch_refresh_nodedata(nid, pgdat);
|
||||
}
|
||||
|
||||
/* we can use NODE_DATA(nid) from here */
|
||||
|
||||
|
@ -854,7 +1080,8 @@ out:
|
|||
int __ref add_memory(int nid, u64 start, u64 size)
|
||||
{
|
||||
pg_data_t *pgdat = NULL;
|
||||
int new_pgdat = 0;
|
||||
bool new_pgdat;
|
||||
bool new_node;
|
||||
struct resource *res;
|
||||
int ret;
|
||||
|
||||
|
@ -865,12 +1092,16 @@ int __ref add_memory(int nid, u64 start, u64 size)
|
|||
if (!res)
|
||||
goto out;
|
||||
|
||||
if (!node_online(nid)) {
|
||||
{ /* Stupid hack to suppress address-never-null warning */
|
||||
void *p = NODE_DATA(nid);
|
||||
new_pgdat = !p;
|
||||
}
|
||||
new_node = !node_online(nid);
|
||||
if (new_node) {
|
||||
pgdat = hotadd_new_pgdat(nid, start);
|
||||
ret = -ENOMEM;
|
||||
if (!pgdat)
|
||||
goto error;
|
||||
new_pgdat = 1;
|
||||
}
|
||||
|
||||
/* call arch's memory hotadd */
|
||||
|
@ -882,7 +1113,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
|
|||
/* we online node here. we can't roll back from here. */
|
||||
node_set_online(nid);
|
||||
|
||||
if (new_pgdat) {
|
||||
if (new_node) {
|
||||
ret = register_one_node(nid);
|
||||
/*
|
||||
* If sysfs file of new node can't create, cpu on the node
|
||||
|
@ -901,8 +1132,7 @@ error:
|
|||
/* rollback pgdat allocation and others */
|
||||
if (new_pgdat)
|
||||
rollback_node_hotadd(nid, pgdat);
|
||||
if (res)
|
||||
release_memory_resource(res);
|
||||
release_memory_resource(res);
|
||||
|
||||
out:
|
||||
unlock_memory_hotplug();
|
||||
|
@ -1058,8 +1288,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
|
|||
* migrate_pages returns # of failed pages.
|
||||
*/
|
||||
ret = migrate_pages(&source, alloc_migrate_target, 0,
|
||||
true, MIGRATE_SYNC,
|
||||
MR_MEMORY_HOTPLUG);
|
||||
MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
|
||||
if (ret)
|
||||
putback_lru_pages(&source);
|
||||
}
|
||||
|
@ -1381,17 +1610,26 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
|
|||
return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
|
||||
}
|
||||
|
||||
int remove_memory(u64 start, u64 size)
|
||||
/**
|
||||
* walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
|
||||
* @start_pfn: start pfn of the memory range
|
||||
* @end_pfn: end pft of the memory range
|
||||
* @arg: argument passed to func
|
||||
* @func: callback for each memory section walked
|
||||
*
|
||||
* This function walks through all present mem sections in range
|
||||
* [start_pfn, end_pfn) and call func on each mem section.
|
||||
*
|
||||
* Returns the return value of func.
|
||||
*/
|
||||
static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
|
||||
void *arg, int (*func)(struct memory_block *, void *))
|
||||
{
|
||||
struct memory_block *mem = NULL;
|
||||
struct mem_section *section;
|
||||
unsigned long start_pfn, end_pfn;
|
||||
unsigned long pfn, section_nr;
|
||||
int ret;
|
||||
|
||||
start_pfn = PFN_DOWN(start);
|
||||
end_pfn = start_pfn + PFN_DOWN(size);
|
||||
|
||||
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
|
||||
section_nr = pfn_to_section_nr(pfn);
|
||||
if (!present_section_nr(section_nr))
|
||||
|
@ -1408,7 +1646,7 @@ int remove_memory(u64 start, u64 size)
|
|||
if (!mem)
|
||||
continue;
|
||||
|
||||
ret = offline_memory_block(mem);
|
||||
ret = func(mem, arg);
|
||||
if (ret) {
|
||||
kobject_put(&mem->dev.kobj);
|
||||
return ret;
|
||||
|
@ -1420,12 +1658,209 @@ int remove_memory(u64 start, u64 size)
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* offline_memory_block_cb - callback function for offlining memory block
|
||||
* @mem: the memory block to be offlined
|
||||
* @arg: buffer to hold error msg
|
||||
*
|
||||
* Always return 0, and put the error msg in arg if any.
|
||||
*/
|
||||
static int offline_memory_block_cb(struct memory_block *mem, void *arg)
|
||||
{
|
||||
int *ret = arg;
|
||||
int error = offline_memory_block(mem);
|
||||
|
||||
if (error != 0 && *ret == 0)
|
||||
*ret = error;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
|
||||
{
|
||||
int ret = !is_memblock_offlined(mem);
|
||||
|
||||
if (unlikely(ret))
|
||||
pr_warn("removing memory fails, because memory "
|
||||
"[%#010llx-%#010llx] is onlined\n",
|
||||
PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
|
||||
PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int check_cpu_on_node(void *data)
|
||||
{
|
||||
struct pglist_data *pgdat = data;
|
||||
int cpu;
|
||||
|
||||
for_each_present_cpu(cpu) {
|
||||
if (cpu_to_node(cpu) == pgdat->node_id)
|
||||
/*
|
||||
* the cpu on this node isn't removed, and we can't
|
||||
* offline this node.
|
||||
*/
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void unmap_cpu_on_node(void *data)
|
||||
{
|
||||
#ifdef CONFIG_ACPI_NUMA
|
||||
struct pglist_data *pgdat = data;
|
||||
int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu)
|
||||
if (cpu_to_node(cpu) == pgdat->node_id)
|
||||
numa_clear_node(cpu);
|
||||
#endif
|
||||
}
|
||||
|
||||
static int check_and_unmap_cpu_on_node(void *data)
|
||||
{
|
||||
int ret = check_cpu_on_node(data);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* the node will be offlined when we come here, so we can clear
|
||||
* the cpu_to_node() now.
|
||||
*/
|
||||
|
||||
unmap_cpu_on_node(data);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* offline the node if all memory sections of this node are removed */
|
||||
void try_offline_node(int nid)
|
||||
{
|
||||
pg_data_t *pgdat = NODE_DATA(nid);
|
||||
unsigned long start_pfn = pgdat->node_start_pfn;
|
||||
unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
|
||||
unsigned long pfn;
|
||||
struct page *pgdat_page = virt_to_page(pgdat);
|
||||
int i;
|
||||
|
||||
for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
|
||||
unsigned long section_nr = pfn_to_section_nr(pfn);
|
||||
|
||||
if (!present_section_nr(section_nr))
|
||||
continue;
|
||||
|
||||
if (pfn_to_nid(pfn) != nid)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* some memory sections of this node are not removed, and we
|
||||
* can't offline node now.
|
||||
*/
|
||||
return;
|
||||
}
|
||||
|
||||
if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL))
|
||||
return;
|
||||
|
||||
/*
|
||||
* all memory/cpu of this node are removed, we can offline this
|
||||
* node now.
|
||||
*/
|
||||
node_set_offline(nid);
|
||||
unregister_one_node(nid);
|
||||
|
||||
if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
|
||||
/* node data is allocated from boot memory */
|
||||
return;
|
||||
|
||||
/* free waittable in each zone */
|
||||
for (i = 0; i < MAX_NR_ZONES; i++) {
|
||||
struct zone *zone = pgdat->node_zones + i;
|
||||
|
||||
if (zone->wait_table)
|
||||
vfree(zone->wait_table);
|
||||
}
|
||||
|
||||
/*
|
||||
* Since there is no way to guarentee the address of pgdat/zone is not
|
||||
* on stack of any kernel threads or used by other kernel objects
|
||||
* without reference counting or other symchronizing method, do not
|
||||
* reset node_data and free pgdat here. Just reset it to 0 and reuse
|
||||
* the memory when the node is online again.
|
||||
*/
|
||||
memset(pgdat, 0, sizeof(*pgdat));
|
||||
}
|
||||
EXPORT_SYMBOL(try_offline_node);
|
||||
|
||||
int __ref remove_memory(int nid, u64 start, u64 size)
|
||||
{
|
||||
unsigned long start_pfn, end_pfn;
|
||||
int ret = 0;
|
||||
int retry = 1;
|
||||
|
||||
start_pfn = PFN_DOWN(start);
|
||||
end_pfn = start_pfn + PFN_DOWN(size);
|
||||
|
||||
/*
|
||||
* When CONFIG_MEMCG is on, one memory block may be used by other
|
||||
* blocks to store page cgroup when onlining pages. But we don't know
|
||||
* in what order pages are onlined. So we iterate twice to offline
|
||||
* memory:
|
||||
* 1st iterate: offline every non primary memory block.
|
||||
* 2nd iterate: offline primary (i.e. first added) memory block.
|
||||
*/
|
||||
repeat:
|
||||
walk_memory_range(start_pfn, end_pfn, &ret,
|
||||
offline_memory_block_cb);
|
||||
if (ret) {
|
||||
if (!retry)
|
||||
return ret;
|
||||
|
||||
retry = 0;
|
||||
ret = 0;
|
||||
goto repeat;
|
||||
}
|
||||
|
||||
lock_memory_hotplug();
|
||||
|
||||
/*
|
||||
* we have offlined all memory blocks like this:
|
||||
* 1. lock memory hotplug
|
||||
* 2. offline a memory block
|
||||
* 3. unlock memory hotplug
|
||||
*
|
||||
* repeat step1-3 to offline the memory block. All memory blocks
|
||||
* must be offlined before removing memory. But we don't hold the
|
||||
* lock in the whole operation. So we should check whether all
|
||||
* memory blocks are offlined.
|
||||
*/
|
||||
|
||||
ret = walk_memory_range(start_pfn, end_pfn, NULL,
|
||||
is_memblock_offlined_cb);
|
||||
if (ret) {
|
||||
unlock_memory_hotplug();
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* remove memmap entry */
|
||||
firmware_map_remove(start, start + size, "System RAM");
|
||||
|
||||
arch_remove_memory(start, size);
|
||||
|
||||
try_offline_node(nid);
|
||||
|
||||
unlock_memory_hotplug();
|
||||
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
int remove_memory(u64 start, u64 size)
|
||||
int remove_memory(int nid, u64 start, u64 size)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
|
|
|
@ -26,7 +26,7 @@
|
|||
* the allocation to memory nodes instead
|
||||
*
|
||||
* preferred Try a specific node first before normal fallback.
|
||||
* As a special case node -1 here means do the allocation
|
||||
* As a special case NUMA_NO_NODE here means do the allocation
|
||||
* on the local CPU. This is normally identical to default,
|
||||
* but useful to set in a VMA when you have a non default
|
||||
* process policy.
|
||||
|
@ -127,7 +127,7 @@ static struct mempolicy *get_task_policy(struct task_struct *p)
|
|||
|
||||
if (!pol) {
|
||||
node = numa_node_id();
|
||||
if (node != -1)
|
||||
if (node != NUMA_NO_NODE)
|
||||
pol = &preferred_node_policy[node];
|
||||
|
||||
/* preferred_node_policy is not initialised early in boot */
|
||||
|
@ -161,19 +161,7 @@ static const struct mempolicy_operations {
|
|||
/* Check that the nodemask contains at least one populated zone */
|
||||
static int is_valid_nodemask(const nodemask_t *nodemask)
|
||||
{
|
||||
int nd, k;
|
||||
|
||||
for_each_node_mask(nd, *nodemask) {
|
||||
struct zone *z;
|
||||
|
||||
for (k = 0; k <= policy_zone; k++) {
|
||||
z = &NODE_DATA(nd)->node_zones[k];
|
||||
if (z->present_pages > 0)
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
return nodes_intersects(*nodemask, node_states[N_MEMORY]);
|
||||
}
|
||||
|
||||
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
|
||||
|
@ -270,7 +258,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
|
|||
struct mempolicy *policy;
|
||||
|
||||
pr_debug("setting mode %d flags %d nodes[0] %lx\n",
|
||||
mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
|
||||
mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
|
||||
|
||||
if (mode == MPOL_DEFAULT) {
|
||||
if (nodes && !nodes_empty(*nodes))
|
||||
|
@ -508,9 +496,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
|
|||
/*
|
||||
* vm_normal_page() filters out zero pages, but there might
|
||||
* still be PageReserved pages to skip, perhaps in a VDSO.
|
||||
* And we cannot move PageKsm pages sensibly or safely yet.
|
||||
*/
|
||||
if (PageReserved(page) || PageKsm(page))
|
||||
if (PageReserved(page))
|
||||
continue;
|
||||
nid = page_to_nid(page);
|
||||
if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
|
||||
|
@ -1027,8 +1014,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
|
|||
|
||||
if (!list_empty(&pagelist)) {
|
||||
err = migrate_pages(&pagelist, new_node_page, dest,
|
||||
false, MIGRATE_SYNC,
|
||||
MR_SYSCALL);
|
||||
MIGRATE_SYNC, MR_SYSCALL);
|
||||
if (err)
|
||||
putback_lru_pages(&pagelist);
|
||||
}
|
||||
|
@ -1235,7 +1221,7 @@ static long do_mbind(unsigned long start, unsigned long len,
|
|||
|
||||
pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
|
||||
start, start + len, mode, mode_flags,
|
||||
nmask ? nodes_addr(*nmask)[0] : -1);
|
||||
nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
|
||||
|
||||
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
|
||||
|
||||
|
@ -1272,9 +1258,8 @@ static long do_mbind(unsigned long start, unsigned long len,
|
|||
if (!list_empty(&pagelist)) {
|
||||
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
|
||||
nr_failed = migrate_pages(&pagelist, new_vma_page,
|
||||
(unsigned long)vma,
|
||||
false, MIGRATE_SYNC,
|
||||
MR_MEMPOLICY_MBIND);
|
||||
(unsigned long)vma,
|
||||
MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
|
||||
if (nr_failed)
|
||||
putback_lru_pages(&pagelist);
|
||||
}
|
||||
|
@ -1644,6 +1629,26 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
|
|||
return pol;
|
||||
}
|
||||
|
||||
static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
|
||||
{
|
||||
enum zone_type dynamic_policy_zone = policy_zone;
|
||||
|
||||
BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
|
||||
|
||||
/*
|
||||
* if policy->v.nodes has movable memory only,
|
||||
* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
|
||||
*
|
||||
* policy->v.nodes is intersect with node_states[N_MEMORY].
|
||||
* so if the following test faile, it implies
|
||||
* policy->v.nodes has movable memory only.
|
||||
*/
|
||||
if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
|
||||
dynamic_policy_zone = ZONE_MOVABLE;
|
||||
|
||||
return zone >= dynamic_policy_zone;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a nodemask representing a mempolicy for filtering nodes for
|
||||
* page allocation
|
||||
|
@ -1652,7 +1657,7 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
|
|||
{
|
||||
/* Lower zones don't get a nodemask applied for MPOL_BIND */
|
||||
if (unlikely(policy->mode == MPOL_BIND) &&
|
||||
gfp_zone(gfp) >= policy_zone &&
|
||||
apply_policy_zone(policy, gfp_zone(gfp)) &&
|
||||
cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
|
||||
return &policy->v.nodes;
|
||||
|
||||
|
@ -2308,7 +2313,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
|
|||
* it less likely we act on an unlikely task<->page
|
||||
* relation.
|
||||
*/
|
||||
last_nid = page_xchg_last_nid(page, polnid);
|
||||
last_nid = page_nid_xchg_last(page, polnid);
|
||||
if (last_nid != polnid)
|
||||
goto out;
|
||||
}
|
||||
|
@ -2483,7 +2488,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
|
|||
vma->vm_pgoff,
|
||||
sz, npol ? npol->mode : -1,
|
||||
npol ? npol->flags : -1,
|
||||
npol ? nodes_addr(npol->v.nodes)[0] : -1);
|
||||
npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
|
||||
|
||||
if (npol) {
|
||||
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
|
||||
|
|
168
mm/migrate.c
168
mm/migrate.c
|
@ -464,7 +464,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
|
|||
|
||||
mlock_migrate_page(newpage, page);
|
||||
ksm_migrate_page(newpage, page);
|
||||
|
||||
/*
|
||||
* Please do not reorder this without considering how mm/ksm.c's
|
||||
* get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
|
||||
*/
|
||||
ClearPageSwapCache(page);
|
||||
ClearPagePrivate(page);
|
||||
set_page_private(page, 0);
|
||||
|
@ -698,7 +701,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
|
|||
}
|
||||
|
||||
static int __unmap_and_move(struct page *page, struct page *newpage,
|
||||
int force, bool offlining, enum migrate_mode mode)
|
||||
int force, enum migrate_mode mode)
|
||||
{
|
||||
int rc = -EAGAIN;
|
||||
int remap_swapcache = 1;
|
||||
|
@ -728,20 +731,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
|
|||
lock_page(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* Only memory hotplug's offline_pages() caller has locked out KSM,
|
||||
* and can safely migrate a KSM page. The other cases have skipped
|
||||
* PageKsm along with PageReserved - but it is only now when we have
|
||||
* the page lock that we can be certain it will not go KSM beneath us
|
||||
* (KSM will not upgrade a page from PageAnon to PageKsm when it sees
|
||||
* its pagecount raised, but only here do we take the page lock which
|
||||
* serializes that).
|
||||
*/
|
||||
if (PageKsm(page) && !offlining) {
|
||||
rc = -EBUSY;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/* charge against new page */
|
||||
mem_cgroup_prepare_migration(page, newpage, &mem);
|
||||
|
||||
|
@ -768,7 +757,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
|
|||
* File Caches may use write_page() or lock_page() in migration, then,
|
||||
* just care Anon page here.
|
||||
*/
|
||||
if (PageAnon(page)) {
|
||||
if (PageAnon(page) && !PageKsm(page)) {
|
||||
/*
|
||||
* Only page_lock_anon_vma_read() understands the subtleties of
|
||||
* getting a hold on an anon_vma from outside one of its mms.
|
||||
|
@ -848,7 +837,6 @@ uncharge:
|
|||
mem_cgroup_end_migration(mem, page, newpage,
|
||||
(rc == MIGRATEPAGE_SUCCESS ||
|
||||
rc == MIGRATEPAGE_BALLOON_SUCCESS));
|
||||
unlock:
|
||||
unlock_page(page);
|
||||
out:
|
||||
return rc;
|
||||
|
@ -859,8 +847,7 @@ out:
|
|||
* to the newly allocated page in newpage.
|
||||
*/
|
||||
static int unmap_and_move(new_page_t get_new_page, unsigned long private,
|
||||
struct page *page, int force, bool offlining,
|
||||
enum migrate_mode mode)
|
||||
struct page *page, int force, enum migrate_mode mode)
|
||||
{
|
||||
int rc = 0;
|
||||
int *result = NULL;
|
||||
|
@ -878,7 +865,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
|
|||
if (unlikely(split_huge_page(page)))
|
||||
goto out;
|
||||
|
||||
rc = __unmap_and_move(page, newpage, force, offlining, mode);
|
||||
rc = __unmap_and_move(page, newpage, force, mode);
|
||||
|
||||
if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
|
||||
/*
|
||||
|
@ -938,8 +925,7 @@ out:
|
|||
*/
|
||||
static int unmap_and_move_huge_page(new_page_t get_new_page,
|
||||
unsigned long private, struct page *hpage,
|
||||
int force, bool offlining,
|
||||
enum migrate_mode mode)
|
||||
int force, enum migrate_mode mode)
|
||||
{
|
||||
int rc = 0;
|
||||
int *result = NULL;
|
||||
|
@ -1001,9 +987,8 @@ out:
|
|||
*
|
||||
* Return: Number of pages not migrated or error code.
|
||||
*/
|
||||
int migrate_pages(struct list_head *from,
|
||||
new_page_t get_new_page, unsigned long private, bool offlining,
|
||||
enum migrate_mode mode, int reason)
|
||||
int migrate_pages(struct list_head *from, new_page_t get_new_page,
|
||||
unsigned long private, enum migrate_mode mode, int reason)
|
||||
{
|
||||
int retry = 1;
|
||||
int nr_failed = 0;
|
||||
|
@ -1024,8 +1009,7 @@ int migrate_pages(struct list_head *from,
|
|||
cond_resched();
|
||||
|
||||
rc = unmap_and_move(get_new_page, private,
|
||||
page, pass > 2, offlining,
|
||||
mode);
|
||||
page, pass > 2, mode);
|
||||
|
||||
switch(rc) {
|
||||
case -ENOMEM:
|
||||
|
@ -1058,15 +1042,13 @@ out:
|
|||
}
|
||||
|
||||
int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
|
||||
unsigned long private, bool offlining,
|
||||
enum migrate_mode mode)
|
||||
unsigned long private, enum migrate_mode mode)
|
||||
{
|
||||
int pass, rc;
|
||||
|
||||
for (pass = 0; pass < 10; pass++) {
|
||||
rc = unmap_and_move_huge_page(get_new_page,
|
||||
private, hpage, pass > 2, offlining,
|
||||
mode);
|
||||
rc = unmap_and_move_huge_page(get_new_page, private,
|
||||
hpage, pass > 2, mode);
|
||||
switch (rc) {
|
||||
case -ENOMEM:
|
||||
goto out;
|
||||
|
@ -1152,7 +1134,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
|
|||
goto set_status;
|
||||
|
||||
/* Use PageReserved to check for zero page */
|
||||
if (PageReserved(page) || PageKsm(page))
|
||||
if (PageReserved(page))
|
||||
goto put_and_set;
|
||||
|
||||
pp->page = page;
|
||||
|
@ -1189,8 +1171,7 @@ set_status:
|
|||
err = 0;
|
||||
if (!list_empty(&pagelist)) {
|
||||
err = migrate_pages(&pagelist, new_page_node,
|
||||
(unsigned long)pm, 0, MIGRATE_SYNC,
|
||||
MR_SYSCALL);
|
||||
(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
|
||||
if (err)
|
||||
putback_lru_pages(&pagelist);
|
||||
}
|
||||
|
@ -1314,7 +1295,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
|
|||
|
||||
err = -ENOENT;
|
||||
/* Use PageReserved to check for zero page */
|
||||
if (!page || PageReserved(page) || PageKsm(page))
|
||||
if (!page || PageReserved(page))
|
||||
goto set_status;
|
||||
|
||||
err = page_to_nid(page);
|
||||
|
@ -1461,7 +1442,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
|
|||
* pages. Currently it only checks the watermarks which crude
|
||||
*/
|
||||
static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
|
||||
int nr_migrate_pages)
|
||||
unsigned long nr_migrate_pages)
|
||||
{
|
||||
int z;
|
||||
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
|
||||
|
@ -1497,7 +1478,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
|
|||
__GFP_NOWARN) &
|
||||
~GFP_IOFS, 0);
|
||||
if (newpage)
|
||||
page_xchg_last_nid(newpage, page_last_nid(page));
|
||||
page_nid_xchg_last(newpage, page_nid_last(page));
|
||||
|
||||
return newpage;
|
||||
}
|
||||
|
@ -1557,39 +1538,40 @@ bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
|
|||
|
||||
int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
|
||||
{
|
||||
int ret = 0;
|
||||
int page_lru;
|
||||
|
||||
VM_BUG_ON(compound_order(page) && !PageTransHuge(page));
|
||||
|
||||
/* Avoid migrating to a node that is nearly full */
|
||||
if (migrate_balanced_pgdat(pgdat, 1)) {
|
||||
int page_lru;
|
||||
if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
|
||||
return 0;
|
||||
|
||||
if (isolate_lru_page(page)) {
|
||||
put_page(page);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Page is isolated */
|
||||
ret = 1;
|
||||
page_lru = page_is_file_cache(page);
|
||||
if (!PageTransHuge(page))
|
||||
inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
|
||||
else
|
||||
mod_zone_page_state(page_zone(page),
|
||||
NR_ISOLATED_ANON + page_lru,
|
||||
HPAGE_PMD_NR);
|
||||
}
|
||||
if (isolate_lru_page(page))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Page is either isolated or there is not enough space on the target
|
||||
* node. If isolated, then it has taken a reference count and the
|
||||
* callers reference can be safely dropped without the page
|
||||
* disappearing underneath us during migration. Otherwise the page is
|
||||
* not to be migrated but the callers reference should still be
|
||||
* dropped so it does not leak.
|
||||
* migrate_misplaced_transhuge_page() skips page migration's usual
|
||||
* check on page_count(), so we must do it here, now that the page
|
||||
* has been isolated: a GUP pin, or any other pin, prevents migration.
|
||||
* The expected page count is 3: 1 for page's mapcount and 1 for the
|
||||
* caller's pin and 1 for the reference taken by isolate_lru_page().
|
||||
*/
|
||||
if (PageTransHuge(page) && page_count(page) != 3) {
|
||||
putback_lru_page(page);
|
||||
return 0;
|
||||
}
|
||||
|
||||
page_lru = page_is_file_cache(page);
|
||||
mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru,
|
||||
hpage_nr_pages(page));
|
||||
|
||||
/*
|
||||
* Isolating the page has taken another reference, so the
|
||||
* caller's reference can be safely dropped without the page
|
||||
* disappearing underneath us during migration.
|
||||
*/
|
||||
put_page(page);
|
||||
|
||||
return ret;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1600,7 +1582,7 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
|
|||
int migrate_misplaced_page(struct page *page, int node)
|
||||
{
|
||||
pg_data_t *pgdat = NODE_DATA(node);
|
||||
int isolated = 0;
|
||||
int isolated;
|
||||
int nr_remaining;
|
||||
LIST_HEAD(migratepages);
|
||||
|
||||
|
@ -1608,42 +1590,43 @@ int migrate_misplaced_page(struct page *page, int node)
|
|||
* Don't migrate pages that are mapped in multiple processes.
|
||||
* TODO: Handle false sharing detection instead of this hammer
|
||||
*/
|
||||
if (page_mapcount(page) != 1) {
|
||||
put_page(page);
|
||||
if (page_mapcount(page) != 1)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Rate-limit the amount of data that is being migrated to a node.
|
||||
* Optimal placement is no good if the memory bus is saturated and
|
||||
* all the time is being spent migrating!
|
||||
*/
|
||||
if (numamigrate_update_ratelimit(pgdat, 1)) {
|
||||
put_page(page);
|
||||
if (numamigrate_update_ratelimit(pgdat, 1))
|
||||
goto out;
|
||||
}
|
||||
|
||||
isolated = numamigrate_isolate_page(pgdat, page);
|
||||
if (!isolated)
|
||||
goto out;
|
||||
|
||||
list_add(&page->lru, &migratepages);
|
||||
nr_remaining = migrate_pages(&migratepages,
|
||||
alloc_misplaced_dst_page,
|
||||
node, false, MIGRATE_ASYNC,
|
||||
MR_NUMA_MISPLACED);
|
||||
nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
|
||||
node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
|
||||
if (nr_remaining) {
|
||||
putback_lru_pages(&migratepages);
|
||||
isolated = 0;
|
||||
} else
|
||||
count_vm_numa_event(NUMA_PAGE_MIGRATE);
|
||||
BUG_ON(!list_empty(&migratepages));
|
||||
out:
|
||||
return isolated;
|
||||
|
||||
out:
|
||||
put_page(page);
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
|
||||
#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
|
||||
/*
|
||||
* Migrates a THP to a given target node. page must be locked and is unlocked
|
||||
* before returning.
|
||||
*/
|
||||
int migrate_misplaced_transhuge_page(struct mm_struct *mm,
|
||||
struct vm_area_struct *vma,
|
||||
pmd_t *pmd, pmd_t entry,
|
||||
|
@ -1674,29 +1657,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
|
|||
|
||||
new_page = alloc_pages_node(node,
|
||||
(GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
|
||||
if (!new_page) {
|
||||
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
|
||||
goto out_dropref;
|
||||
}
|
||||
page_xchg_last_nid(new_page, page_last_nid(page));
|
||||
if (!new_page)
|
||||
goto out_fail;
|
||||
|
||||
page_nid_xchg_last(new_page, page_nid_last(page));
|
||||
|
||||
isolated = numamigrate_isolate_page(pgdat, page);
|
||||
|
||||
/*
|
||||
* Failing to isolate or a GUP pin prevents migration. The expected
|
||||
* page count is 2. 1 for anonymous pages without a mapping and 1
|
||||
* for the callers pin. If the page was isolated, the page will
|
||||
* need to be put back on the LRU.
|
||||
*/
|
||||
if (!isolated || page_count(page) != 2) {
|
||||
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
|
||||
if (!isolated) {
|
||||
put_page(new_page);
|
||||
if (isolated) {
|
||||
putback_lru_page(page);
|
||||
isolated = 0;
|
||||
goto out;
|
||||
}
|
||||
goto out_keep_locked;
|
||||
goto out_fail;
|
||||
}
|
||||
|
||||
/* Prepare a page as a migration target */
|
||||
|
@ -1728,6 +1697,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
|
|||
putback_lru_page(page);
|
||||
|
||||
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
|
||||
isolated = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
|
@ -1772,9 +1742,11 @@ out:
|
|||
-HPAGE_PMD_NR);
|
||||
return isolated;
|
||||
|
||||
out_fail:
|
||||
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
|
||||
out_dropref:
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
out_keep_locked:
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
|
|
|
@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
|
|||
/* shmem/tmpfs may return swap: account for swapcache page too. */
|
||||
if (radix_tree_exceptional_entry(page)) {
|
||||
swp_entry_t swap = radix_to_swp_entry(page);
|
||||
page = find_get_page(&swapper_space, swap.val);
|
||||
page = find_get_page(swap_address_space(swap), swap.val);
|
||||
}
|
||||
#endif
|
||||
if (page) {
|
||||
|
@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
|
|||
} else {
|
||||
#ifdef CONFIG_SWAP
|
||||
pgoff = entry.val;
|
||||
*vec = mincore_page(&swapper_space, pgoff);
|
||||
*vec = mincore_page(swap_address_space(entry),
|
||||
pgoff);
|
||||
#else
|
||||
WARN_ON(1);
|
||||
*vec = 1;
|
||||
|
|
101
mm/mlock.c
101
mm/mlock.c
|
@ -155,13 +155,12 @@ void munlock_vma_page(struct page *page)
|
|||
*
|
||||
* vma->vm_mm->mmap_sem must be held for at least read.
|
||||
*/
|
||||
static long __mlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end,
|
||||
int *nonblocking)
|
||||
long __mlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end, int *nonblocking)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
unsigned long addr = start;
|
||||
int nr_pages = (end - start) / PAGE_SIZE;
|
||||
unsigned long nr_pages = (end - start) / PAGE_SIZE;
|
||||
int gup_flags;
|
||||
|
||||
VM_BUG_ON(start & ~PAGE_MASK);
|
||||
|
@ -186,6 +185,10 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
|
|||
if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
|
||||
gup_flags |= FOLL_FORCE;
|
||||
|
||||
/*
|
||||
* We made sure addr is within a VMA, so the following will
|
||||
* not result in a stack expansion that recurses back here.
|
||||
*/
|
||||
return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
|
||||
NULL, NULL, nonblocking);
|
||||
}
|
||||
|
@ -202,56 +205,6 @@ static int __mlock_posix_error_return(long retval)
|
|||
return retval;
|
||||
}
|
||||
|
||||
/**
|
||||
* mlock_vma_pages_range() - mlock pages in specified vma range.
|
||||
* @vma - the vma containing the specfied address range
|
||||
* @start - starting address in @vma to mlock
|
||||
* @end - end address [+1] in @vma to mlock
|
||||
*
|
||||
* For mmap()/mremap()/expansion of mlocked vma.
|
||||
*
|
||||
* return 0 on success for "normal" vmas.
|
||||
*
|
||||
* return number of pages [> 0] to be removed from locked_vm on success
|
||||
* of "special" vmas.
|
||||
*/
|
||||
long mlock_vma_pages_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
int nr_pages = (end - start) / PAGE_SIZE;
|
||||
BUG_ON(!(vma->vm_flags & VM_LOCKED));
|
||||
|
||||
/*
|
||||
* filter unlockable vmas
|
||||
*/
|
||||
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
|
||||
goto no_mlock;
|
||||
|
||||
if (!((vma->vm_flags & VM_DONTEXPAND) ||
|
||||
is_vm_hugetlb_page(vma) ||
|
||||
vma == get_gate_vma(current->mm))) {
|
||||
|
||||
__mlock_vma_pages_range(vma, start, end, NULL);
|
||||
|
||||
/* Hide errors from mmap() and other callers */
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* User mapped kernel pages or huge pages:
|
||||
* make these pages present to populate the ptes, but
|
||||
* fall thru' to reset VM_LOCKED--no need to unlock, and
|
||||
* return nr_pages so these don't get counted against task's
|
||||
* locked limit. huge pages are already counted against
|
||||
* locked vm limit.
|
||||
*/
|
||||
make_pages_present(start, end);
|
||||
|
||||
no_mlock:
|
||||
vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */
|
||||
return nr_pages; /* error or pages NOT mlocked */
|
||||
}
|
||||
|
||||
/*
|
||||
* munlock_vma_pages_range() - munlock all pages in the vma range.'
|
||||
* @vma - vma containing range to be munlock()ed.
|
||||
|
@ -303,7 +256,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
|
|||
*
|
||||
* Filters out "special" vmas -- VM_LOCKED never gets set for these, and
|
||||
* munlock is a no-op. However, for some special vmas, we go ahead and
|
||||
* populate the ptes via make_pages_present().
|
||||
* populate the ptes.
|
||||
*
|
||||
* For vmas that pass the filters, merge/split as appropriate.
|
||||
*/
|
||||
|
@ -391,9 +344,9 @@ static int do_mlock(unsigned long start, size_t len, int on)
|
|||
|
||||
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
|
||||
|
||||
newflags = vma->vm_flags | VM_LOCKED;
|
||||
if (!on)
|
||||
newflags &= ~VM_LOCKED;
|
||||
newflags = vma->vm_flags & ~VM_LOCKED;
|
||||
if (on)
|
||||
newflags |= VM_LOCKED | VM_POPULATE;
|
||||
|
||||
tmp = vma->vm_end;
|
||||
if (tmp > end)
|
||||
|
@ -416,13 +369,20 @@ static int do_mlock(unsigned long start, size_t len, int on)
|
|||
return error;
|
||||
}
|
||||
|
||||
static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
|
||||
/*
|
||||
* __mm_populate - populate and/or mlock pages within a range of address space.
|
||||
*
|
||||
* This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
|
||||
* flags. VMAs must be already marked with the desired vm_flags, and
|
||||
* mmap_sem must not be held.
|
||||
*/
|
||||
int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
unsigned long end, nstart, nend;
|
||||
struct vm_area_struct *vma = NULL;
|
||||
int locked = 0;
|
||||
int ret = 0;
|
||||
long ret = 0;
|
||||
|
||||
VM_BUG_ON(start & ~PAGE_MASK);
|
||||
VM_BUG_ON(len != PAGE_ALIGN(len));
|
||||
|
@ -446,7 +406,8 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
|
|||
* range with the first VMA. Also, skip undesirable VMA types.
|
||||
*/
|
||||
nend = min(end, vma->vm_end);
|
||||
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
|
||||
if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) !=
|
||||
VM_POPULATE)
|
||||
continue;
|
||||
if (nstart < vma->vm_start)
|
||||
nstart = vma->vm_start;
|
||||
|
@ -498,7 +459,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
|
|||
error = do_mlock(start, len, 1);
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
if (!error)
|
||||
error = do_mlock_pages(start, len, 0);
|
||||
error = __mm_populate(start, len, 0);
|
||||
return error;
|
||||
}
|
||||
|
||||
|
@ -519,18 +480,18 @@ static int do_mlockall(int flags)
|
|||
struct vm_area_struct * vma, * prev = NULL;
|
||||
|
||||
if (flags & MCL_FUTURE)
|
||||
current->mm->def_flags |= VM_LOCKED;
|
||||
current->mm->def_flags |= VM_LOCKED | VM_POPULATE;
|
||||
else
|
||||
current->mm->def_flags &= ~VM_LOCKED;
|
||||
current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE);
|
||||
if (flags == MCL_FUTURE)
|
||||
goto out;
|
||||
|
||||
for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
|
||||
vm_flags_t newflags;
|
||||
|
||||
newflags = vma->vm_flags | VM_LOCKED;
|
||||
if (!(flags & MCL_CURRENT))
|
||||
newflags &= ~VM_LOCKED;
|
||||
newflags = vma->vm_flags & ~VM_LOCKED;
|
||||
if (flags & MCL_CURRENT)
|
||||
newflags |= VM_LOCKED | VM_POPULATE;
|
||||
|
||||
/* Ignore errors */
|
||||
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
|
||||
|
@ -564,10 +525,8 @@ SYSCALL_DEFINE1(mlockall, int, flags)
|
|||
capable(CAP_IPC_LOCK))
|
||||
ret = do_mlockall(flags);
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
if (!ret && (flags & MCL_CURRENT)) {
|
||||
/* Ignore errors */
|
||||
do_mlock_pages(0, TASK_SIZE, 1);
|
||||
}
|
||||
if (!ret && (flags & MCL_CURRENT))
|
||||
mm_populate(0, TASK_SIZE);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
|
31
mm/mm_init.c
31
mm/mm_init.c
|
@ -69,34 +69,41 @@ void __init mminit_verify_pageflags_layout(void)
|
|||
unsigned long or_mask, add_mask;
|
||||
|
||||
shift = 8 * sizeof(unsigned long);
|
||||
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH;
|
||||
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT;
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
|
||||
"Section %d Node %d Zone %d Flags %d\n",
|
||||
"Section %d Node %d Zone %d Lastnid %d Flags %d\n",
|
||||
SECTIONS_WIDTH,
|
||||
NODES_WIDTH,
|
||||
ZONES_WIDTH,
|
||||
LAST_NID_WIDTH,
|
||||
NR_PAGEFLAGS);
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
|
||||
"Section %d Node %d Zone %d\n",
|
||||
"Section %d Node %d Zone %d Lastnid %d\n",
|
||||
SECTIONS_SHIFT,
|
||||
NODES_SHIFT,
|
||||
ZONES_SHIFT);
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
|
||||
"Section %lu Node %lu Zone %lu\n",
|
||||
ZONES_SHIFT,
|
||||
LAST_NID_SHIFT);
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
|
||||
"Section %lu Node %lu Zone %lu Lastnid %lu\n",
|
||||
(unsigned long)SECTIONS_PGSHIFT,
|
||||
(unsigned long)NODES_PGSHIFT,
|
||||
(unsigned long)ZONES_PGSHIFT);
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid",
|
||||
"Zone ID: %lu -> %lu\n",
|
||||
(unsigned long)ZONEID_PGOFF,
|
||||
(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT));
|
||||
(unsigned long)ZONES_PGSHIFT,
|
||||
(unsigned long)LAST_NID_PGSHIFT);
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
|
||||
"Node/Zone ID: %lu -> %lu\n",
|
||||
(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
|
||||
(unsigned long)ZONEID_PGOFF);
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
|
||||
"location: %d -> %d unused %d -> %d flags %d -> %d\n",
|
||||
"location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
|
||||
shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
|
||||
#ifdef NODE_NOT_IN_PAGE_FLAGS
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
|
||||
"Node not in page flags");
|
||||
#endif
|
||||
#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
|
||||
"Last nid not in page flags");
|
||||
#endif
|
||||
|
||||
if (SECTIONS_WIDTH) {
|
||||
shift -= SECTIONS_WIDTH;
|
||||
|
|
83
mm/mmap.c
83
mm/mmap.c
|
@ -144,7 +144,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
|
|||
*/
|
||||
free -= global_page_state(NR_SHMEM);
|
||||
|
||||
free += nr_swap_pages;
|
||||
free += get_nr_swap_pages();
|
||||
|
||||
/*
|
||||
* Any slabs which are created with the
|
||||
|
@ -256,6 +256,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
|
|||
unsigned long newbrk, oldbrk;
|
||||
struct mm_struct *mm = current->mm;
|
||||
unsigned long min_brk;
|
||||
bool populate;
|
||||
|
||||
down_write(&mm->mmap_sem);
|
||||
|
||||
|
@ -305,8 +306,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
|
|||
/* Ok, looks good - let it rip. */
|
||||
if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
|
||||
goto out;
|
||||
|
||||
set_brk:
|
||||
mm->brk = brk;
|
||||
populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
|
||||
up_write(&mm->mmap_sem);
|
||||
if (populate)
|
||||
mm_populate(oldbrk, newbrk - oldbrk);
|
||||
return brk;
|
||||
|
||||
out:
|
||||
retval = mm->brk;
|
||||
up_write(&mm->mmap_sem);
|
||||
|
@ -801,7 +809,7 @@ again: remove_next = 1 + (end > next->vm_end);
|
|||
anon_vma_interval_tree_post_update_vma(vma);
|
||||
if (adjust_next)
|
||||
anon_vma_interval_tree_post_update_vma(next);
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_write(anon_vma);
|
||||
}
|
||||
if (mapping)
|
||||
mutex_unlock(&mapping->i_mmap_mutex);
|
||||
|
@ -1154,12 +1162,15 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
|
|||
|
||||
unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
|
||||
unsigned long len, unsigned long prot,
|
||||
unsigned long flags, unsigned long pgoff)
|
||||
unsigned long flags, unsigned long pgoff,
|
||||
unsigned long *populate)
|
||||
{
|
||||
struct mm_struct * mm = current->mm;
|
||||
struct inode *inode;
|
||||
vm_flags_t vm_flags;
|
||||
|
||||
*populate = 0;
|
||||
|
||||
/*
|
||||
* Does the application expect PROT_READ to imply PROT_EXEC?
|
||||
*
|
||||
|
@ -1280,7 +1291,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
|
|||
}
|
||||
}
|
||||
|
||||
return mmap_region(file, addr, len, flags, vm_flags, pgoff);
|
||||
/*
|
||||
* Set 'VM_NORESERVE' if we should not account for the
|
||||
* memory use of this mapping.
|
||||
*/
|
||||
if (flags & MAP_NORESERVE) {
|
||||
/* We honor MAP_NORESERVE if allowed to overcommit */
|
||||
if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
|
||||
vm_flags |= VM_NORESERVE;
|
||||
|
||||
/* hugetlb applies strict overcommit unless MAP_NORESERVE */
|
||||
if (file && is_file_hugepages(file))
|
||||
vm_flags |= VM_NORESERVE;
|
||||
}
|
||||
|
||||
addr = mmap_region(file, addr, len, vm_flags, pgoff);
|
||||
if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE))
|
||||
*populate = len;
|
||||
return addr;
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
|
||||
|
@ -1395,8 +1423,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
|
|||
}
|
||||
|
||||
unsigned long mmap_region(struct file *file, unsigned long addr,
|
||||
unsigned long len, unsigned long flags,
|
||||
vm_flags_t vm_flags, unsigned long pgoff)
|
||||
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma, *prev;
|
||||
|
@ -1419,20 +1446,6 @@ munmap_back:
|
|||
if (!may_expand_vm(mm, len >> PAGE_SHIFT))
|
||||
return -ENOMEM;
|
||||
|
||||
/*
|
||||
* Set 'VM_NORESERVE' if we should not account for the
|
||||
* memory use of this mapping.
|
||||
*/
|
||||
if ((flags & MAP_NORESERVE)) {
|
||||
/* We honor MAP_NORESERVE if allowed to overcommit */
|
||||
if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
|
||||
vm_flags |= VM_NORESERVE;
|
||||
|
||||
/* hugetlb applies strict overcommit unless MAP_NORESERVE */
|
||||
if (file && is_file_hugepages(file))
|
||||
vm_flags |= VM_NORESERVE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Private writable mapping: check memory availability
|
||||
*/
|
||||
|
@ -1531,10 +1544,12 @@ out:
|
|||
|
||||
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
|
||||
if (vm_flags & VM_LOCKED) {
|
||||
if (!mlock_vma_pages_range(vma, addr, addr + len))
|
||||
if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
|
||||
vma == get_gate_vma(current->mm)))
|
||||
mm->locked_vm += (len >> PAGE_SHIFT);
|
||||
} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
|
||||
make_pages_present(addr, addr + len);
|
||||
else
|
||||
vma->vm_flags &= ~VM_LOCKED;
|
||||
}
|
||||
|
||||
if (file)
|
||||
uprobe_mmap(vma);
|
||||
|
@ -2187,9 +2202,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
|
|||
return vma;
|
||||
if (!prev || expand_stack(prev, addr))
|
||||
return NULL;
|
||||
if (prev->vm_flags & VM_LOCKED) {
|
||||
mlock_vma_pages_range(prev, addr, prev->vm_end);
|
||||
}
|
||||
if (prev->vm_flags & VM_LOCKED)
|
||||
__mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
|
||||
return prev;
|
||||
}
|
||||
#else
|
||||
|
@ -2215,9 +2229,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
|
|||
start = vma->vm_start;
|
||||
if (expand_stack(vma, addr))
|
||||
return NULL;
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
mlock_vma_pages_range(vma, addr, start);
|
||||
}
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
__mlock_vma_pages_range(vma, addr, start, NULL);
|
||||
return vma;
|
||||
}
|
||||
#endif
|
||||
|
@ -2590,10 +2603,8 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
|
|||
out:
|
||||
perf_event_mmap(vma);
|
||||
mm->total_vm += len >> PAGE_SHIFT;
|
||||
if (flags & VM_LOCKED) {
|
||||
if (!mlock_vma_pages_range(vma, addr, addr + len))
|
||||
mm->locked_vm += (len >> PAGE_SHIFT);
|
||||
}
|
||||
if (flags & VM_LOCKED)
|
||||
mm->locked_vm += (len >> PAGE_SHIFT);
|
||||
return addr;
|
||||
}
|
||||
|
||||
|
@ -2601,10 +2612,14 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
|
|||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
unsigned long ret;
|
||||
bool populate;
|
||||
|
||||
down_write(&mm->mmap_sem);
|
||||
ret = do_brk(addr, len);
|
||||
populate = ((mm->def_flags & VM_LOCKED) != 0);
|
||||
up_write(&mm->mmap_sem);
|
||||
if (populate)
|
||||
mm_populate(addr, len);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(vm_brk);
|
||||
|
@ -3002,7 +3017,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
|
|||
if (!__test_and_clear_bit(0, (unsigned long *)
|
||||
&anon_vma->root->rb_root.rb_node))
|
||||
BUG();
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_write(anon_vma);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -37,49 +37,51 @@ static struct srcu_struct srcu;
|
|||
void __mmu_notifier_release(struct mm_struct *mm)
|
||||
{
|
||||
struct mmu_notifier *mn;
|
||||
struct hlist_node *n;
|
||||
int id;
|
||||
|
||||
/*
|
||||
* SRCU here will block mmu_notifier_unregister until
|
||||
* ->release returns.
|
||||
* srcu_read_lock() here will block synchronize_srcu() in
|
||||
* mmu_notifier_unregister() until all registered
|
||||
* ->release() callouts this function makes have
|
||||
* returned.
|
||||
*/
|
||||
id = srcu_read_lock(&srcu);
|
||||
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
|
||||
/*
|
||||
* if ->release runs before mmu_notifier_unregister it
|
||||
* must be handled as it's the only way for the driver
|
||||
* to flush all existing sptes and stop the driver
|
||||
* from establishing any more sptes before all the
|
||||
* pages in the mm are freed.
|
||||
*/
|
||||
if (mn->ops->release)
|
||||
mn->ops->release(mn, mm);
|
||||
srcu_read_unlock(&srcu, id);
|
||||
|
||||
spin_lock(&mm->mmu_notifier_mm->lock);
|
||||
while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
|
||||
mn = hlist_entry(mm->mmu_notifier_mm->list.first,
|
||||
struct mmu_notifier,
|
||||
hlist);
|
||||
|
||||
/*
|
||||
* We arrived before mmu_notifier_unregister so
|
||||
* mmu_notifier_unregister will do nothing other than
|
||||
* to wait ->release to finish and
|
||||
* mmu_notifier_unregister to return.
|
||||
* Unlink. This will prevent mmu_notifier_unregister()
|
||||
* from also making the ->release() callout.
|
||||
*/
|
||||
hlist_del_init_rcu(&mn->hlist);
|
||||
spin_unlock(&mm->mmu_notifier_mm->lock);
|
||||
|
||||
/*
|
||||
* Clear sptes. (see 'release' description in mmu_notifier.h)
|
||||
*/
|
||||
if (mn->ops->release)
|
||||
mn->ops->release(mn, mm);
|
||||
|
||||
spin_lock(&mm->mmu_notifier_mm->lock);
|
||||
}
|
||||
spin_unlock(&mm->mmu_notifier_mm->lock);
|
||||
|
||||
/*
|
||||
* synchronize_srcu here prevents mmu_notifier_release to
|
||||
* return to exit_mmap (which would proceed freeing all pages
|
||||
* in the mm) until the ->release method returns, if it was
|
||||
* invoked by mmu_notifier_unregister.
|
||||
*
|
||||
* The mmu_notifier_mm can't go away from under us because one
|
||||
* mm_count is hold by exit_mmap.
|
||||
* All callouts to ->release() which we have done are complete.
|
||||
* Allow synchronize_srcu() in mmu_notifier_unregister() to complete
|
||||
*/
|
||||
srcu_read_unlock(&srcu, id);
|
||||
|
||||
/*
|
||||
* mmu_notifier_unregister() may have unlinked a notifier and may
|
||||
* still be calling out to it. Additionally, other notifiers
|
||||
* may have been active via vmtruncate() et. al. Block here
|
||||
* to ensure that all notifier callouts for this mm have been
|
||||
* completed and the sptes are really cleaned up before returning
|
||||
* to exit_mmap().
|
||||
*/
|
||||
synchronize_srcu(&srcu);
|
||||
}
|
||||
|
@ -170,6 +172,7 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
|
|||
}
|
||||
srcu_read_unlock(&srcu, id);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
|
||||
|
||||
void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
|
||||
unsigned long start, unsigned long end)
|
||||
|
@ -185,6 +188,7 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
|
|||
}
|
||||
srcu_read_unlock(&srcu, id);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
|
||||
|
||||
static int do_mmu_notifier_register(struct mmu_notifier *mn,
|
||||
struct mm_struct *mm,
|
||||
|
@ -294,31 +298,31 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
|
|||
{
|
||||
BUG_ON(atomic_read(&mm->mm_count) <= 0);
|
||||
|
||||
spin_lock(&mm->mmu_notifier_mm->lock);
|
||||
if (!hlist_unhashed(&mn->hlist)) {
|
||||
/*
|
||||
* SRCU here will force exit_mmap to wait ->release to finish
|
||||
* before freeing the pages.
|
||||
*/
|
||||
int id;
|
||||
|
||||
id = srcu_read_lock(&srcu);
|
||||
/*
|
||||
* exit_mmap will block in mmu_notifier_release to
|
||||
* guarantee ->release is called before freeing the
|
||||
* pages.
|
||||
* Ensure we synchronize up with __mmu_notifier_release().
|
||||
*/
|
||||
if (mn->ops->release)
|
||||
mn->ops->release(mn, mm);
|
||||
srcu_read_unlock(&srcu, id);
|
||||
id = srcu_read_lock(&srcu);
|
||||
|
||||
spin_lock(&mm->mmu_notifier_mm->lock);
|
||||
hlist_del_rcu(&mn->hlist);
|
||||
spin_unlock(&mm->mmu_notifier_mm->lock);
|
||||
}
|
||||
|
||||
if (mn->ops->release)
|
||||
mn->ops->release(mn, mm);
|
||||
|
||||
/*
|
||||
* Allow __mmu_notifier_release() to complete.
|
||||
*/
|
||||
srcu_read_unlock(&srcu, id);
|
||||
} else
|
||||
spin_unlock(&mm->mmu_notifier_mm->lock);
|
||||
|
||||
/*
|
||||
* Wait any running method to finish, of course including
|
||||
* ->release if it was run by mmu_notifier_relase instead of us.
|
||||
* Wait for any running method to finish, including ->release() if it
|
||||
* was run by __mmu_notifier_release() instead of us.
|
||||
*/
|
||||
synchronize_srcu(&srcu);
|
||||
|
||||
|
|
20
mm/mmzone.c
20
mm/mmzone.c
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* linux/mm/mmzone.c
|
||||
*
|
||||
* management codes for pgdats and zones.
|
||||
* management codes for pgdats, zones and page flags
|
||||
*/
|
||||
|
||||
|
||||
|
@ -96,3 +96,21 @@ void lruvec_init(struct lruvec *lruvec)
|
|||
for_each_lru(lru)
|
||||
INIT_LIST_HEAD(&lruvec->lists[lru]);
|
||||
}
|
||||
|
||||
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS)
|
||||
int page_nid_xchg_last(struct page *page, int nid)
|
||||
{
|
||||
unsigned long old_flags, flags;
|
||||
int last_nid;
|
||||
|
||||
do {
|
||||
old_flags = flags = page->flags;
|
||||
last_nid = page_nid_last(page);
|
||||
|
||||
flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
|
||||
flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
|
||||
} while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
|
||||
|
||||
return last_nid;
|
||||
}
|
||||
#endif
|
||||
|
|
27
mm/mremap.c
27
mm/mremap.c
|
@ -135,7 +135,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
|
|||
pte_unmap(new_pte - 1);
|
||||
pte_unmap_unlock(old_pte - 1, old_ptl);
|
||||
if (anon_vma)
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_write(anon_vma);
|
||||
if (mapping)
|
||||
mutex_unlock(&mapping->i_mmap_mutex);
|
||||
}
|
||||
|
@ -209,7 +209,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
|
|||
|
||||
static unsigned long move_vma(struct vm_area_struct *vma,
|
||||
unsigned long old_addr, unsigned long old_len,
|
||||
unsigned long new_len, unsigned long new_addr)
|
||||
unsigned long new_len, unsigned long new_addr, bool *locked)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct vm_area_struct *new_vma;
|
||||
|
@ -300,9 +300,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
|
|||
|
||||
if (vm_flags & VM_LOCKED) {
|
||||
mm->locked_vm += new_len >> PAGE_SHIFT;
|
||||
if (new_len > old_len)
|
||||
mlock_vma_pages_range(new_vma, new_addr + old_len,
|
||||
new_addr + new_len);
|
||||
*locked = true;
|
||||
}
|
||||
|
||||
return new_addr;
|
||||
|
@ -367,9 +365,8 @@ Eagain:
|
|||
return ERR_PTR(-EAGAIN);
|
||||
}
|
||||
|
||||
static unsigned long mremap_to(unsigned long addr,
|
||||
unsigned long old_len, unsigned long new_addr,
|
||||
unsigned long new_len)
|
||||
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
|
||||
unsigned long new_addr, unsigned long new_len, bool *locked)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma;
|
||||
|
@ -419,7 +416,7 @@ static unsigned long mremap_to(unsigned long addr,
|
|||
if (ret & ~PAGE_MASK)
|
||||
goto out1;
|
||||
|
||||
ret = move_vma(vma, addr, old_len, new_len, new_addr);
|
||||
ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
|
||||
if (!(ret & ~PAGE_MASK))
|
||||
goto out;
|
||||
out1:
|
||||
|
@ -457,6 +454,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
|
|||
struct vm_area_struct *vma;
|
||||
unsigned long ret = -EINVAL;
|
||||
unsigned long charged = 0;
|
||||
bool locked = false;
|
||||
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
|
||||
|
@ -479,7 +477,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
|
|||
|
||||
if (flags & MREMAP_FIXED) {
|
||||
if (flags & MREMAP_MAYMOVE)
|
||||
ret = mremap_to(addr, old_len, new_addr, new_len);
|
||||
ret = mremap_to(addr, old_len, new_addr, new_len,
|
||||
&locked);
|
||||
goto out;
|
||||
}
|
||||
|
||||
|
@ -521,8 +520,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
|
|||
vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
|
||||
if (vma->vm_flags & VM_LOCKED) {
|
||||
mm->locked_vm += pages;
|
||||
mlock_vma_pages_range(vma, addr + old_len,
|
||||
addr + new_len);
|
||||
locked = true;
|
||||
new_addr = addr;
|
||||
}
|
||||
ret = addr;
|
||||
goto out;
|
||||
|
@ -548,11 +547,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
|
|||
goto out;
|
||||
}
|
||||
|
||||
ret = move_vma(vma, addr, old_len, new_len, new_addr);
|
||||
ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
|
||||
}
|
||||
out:
|
||||
if (ret & ~PAGE_MASK)
|
||||
vm_unacct_memory(charged);
|
||||
up_write(¤t->mm->mmap_sem);
|
||||
if (locked && new_len > old_len)
|
||||
mm_populate(new_addr + old_len, new_len - old_len);
|
||||
return ret;
|
||||
}
|
||||
|
|
28
mm/nommu.c
28
mm/nommu.c
|
@ -140,10 +140,10 @@ unsigned int kobjsize(const void *objp)
|
|||
return PAGE_SIZE << compound_order(page);
|
||||
}
|
||||
|
||||
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, int nr_pages, unsigned int foll_flags,
|
||||
struct page **pages, struct vm_area_struct **vmas,
|
||||
int *retry)
|
||||
long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long nr_pages,
|
||||
unsigned int foll_flags, struct page **pages,
|
||||
struct vm_area_struct **vmas, int *nonblocking)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long vm_flags;
|
||||
|
@ -190,9 +190,10 @@ finish_or_fault:
|
|||
* slab page or a secondary page from a compound page
|
||||
* - don't permit access to VMAs that don't support it, such as I/O mappings
|
||||
*/
|
||||
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, int nr_pages, int write, int force,
|
||||
struct page **pages, struct vm_area_struct **vmas)
|
||||
long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|
||||
unsigned long start, unsigned long nr_pages,
|
||||
int write, int force, struct page **pages,
|
||||
struct vm_area_struct **vmas)
|
||||
{
|
||||
int flags = 0;
|
||||
|
||||
|
@ -1250,7 +1251,8 @@ unsigned long do_mmap_pgoff(struct file *file,
|
|||
unsigned long len,
|
||||
unsigned long prot,
|
||||
unsigned long flags,
|
||||
unsigned long pgoff)
|
||||
unsigned long pgoff,
|
||||
unsigned long *populate)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
struct vm_region *region;
|
||||
|
@ -1260,6 +1262,8 @@ unsigned long do_mmap_pgoff(struct file *file,
|
|||
|
||||
kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
|
||||
|
||||
*populate = 0;
|
||||
|
||||
/* decide whether we should attempt the mapping, and if so what sort of
|
||||
* mapping */
|
||||
ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
|
||||
|
@ -1815,9 +1819,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
|
|||
return ret;
|
||||
}
|
||||
|
||||
struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
|
||||
unsigned int foll_flags)
|
||||
struct page *follow_page_mask(struct vm_area_struct *vma,
|
||||
unsigned long address, unsigned int flags,
|
||||
unsigned int *page_mask)
|
||||
{
|
||||
*page_mask = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -1904,7 +1910,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
|
|||
*/
|
||||
free -= global_page_state(NR_SHMEM);
|
||||
|
||||
free += nr_swap_pages;
|
||||
free += get_nr_swap_pages();
|
||||
|
||||
/*
|
||||
* Any slabs which are created with the
|
||||
|
|
|
@ -386,8 +386,10 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
|
|||
cpuset_print_task_mems_allowed(current);
|
||||
task_unlock(current);
|
||||
dump_stack();
|
||||
mem_cgroup_print_oom_info(memcg, p);
|
||||
show_mem(SHOW_MEM_FILTER_NODES);
|
||||
if (memcg)
|
||||
mem_cgroup_print_oom_info(memcg, p);
|
||||
else
|
||||
show_mem(SHOW_MEM_FILTER_NODES);
|
||||
if (sysctl_oom_dump_tasks)
|
||||
dump_tasks(memcg, nodemask);
|
||||
}
|
||||
|
|
|
@ -241,6 +241,9 @@ static unsigned long global_dirtyable_memory(void)
|
|||
if (!vm_highmem_is_dirtyable)
|
||||
x -= highmem_dirtyable_memory(x);
|
||||
|
||||
/* Subtract min_free_kbytes */
|
||||
x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10));
|
||||
|
||||
return x + 1; /* Ensure that we never return 0 */
|
||||
}
|
||||
|
||||
|
|
439
mm/page_alloc.c
439
mm/page_alloc.c
|
@ -202,11 +202,18 @@ static unsigned long __meminitdata nr_all_pages;
|
|||
static unsigned long __meminitdata dma_reserve;
|
||||
|
||||
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
|
||||
/* Movable memory ranges, will also be used by memblock subsystem. */
|
||||
struct movablemem_map movablemem_map = {
|
||||
.acpi = false,
|
||||
.nr_map = 0,
|
||||
};
|
||||
|
||||
static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
|
||||
static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
|
||||
static unsigned long __initdata required_kernelcore;
|
||||
static unsigned long __initdata required_movablecore;
|
||||
static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
|
||||
static unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES];
|
||||
|
||||
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
|
||||
int movable_zone;
|
||||
|
@ -240,15 +247,20 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
|
|||
int ret = 0;
|
||||
unsigned seq;
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
unsigned long sp, start_pfn;
|
||||
|
||||
do {
|
||||
seq = zone_span_seqbegin(zone);
|
||||
if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
|
||||
ret = 1;
|
||||
else if (pfn < zone->zone_start_pfn)
|
||||
start_pfn = zone->zone_start_pfn;
|
||||
sp = zone->spanned_pages;
|
||||
if (!zone_spans_pfn(zone, pfn))
|
||||
ret = 1;
|
||||
} while (zone_span_seqretry(zone, seq));
|
||||
|
||||
if (ret)
|
||||
pr_err("page %lu outside zone [ %lu - %lu ]\n",
|
||||
pfn, start_pfn, start_pfn + sp);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -288,7 +300,7 @@ static void bad_page(struct page *page)
|
|||
|
||||
/* Don't complain about poisoned pages */
|
||||
if (PageHWPoison(page)) {
|
||||
reset_page_mapcount(page); /* remove PageBuddy */
|
||||
page_mapcount_reset(page); /* remove PageBuddy */
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -320,7 +332,7 @@ static void bad_page(struct page *page)
|
|||
dump_stack();
|
||||
out:
|
||||
/* Leave bad fields for debug, except PageBuddy could make trouble */
|
||||
reset_page_mapcount(page); /* remove PageBuddy */
|
||||
page_mapcount_reset(page); /* remove PageBuddy */
|
||||
add_taint(TAINT_BAD_PAGE);
|
||||
}
|
||||
|
||||
|
@ -533,6 +545,8 @@ static inline void __free_one_page(struct page *page,
|
|||
unsigned long uninitialized_var(buddy_idx);
|
||||
struct page *buddy;
|
||||
|
||||
VM_BUG_ON(!zone_is_initialized(zone));
|
||||
|
||||
if (unlikely(PageCompound(page)))
|
||||
if (unlikely(destroy_compound_page(page, order)))
|
||||
return;
|
||||
|
@ -606,7 +620,7 @@ static inline int free_pages_check(struct page *page)
|
|||
bad_page(page);
|
||||
return 1;
|
||||
}
|
||||
reset_page_last_nid(page);
|
||||
page_nid_reset_last(page);
|
||||
if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
|
||||
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
|
||||
return 0;
|
||||
|
@ -666,7 +680,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
|
|||
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
|
||||
__free_one_page(page, zone, 0, mt);
|
||||
trace_mm_page_pcpu_drain(page, 0, mt);
|
||||
if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
|
||||
if (likely(!is_migrate_isolate_page(page))) {
|
||||
__mod_zone_page_state(zone, NR_FREE_PAGES, 1);
|
||||
if (is_migrate_cma(mt))
|
||||
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
|
||||
|
@ -684,7 +698,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
|
|||
zone->pages_scanned = 0;
|
||||
|
||||
__free_one_page(page, zone, order, migratetype);
|
||||
if (unlikely(migratetype != MIGRATE_ISOLATE))
|
||||
if (unlikely(!is_migrate_isolate(migratetype)))
|
||||
__mod_zone_freepage_state(zone, 1 << order, migratetype);
|
||||
spin_unlock(&zone->lock);
|
||||
}
|
||||
|
@ -916,7 +930,9 @@ static int fallbacks[MIGRATE_TYPES][4] = {
|
|||
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
|
||||
#endif
|
||||
[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
|
||||
#ifdef CONFIG_MEMORY_ISOLATION
|
||||
[MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -981,9 +997,9 @@ int move_freepages_block(struct zone *zone, struct page *page,
|
|||
end_pfn = start_pfn + pageblock_nr_pages - 1;
|
||||
|
||||
/* Do not cross zone boundaries */
|
||||
if (start_pfn < zone->zone_start_pfn)
|
||||
if (!zone_spans_pfn(zone, start_pfn))
|
||||
start_page = page;
|
||||
if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
|
||||
if (!zone_spans_pfn(zone, end_pfn))
|
||||
return 0;
|
||||
|
||||
return move_freepages(zone, start_page, end_page, migratetype);
|
||||
|
@ -1142,7 +1158,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
|
|||
list_add_tail(&page->lru, list);
|
||||
if (IS_ENABLED(CONFIG_CMA)) {
|
||||
mt = get_pageblock_migratetype(page);
|
||||
if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
|
||||
if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
|
||||
mt = migratetype;
|
||||
}
|
||||
set_freepage_migratetype(page, mt);
|
||||
|
@ -1277,7 +1293,7 @@ void mark_free_pages(struct zone *zone)
|
|||
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
|
||||
max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
|
||||
max_zone_pfn = zone_end_pfn(zone);
|
||||
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
|
||||
if (pfn_valid(pfn)) {
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
@ -1326,7 +1342,7 @@ void free_hot_cold_page(struct page *page, int cold)
|
|||
* excessively into the page allocator
|
||||
*/
|
||||
if (migratetype >= MIGRATE_PCPTYPES) {
|
||||
if (unlikely(migratetype == MIGRATE_ISOLATE)) {
|
||||
if (unlikely(is_migrate_isolate(migratetype))) {
|
||||
free_one_page(zone, page, 0, migratetype);
|
||||
goto out;
|
||||
}
|
||||
|
@ -1400,7 +1416,7 @@ static int __isolate_free_page(struct page *page, unsigned int order)
|
|||
zone = page_zone(page);
|
||||
mt = get_pageblock_migratetype(page);
|
||||
|
||||
if (mt != MIGRATE_ISOLATE) {
|
||||
if (!is_migrate_isolate(mt)) {
|
||||
/* Obey watermarks as if the page was being allocated */
|
||||
watermark = low_wmark_pages(zone) + (1 << order);
|
||||
if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
|
||||
|
@ -1419,7 +1435,7 @@ static int __isolate_free_page(struct page *page, unsigned int order)
|
|||
struct page *endpage = page + (1 << order) - 1;
|
||||
for (; page < endpage; page += pageblock_nr_pages) {
|
||||
int mt = get_pageblock_migratetype(page);
|
||||
if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
|
||||
if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
|
||||
set_pageblock_migratetype(page,
|
||||
MIGRATE_MOVABLE);
|
||||
}
|
||||
|
@ -2615,10 +2631,17 @@ retry_cpuset:
|
|||
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
|
||||
zonelist, high_zoneidx, alloc_flags,
|
||||
preferred_zone, migratetype);
|
||||
if (unlikely(!page))
|
||||
if (unlikely(!page)) {
|
||||
/*
|
||||
* Runtime PM, block IO and its error handling path
|
||||
* can deadlock because I/O on the device might not
|
||||
* complete.
|
||||
*/
|
||||
gfp_mask = memalloc_noio_flags(gfp_mask);
|
||||
page = __alloc_pages_slowpath(gfp_mask, order,
|
||||
zonelist, high_zoneidx, nodemask,
|
||||
preferred_zone, migratetype);
|
||||
}
|
||||
|
||||
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
|
||||
|
||||
|
@ -2790,18 +2813,27 @@ void free_pages_exact(void *virt, size_t size)
|
|||
}
|
||||
EXPORT_SYMBOL(free_pages_exact);
|
||||
|
||||
static unsigned int nr_free_zone_pages(int offset)
|
||||
/**
|
||||
* nr_free_zone_pages - count number of pages beyond high watermark
|
||||
* @offset: The zone index of the highest zone
|
||||
*
|
||||
* nr_free_zone_pages() counts the number of counts pages which are beyond the
|
||||
* high watermark within all zones at or below a given zone index. For each
|
||||
* zone, the number of pages is calculated as:
|
||||
* present_pages - high_pages
|
||||
*/
|
||||
static unsigned long nr_free_zone_pages(int offset)
|
||||
{
|
||||
struct zoneref *z;
|
||||
struct zone *zone;
|
||||
|
||||
/* Just pick one node, since fallback list is circular */
|
||||
unsigned int sum = 0;
|
||||
unsigned long sum = 0;
|
||||
|
||||
struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
|
||||
|
||||
for_each_zone_zonelist(zone, z, zonelist, offset) {
|
||||
unsigned long size = zone->present_pages;
|
||||
unsigned long size = zone->managed_pages;
|
||||
unsigned long high = high_wmark_pages(zone);
|
||||
if (size > high)
|
||||
sum += size - high;
|
||||
|
@ -2810,19 +2842,25 @@ static unsigned int nr_free_zone_pages(int offset)
|
|||
return sum;
|
||||
}
|
||||
|
||||
/*
|
||||
* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
|
||||
/**
|
||||
* nr_free_buffer_pages - count number of pages beyond high watermark
|
||||
*
|
||||
* nr_free_buffer_pages() counts the number of pages which are beyond the high
|
||||
* watermark within ZONE_DMA and ZONE_NORMAL.
|
||||
*/
|
||||
unsigned int nr_free_buffer_pages(void)
|
||||
unsigned long nr_free_buffer_pages(void)
|
||||
{
|
||||
return nr_free_zone_pages(gfp_zone(GFP_USER));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
|
||||
|
||||
/*
|
||||
* Amount of free RAM allocatable within all zones
|
||||
/**
|
||||
* nr_free_pagecache_pages - count number of pages beyond high watermark
|
||||
*
|
||||
* nr_free_pagecache_pages() counts the number of pages which are beyond the
|
||||
* high watermark within all zones.
|
||||
*/
|
||||
unsigned int nr_free_pagecache_pages(void)
|
||||
unsigned long nr_free_pagecache_pages(void)
|
||||
{
|
||||
return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
|
||||
}
|
||||
|
@ -2854,7 +2892,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
|
|||
val->totalram = pgdat->node_present_pages;
|
||||
val->freeram = node_page_state(nid, NR_FREE_PAGES);
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
|
||||
val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
|
||||
val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
|
||||
NR_FREE_PAGES);
|
||||
#else
|
||||
|
@ -2897,7 +2935,9 @@ static void show_migration_types(unsigned char type)
|
|||
#ifdef CONFIG_CMA
|
||||
[MIGRATE_CMA] = 'C',
|
||||
#endif
|
||||
#ifdef CONFIG_MEMORY_ISOLATION
|
||||
[MIGRATE_ISOLATE] = 'I',
|
||||
#endif
|
||||
};
|
||||
char tmp[MIGRATE_TYPES + 1];
|
||||
char *p = tmp;
|
||||
|
@ -3236,7 +3276,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
|
|||
{
|
||||
int n, val;
|
||||
int min_val = INT_MAX;
|
||||
int best_node = -1;
|
||||
int best_node = NUMA_NO_NODE;
|
||||
const struct cpumask *tmp = cpumask_of_node(0);
|
||||
|
||||
/* Use the local node if we haven't already */
|
||||
|
@ -3780,7 +3820,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
|
|||
* the block.
|
||||
*/
|
||||
start_pfn = zone->zone_start_pfn;
|
||||
end_pfn = start_pfn + zone->spanned_pages;
|
||||
end_pfn = zone_end_pfn(zone);
|
||||
start_pfn = roundup(start_pfn, pageblock_nr_pages);
|
||||
reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
|
||||
pageblock_order;
|
||||
|
@ -3876,8 +3916,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
|
|||
set_page_links(page, zone, nid, pfn);
|
||||
mminit_verify_page_links(page, zone, nid, pfn);
|
||||
init_page_count(page);
|
||||
reset_page_mapcount(page);
|
||||
reset_page_last_nid(page);
|
||||
page_mapcount_reset(page);
|
||||
page_nid_reset_last(page);
|
||||
SetPageReserved(page);
|
||||
/*
|
||||
* Mark the block movable so that blocks are reserved for
|
||||
|
@ -3894,7 +3934,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
|
|||
* pfn out of zone.
|
||||
*/
|
||||
if ((z->zone_start_pfn <= pfn)
|
||||
&& (pfn < z->zone_start_pfn + z->spanned_pages)
|
||||
&& (pfn < zone_end_pfn(z))
|
||||
&& !(pfn & (pageblock_nr_pages - 1)))
|
||||
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
|
||||
|
||||
|
@ -3932,7 +3972,7 @@ static int __meminit zone_batchsize(struct zone *zone)
|
|||
*
|
||||
* OK, so we don't know how big the cache is. So guess.
|
||||
*/
|
||||
batch = zone->present_pages / 1024;
|
||||
batch = zone->managed_pages / 1024;
|
||||
if (batch * PAGE_SIZE > 512 * 1024)
|
||||
batch = (512 * 1024) / PAGE_SIZE;
|
||||
batch /= 4; /* We effectively *= 4 below */
|
||||
|
@ -4016,7 +4056,7 @@ static void __meminit setup_zone_pageset(struct zone *zone)
|
|||
|
||||
if (percpu_pagelist_fraction)
|
||||
setup_pagelist_highmark(pcp,
|
||||
(zone->present_pages /
|
||||
(zone->managed_pages /
|
||||
percpu_pagelist_fraction));
|
||||
}
|
||||
}
|
||||
|
@ -4372,6 +4412,77 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
|
|||
return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
|
||||
}
|
||||
|
||||
/**
|
||||
* sanitize_zone_movable_limit - Sanitize the zone_movable_limit array.
|
||||
*
|
||||
* zone_movable_limit is initialized as 0. This function will try to get
|
||||
* the first ZONE_MOVABLE pfn of each node from movablemem_map, and
|
||||
* assigne them to zone_movable_limit.
|
||||
* zone_movable_limit[nid] == 0 means no limit for the node.
|
||||
*
|
||||
* Note: Each range is represented as [start_pfn, end_pfn)
|
||||
*/
|
||||
static void __meminit sanitize_zone_movable_limit(void)
|
||||
{
|
||||
int map_pos = 0, i, nid;
|
||||
unsigned long start_pfn, end_pfn;
|
||||
|
||||
if (!movablemem_map.nr_map)
|
||||
return;
|
||||
|
||||
/* Iterate all ranges from minimum to maximum */
|
||||
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
|
||||
/*
|
||||
* If we have found lowest pfn of ZONE_MOVABLE of the node
|
||||
* specified by user, just go on to check next range.
|
||||
*/
|
||||
if (zone_movable_limit[nid])
|
||||
continue;
|
||||
|
||||
#ifdef CONFIG_ZONE_DMA
|
||||
/* Skip DMA memory. */
|
||||
if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA])
|
||||
start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA];
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_ZONE_DMA32
|
||||
/* Skip DMA32 memory. */
|
||||
if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA32])
|
||||
start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA32];
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
/* Skip lowmem if ZONE_MOVABLE is highmem. */
|
||||
if (zone_movable_is_highmem() &&
|
||||
start_pfn < arch_zone_lowest_possible_pfn[ZONE_HIGHMEM])
|
||||
start_pfn = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
|
||||
#endif
|
||||
|
||||
if (start_pfn >= end_pfn)
|
||||
continue;
|
||||
|
||||
while (map_pos < movablemem_map.nr_map) {
|
||||
if (end_pfn <= movablemem_map.map[map_pos].start_pfn)
|
||||
break;
|
||||
|
||||
if (start_pfn >= movablemem_map.map[map_pos].end_pfn) {
|
||||
map_pos++;
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* The start_pfn of ZONE_MOVABLE is either the minimum
|
||||
* pfn specified by movablemem_map, or 0, which means
|
||||
* the node has no ZONE_MOVABLE.
|
||||
*/
|
||||
zone_movable_limit[nid] = max(start_pfn,
|
||||
movablemem_map.map[map_pos].start_pfn);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
|
||||
static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
|
||||
unsigned long zone_type,
|
||||
|
@ -4389,7 +4500,6 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
|
|||
|
||||
return zholes_size[zone_type];
|
||||
}
|
||||
|
||||
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
|
||||
|
||||
static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
|
||||
|
@ -4573,7 +4683,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
|
|||
nr_all_pages += freesize;
|
||||
|
||||
zone->spanned_pages = size;
|
||||
zone->present_pages = freesize;
|
||||
zone->present_pages = realsize;
|
||||
/*
|
||||
* Set an approximate value for lowmem here, it will be adjusted
|
||||
* when the bootmem allocator frees pages into the buddy system.
|
||||
|
@ -4625,7 +4735,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
|
|||
* for the buddy allocator to function correctly.
|
||||
*/
|
||||
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
|
||||
end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
|
||||
end = pgdat_end_pfn(pgdat);
|
||||
end = ALIGN(end, MAX_ORDER_NR_PAGES);
|
||||
size = (end - start) * sizeof(struct page);
|
||||
map = alloc_remap(pgdat->node_id, size);
|
||||
|
@ -4831,12 +4941,19 @@ static void __init find_zone_movable_pfns_for_nodes(void)
|
|||
required_kernelcore = max(required_kernelcore, corepages);
|
||||
}
|
||||
|
||||
/* If kernelcore was not specified, there is no ZONE_MOVABLE */
|
||||
if (!required_kernelcore)
|
||||
/*
|
||||
* If neither kernelcore/movablecore nor movablemem_map is specified,
|
||||
* there is no ZONE_MOVABLE. But if movablemem_map is specified, the
|
||||
* start pfn of ZONE_MOVABLE has been stored in zone_movable_limit[].
|
||||
*/
|
||||
if (!required_kernelcore) {
|
||||
if (movablemem_map.nr_map)
|
||||
memcpy(zone_movable_pfn, zone_movable_limit,
|
||||
sizeof(zone_movable_pfn));
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
|
||||
find_usable_zone_for_movable();
|
||||
usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
|
||||
|
||||
restart:
|
||||
|
@ -4864,10 +4981,24 @@ restart:
|
|||
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
|
||||
unsigned long size_pages;
|
||||
|
||||
/*
|
||||
* Find more memory for kernelcore in
|
||||
* [zone_movable_pfn[nid], zone_movable_limit[nid]).
|
||||
*/
|
||||
start_pfn = max(start_pfn, zone_movable_pfn[nid]);
|
||||
if (start_pfn >= end_pfn)
|
||||
continue;
|
||||
|
||||
if (zone_movable_limit[nid]) {
|
||||
end_pfn = min(end_pfn, zone_movable_limit[nid]);
|
||||
/* No range left for kernelcore in this node */
|
||||
if (start_pfn >= end_pfn) {
|
||||
zone_movable_pfn[nid] =
|
||||
zone_movable_limit[nid];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Account for what is only usable for kernelcore */
|
||||
if (start_pfn < usable_startpfn) {
|
||||
unsigned long kernel_pages;
|
||||
|
@ -4927,12 +5058,12 @@ restart:
|
|||
if (usable_nodes && required_kernelcore > usable_nodes)
|
||||
goto restart;
|
||||
|
||||
out:
|
||||
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
|
||||
for (nid = 0; nid < MAX_NUMNODES; nid++)
|
||||
zone_movable_pfn[nid] =
|
||||
roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
|
||||
|
||||
out:
|
||||
/* restore the node_state */
|
||||
node_states[N_MEMORY] = saved_node_state;
|
||||
}
|
||||
|
@ -4995,6 +5126,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
|
|||
|
||||
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
|
||||
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
|
||||
find_usable_zone_for_movable();
|
||||
sanitize_zone_movable_limit();
|
||||
find_zone_movable_pfns_for_nodes();
|
||||
|
||||
/* Print out the zone ranges */
|
||||
|
@ -5078,6 +5211,181 @@ static int __init cmdline_parse_movablecore(char *p)
|
|||
early_param("kernelcore", cmdline_parse_kernelcore);
|
||||
early_param("movablecore", cmdline_parse_movablecore);
|
||||
|
||||
/**
|
||||
* movablemem_map_overlap() - Check if a range overlaps movablemem_map.map[].
|
||||
* @start_pfn: start pfn of the range to be checked
|
||||
* @end_pfn: end pfn of the range to be checked (exclusive)
|
||||
*
|
||||
* This function checks if a given memory range [start_pfn, end_pfn) overlaps
|
||||
* the movablemem_map.map[] array.
|
||||
*
|
||||
* Return: index of the first overlapped element in movablemem_map.map[]
|
||||
* or -1 if they don't overlap each other.
|
||||
*/
|
||||
int __init movablemem_map_overlap(unsigned long start_pfn,
|
||||
unsigned long end_pfn)
|
||||
{
|
||||
int overlap;
|
||||
|
||||
if (!movablemem_map.nr_map)
|
||||
return -1;
|
||||
|
||||
for (overlap = 0; overlap < movablemem_map.nr_map; overlap++)
|
||||
if (start_pfn < movablemem_map.map[overlap].end_pfn)
|
||||
break;
|
||||
|
||||
if (overlap == movablemem_map.nr_map ||
|
||||
end_pfn <= movablemem_map.map[overlap].start_pfn)
|
||||
return -1;
|
||||
|
||||
return overlap;
|
||||
}
|
||||
|
||||
/**
|
||||
* insert_movablemem_map - Insert a memory range in to movablemem_map.map.
|
||||
* @start_pfn: start pfn of the range
|
||||
* @end_pfn: end pfn of the range
|
||||
*
|
||||
* This function will also merge the overlapped ranges, and sort the array
|
||||
* by start_pfn in monotonic increasing order.
|
||||
*/
|
||||
void __init insert_movablemem_map(unsigned long start_pfn,
|
||||
unsigned long end_pfn)
|
||||
{
|
||||
int pos, overlap;
|
||||
|
||||
/*
|
||||
* pos will be at the 1st overlapped range, or the position
|
||||
* where the element should be inserted.
|
||||
*/
|
||||
for (pos = 0; pos < movablemem_map.nr_map; pos++)
|
||||
if (start_pfn <= movablemem_map.map[pos].end_pfn)
|
||||
break;
|
||||
|
||||
/* If there is no overlapped range, just insert the element. */
|
||||
if (pos == movablemem_map.nr_map ||
|
||||
end_pfn < movablemem_map.map[pos].start_pfn) {
|
||||
/*
|
||||
* If pos is not the end of array, we need to move all
|
||||
* the rest elements backward.
|
||||
*/
|
||||
if (pos < movablemem_map.nr_map)
|
||||
memmove(&movablemem_map.map[pos+1],
|
||||
&movablemem_map.map[pos],
|
||||
sizeof(struct movablemem_entry) *
|
||||
(movablemem_map.nr_map - pos));
|
||||
movablemem_map.map[pos].start_pfn = start_pfn;
|
||||
movablemem_map.map[pos].end_pfn = end_pfn;
|
||||
movablemem_map.nr_map++;
|
||||
return;
|
||||
}
|
||||
|
||||
/* overlap will be at the last overlapped range */
|
||||
for (overlap = pos + 1; overlap < movablemem_map.nr_map; overlap++)
|
||||
if (end_pfn < movablemem_map.map[overlap].start_pfn)
|
||||
break;
|
||||
|
||||
/*
|
||||
* If there are more ranges overlapped, we need to merge them,
|
||||
* and move the rest elements forward.
|
||||
*/
|
||||
overlap--;
|
||||
movablemem_map.map[pos].start_pfn = min(start_pfn,
|
||||
movablemem_map.map[pos].start_pfn);
|
||||
movablemem_map.map[pos].end_pfn = max(end_pfn,
|
||||
movablemem_map.map[overlap].end_pfn);
|
||||
|
||||
if (pos != overlap && overlap + 1 != movablemem_map.nr_map)
|
||||
memmove(&movablemem_map.map[pos+1],
|
||||
&movablemem_map.map[overlap+1],
|
||||
sizeof(struct movablemem_entry) *
|
||||
(movablemem_map.nr_map - overlap - 1));
|
||||
|
||||
movablemem_map.nr_map -= overlap - pos;
|
||||
}
|
||||
|
||||
/**
|
||||
* movablemem_map_add_region - Add a memory range into movablemem_map.
|
||||
* @start: physical start address of range
|
||||
* @end: physical end address of range
|
||||
*
|
||||
* This function transform the physical address into pfn, and then add the
|
||||
* range into movablemem_map by calling insert_movablemem_map().
|
||||
*/
|
||||
static void __init movablemem_map_add_region(u64 start, u64 size)
|
||||
{
|
||||
unsigned long start_pfn, end_pfn;
|
||||
|
||||
/* In case size == 0 or start + size overflows */
|
||||
if (start + size <= start)
|
||||
return;
|
||||
|
||||
if (movablemem_map.nr_map >= ARRAY_SIZE(movablemem_map.map)) {
|
||||
pr_err("movablemem_map: too many entries;"
|
||||
" ignoring [mem %#010llx-%#010llx]\n",
|
||||
(unsigned long long) start,
|
||||
(unsigned long long) (start + size - 1));
|
||||
return;
|
||||
}
|
||||
|
||||
start_pfn = PFN_DOWN(start);
|
||||
end_pfn = PFN_UP(start + size);
|
||||
insert_movablemem_map(start_pfn, end_pfn);
|
||||
}
|
||||
|
||||
/*
|
||||
* cmdline_parse_movablemem_map - Parse boot option movablemem_map.
|
||||
* @p: The boot option of the following format:
|
||||
* movablemem_map=nn[KMG]@ss[KMG]
|
||||
*
|
||||
* This option sets the memory range [ss, ss+nn) to be used as movable memory.
|
||||
*
|
||||
* Return: 0 on success or -EINVAL on failure.
|
||||
*/
|
||||
static int __init cmdline_parse_movablemem_map(char *p)
|
||||
{
|
||||
char *oldp;
|
||||
u64 start_at, mem_size;
|
||||
|
||||
if (!p)
|
||||
goto err;
|
||||
|
||||
if (!strcmp(p, "acpi"))
|
||||
movablemem_map.acpi = true;
|
||||
|
||||
/*
|
||||
* If user decide to use info from BIOS, all the other user specified
|
||||
* ranges will be ingored.
|
||||
*/
|
||||
if (movablemem_map.acpi) {
|
||||
if (movablemem_map.nr_map) {
|
||||
memset(movablemem_map.map, 0,
|
||||
sizeof(struct movablemem_entry)
|
||||
* movablemem_map.nr_map);
|
||||
movablemem_map.nr_map = 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
oldp = p;
|
||||
mem_size = memparse(p, &p);
|
||||
if (p == oldp)
|
||||
goto err;
|
||||
|
||||
if (*p == '@') {
|
||||
oldp = ++p;
|
||||
start_at = memparse(p, &p);
|
||||
if (p == oldp || *p != '\0')
|
||||
goto err;
|
||||
|
||||
movablemem_map_add_region(start_at, mem_size);
|
||||
return 0;
|
||||
}
|
||||
err:
|
||||
return -EINVAL;
|
||||
}
|
||||
early_param("movablemem_map", cmdline_parse_movablemem_map);
|
||||
|
||||
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
|
||||
|
||||
/**
|
||||
|
@ -5160,8 +5468,8 @@ static void calculate_totalreserve_pages(void)
|
|||
/* we treat the high watermark as reserved pages. */
|
||||
max += high_wmark_pages(zone);
|
||||
|
||||
if (max > zone->present_pages)
|
||||
max = zone->present_pages;
|
||||
if (max > zone->managed_pages)
|
||||
max = zone->managed_pages;
|
||||
reserve_pages += max;
|
||||
/*
|
||||
* Lowmem reserves are not available to
|
||||
|
@ -5193,7 +5501,7 @@ static void setup_per_zone_lowmem_reserve(void)
|
|||
for_each_online_pgdat(pgdat) {
|
||||
for (j = 0; j < MAX_NR_ZONES; j++) {
|
||||
struct zone *zone = pgdat->node_zones + j;
|
||||
unsigned long present_pages = zone->present_pages;
|
||||
unsigned long managed_pages = zone->managed_pages;
|
||||
|
||||
zone->lowmem_reserve[j] = 0;
|
||||
|
||||
|
@ -5207,9 +5515,9 @@ static void setup_per_zone_lowmem_reserve(void)
|
|||
sysctl_lowmem_reserve_ratio[idx] = 1;
|
||||
|
||||
lower_zone = pgdat->node_zones + idx;
|
||||
lower_zone->lowmem_reserve[j] = present_pages /
|
||||
lower_zone->lowmem_reserve[j] = managed_pages /
|
||||
sysctl_lowmem_reserve_ratio[idx];
|
||||
present_pages += lower_zone->present_pages;
|
||||
managed_pages += lower_zone->managed_pages;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -5228,14 +5536,14 @@ static void __setup_per_zone_wmarks(void)
|
|||
/* Calculate total number of !ZONE_HIGHMEM pages */
|
||||
for_each_zone(zone) {
|
||||
if (!is_highmem(zone))
|
||||
lowmem_pages += zone->present_pages;
|
||||
lowmem_pages += zone->managed_pages;
|
||||
}
|
||||
|
||||
for_each_zone(zone) {
|
||||
u64 tmp;
|
||||
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
tmp = (u64)pages_min * zone->present_pages;
|
||||
tmp = (u64)pages_min * zone->managed_pages;
|
||||
do_div(tmp, lowmem_pages);
|
||||
if (is_highmem(zone)) {
|
||||
/*
|
||||
|
@ -5247,13 +5555,10 @@ static void __setup_per_zone_wmarks(void)
|
|||
* deltas controls asynch page reclaim, and so should
|
||||
* not be capped for highmem.
|
||||
*/
|
||||
int min_pages;
|
||||
unsigned long min_pages;
|
||||
|
||||
min_pages = zone->present_pages / 1024;
|
||||
if (min_pages < SWAP_CLUSTER_MAX)
|
||||
min_pages = SWAP_CLUSTER_MAX;
|
||||
if (min_pages > 128)
|
||||
min_pages = 128;
|
||||
min_pages = zone->managed_pages / 1024;
|
||||
min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
|
||||
zone->watermark[WMARK_MIN] = min_pages;
|
||||
} else {
|
||||
/*
|
||||
|
@ -5314,7 +5619,7 @@ static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
|
|||
unsigned int gb, ratio;
|
||||
|
||||
/* Zone size in gigabytes */
|
||||
gb = zone->present_pages >> (30 - PAGE_SHIFT);
|
||||
gb = zone->managed_pages >> (30 - PAGE_SHIFT);
|
||||
if (gb)
|
||||
ratio = int_sqrt(10 * gb);
|
||||
else
|
||||
|
@ -5400,7 +5705,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
|
|||
return rc;
|
||||
|
||||
for_each_zone(zone)
|
||||
zone->min_unmapped_pages = (zone->present_pages *
|
||||
zone->min_unmapped_pages = (zone->managed_pages *
|
||||
sysctl_min_unmapped_ratio) / 100;
|
||||
return 0;
|
||||
}
|
||||
|
@ -5416,7 +5721,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
|
|||
return rc;
|
||||
|
||||
for_each_zone(zone)
|
||||
zone->min_slab_pages = (zone->present_pages *
|
||||
zone->min_slab_pages = (zone->managed_pages *
|
||||
sysctl_min_slab_ratio) / 100;
|
||||
return 0;
|
||||
}
|
||||
|
@ -5458,7 +5763,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
|
|||
for_each_populated_zone(zone) {
|
||||
for_each_possible_cpu(cpu) {
|
||||
unsigned long high;
|
||||
high = zone->present_pages / percpu_pagelist_fraction;
|
||||
high = zone->managed_pages / percpu_pagelist_fraction;
|
||||
setup_pagelist_highmark(
|
||||
per_cpu_ptr(zone->pageset, cpu), high);
|
||||
}
|
||||
|
@ -5645,8 +5950,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
|
|||
pfn = page_to_pfn(page);
|
||||
bitmap = get_pageblock_bitmap(zone, pfn);
|
||||
bitidx = pfn_to_bitidx(zone, pfn);
|
||||
VM_BUG_ON(pfn < zone->zone_start_pfn);
|
||||
VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
|
||||
VM_BUG_ON(!zone_spans_pfn(zone, pfn));
|
||||
|
||||
for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
|
||||
if (flags & value)
|
||||
|
@ -5744,8 +6048,7 @@ bool is_pageblock_removable_nolock(struct page *page)
|
|||
|
||||
zone = page_zone(page);
|
||||
pfn = page_to_pfn(page);
|
||||
if (zone->zone_start_pfn > pfn ||
|
||||
zone->zone_start_pfn + zone->spanned_pages <= pfn)
|
||||
if (!zone_spans_pfn(zone, pfn))
|
||||
return false;
|
||||
|
||||
return !has_unmovable_pages(zone, page, 0, true);
|
||||
|
@ -5801,14 +6104,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
|
|||
&cc->migratepages);
|
||||
cc->nr_migratepages -= nr_reclaimed;
|
||||
|
||||
ret = migrate_pages(&cc->migratepages,
|
||||
alloc_migrate_target,
|
||||
0, false, MIGRATE_SYNC,
|
||||
MR_CMA);
|
||||
ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
|
||||
0, MIGRATE_SYNC, MR_CMA);
|
||||
}
|
||||
|
||||
putback_movable_pages(&cc->migratepages);
|
||||
return ret > 0 ? 0 : ret;
|
||||
if (ret < 0) {
|
||||
putback_movable_pages(&cc->migratepages);
|
||||
return ret;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -105,7 +105,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
|
|||
*/
|
||||
if (rwsem_is_locked(&anon_vma->root->rwsem)) {
|
||||
anon_vma_lock_write(anon_vma);
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_write(anon_vma);
|
||||
}
|
||||
|
||||
kmem_cache_free(anon_vma_cachep, anon_vma);
|
||||
|
@ -191,7 +191,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
|
|||
avc = NULL;
|
||||
}
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_write(anon_vma);
|
||||
|
||||
if (unlikely(allocated))
|
||||
put_anon_vma(allocated);
|
||||
|
@ -308,7 +308,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
|
|||
vma->anon_vma = anon_vma;
|
||||
anon_vma_lock_write(anon_vma);
|
||||
anon_vma_chain_link(vma, avc, anon_vma);
|
||||
anon_vma_unlock(anon_vma);
|
||||
anon_vma_unlock_write(anon_vma);
|
||||
|
||||
return 0;
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue