Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: - most of the rest of MM - KASAN updates - lib/ updates - checkpatch updates - some binfmt_elf changes - various misc bits * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (115 commits) kernel/exit.c: avoid undefined behaviour when calling wait4() kernel/signal.c: avoid undefined behaviour in kill_something_info binfmt_elf: safely increment argv pointers s390: reduce ELF_ET_DYN_BASE powerpc: move ELF_ET_DYN_BASE to 4GB / 4MB arm64: move ELF_ET_DYN_BASE to 4GB / 4MB arm: move ELF_ET_DYN_BASE to 4MB binfmt_elf: use ELF_ET_DYN_BASE only for PIE fs, epoll: short circuit fetching events if thread has been killed checkpatch: improve multi-line alignment test checkpatch: improve macro reuse test checkpatch: change format of --color argument to --color[=WHEN] checkpatch: silence perl 5.26.0 unescaped left brace warnings checkpatch: improve tests for multiple line function definitions checkpatch: remove false warning for commit reference checkpatch: fix stepping through statements with $stat and ctx_statement_block checkpatch: [HLP]LIST_HEAD is also declaration checkpatch: warn when a MAINTAINERS entry isn't [A-Z]:\t checkpatch: improve the unnecessary OOM message test lib/bsearch.c: micro-optimize pivot position calculation ...
This commit is contained in:
commit
9967468c0a
103 changed files with 1539 additions and 1250 deletions
|
@ -789,23 +789,46 @@ way to trigger. Applications should do whatever they can to help the
|
|||
system. It might be too late to consult with vmstat or any other
|
||||
statistics, so it's advisable to take an immediate action.
|
||||
|
||||
The events are propagated upward until the event is handled, i.e. the
|
||||
events are not pass-through. Here is what this means: for example you have
|
||||
three cgroups: A->B->C. Now you set up an event listener on cgroups A, B
|
||||
and C, and suppose group C experiences some pressure. In this situation,
|
||||
only group C will receive the notification, i.e. groups A and B will not
|
||||
receive it. This is done to avoid excessive "broadcasting" of messages,
|
||||
which disturbs the system and which is especially bad if we are low on
|
||||
memory or thrashing. So, organize the cgroups wisely, or propagate the
|
||||
events manually (or, ask us to implement the pass-through events,
|
||||
explaining why would you need them.)
|
||||
By default, events are propagated upward until the event is handled, i.e. the
|
||||
events are not pass-through. For example, you have three cgroups: A->B->C. Now
|
||||
you set up an event listener on cgroups A, B and C, and suppose group C
|
||||
experiences some pressure. In this situation, only group C will receive the
|
||||
notification, i.e. groups A and B will not receive it. This is done to avoid
|
||||
excessive "broadcasting" of messages, which disturbs the system and which is
|
||||
especially bad if we are low on memory or thrashing. Group B, will receive
|
||||
notification only if there are no event listers for group C.
|
||||
|
||||
There are three optional modes that specify different propagation behavior:
|
||||
|
||||
- "default": this is the default behavior specified above. This mode is the
|
||||
same as omitting the optional mode parameter, preserved by backwards
|
||||
compatibility.
|
||||
|
||||
- "hierarchy": events always propagate up to the root, similar to the default
|
||||
behavior, except that propagation continues regardless of whether there are
|
||||
event listeners at each level, with the "hierarchy" mode. In the above
|
||||
example, groups A, B, and C will receive notification of memory pressure.
|
||||
|
||||
- "local": events are pass-through, i.e. they only receive notifications when
|
||||
memory pressure is experienced in the memcg for which the notification is
|
||||
registered. In the above example, group C will receive notification if
|
||||
registered for "local" notification and the group experiences memory
|
||||
pressure. However, group B will never receive notification, regardless if
|
||||
there is an event listener for group C or not, if group B is registered for
|
||||
local notification.
|
||||
|
||||
The level and event notification mode ("hierarchy" or "local", if necessary) are
|
||||
specified by a comma-delimited string, i.e. "low,hierarchy" specifies
|
||||
hierarchical, pass-through, notification for all ancestor memcgs. Notification
|
||||
that is the default, non pass-through behavior, does not specify a mode.
|
||||
"medium,local" specifies pass-through notification for the medium level.
|
||||
|
||||
The file memory.pressure_level is only used to setup an eventfd. To
|
||||
register a notification, an application must:
|
||||
|
||||
- create an eventfd using eventfd(2);
|
||||
- open memory.pressure_level;
|
||||
- write string like "<event_fd> <fd of memory.pressure_level> <level>"
|
||||
- write string as "<event_fd> <fd of memory.pressure_level> <level[,mode]>"
|
||||
to cgroup.event_control.
|
||||
|
||||
Application will be notified through eventfd when memory pressure is at
|
||||
|
@ -821,7 +844,7 @@ Test:
|
|||
# cd /sys/fs/cgroup/memory/
|
||||
# mkdir foo
|
||||
# cd foo
|
||||
# cgroup_event_listener memory.pressure_level low &
|
||||
# cgroup_event_listener memory.pressure_level low,hierarchy &
|
||||
# echo 8000000 > memory.limit_in_bytes
|
||||
# echo 8000000 > memory.memsw.limit_in_bytes
|
||||
# echo $$ > tasks
|
||||
|
|
|
@ -282,20 +282,26 @@ offlined it is possible to change the individual block's state by writing to the
|
|||
% echo online > /sys/devices/system/memory/memoryXXX/state
|
||||
|
||||
This onlining will not change the ZONE type of the target memory block,
|
||||
If the memory block is in ZONE_NORMAL, you can change it to ZONE_MOVABLE:
|
||||
If the memory block doesn't belong to any zone an appropriate kernel zone
|
||||
(usually ZONE_NORMAL) will be used unless movable_node kernel command line
|
||||
option is specified when ZONE_MOVABLE will be used.
|
||||
|
||||
You can explicitly request to associate it with ZONE_MOVABLE by
|
||||
|
||||
% echo online_movable > /sys/devices/system/memory/memoryXXX/state
|
||||
(NOTE: current limit: this memory block must be adjacent to ZONE_MOVABLE)
|
||||
|
||||
And if the memory block is in ZONE_MOVABLE, you can change it to ZONE_NORMAL:
|
||||
Or you can explicitly request a kernel zone (usually ZONE_NORMAL) by:
|
||||
|
||||
% echo online_kernel > /sys/devices/system/memory/memoryXXX/state
|
||||
(NOTE: current limit: this memory block must be adjacent to ZONE_NORMAL)
|
||||
|
||||
An explicit zone onlining can fail (e.g. when the range is already within
|
||||
and existing and incompatible zone already).
|
||||
|
||||
After this, memory block XXX's state will be 'online' and the amount of
|
||||
available memory will be increased.
|
||||
|
||||
Currently, newly added memory is added as ZONE_NORMAL (for powerpc, ZONE_DMA).
|
||||
This may be changed in future.
|
||||
|
||||
|
||||
|
|
|
@ -240,6 +240,26 @@ fragmentation index is <= extfrag_threshold. The default value is 500.
|
|||
|
||||
==============================================================
|
||||
|
||||
highmem_is_dirtyable
|
||||
|
||||
Available only for systems with CONFIG_HIGHMEM enabled (32b systems).
|
||||
|
||||
This parameter controls whether the high memory is considered for dirty
|
||||
writers throttling. This is not the case by default which means that
|
||||
only the amount of memory directly visible/usable by the kernel can
|
||||
be dirtied. As a result, on systems with a large amount of memory and
|
||||
lowmem basically depleted writers might be throttled too early and
|
||||
streaming writes can get very slow.
|
||||
|
||||
Changing the value to non zero would allow more memory to be dirtied
|
||||
and thus allow writers to write more data which can be flushed to the
|
||||
storage more effectively. Note this also comes with a risk of pre-mature
|
||||
OOM killer because some writers (e.g. direct block device writes) can
|
||||
only use the low memory and they can fill it up with dirty data without
|
||||
any throttling.
|
||||
|
||||
==============================================================
|
||||
|
||||
hugepages_treat_as_movable
|
||||
|
||||
This parameter controls whether we can allocate hugepages from ZONE_MOVABLE
|
||||
|
|
11
MAINTAINERS
11
MAINTAINERS
|
@ -10559,6 +10559,17 @@ W: http://wireless.kernel.org/en/users/Drivers/p54
|
|||
S: Obsolete
|
||||
F: drivers/net/wireless/intersil/prism54/
|
||||
|
||||
PROC SYSCTL
|
||||
M: "Luis R. Rodriguez" <mcgrof@kernel.org>
|
||||
M: Kees Cook <keescook@chromium.org>
|
||||
L: linux-kernel@vger.kernel.org
|
||||
L: linux-fsdevel@vger.kernel.org
|
||||
S: Maintained
|
||||
F: fs/proc/proc_sysctl.c
|
||||
F: include/linux/sysctl.h
|
||||
F: kernel/sysctl.c
|
||||
F: tools/testing/selftests/sysctl/
|
||||
|
||||
PS3 NETWORK SUPPORT
|
||||
M: Geoff Levand <geoff@infradead.org>
|
||||
L: netdev@vger.kernel.org
|
||||
|
|
|
@ -33,6 +33,7 @@ extern void error(char *);
|
|||
/* Not needed, but used in some headers pulled in by decompressors */
|
||||
extern char * strstr(const char * s1, const char *s2);
|
||||
extern size_t strlen(const char *s);
|
||||
extern int memcmp(const void *cs, const void *ct, size_t count);
|
||||
|
||||
#ifdef CONFIG_KERNEL_GZIP
|
||||
#include "../../../../lib/decompress_inflate.c"
|
||||
|
|
|
@ -112,12 +112,8 @@ int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs);
|
|||
#define CORE_DUMP_USE_REGSET
|
||||
#define ELF_EXEC_PAGESIZE 4096
|
||||
|
||||
/* This is the location that an ET_DYN program is loaded if exec'ed. Typical
|
||||
use of this is to invoke "./ld.so someprog" to test out a new version of
|
||||
the loader. We need to make sure that it is out of the way of the program
|
||||
that it will "exec", and that there is sufficient room for the brk. */
|
||||
|
||||
#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2)
|
||||
/* This is the base location for PIE (ET_DYN with INTERP) loads. */
|
||||
#define ELF_ET_DYN_BASE 0x400000UL
|
||||
|
||||
/* When the program starts, a1 contains a pointer to a function to be
|
||||
registered with atexit, as per the SVR4 ABI. A value of 0 means we
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
*/
|
||||
|
||||
#include <linux/init.h>
|
||||
#include <linux/initrd.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/root_dev.h>
|
||||
|
@ -91,8 +92,6 @@ __tagtable(ATAG_VIDEOTEXT, parse_tag_videotext);
|
|||
#ifdef CONFIG_BLK_DEV_RAM
|
||||
static int __init parse_tag_ramdisk(const struct tag *tag)
|
||||
{
|
||||
extern int rd_size, rd_image_start, rd_prompt, rd_doload;
|
||||
|
||||
rd_image_start = tag->u.ramdisk.start;
|
||||
rd_doload = (tag->u.ramdisk.flags & 1) == 0;
|
||||
rd_prompt = (tag->u.ramdisk.flags & 2) == 0;
|
||||
|
|
|
@ -113,12 +113,11 @@
|
|||
#define ELF_EXEC_PAGESIZE PAGE_SIZE
|
||||
|
||||
/*
|
||||
* This is the location that an ET_DYN program is loaded if exec'ed. Typical
|
||||
* use of this is to invoke "./ld.so someprog" to test out a new version of
|
||||
* the loader. We need to make sure that it is out of the way of the program
|
||||
* that it will "exec", and that there is sufficient room for the brk.
|
||||
* This is the base location for PIE (ET_DYN with INTERP) loads. On
|
||||
* 64-bit, this is raised to 4GB to leave the entire 32-bit address
|
||||
* space open for things that want to use the area for 32-bit pointers.
|
||||
*/
|
||||
#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3)
|
||||
#define ELF_ET_DYN_BASE 0x100000000UL
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
|
@ -174,7 +173,8 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
|
|||
|
||||
#ifdef CONFIG_COMPAT
|
||||
|
||||
#define COMPAT_ELF_ET_DYN_BASE (2 * TASK_SIZE_32 / 3)
|
||||
/* PIE load location for compat arm. Must match ARM ELF_ET_DYN_BASE. */
|
||||
#define COMPAT_ELF_ET_DYN_BASE 0x000400000UL
|
||||
|
||||
/* AArch32 registers. */
|
||||
#define COMPAT_ELF_NGREG 18
|
||||
|
|
|
@ -191,14 +191,8 @@ void __init kasan_init(void)
|
|||
if (start >= end)
|
||||
break;
|
||||
|
||||
/*
|
||||
* end + 1 here is intentional. We check several shadow bytes in
|
||||
* advance to slightly speed up fastpath. In some rare cases
|
||||
* we could cross boundary of mapped shadow, so we just map
|
||||
* some more here.
|
||||
*/
|
||||
vmemmap_populate((unsigned long)kasan_mem_to_shadow(start),
|
||||
(unsigned long)kasan_mem_to_shadow(end) + 1,
|
||||
(unsigned long)kasan_mem_to_shadow(end),
|
||||
pfn_to_nid(virt_to_pfn(start)));
|
||||
}
|
||||
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
|
||||
generic-y += clkdev.h
|
||||
generic-y += device.h
|
||||
generic-y += exec.h
|
||||
generic-y += extable.h
|
||||
generic-y += fb.h
|
||||
generic-y += irq_work.h
|
||||
generic-y += mcs_spinlock.h
|
||||
generic-y += mm-arch-hooks.h
|
||||
|
|
|
@ -76,6 +76,7 @@ extern uint32_t __xchg_32(uint32_t i, volatile void *v);
|
|||
* - if (*ptr != test) then orig = *ptr;
|
||||
*/
|
||||
extern uint64_t __cmpxchg_64(uint64_t test, uint64_t new, volatile uint64_t *v);
|
||||
#define cmpxchg64(p, o, n) __cmpxchg_64((o), (n), (p))
|
||||
|
||||
#ifndef CONFIG_FRV_OUTOFLINE_ATOMIC_OPS
|
||||
|
||||
|
|
|
@ -1,7 +0,0 @@
|
|||
/*
|
||||
* Arch specific extensions to struct device
|
||||
*
|
||||
* This file is released under the GPLv2
|
||||
*/
|
||||
#include <asm-generic/device.h>
|
||||
|
|
@ -1,12 +0,0 @@
|
|||
#ifndef _ASM_FB_H_
|
||||
#define _ASM_FB_H_
|
||||
#include <linux/fb.h>
|
||||
|
||||
#define fb_pgprotect(...) do {} while (0)
|
||||
|
||||
static inline int fb_is_primary_device(struct fb_info *info)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* _ASM_FB_H_ */
|
|
@ -317,7 +317,8 @@ const struct exception_table_entry *search_module_dbetables(unsigned long addr)
|
|||
|
||||
spin_lock_irqsave(&dbe_lock, flags);
|
||||
list_for_each_entry(dbe, &dbe_list, dbe_list) {
|
||||
e = search_extable(dbe->dbe_start, dbe->dbe_end - 1, addr);
|
||||
e = search_extable(dbe->dbe_start,
|
||||
dbe->dbe_end - dbe->dbe_start, addr);
|
||||
if (e)
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -429,7 +429,8 @@ static const struct exception_table_entry *search_dbe_tables(unsigned long addr)
|
|||
{
|
||||
const struct exception_table_entry *e;
|
||||
|
||||
e = search_extable(__start___dbe_table, __stop___dbe_table - 1, addr);
|
||||
e = search_extable(__start___dbe_table,
|
||||
__stop___dbe_table - __start___dbe_table, addr);
|
||||
if (!e)
|
||||
e = search_module_dbetables(addr);
|
||||
return e;
|
||||
|
|
|
@ -23,12 +23,13 @@
|
|||
#define CORE_DUMP_USE_REGSET
|
||||
#define ELF_EXEC_PAGESIZE PAGE_SIZE
|
||||
|
||||
/* This is the location that an ET_DYN program is loaded if exec'ed. Typical
|
||||
use of this is to invoke "./ld.so someprog" to test out a new version of
|
||||
the loader. We need to make sure that it is out of the way of the program
|
||||
that it will "exec", and that there is sufficient room for the brk. */
|
||||
|
||||
#define ELF_ET_DYN_BASE 0x20000000
|
||||
/*
|
||||
* This is the base location for PIE (ET_DYN with INTERP) loads. On
|
||||
* 64-bit, this is raised to 4GB to leave the entire 32-bit address
|
||||
* space open for things that want to use the area for 32-bit pointers.
|
||||
*/
|
||||
#define ELF_ET_DYN_BASE (is_32bit_task() ? 0x000400000UL : \
|
||||
0x100000000UL)
|
||||
|
||||
#define ELF_CORE_EFLAGS (is_elf2_task() ? 2 : 0)
|
||||
|
||||
|
|
|
@ -193,14 +193,13 @@ struct arch_elf_state {
|
|||
#define CORE_DUMP_USE_REGSET
|
||||
#define ELF_EXEC_PAGESIZE 4096
|
||||
|
||||
/* This is the location that an ET_DYN program is loaded if exec'ed. Typical
|
||||
use of this is to invoke "./ld.so someprog" to test out a new version of
|
||||
the loader. We need to make sure that it is out of the way of the program
|
||||
that it will "exec", and that there is sufficient room for the brk. 64-bit
|
||||
tasks are aligned to 4GB. */
|
||||
#define ELF_ET_DYN_BASE (is_compat_task() ? \
|
||||
(STACK_TOP / 3 * 2) : \
|
||||
(STACK_TOP / 3 * 2) & ~((1UL << 32) - 1))
|
||||
/*
|
||||
* This is the base location for PIE (ET_DYN with INTERP) loads. On
|
||||
* 64-bit, this is raised to 4GB to leave the entire 32-bit address
|
||||
* space open for things that want to use the area for 32-bit pointers.
|
||||
*/
|
||||
#define ELF_ET_DYN_BASE (is_compat_task() ? 0x000400000UL : \
|
||||
0x100000000UL)
|
||||
|
||||
/* This yields a mask that user programs can use to figure out what
|
||||
instruction set this CPU supports. */
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
* License. See the file "COPYING" in the main directory of this archive
|
||||
* for more details.
|
||||
*/
|
||||
#include <linux/bsearch.h>
|
||||
#include <linux/rwsem.h>
|
||||
#include <linux/extable.h>
|
||||
#include <linux/uaccess.h>
|
||||
|
@ -40,10 +41,23 @@ static const struct exception_table_entry *check_exception_ranges(unsigned long
|
|||
return NULL;
|
||||
}
|
||||
|
||||
static int cmp_ex_search(const void *key, const void *elt)
|
||||
{
|
||||
const struct exception_table_entry *_elt = elt;
|
||||
unsigned long _key = *(unsigned long *)key;
|
||||
|
||||
/* avoid overflow */
|
||||
if (_key > _elt->insn)
|
||||
return 1;
|
||||
if (_key < _elt->insn)
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Simple binary search */
|
||||
const struct exception_table_entry *
|
||||
search_extable(const struct exception_table_entry *first,
|
||||
const struct exception_table_entry *last,
|
||||
search_extable(const struct exception_table_entry *base,
|
||||
const size_t num,
|
||||
unsigned long value)
|
||||
{
|
||||
const struct exception_table_entry *mid;
|
||||
|
@ -52,20 +66,8 @@ search_extable(const struct exception_table_entry *first,
|
|||
if (mid)
|
||||
return mid;
|
||||
|
||||
while (first <= last) {
|
||||
long diff;
|
||||
|
||||
mid = (last - first) / 2 + first;
|
||||
diff = mid->insn - value;
|
||||
if (diff == 0)
|
||||
return mid;
|
||||
else if (diff < 0)
|
||||
first = mid+1;
|
||||
else
|
||||
last = mid-1;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
return bsearch(&value, base, num,
|
||||
sizeof(struct exception_table_entry), cmp_ex_search);
|
||||
}
|
||||
|
||||
int fixup_exception(struct pt_regs *regs)
|
||||
|
|
|
@ -13,11 +13,11 @@ void sort_extable(struct exception_table_entry *start,
|
|||
|
||||
/* Caller knows they are in a range if ret->fixup == 0 */
|
||||
const struct exception_table_entry *
|
||||
search_extable(const struct exception_table_entry *start,
|
||||
const struct exception_table_entry *last,
|
||||
search_extable(const struct exception_table_entry *base,
|
||||
const size_t num,
|
||||
unsigned long value)
|
||||
{
|
||||
const struct exception_table_entry *walk;
|
||||
int i;
|
||||
|
||||
/* Single insn entries are encoded as:
|
||||
* word 1: insn address
|
||||
|
@ -37,30 +37,30 @@ search_extable(const struct exception_table_entry *start,
|
|||
*/
|
||||
|
||||
/* 1. Try to find an exact match. */
|
||||
for (walk = start; walk <= last; walk++) {
|
||||
if (walk->fixup == 0) {
|
||||
for (i = 0; i < num; i++) {
|
||||
if (base[i].fixup == 0) {
|
||||
/* A range entry, skip both parts. */
|
||||
walk++;
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* A deleted entry; see trim_init_extable */
|
||||
if (walk->fixup == -1)
|
||||
if (base[i].fixup == -1)
|
||||
continue;
|
||||
|
||||
if (walk->insn == value)
|
||||
return walk;
|
||||
if (base[i].insn == value)
|
||||
return &base[i];
|
||||
}
|
||||
|
||||
/* 2. Try to find a range match. */
|
||||
for (walk = start; walk <= (last - 1); walk++) {
|
||||
if (walk->fixup)
|
||||
for (i = 0; i < (num - 1); i++) {
|
||||
if (base[i].fixup)
|
||||
continue;
|
||||
|
||||
if (walk[0].insn <= value && walk[1].insn > value)
|
||||
return walk;
|
||||
if (base[i].insn <= value && base[i + 1].insn > value)
|
||||
return &base[i];
|
||||
|
||||
walk++;
|
||||
i++;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
|
|
|
@ -245,12 +245,13 @@ extern int force_personality32;
|
|||
#define CORE_DUMP_USE_REGSET
|
||||
#define ELF_EXEC_PAGESIZE 4096
|
||||
|
||||
/* This is the location that an ET_DYN program is loaded if exec'ed. Typical
|
||||
use of this is to invoke "./ld.so someprog" to test out a new version of
|
||||
the loader. We need to make sure that it is out of the way of the program
|
||||
that it will "exec", and that there is sufficient room for the brk. */
|
||||
|
||||
#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2)
|
||||
/*
|
||||
* This is the base location for PIE (ET_DYN with INTERP) loads. On
|
||||
* 64-bit, this is raised to 4GB to leave the entire 32-bit address
|
||||
* space open for things that want to use the area for 32-bit pointers.
|
||||
*/
|
||||
#define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \
|
||||
0x100000000UL)
|
||||
|
||||
/* This yields a mask that user programs can use to figure out what
|
||||
instruction set this CPU supports. This could be done in user space,
|
||||
|
|
|
@ -23,12 +23,7 @@ static int __init map_range(struct range *range)
|
|||
start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start));
|
||||
end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end));
|
||||
|
||||
/*
|
||||
* end + 1 here is intentional. We check several shadow bytes in advance
|
||||
* to slightly speed up fastpath. In some rare cases we could cross
|
||||
* boundary of mapped shadow, so we just map some more here.
|
||||
*/
|
||||
return vmemmap_populate(start, end + 1, NUMA_NO_NODE);
|
||||
return vmemmap_populate(start, end, NUMA_NO_NODE);
|
||||
}
|
||||
|
||||
static void __init clear_pgds(unsigned long start,
|
||||
|
|
|
@ -288,7 +288,7 @@ static void node_device_release(struct device *dev)
|
|||
*
|
||||
* Initialize and register the node device.
|
||||
*/
|
||||
static int register_node(struct node *node, int num, struct node *parent)
|
||||
static int register_node(struct node *node, int num)
|
||||
{
|
||||
int error;
|
||||
|
||||
|
@ -567,19 +567,14 @@ static void init_node_hugetlb_work(int nid) { }
|
|||
|
||||
int __register_one_node(int nid)
|
||||
{
|
||||
int p_node = parent_node(nid);
|
||||
struct node *parent = NULL;
|
||||
int error;
|
||||
int cpu;
|
||||
|
||||
if (p_node != nid)
|
||||
parent = node_devices[p_node];
|
||||
|
||||
node_devices[nid] = kzalloc(sizeof(struct node), GFP_KERNEL);
|
||||
if (!node_devices[nid])
|
||||
return -ENOMEM;
|
||||
|
||||
error = register_node(node_devices[nid], nid, parent);
|
||||
error = register_node(node_devices[nid], nid);
|
||||
|
||||
/* link cpu under this node */
|
||||
for_each_present_cpu(cpu) {
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
*/
|
||||
|
||||
#include <linux/init.h>
|
||||
#include <linux/initrd.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/moduleparam.h>
|
||||
#include <linux/major.h>
|
||||
|
|
|
@ -68,13 +68,11 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
|
|||
|
||||
bool zcomp_available_algorithm(const char *comp)
|
||||
{
|
||||
int i = 0;
|
||||
int i;
|
||||
|
||||
while (backends[i]) {
|
||||
if (sysfs_streq(comp, backends[i]))
|
||||
return true;
|
||||
i++;
|
||||
}
|
||||
i = __sysfs_match_string(backends, -1, comp);
|
||||
if (i >= 0)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* Crypto does not ignore a trailing new line symbol,
|
||||
|
|
|
@ -1124,7 +1124,7 @@ static struct attribute *zram_disk_attrs[] = {
|
|||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute_group zram_disk_attr_group = {
|
||||
static const struct attribute_group zram_disk_attr_group = {
|
||||
.attrs = zram_disk_attrs,
|
||||
};
|
||||
|
||||
|
|
|
@ -163,8 +163,6 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
|
|||
unsigned long p = bprm->p;
|
||||
int argc = bprm->argc;
|
||||
int envc = bprm->envc;
|
||||
elf_addr_t __user *argv;
|
||||
elf_addr_t __user *envp;
|
||||
elf_addr_t __user *sp;
|
||||
elf_addr_t __user *u_platform;
|
||||
elf_addr_t __user *u_base_platform;
|
||||
|
@ -304,38 +302,38 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
|
|||
/* Now, let's put argc (and argv, envp if appropriate) on the stack */
|
||||
if (__put_user(argc, sp++))
|
||||
return -EFAULT;
|
||||
argv = sp;
|
||||
envp = argv + argc + 1;
|
||||
|
||||
/* Populate argv and envp */
|
||||
/* Populate list of argv pointers back to argv strings. */
|
||||
p = current->mm->arg_end = current->mm->arg_start;
|
||||
while (argc-- > 0) {
|
||||
size_t len;
|
||||
if (__put_user((elf_addr_t)p, argv++))
|
||||
if (__put_user((elf_addr_t)p, sp++))
|
||||
return -EFAULT;
|
||||
len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
|
||||
if (!len || len > MAX_ARG_STRLEN)
|
||||
return -EINVAL;
|
||||
p += len;
|
||||
}
|
||||
if (__put_user(0, argv))
|
||||
if (__put_user(0, sp++))
|
||||
return -EFAULT;
|
||||
current->mm->arg_end = current->mm->env_start = p;
|
||||
current->mm->arg_end = p;
|
||||
|
||||
/* Populate list of envp pointers back to envp strings. */
|
||||
current->mm->env_end = current->mm->env_start = p;
|
||||
while (envc-- > 0) {
|
||||
size_t len;
|
||||
if (__put_user((elf_addr_t)p, envp++))
|
||||
if (__put_user((elf_addr_t)p, sp++))
|
||||
return -EFAULT;
|
||||
len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
|
||||
if (!len || len > MAX_ARG_STRLEN)
|
||||
return -EINVAL;
|
||||
p += len;
|
||||
}
|
||||
if (__put_user(0, envp))
|
||||
if (__put_user(0, sp++))
|
||||
return -EFAULT;
|
||||
current->mm->env_end = p;
|
||||
|
||||
/* Put the elf_info on the stack in the right place. */
|
||||
sp = (elf_addr_t __user *)envp + 1;
|
||||
if (copy_to_user(sp, elf_info, ei_index * sizeof(elf_addr_t)))
|
||||
return -EFAULT;
|
||||
return 0;
|
||||
|
@ -927,17 +925,60 @@ static int load_elf_binary(struct linux_binprm *bprm)
|
|||
elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
|
||||
|
||||
vaddr = elf_ppnt->p_vaddr;
|
||||
/*
|
||||
* If we are loading ET_EXEC or we have already performed
|
||||
* the ET_DYN load_addr calculations, proceed normally.
|
||||
*/
|
||||
if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
|
||||
elf_flags |= MAP_FIXED;
|
||||
} else if (loc->elf_ex.e_type == ET_DYN) {
|
||||
/* Try and get dynamic programs out of the way of the
|
||||
* default mmap base, as well as whatever program they
|
||||
* might try to exec. This is because the brk will
|
||||
* follow the loader, and is not movable. */
|
||||
load_bias = ELF_ET_DYN_BASE - vaddr;
|
||||
if (current->flags & PF_RANDOMIZE)
|
||||
load_bias += arch_mmap_rnd();
|
||||
load_bias = ELF_PAGESTART(load_bias);
|
||||
/*
|
||||
* This logic is run once for the first LOAD Program
|
||||
* Header for ET_DYN binaries to calculate the
|
||||
* randomization (load_bias) for all the LOAD
|
||||
* Program Headers, and to calculate the entire
|
||||
* size of the ELF mapping (total_size). (Note that
|
||||
* load_addr_set is set to true later once the
|
||||
* initial mapping is performed.)
|
||||
*
|
||||
* There are effectively two types of ET_DYN
|
||||
* binaries: programs (i.e. PIE: ET_DYN with INTERP)
|
||||
* and loaders (ET_DYN without INTERP, since they
|
||||
* _are_ the ELF interpreter). The loaders must
|
||||
* be loaded away from programs since the program
|
||||
* may otherwise collide with the loader (especially
|
||||
* for ET_EXEC which does not have a randomized
|
||||
* position). For example to handle invocations of
|
||||
* "./ld.so someprog" to test out a new version of
|
||||
* the loader, the subsequent program that the
|
||||
* loader loads must avoid the loader itself, so
|
||||
* they cannot share the same load range. Sufficient
|
||||
* room for the brk must be allocated with the
|
||||
* loader as well, since brk must be available with
|
||||
* the loader.
|
||||
*
|
||||
* Therefore, programs are loaded offset from
|
||||
* ELF_ET_DYN_BASE and loaders are loaded into the
|
||||
* independently randomized mmap region (0 load_bias
|
||||
* without MAP_FIXED).
|
||||
*/
|
||||
if (elf_interpreter) {
|
||||
load_bias = ELF_ET_DYN_BASE;
|
||||
if (current->flags & PF_RANDOMIZE)
|
||||
load_bias += arch_mmap_rnd();
|
||||
elf_flags |= MAP_FIXED;
|
||||
} else
|
||||
load_bias = 0;
|
||||
|
||||
/*
|
||||
* Since load_bias is used for all subsequent loading
|
||||
* calculations, we must lower it by the first vaddr
|
||||
* so that the remaining calculations based on the
|
||||
* ELF vaddrs will be correctly offset. The result
|
||||
* is then page aligned.
|
||||
*/
|
||||
load_bias = ELF_PAGESTART(load_bias - vaddr);
|
||||
|
||||
total_size = total_mapping_size(elf_phdata,
|
||||
loc->elf_ex.e_phnum);
|
||||
if (!total_size) {
|
||||
|
|
43
fs/buffer.c
43
fs/buffer.c
|
@ -1281,44 +1281,31 @@ static inline void check_irqs_on(void)
|
|||
}
|
||||
|
||||
/*
|
||||
* The LRU management algorithm is dopey-but-simple. Sorry.
|
||||
* Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
|
||||
* inserted at the front, and the buffer_head at the back if any is evicted.
|
||||
* Or, if already in the LRU it is moved to the front.
|
||||
*/
|
||||
static void bh_lru_install(struct buffer_head *bh)
|
||||
{
|
||||
struct buffer_head *evictee = NULL;
|
||||
struct buffer_head *evictee = bh;
|
||||
struct bh_lru *b;
|
||||
int i;
|
||||
|
||||
check_irqs_on();
|
||||
bh_lru_lock();
|
||||
if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
|
||||
struct buffer_head *bhs[BH_LRU_SIZE];
|
||||
int in;
|
||||
int out = 0;
|
||||
|
||||
get_bh(bh);
|
||||
bhs[out++] = bh;
|
||||
for (in = 0; in < BH_LRU_SIZE; in++) {
|
||||
struct buffer_head *bh2 =
|
||||
__this_cpu_read(bh_lrus.bhs[in]);
|
||||
|
||||
if (bh2 == bh) {
|
||||
__brelse(bh2);
|
||||
} else {
|
||||
if (out >= BH_LRU_SIZE) {
|
||||
BUG_ON(evictee != NULL);
|
||||
evictee = bh2;
|
||||
} else {
|
||||
bhs[out++] = bh2;
|
||||
}
|
||||
}
|
||||
b = this_cpu_ptr(&bh_lrus);
|
||||
for (i = 0; i < BH_LRU_SIZE; i++) {
|
||||
swap(evictee, b->bhs[i]);
|
||||
if (evictee == bh) {
|
||||
bh_lru_unlock();
|
||||
return;
|
||||
}
|
||||
while (out < BH_LRU_SIZE)
|
||||
bhs[out++] = NULL;
|
||||
memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
|
||||
}
|
||||
bh_lru_unlock();
|
||||
|
||||
if (evictee)
|
||||
__brelse(evictee);
|
||||
get_bh(bh);
|
||||
bh_lru_unlock();
|
||||
brelse(evictee);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -1160,11 +1160,12 @@ void shrink_dcache_sb(struct super_block *sb)
|
|||
LIST_HEAD(dispose);
|
||||
|
||||
freed = list_lru_walk(&sb->s_dentry_lru,
|
||||
dentry_lru_isolate_shrink, &dispose, UINT_MAX);
|
||||
dentry_lru_isolate_shrink, &dispose, 1024);
|
||||
|
||||
this_cpu_sub(nr_dentry_unused, freed);
|
||||
shrink_dentry_list(&dispose);
|
||||
} while (freed > 0);
|
||||
cond_resched();
|
||||
} while (list_lru_count(&sb->s_dentry_lru) > 0);
|
||||
}
|
||||
EXPORT_SYMBOL(shrink_dcache_sb);
|
||||
|
||||
|
|
|
@ -1748,6 +1748,16 @@ fetch_events:
|
|||
* to TASK_INTERRUPTIBLE before doing the checks.
|
||||
*/
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
/*
|
||||
* Always short-circuit for fatal signals to allow
|
||||
* threads to make a timely exit without the chance of
|
||||
* finding more events available and fetching
|
||||
* repeatedly.
|
||||
*/
|
||||
if (fatal_signal_pending(current)) {
|
||||
res = -EINTR;
|
||||
break;
|
||||
}
|
||||
if (ep_events_available(ep) || timed_out)
|
||||
break;
|
||||
if (signal_pending(current)) {
|
||||
|
|
|
@ -851,6 +851,16 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
|
|||
return MIGRATEPAGE_SUCCESS;
|
||||
}
|
||||
|
||||
static int hugetlbfs_error_remove_page(struct address_space *mapping,
|
||||
struct page *page)
|
||||
{
|
||||
struct inode *inode = mapping->host;
|
||||
|
||||
remove_huge_page(page);
|
||||
hugetlb_fix_reserve_counts(inode);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
|
||||
{
|
||||
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
|
||||
|
@ -966,6 +976,7 @@ static const struct address_space_operations hugetlbfs_aops = {
|
|||
.write_end = hugetlbfs_write_end,
|
||||
.set_page_dirty = hugetlbfs_set_page_dirty,
|
||||
.migratepage = hugetlbfs_migrate_page,
|
||||
.error_remove_page = hugetlbfs_error_remove_page,
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -180,7 +180,6 @@ static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
|
|||
}
|
||||
|
||||
static DEFINE_IDA(proc_inum_ida);
|
||||
static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
|
||||
|
||||
#define PROC_DYNAMIC_FIRST 0xF0000000U
|
||||
|
||||
|
@ -190,37 +189,20 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
|
|||
*/
|
||||
int proc_alloc_inum(unsigned int *inum)
|
||||
{
|
||||
unsigned int i;
|
||||
int error;
|
||||
int i;
|
||||
|
||||
retry:
|
||||
if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL))
|
||||
return -ENOMEM;
|
||||
i = ida_simple_get(&proc_inum_ida, 0, UINT_MAX - PROC_DYNAMIC_FIRST + 1,
|
||||
GFP_KERNEL);
|
||||
if (i < 0)
|
||||
return i;
|
||||
|
||||
spin_lock_irq(&proc_inum_lock);
|
||||
error = ida_get_new(&proc_inum_ida, &i);
|
||||
spin_unlock_irq(&proc_inum_lock);
|
||||
if (error == -EAGAIN)
|
||||
goto retry;
|
||||
else if (error)
|
||||
return error;
|
||||
|
||||
if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
|
||||
spin_lock_irq(&proc_inum_lock);
|
||||
ida_remove(&proc_inum_ida, i);
|
||||
spin_unlock_irq(&proc_inum_lock);
|
||||
return -ENOSPC;
|
||||
}
|
||||
*inum = PROC_DYNAMIC_FIRST + i;
|
||||
*inum = PROC_DYNAMIC_FIRST + (unsigned int)i;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void proc_free_inum(unsigned int inum)
|
||||
{
|
||||
unsigned long flags;
|
||||
spin_lock_irqsave(&proc_inum_lock, flags);
|
||||
ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
|
||||
spin_unlock_irqrestore(&proc_inum_lock, flags);
|
||||
ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -298,7 +298,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
|
|||
pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
|
||||
}
|
||||
|
||||
/* We don't show the stack guard page in /proc/maps */
|
||||
start = vma->vm_start;
|
||||
end = vma->vm_end;
|
||||
|
||||
|
|
|
@ -97,6 +97,7 @@ extern void warn_slowpath_null(const char *file, const int line);
|
|||
|
||||
/* used internally by panic.c */
|
||||
struct warn_args;
|
||||
struct pt_regs;
|
||||
|
||||
void __warn(const char *file, int line, void *caller, unsigned taint,
|
||||
struct pt_regs *regs, struct warn_args *args);
|
||||
|
|
|
@ -104,22 +104,9 @@ static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
|
|||
return percpu_counter_read_positive(&wb->stat[item]);
|
||||
}
|
||||
|
||||
static inline s64 __wb_stat_sum(struct bdi_writeback *wb,
|
||||
enum wb_stat_item item)
|
||||
{
|
||||
return percpu_counter_sum_positive(&wb->stat[item]);
|
||||
}
|
||||
|
||||
static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item)
|
||||
{
|
||||
s64 sum;
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
sum = __wb_stat_sum(wb, item);
|
||||
local_irq_restore(flags);
|
||||
|
||||
return sum;
|
||||
return percpu_counter_sum_positive(&wb->stat[item]);
|
||||
}
|
||||
|
||||
extern void wb_writeout_inc(struct bdi_writeback *wb);
|
||||
|
|
|
@ -112,9 +112,8 @@ extern int __bitmap_intersects(const unsigned long *bitmap1,
|
|||
extern int __bitmap_subset(const unsigned long *bitmap1,
|
||||
const unsigned long *bitmap2, unsigned int nbits);
|
||||
extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
|
||||
|
||||
extern void bitmap_set(unsigned long *map, unsigned int start, int len);
|
||||
extern void bitmap_clear(unsigned long *map, unsigned int start, int len);
|
||||
extern void __bitmap_set(unsigned long *map, unsigned int start, int len);
|
||||
extern void __bitmap_clear(unsigned long *map, unsigned int start, int len);
|
||||
|
||||
extern unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
|
||||
unsigned long size,
|
||||
|
@ -267,10 +266,8 @@ static inline int bitmap_equal(const unsigned long *src1,
|
|||
{
|
||||
if (small_const_nbits(nbits))
|
||||
return !((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits));
|
||||
#ifdef CONFIG_S390
|
||||
if (__builtin_constant_p(nbits) && (nbits % BITS_PER_LONG) == 0)
|
||||
if (__builtin_constant_p(nbits & 7) && IS_ALIGNED(nbits, 8))
|
||||
return !memcmp(src1, src2, nbits / 8);
|
||||
#endif
|
||||
return __bitmap_equal(src1, src2, nbits);
|
||||
}
|
||||
|
||||
|
@ -315,6 +312,30 @@ static __always_inline int bitmap_weight(const unsigned long *src, unsigned int
|
|||
return __bitmap_weight(src, nbits);
|
||||
}
|
||||
|
||||
static __always_inline void bitmap_set(unsigned long *map, unsigned int start,
|
||||
unsigned int nbits)
|
||||
{
|
||||
if (__builtin_constant_p(nbits) && nbits == 1)
|
||||
__set_bit(start, map);
|
||||
else if (__builtin_constant_p(start & 7) && IS_ALIGNED(start, 8) &&
|
||||
__builtin_constant_p(nbits & 7) && IS_ALIGNED(nbits, 8))
|
||||
memset((char *)map + start / 8, 0xff, nbits / 8);
|
||||
else
|
||||
__bitmap_set(map, start, nbits);
|
||||
}
|
||||
|
||||
static __always_inline void bitmap_clear(unsigned long *map, unsigned int start,
|
||||
unsigned int nbits)
|
||||
{
|
||||
if (__builtin_constant_p(nbits) && nbits == 1)
|
||||
__clear_bit(start, map);
|
||||
else if (__builtin_constant_p(start & 7) && IS_ALIGNED(start, 8) &&
|
||||
__builtin_constant_p(nbits & 7) && IS_ALIGNED(nbits, 8))
|
||||
memset((char *)map + start / 8, 0, nbits / 8);
|
||||
else
|
||||
__bitmap_clear(map, start, nbits);
|
||||
}
|
||||
|
||||
static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src,
|
||||
unsigned int shift, int nbits)
|
||||
{
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
#include <asm/bug.h>
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/build_bug.h>
|
||||
|
||||
enum bug_trap_type {
|
||||
BUG_TRAP_TYPE_NONE = 0,
|
||||
|
@ -13,80 +14,9 @@ enum bug_trap_type {
|
|||
struct pt_regs;
|
||||
|
||||
#ifdef __CHECKER__
|
||||
#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) (0)
|
||||
#define BUILD_BUG_ON_NOT_POWER_OF_2(n) (0)
|
||||
#define BUILD_BUG_ON_ZERO(e) (0)
|
||||
#define BUILD_BUG_ON_NULL(e) ((void*)0)
|
||||
#define BUILD_BUG_ON_INVALID(e) (0)
|
||||
#define BUILD_BUG_ON_MSG(cond, msg) (0)
|
||||
#define BUILD_BUG_ON(condition) (0)
|
||||
#define BUILD_BUG() (0)
|
||||
#define MAYBE_BUILD_BUG_ON(cond) (0)
|
||||
#else /* __CHECKER__ */
|
||||
|
||||
/* Force a compilation error if a constant expression is not a power of 2 */
|
||||
#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) \
|
||||
BUILD_BUG_ON(((n) & ((n) - 1)) != 0)
|
||||
#define BUILD_BUG_ON_NOT_POWER_OF_2(n) \
|
||||
BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0))
|
||||
|
||||
/* Force a compilation error if condition is true, but also produce a
|
||||
result (of value 0 and type size_t), so the expression can be used
|
||||
e.g. in a structure initializer (or where-ever else comma expressions
|
||||
aren't permitted). */
|
||||
#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
|
||||
#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); }))
|
||||
|
||||
/*
|
||||
* BUILD_BUG_ON_INVALID() permits the compiler to check the validity of the
|
||||
* expression but avoids the generation of any code, even if that expression
|
||||
* has side-effects.
|
||||
*/
|
||||
#define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
|
||||
|
||||
/**
|
||||
* BUILD_BUG_ON_MSG - break compile if a condition is true & emit supplied
|
||||
* error message.
|
||||
* @condition: the condition which the compiler should know is false.
|
||||
*
|
||||
* See BUILD_BUG_ON for description.
|
||||
*/
|
||||
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
|
||||
|
||||
/**
|
||||
* BUILD_BUG_ON - break compile if a condition is true.
|
||||
* @condition: the condition which the compiler should know is false.
|
||||
*
|
||||
* If you have some code which relies on certain constants being equal, or
|
||||
* some other compile-time-evaluated condition, you should use BUILD_BUG_ON to
|
||||
* detect if someone changes it.
|
||||
*
|
||||
* The implementation uses gcc's reluctance to create a negative array, but gcc
|
||||
* (as of 4.4) only emits that error for obvious cases (e.g. not arguments to
|
||||
* inline functions). Luckily, in 4.3 they added the "error" function
|
||||
* attribute just for this type of case. Thus, we use a negative sized array
|
||||
* (should always create an error on gcc versions older than 4.4) and then call
|
||||
* an undefined function with the error attribute (should always create an
|
||||
* error on gcc 4.3 and later). If for some reason, neither creates a
|
||||
* compile-time error, we'll still have a link-time error, which is harder to
|
||||
* track down.
|
||||
*/
|
||||
#ifndef __OPTIMIZE__
|
||||
#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
|
||||
#else
|
||||
#define BUILD_BUG_ON(condition) \
|
||||
BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition)
|
||||
#endif
|
||||
|
||||
/**
|
||||
* BUILD_BUG - break compile if used.
|
||||
*
|
||||
* If you have some code that you expect the compiler to eliminate at
|
||||
* build time, you should use BUILD_BUG to detect if it is
|
||||
* unexpectedly used.
|
||||
*/
|
||||
#define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
|
||||
|
||||
#define MAYBE_BUILD_BUG_ON(cond) \
|
||||
do { \
|
||||
if (__builtin_constant_p((cond))) \
|
||||
|
|
84
include/linux/build_bug.h
Normal file
84
include/linux/build_bug.h
Normal file
|
@ -0,0 +1,84 @@
|
|||
#ifndef _LINUX_BUILD_BUG_H
|
||||
#define _LINUX_BUILD_BUG_H
|
||||
|
||||
#include <linux/compiler.h>
|
||||
|
||||
#ifdef __CHECKER__
|
||||
#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) (0)
|
||||
#define BUILD_BUG_ON_NOT_POWER_OF_2(n) (0)
|
||||
#define BUILD_BUG_ON_ZERO(e) (0)
|
||||
#define BUILD_BUG_ON_NULL(e) ((void *)0)
|
||||
#define BUILD_BUG_ON_INVALID(e) (0)
|
||||
#define BUILD_BUG_ON_MSG(cond, msg) (0)
|
||||
#define BUILD_BUG_ON(condition) (0)
|
||||
#define BUILD_BUG() (0)
|
||||
#else /* __CHECKER__ */
|
||||
|
||||
/* Force a compilation error if a constant expression is not a power of 2 */
|
||||
#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) \
|
||||
BUILD_BUG_ON(((n) & ((n) - 1)) != 0)
|
||||
#define BUILD_BUG_ON_NOT_POWER_OF_2(n) \
|
||||
BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0))
|
||||
|
||||
/*
|
||||
* Force a compilation error if condition is true, but also produce a
|
||||
* result (of value 0 and type size_t), so the expression can be used
|
||||
* e.g. in a structure initializer (or where-ever else comma expressions
|
||||
* aren't permitted).
|
||||
*/
|
||||
#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:(-!!(e)); }))
|
||||
#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:(-!!(e)); }))
|
||||
|
||||
/*
|
||||
* BUILD_BUG_ON_INVALID() permits the compiler to check the validity of the
|
||||
* expression but avoids the generation of any code, even if that expression
|
||||
* has side-effects.
|
||||
*/
|
||||
#define BUILD_BUG_ON_INVALID(e) ((void)(sizeof((__force long)(e))))
|
||||
|
||||
/**
|
||||
* BUILD_BUG_ON_MSG - break compile if a condition is true & emit supplied
|
||||
* error message.
|
||||
* @condition: the condition which the compiler should know is false.
|
||||
*
|
||||
* See BUILD_BUG_ON for description.
|
||||
*/
|
||||
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
|
||||
|
||||
/**
|
||||
* BUILD_BUG_ON - break compile if a condition is true.
|
||||
* @condition: the condition which the compiler should know is false.
|
||||
*
|
||||
* If you have some code which relies on certain constants being equal, or
|
||||
* some other compile-time-evaluated condition, you should use BUILD_BUG_ON to
|
||||
* detect if someone changes it.
|
||||
*
|
||||
* The implementation uses gcc's reluctance to create a negative array, but gcc
|
||||
* (as of 4.4) only emits that error for obvious cases (e.g. not arguments to
|
||||
* inline functions). Luckily, in 4.3 they added the "error" function
|
||||
* attribute just for this type of case. Thus, we use a negative sized array
|
||||
* (should always create an error on gcc versions older than 4.4) and then call
|
||||
* an undefined function with the error attribute (should always create an
|
||||
* error on gcc 4.3 and later). If for some reason, neither creates a
|
||||
* compile-time error, we'll still have a link-time error, which is harder to
|
||||
* track down.
|
||||
*/
|
||||
#ifndef __OPTIMIZE__
|
||||
#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
|
||||
#else
|
||||
#define BUILD_BUG_ON(condition) \
|
||||
BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition)
|
||||
#endif
|
||||
|
||||
/**
|
||||
* BUILD_BUG - break compile if used.
|
||||
*
|
||||
* If you have some code that you expect the compiler to eliminate at
|
||||
* build time, you should use BUILD_BUG to detect if it is
|
||||
* unexpectedly used.
|
||||
*/
|
||||
#define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
|
||||
|
||||
#endif /* __CHECKER__ */
|
||||
|
||||
#endif /* _LINUX_BUILD_BUG_H */
|
|
@ -154,11 +154,6 @@ static inline unsigned int dax_radix_order(void *entry)
|
|||
#endif
|
||||
int dax_pfn_mkwrite(struct vm_fault *vmf);
|
||||
|
||||
static inline bool vma_is_dax(struct vm_area_struct *vma)
|
||||
{
|
||||
return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
|
||||
}
|
||||
|
||||
static inline bool dax_mapping(struct address_space *mapping)
|
||||
{
|
||||
return mapping->host && IS_DAX(mapping->host);
|
||||
|
|
|
@ -2,13 +2,14 @@
|
|||
#define _LINUX_EXTABLE_H
|
||||
|
||||
#include <linux/stddef.h> /* for NULL */
|
||||
#include <linux/types.h>
|
||||
|
||||
struct module;
|
||||
struct exception_table_entry;
|
||||
|
||||
const struct exception_table_entry *
|
||||
search_extable(const struct exception_table_entry *first,
|
||||
const struct exception_table_entry *last,
|
||||
search_extable(const struct exception_table_entry *base,
|
||||
const size_t num,
|
||||
unsigned long value);
|
||||
void sort_extable(struct exception_table_entry *start,
|
||||
struct exception_table_entry *finish);
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include <linux/bug.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/rwsem.h>
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/capability.h>
|
||||
#include <linux/semaphore.h>
|
||||
#include <linux/fcntl.h>
|
||||
|
@ -3127,6 +3128,11 @@ static inline bool io_is_direct(struct file *filp)
|
|||
return (filp->f_flags & O_DIRECT) || IS_DAX(filp->f_mapping->host);
|
||||
}
|
||||
|
||||
static inline bool vma_is_dax(struct vm_area_struct *vma)
|
||||
{
|
||||
return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
|
||||
}
|
||||
|
||||
static inline int iocb_flags(struct file *file)
|
||||
{
|
||||
int res = 0;
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
#ifndef _LINUX_HUGE_MM_H
|
||||
#define _LINUX_HUGE_MM_H
|
||||
|
||||
#include <linux/sched/coredump.h>
|
||||
|
||||
#include <linux/fs.h> /* only for vma_is_dax() */
|
||||
|
||||
extern int do_huge_pmd_anonymous_page(struct vm_fault *vmf);
|
||||
extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
||||
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
|
||||
|
@ -85,14 +89,32 @@ extern struct kobj_attribute shmem_enabled_attr;
|
|||
|
||||
extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
|
||||
|
||||
#define transparent_hugepage_enabled(__vma) \
|
||||
((transparent_hugepage_flags & \
|
||||
(1<<TRANSPARENT_HUGEPAGE_FLAG) || \
|
||||
(transparent_hugepage_flags & \
|
||||
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG) && \
|
||||
((__vma)->vm_flags & VM_HUGEPAGE))) && \
|
||||
!((__vma)->vm_flags & VM_NOHUGEPAGE) && \
|
||||
!is_vma_temporary_stack(__vma))
|
||||
extern unsigned long transparent_hugepage_flags;
|
||||
|
||||
static inline bool transparent_hugepage_enabled(struct vm_area_struct *vma)
|
||||
{
|
||||
if (vma->vm_flags & VM_NOHUGEPAGE)
|
||||
return false;
|
||||
|
||||
if (is_vma_temporary_stack(vma))
|
||||
return false;
|
||||
|
||||
if (test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
|
||||
return false;
|
||||
|
||||
if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG))
|
||||
return true;
|
||||
|
||||
if (vma_is_dax(vma))
|
||||
return true;
|
||||
|
||||
if (transparent_hugepage_flags &
|
||||
(1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
|
||||
return !!(vma->vm_flags & VM_HUGEPAGE);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
#define transparent_hugepage_use_zero_page() \
|
||||
(transparent_hugepage_flags & \
|
||||
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
|
||||
|
@ -104,8 +126,6 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma);
|
|||
#define transparent_hugepage_debug_cow() 0
|
||||
#endif /* CONFIG_DEBUG_VM */
|
||||
|
||||
extern unsigned long transparent_hugepage_flags;
|
||||
|
||||
extern unsigned long thp_get_unmapped_area(struct file *filp,
|
||||
unsigned long addr, unsigned long len, unsigned long pgoff,
|
||||
unsigned long flags);
|
||||
|
@ -224,7 +244,10 @@ void mm_put_huge_zero_page(struct mm_struct *mm);
|
|||
|
||||
#define hpage_nr_pages(x) 1
|
||||
|
||||
#define transparent_hugepage_enabled(__vma) 0
|
||||
static inline bool transparent_hugepage_enabled(struct vm_area_struct *vma)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void prep_transhuge_page(struct page *page) {}
|
||||
|
||||
|
|
|
@ -116,7 +116,6 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
|
|||
vm_flags_t vm_flags);
|
||||
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
|
||||
long freed);
|
||||
int dequeue_hwpoisoned_huge_page(struct page *page);
|
||||
bool isolate_huge_page(struct page *page, struct list_head *list);
|
||||
void putback_active_hugepage(struct page *page);
|
||||
void free_huge_page(struct page *page);
|
||||
|
@ -192,10 +191,6 @@ static inline void hugetlb_show_meminfo(void)
|
|||
#define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
|
||||
src_addr, pagep) ({ BUG(); 0; })
|
||||
#define huge_pte_offset(mm, address, sz) 0
|
||||
static inline int dequeue_hwpoisoned_huge_page(struct page *page)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline bool isolate_huge_page(struct page *page, struct list_head *list)
|
||||
{
|
||||
|
@ -354,6 +349,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
|
|||
struct page *alloc_huge_page_node(struct hstate *h, int nid);
|
||||
struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
|
||||
unsigned long addr, int avoid_reserve);
|
||||
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
|
||||
nodemask_t *nmask);
|
||||
int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
|
||||
pgoff_t idx);
|
||||
|
||||
|
@ -472,6 +469,7 @@ static inline pgoff_t basepage_index(struct page *page)
|
|||
return __basepage_index(page);
|
||||
}
|
||||
|
||||
extern int dissolve_free_huge_page(struct page *page);
|
||||
extern int dissolve_free_huge_pages(unsigned long start_pfn,
|
||||
unsigned long end_pfn);
|
||||
static inline bool hugepage_migration_supported(struct hstate *h)
|
||||
|
@ -528,6 +526,7 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr
|
|||
struct hstate {};
|
||||
#define alloc_huge_page(v, a, r) NULL
|
||||
#define alloc_huge_page_node(h, nid) NULL
|
||||
#define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL
|
||||
#define alloc_huge_page_noerr(v, a, r) NULL
|
||||
#define alloc_bootmem_huge_page(h) NULL
|
||||
#define hstate_file(f) NULL
|
||||
|
@ -550,15 +549,37 @@ static inline unsigned int pages_per_huge_page(struct hstate *h)
|
|||
{
|
||||
return 1;
|
||||
}
|
||||
#define hstate_index_to_shift(index) 0
|
||||
#define hstate_index(h) 0
|
||||
|
||||
static inline unsigned hstate_index_to_shift(unsigned index)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int hstate_index(struct hstate *h)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline pgoff_t basepage_index(struct page *page)
|
||||
{
|
||||
return page->index;
|
||||
}
|
||||
#define dissolve_free_huge_pages(s, e) 0
|
||||
#define hugepage_migration_supported(h) false
|
||||
|
||||
static inline int dissolve_free_huge_page(struct page *page)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int dissolve_free_huge_pages(unsigned long start_pfn,
|
||||
unsigned long end_pfn)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline bool hugepage_migration_supported(struct hstate *h)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
|
||||
struct mm_struct *mm, pte_t *pte)
|
||||
|
|
|
@ -10,6 +10,9 @@ extern int rd_prompt;
|
|||
/* starting block # of image */
|
||||
extern int rd_image_start;
|
||||
|
||||
/* size of a single RAM disk */
|
||||
extern unsigned long rd_size;
|
||||
|
||||
/* 1 if it is not an error if initrd_start < memory_start */
|
||||
extern int initrd_below_start_ok;
|
||||
|
||||
|
|
|
@ -48,7 +48,8 @@ static inline int khugepaged_enter(struct vm_area_struct *vma,
|
|||
if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
|
||||
if ((khugepaged_always() ||
|
||||
(khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) &&
|
||||
!(vm_flags & VM_NOHUGEPAGE))
|
||||
!(vm_flags & VM_NOHUGEPAGE) &&
|
||||
!test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
|
||||
if (__khugepaged_enter(vma->vm_mm))
|
||||
return -ENOMEM;
|
||||
return 0;
|
||||
|
|
|
@ -44,6 +44,7 @@ struct list_lru_node {
|
|||
/* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
|
||||
struct list_lru_memcg *memcg_lrus;
|
||||
#endif
|
||||
long nr_items;
|
||||
} ____cacheline_aligned_in_smp;
|
||||
|
||||
struct list_lru {
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
#include <linux/mm.h>
|
||||
#include <linux/mempolicy.h>
|
||||
#include <linux/migrate_mode.h>
|
||||
#include <linux/hugetlb.h>
|
||||
|
||||
typedef struct page *new_page_t(struct page *page, unsigned long private,
|
||||
int **reason);
|
||||
|
@ -30,6 +31,21 @@ enum migrate_reason {
|
|||
/* In mm/debug.c; also keep sync with include/trace/events/migrate.h */
|
||||
extern char *migrate_reason_names[MR_TYPES];
|
||||
|
||||
static inline struct page *new_page_nodemask(struct page *page,
|
||||
int preferred_nid, nodemask_t *nodemask)
|
||||
{
|
||||
gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
|
||||
|
||||
if (PageHuge(page))
|
||||
return alloc_huge_page_nodemask(page_hstate(compound_head(page)),
|
||||
preferred_nid, nodemask);
|
||||
|
||||
if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE))
|
||||
gfp_mask |= __GFP_HIGHMEM;
|
||||
|
||||
return __alloc_pages_nodemask(gfp_mask, 0, preferred_nid, nodemask);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MIGRATION
|
||||
|
||||
extern void putback_movable_pages(struct list_head *l);
|
||||
|
|
|
@ -603,12 +603,9 @@ extern struct page *mem_map;
|
|||
#endif
|
||||
|
||||
/*
|
||||
* The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
|
||||
* (mostly NUMA machines?) to denote a higher-level memory zone than the
|
||||
* zone denotes.
|
||||
*
|
||||
* On NUMA machines, each NUMA node would have a pg_data_t to describe
|
||||
* it's memory layout.
|
||||
* it's memory layout. On UMA machines there is a single pglist_data which
|
||||
* describes the whole memory.
|
||||
*
|
||||
* Memory statistics and page replacement data structures are maintained on a
|
||||
* per-zone basis.
|
||||
|
@ -1058,6 +1055,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
|
|||
!defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
|
||||
static inline unsigned long early_pfn_to_nid(unsigned long pfn)
|
||||
{
|
||||
BUILD_BUG_ON(IS_ENABLED(CONFIG_NUMA));
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -174,6 +174,7 @@ static inline void page_ref_unfreeze(struct page *page, int count)
|
|||
VM_BUG_ON_PAGE(page_count(page) != 0, page);
|
||||
VM_BUG_ON(count == 0);
|
||||
|
||||
smp_mb();
|
||||
atomic_set(&page->_refcount, count);
|
||||
if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze))
|
||||
__page_ref_unfreeze(page, count);
|
||||
|
|
|
@ -68,7 +68,10 @@ static inline int get_dumpable(struct mm_struct *mm)
|
|||
#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */
|
||||
#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */
|
||||
#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */
|
||||
#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */
|
||||
#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
|
||||
|
||||
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
|
||||
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
|
||||
MMF_DISABLE_THP_MASK)
|
||||
|
||||
#endif /* _LINUX_SCHED_COREDUMP_H */
|
||||
|
|
|
@ -277,6 +277,7 @@ extern void mark_page_accessed(struct page *);
|
|||
extern void lru_add_drain(void);
|
||||
extern void lru_add_drain_cpu(int cpu);
|
||||
extern void lru_add_drain_all(void);
|
||||
extern void lru_add_drain_all_cpuslocked(void);
|
||||
extern void rotate_reclaimable_page(struct page *page);
|
||||
extern void deactivate_file_page(struct page *page);
|
||||
extern void mark_page_lazyfree(struct page *page);
|
||||
|
@ -331,7 +332,7 @@ extern void kswapd_stop(int nid);
|
|||
#include <linux/blk_types.h> /* for bio_end_io_t */
|
||||
|
||||
/* linux/mm/page_io.c */
|
||||
extern int swap_readpage(struct page *);
|
||||
extern int swap_readpage(struct page *page, bool do_poll);
|
||||
extern int swap_writepage(struct page *page, struct writeback_control *wbc);
|
||||
extern void end_swap_bio_write(struct bio *bio);
|
||||
extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
|
||||
|
@ -362,7 +363,8 @@ extern void free_page_and_swap_cache(struct page *);
|
|||
extern void free_pages_and_swap_cache(struct page **, int);
|
||||
extern struct page *lookup_swap_cache(swp_entry_t);
|
||||
extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
|
||||
struct vm_area_struct *vma, unsigned long addr);
|
||||
struct vm_area_struct *vma, unsigned long addr,
|
||||
bool do_poll);
|
||||
extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
|
||||
struct vm_area_struct *vma, unsigned long addr,
|
||||
bool *new_page_allocated);
|
||||
|
|
|
@ -196,15 +196,6 @@ static inline void num_poisoned_pages_dec(void)
|
|||
atomic_long_dec(&num_poisoned_pages);
|
||||
}
|
||||
|
||||
static inline void num_poisoned_pages_add(long num)
|
||||
{
|
||||
atomic_long_add(num, &num_poisoned_pages);
|
||||
}
|
||||
|
||||
static inline void num_poisoned_pages_sub(long num)
|
||||
{
|
||||
atomic_long_sub(num, &num_poisoned_pages);
|
||||
}
|
||||
#else
|
||||
|
||||
static inline swp_entry_t make_hwpoison_entry(struct page *page)
|
||||
|
|
|
@ -257,7 +257,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \
|
|||
|
||||
COMPACTION_STATUS
|
||||
COMPACTION_PRIORITY
|
||||
COMPACTION_FEEDBACK
|
||||
/* COMPACTION_FEEDBACK are defines not enums. Not needed here. */
|
||||
ZONE_TYPE
|
||||
LRU_NAMES
|
||||
|
||||
|
|
|
@ -70,6 +70,86 @@ TRACE_EVENT(reclaim_retry_zone,
|
|||
__entry->wmark_check)
|
||||
);
|
||||
|
||||
TRACE_EVENT(mark_victim,
|
||||
TP_PROTO(int pid),
|
||||
|
||||
TP_ARGS(pid),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(int, pid)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->pid = pid;
|
||||
),
|
||||
|
||||
TP_printk("pid=%d", __entry->pid)
|
||||
);
|
||||
|
||||
TRACE_EVENT(wake_reaper,
|
||||
TP_PROTO(int pid),
|
||||
|
||||
TP_ARGS(pid),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(int, pid)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->pid = pid;
|
||||
),
|
||||
|
||||
TP_printk("pid=%d", __entry->pid)
|
||||
);
|
||||
|
||||
TRACE_EVENT(start_task_reaping,
|
||||
TP_PROTO(int pid),
|
||||
|
||||
TP_ARGS(pid),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(int, pid)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->pid = pid;
|
||||
),
|
||||
|
||||
TP_printk("pid=%d", __entry->pid)
|
||||
);
|
||||
|
||||
TRACE_EVENT(finish_task_reaping,
|
||||
TP_PROTO(int pid),
|
||||
|
||||
TP_ARGS(pid),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(int, pid)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->pid = pid;
|
||||
),
|
||||
|
||||
TP_printk("pid=%d", __entry->pid)
|
||||
);
|
||||
|
||||
TRACE_EVENT(skip_task_reaping,
|
||||
TP_PROTO(int pid),
|
||||
|
||||
TP_ARGS(pid),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(int, pid)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->pid = pid;
|
||||
),
|
||||
|
||||
TP_printk("pid=%d", __entry->pid)
|
||||
);
|
||||
|
||||
#ifdef CONFIG_COMPACTION
|
||||
TRACE_EVENT(compact_retry,
|
||||
|
||||
|
|
|
@ -1639,6 +1639,10 @@ long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
|
|||
__WNOTHREAD|__WCLONE|__WALL))
|
||||
return -EINVAL;
|
||||
|
||||
/* -INT_MIN is not defined */
|
||||
if (upid == INT_MIN)
|
||||
return -ESRCH;
|
||||
|
||||
if (upid == -1)
|
||||
type = PIDTYPE_MAX;
|
||||
else if (upid < 0) {
|
||||
|
|
|
@ -55,7 +55,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
|
|||
{
|
||||
const struct exception_table_entry *e;
|
||||
|
||||
e = search_extable(__start___ex_table, __stop___ex_table-1, addr);
|
||||
e = search_extable(__start___ex_table,
|
||||
__stop___ex_table - __start___ex_table, addr);
|
||||
if (!e)
|
||||
e = search_module_extables(addr);
|
||||
return e;
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
#include <linux/export.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/sort.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/user_namespace.h>
|
||||
#include <linux/vmalloc.h>
|
||||
|
@ -76,32 +77,18 @@ static int groups_from_user(struct group_info *group_info,
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* a simple Shell sort */
|
||||
static int gid_cmp(const void *_a, const void *_b)
|
||||
{
|
||||
kgid_t a = *(kgid_t *)_a;
|
||||
kgid_t b = *(kgid_t *)_b;
|
||||
|
||||
return gid_gt(a, b) - gid_lt(a, b);
|
||||
}
|
||||
|
||||
static void groups_sort(struct group_info *group_info)
|
||||
{
|
||||
int base, max, stride;
|
||||
int gidsetsize = group_info->ngroups;
|
||||
|
||||
for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
|
||||
; /* nothing */
|
||||
stride /= 3;
|
||||
|
||||
while (stride) {
|
||||
max = gidsetsize - stride;
|
||||
for (base = 0; base < max; base++) {
|
||||
int left = base;
|
||||
int right = left + stride;
|
||||
kgid_t tmp = group_info->gid[right];
|
||||
|
||||
while (left >= 0 && gid_gt(group_info->gid[left], tmp)) {
|
||||
group_info->gid[right] = group_info->gid[left];
|
||||
right = left;
|
||||
left -= stride;
|
||||
}
|
||||
group_info->gid[right] = tmp;
|
||||
}
|
||||
stride /= 3;
|
||||
}
|
||||
sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid),
|
||||
gid_cmp, NULL);
|
||||
}
|
||||
|
||||
/* a simple bsearch */
|
||||
|
|
|
@ -28,12 +28,6 @@
|
|||
|
||||
#include <asm/sections.h>
|
||||
|
||||
#ifdef CONFIG_KALLSYMS_ALL
|
||||
#define all_var 1
|
||||
#else
|
||||
#define all_var 0
|
||||
#endif
|
||||
|
||||
/*
|
||||
* These will be re-linked against their real values
|
||||
* during the second link stage.
|
||||
|
@ -82,7 +76,7 @@ static inline int is_kernel(unsigned long addr)
|
|||
|
||||
static int is_ksym_addr(unsigned long addr)
|
||||
{
|
||||
if (all_var)
|
||||
if (IS_ENABLED(CONFIG_KALLSYMS_ALL))
|
||||
return is_kernel(addr);
|
||||
|
||||
return is_kernel_text(addr) || is_kernel_inittext(addr);
|
||||
|
@ -280,7 +274,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
|
|||
if (!symbol_end) {
|
||||
if (is_kernel_inittext(addr))
|
||||
symbol_end = (unsigned long)_einittext;
|
||||
else if (all_var)
|
||||
else if (IS_ENABLED(CONFIG_KALLSYMS_ALL))
|
||||
symbol_end = (unsigned long)_end;
|
||||
else
|
||||
symbol_end = (unsigned long)_etext;
|
||||
|
|
|
@ -234,7 +234,7 @@ static struct attribute * kernel_attrs[] = {
|
|||
NULL
|
||||
};
|
||||
|
||||
static struct attribute_group kernel_attr_group = {
|
||||
static const struct attribute_group kernel_attr_group = {
|
||||
.attrs = kernel_attrs,
|
||||
};
|
||||
|
||||
|
|
|
@ -4196,7 +4196,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
|
|||
goto out;
|
||||
|
||||
e = search_extable(mod->extable,
|
||||
mod->extable + mod->num_exentries - 1,
|
||||
mod->num_exentries,
|
||||
addr);
|
||||
out:
|
||||
preempt_enable();
|
||||
|
|
|
@ -1402,6 +1402,10 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
|
|||
return ret;
|
||||
}
|
||||
|
||||
/* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */
|
||||
if (pid == INT_MIN)
|
||||
return -ESRCH;
|
||||
|
||||
read_lock(&tasklist_lock);
|
||||
if (pid != -1) {
|
||||
ret = __kill_pgrp_info(sig, info,
|
||||
|
|
|
@ -2360,7 +2360,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
|
|||
case PR_GET_THP_DISABLE:
|
||||
if (arg2 || arg3 || arg4 || arg5)
|
||||
return -EINVAL;
|
||||
error = !!(me->mm->def_flags & VM_NOHUGEPAGE);
|
||||
error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags);
|
||||
break;
|
||||
case PR_SET_THP_DISABLE:
|
||||
if (arg3 || arg4 || arg5)
|
||||
|
@ -2368,9 +2368,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
|
|||
if (down_write_killable(&me->mm->mmap_sem))
|
||||
return -EINTR;
|
||||
if (arg2)
|
||||
me->mm->def_flags |= VM_NOHUGEPAGE;
|
||||
set_bit(MMF_DISABLE_THP, &me->mm->flags);
|
||||
else
|
||||
me->mm->def_flags &= ~VM_NOHUGEPAGE;
|
||||
clear_bit(MMF_DISABLE_THP, &me->mm->flags);
|
||||
up_write(&me->mm->mmap_sem);
|
||||
break;
|
||||
case PR_MPX_ENABLE_MANAGEMENT:
|
||||
|
|
|
@ -1594,7 +1594,7 @@ config RBTREE_TEST
|
|||
|
||||
config INTERVAL_TREE_TEST
|
||||
tristate "Interval tree test"
|
||||
depends on m && DEBUG_KERNEL
|
||||
depends on DEBUG_KERNEL
|
||||
select INTERVAL_TREE
|
||||
help
|
||||
A benchmark measuring the performance of the interval tree library
|
||||
|
|
|
@ -251,7 +251,7 @@ int __bitmap_weight(const unsigned long *bitmap, unsigned int bits)
|
|||
}
|
||||
EXPORT_SYMBOL(__bitmap_weight);
|
||||
|
||||
void bitmap_set(unsigned long *map, unsigned int start, int len)
|
||||
void __bitmap_set(unsigned long *map, unsigned int start, int len)
|
||||
{
|
||||
unsigned long *p = map + BIT_WORD(start);
|
||||
const unsigned int size = start + len;
|
||||
|
@ -270,9 +270,9 @@ void bitmap_set(unsigned long *map, unsigned int start, int len)
|
|||
*p |= mask_to_set;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(bitmap_set);
|
||||
EXPORT_SYMBOL(__bitmap_set);
|
||||
|
||||
void bitmap_clear(unsigned long *map, unsigned int start, int len)
|
||||
void __bitmap_clear(unsigned long *map, unsigned int start, int len)
|
||||
{
|
||||
unsigned long *p = map + BIT_WORD(start);
|
||||
const unsigned int size = start + len;
|
||||
|
@ -291,7 +291,7 @@ void bitmap_clear(unsigned long *map, unsigned int start, int len)
|
|||
*p &= ~mask_to_clear;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(bitmap_clear);
|
||||
EXPORT_SYMBOL(__bitmap_clear);
|
||||
|
||||
/**
|
||||
* bitmap_find_next_zero_area_off - find a contiguous aligned zero area
|
||||
|
|
|
@ -33,19 +33,21 @@
|
|||
void *bsearch(const void *key, const void *base, size_t num, size_t size,
|
||||
int (*cmp)(const void *key, const void *elt))
|
||||
{
|
||||
size_t start = 0, end = num;
|
||||
const char *pivot;
|
||||
int result;
|
||||
|
||||
while (start < end) {
|
||||
size_t mid = start + (end - start) / 2;
|
||||
while (num > 0) {
|
||||
pivot = base + (num >> 1) * size;
|
||||
result = cmp(key, pivot);
|
||||
|
||||
result = cmp(key, base + mid * size);
|
||||
if (result < 0)
|
||||
end = mid;
|
||||
else if (result > 0)
|
||||
start = mid + 1;
|
||||
else
|
||||
return (void *)base + mid * size;
|
||||
if (result == 0)
|
||||
return (void *)pivot;
|
||||
|
||||
if (result > 0) {
|
||||
base = pivot + size;
|
||||
num--;
|
||||
}
|
||||
num >>= 1;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
* 2 of the License, or (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/bsearch.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/sort.h>
|
||||
|
@ -51,7 +52,7 @@ static void swap_ex(void *a, void *b, int size)
|
|||
* This is used both for the kernel exception table and for
|
||||
* the exception tables of modules that get loaded.
|
||||
*/
|
||||
static int cmp_ex(const void *a, const void *b)
|
||||
static int cmp_ex_sort(const void *a, const void *b)
|
||||
{
|
||||
const struct exception_table_entry *x = a, *y = b;
|
||||
|
||||
|
@ -67,7 +68,7 @@ void sort_extable(struct exception_table_entry *start,
|
|||
struct exception_table_entry *finish)
|
||||
{
|
||||
sort(start, finish - start, sizeof(struct exception_table_entry),
|
||||
cmp_ex, swap_ex);
|
||||
cmp_ex_sort, swap_ex);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MODULES
|
||||
|
@ -93,6 +94,20 @@ void trim_init_extable(struct module *m)
|
|||
#endif /* !ARCH_HAS_SORT_EXTABLE */
|
||||
|
||||
#ifndef ARCH_HAS_SEARCH_EXTABLE
|
||||
|
||||
static int cmp_ex_search(const void *key, const void *elt)
|
||||
{
|
||||
const struct exception_table_entry *_elt = elt;
|
||||
unsigned long _key = *(unsigned long *)key;
|
||||
|
||||
/* avoid overflow */
|
||||
if (_key > ex_to_insn(_elt))
|
||||
return 1;
|
||||
if (_key < ex_to_insn(_elt))
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Search one exception table for an entry corresponding to the
|
||||
* given instruction address, and return the address of the entry,
|
||||
|
@ -101,25 +116,11 @@ void trim_init_extable(struct module *m)
|
|||
* already sorted.
|
||||
*/
|
||||
const struct exception_table_entry *
|
||||
search_extable(const struct exception_table_entry *first,
|
||||
const struct exception_table_entry *last,
|
||||
search_extable(const struct exception_table_entry *base,
|
||||
const size_t num,
|
||||
unsigned long value)
|
||||
{
|
||||
while (first <= last) {
|
||||
const struct exception_table_entry *mid;
|
||||
|
||||
mid = ((last - first) >> 1) + first;
|
||||
/*
|
||||
* careful, the distance between value and insn
|
||||
* can be larger than MAX_LONG:
|
||||
*/
|
||||
if (ex_to_insn(mid) < value)
|
||||
first = mid + 1;
|
||||
else if (ex_to_insn(mid) > value)
|
||||
last = mid - 1;
|
||||
else
|
||||
return mid;
|
||||
}
|
||||
return NULL;
|
||||
return bsearch(&value, base, num,
|
||||
sizeof(struct exception_table_entry), cmp_ex_search);
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -1,27 +1,38 @@
|
|||
#include <linux/module.h>
|
||||
#include <linux/moduleparam.h>
|
||||
#include <linux/interval_tree.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/slab.h>
|
||||
#include <asm/timex.h>
|
||||
|
||||
#define NODES 100
|
||||
#define PERF_LOOPS 100000
|
||||
#define SEARCHES 100
|
||||
#define SEARCH_LOOPS 10000
|
||||
#define __param(type, name, init, msg) \
|
||||
static type name = init; \
|
||||
module_param(name, type, 0444); \
|
||||
MODULE_PARM_DESC(name, msg);
|
||||
|
||||
__param(int, nnodes, 100, "Number of nodes in the interval tree");
|
||||
__param(int, perf_loops, 100000, "Number of iterations modifying the tree");
|
||||
|
||||
__param(int, nsearches, 100, "Number of searches to the interval tree");
|
||||
__param(int, search_loops, 10000, "Number of iterations searching the tree");
|
||||
__param(bool, search_all, false, "Searches will iterate all nodes in the tree");
|
||||
|
||||
__param(uint, max_endpoint, ~0, "Largest value for the interval's endpoint");
|
||||
|
||||
static struct rb_root root = RB_ROOT;
|
||||
static struct interval_tree_node nodes[NODES];
|
||||
static u32 queries[SEARCHES];
|
||||
static struct interval_tree_node *nodes = NULL;
|
||||
static u32 *queries = NULL;
|
||||
|
||||
static struct rnd_state rnd;
|
||||
|
||||
static inline unsigned long
|
||||
search(unsigned long query, struct rb_root *root)
|
||||
search(struct rb_root *root, unsigned long start, unsigned long last)
|
||||
{
|
||||
struct interval_tree_node *node;
|
||||
unsigned long results = 0;
|
||||
|
||||
for (node = interval_tree_iter_first(root, query, query); node;
|
||||
node = interval_tree_iter_next(node, query, query))
|
||||
for (node = interval_tree_iter_first(root, start, last); node;
|
||||
node = interval_tree_iter_next(node, start, last))
|
||||
results++;
|
||||
return results;
|
||||
}
|
||||
|
@ -29,19 +40,22 @@ search(unsigned long query, struct rb_root *root)
|
|||
static void init(void)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < NODES; i++) {
|
||||
u32 a = prandom_u32_state(&rnd);
|
||||
u32 b = prandom_u32_state(&rnd);
|
||||
if (a <= b) {
|
||||
nodes[i].start = a;
|
||||
nodes[i].last = b;
|
||||
} else {
|
||||
nodes[i].start = b;
|
||||
nodes[i].last = a;
|
||||
}
|
||||
|
||||
for (i = 0; i < nnodes; i++) {
|
||||
u32 b = (prandom_u32_state(&rnd) >> 4) % max_endpoint;
|
||||
u32 a = (prandom_u32_state(&rnd) >> 4) % b;
|
||||
|
||||
nodes[i].start = a;
|
||||
nodes[i].last = b;
|
||||
}
|
||||
for (i = 0; i < SEARCHES; i++)
|
||||
queries[i] = prandom_u32_state(&rnd);
|
||||
|
||||
/*
|
||||
* Limit the search scope to what the user defined.
|
||||
* Otherwise we are merely measuring empty walks,
|
||||
* which is pointless.
|
||||
*/
|
||||
for (i = 0; i < nsearches; i++)
|
||||
queries[i] = (prandom_u32_state(&rnd) >> 4) % max_endpoint;
|
||||
}
|
||||
|
||||
static int interval_tree_test_init(void)
|
||||
|
@ -50,6 +64,16 @@ static int interval_tree_test_init(void)
|
|||
unsigned long results;
|
||||
cycles_t time1, time2, time;
|
||||
|
||||
nodes = kmalloc(nnodes * sizeof(struct interval_tree_node), GFP_KERNEL);
|
||||
if (!nodes)
|
||||
return -ENOMEM;
|
||||
|
||||
queries = kmalloc(nsearches * sizeof(int), GFP_KERNEL);
|
||||
if (!queries) {
|
||||
kfree(nodes);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
printk(KERN_ALERT "interval tree insert/remove");
|
||||
|
||||
prandom_seed_state(&rnd, 3141592653589793238ULL);
|
||||
|
@ -57,39 +81,46 @@ static int interval_tree_test_init(void)
|
|||
|
||||
time1 = get_cycles();
|
||||
|
||||
for (i = 0; i < PERF_LOOPS; i++) {
|
||||
for (j = 0; j < NODES; j++)
|
||||
for (i = 0; i < perf_loops; i++) {
|
||||
for (j = 0; j < nnodes; j++)
|
||||
interval_tree_insert(nodes + j, &root);
|
||||
for (j = 0; j < NODES; j++)
|
||||
for (j = 0; j < nnodes; j++)
|
||||
interval_tree_remove(nodes + j, &root);
|
||||
}
|
||||
|
||||
time2 = get_cycles();
|
||||
time = time2 - time1;
|
||||
|
||||
time = div_u64(time, PERF_LOOPS);
|
||||
time = div_u64(time, perf_loops);
|
||||
printk(" -> %llu cycles\n", (unsigned long long)time);
|
||||
|
||||
printk(KERN_ALERT "interval tree search");
|
||||
|
||||
for (j = 0; j < NODES; j++)
|
||||
for (j = 0; j < nnodes; j++)
|
||||
interval_tree_insert(nodes + j, &root);
|
||||
|
||||
time1 = get_cycles();
|
||||
|
||||
results = 0;
|
||||
for (i = 0; i < SEARCH_LOOPS; i++)
|
||||
for (j = 0; j < SEARCHES; j++)
|
||||
results += search(queries[j], &root);
|
||||
for (i = 0; i < search_loops; i++)
|
||||
for (j = 0; j < nsearches; j++) {
|
||||
unsigned long start = search_all ? 0 : queries[j];
|
||||
unsigned long last = search_all ? max_endpoint : queries[j];
|
||||
|
||||
results += search(&root, start, last);
|
||||
}
|
||||
|
||||
time2 = get_cycles();
|
||||
time = time2 - time1;
|
||||
|
||||
time = div_u64(time, SEARCH_LOOPS);
|
||||
results = div_u64(results, SEARCH_LOOPS);
|
||||
time = div_u64(time, search_loops);
|
||||
results = div_u64(results, search_loops);
|
||||
printk(" -> %llu cycles (%lu results)\n",
|
||||
(unsigned long long)time, results);
|
||||
|
||||
kfree(queries);
|
||||
kfree(nodes);
|
||||
|
||||
return -EAGAIN; /* Fail will directly unload the module */
|
||||
}
|
||||
|
||||
|
|
|
@ -51,13 +51,15 @@ unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long
|
|||
|
||||
res = 0;
|
||||
rv = 0;
|
||||
while (*s) {
|
||||
while (1) {
|
||||
unsigned int c = *s;
|
||||
unsigned int lc = c | 0x20; /* don't tolower() this line */
|
||||
unsigned int val;
|
||||
|
||||
if ('0' <= *s && *s <= '9')
|
||||
val = *s - '0';
|
||||
else if ('a' <= _tolower(*s) && _tolower(*s) <= 'f')
|
||||
val = _tolower(*s) - 'a' + 10;
|
||||
if ('0' <= c && c <= '9')
|
||||
val = c - '0';
|
||||
else if ('a' <= lc && lc <= 'f')
|
||||
val = lc - 'a' + 10;
|
||||
else
|
||||
break;
|
||||
|
||||
|
|
|
@ -211,11 +211,10 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
|
|||
int i;
|
||||
|
||||
size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
|
||||
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) ||
|
||||
gfp != GFP_KERNEL)
|
||||
if (gfp != GFP_KERNEL)
|
||||
tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY);
|
||||
if (tbl == NULL && gfp == GFP_KERNEL)
|
||||
tbl = vzalloc(size);
|
||||
else
|
||||
tbl = kvzalloc(size, gfp);
|
||||
|
||||
size = nbuckets;
|
||||
|
||||
|
|
|
@ -333,10 +333,39 @@ static void __init test_bitmap_u32_array_conversions(void)
|
|||
}
|
||||
}
|
||||
|
||||
static void noinline __init test_mem_optimisations(void)
|
||||
{
|
||||
DECLARE_BITMAP(bmap1, 1024);
|
||||
DECLARE_BITMAP(bmap2, 1024);
|
||||
unsigned int start, nbits;
|
||||
|
||||
for (start = 0; start < 1024; start += 8) {
|
||||
memset(bmap1, 0x5a, sizeof(bmap1));
|
||||
memset(bmap2, 0x5a, sizeof(bmap2));
|
||||
for (nbits = 0; nbits < 1024 - start; nbits += 8) {
|
||||
bitmap_set(bmap1, start, nbits);
|
||||
__bitmap_set(bmap2, start, nbits);
|
||||
if (!bitmap_equal(bmap1, bmap2, 1024))
|
||||
printk("set not equal %d %d\n", start, nbits);
|
||||
if (!__bitmap_equal(bmap1, bmap2, 1024))
|
||||
printk("set not __equal %d %d\n", start, nbits);
|
||||
|
||||
bitmap_clear(bmap1, start, nbits);
|
||||
__bitmap_clear(bmap2, start, nbits);
|
||||
if (!bitmap_equal(bmap1, bmap2, 1024))
|
||||
printk("clear not equal %d %d\n", start, nbits);
|
||||
if (!__bitmap_equal(bmap1, bmap2, 1024))
|
||||
printk("clear not __equal %d %d\n", start,
|
||||
nbits);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int __init test_bitmap_init(void)
|
||||
{
|
||||
test_zero_fill_copy();
|
||||
test_bitmap_u32_array_conversions();
|
||||
test_mem_optimisations();
|
||||
|
||||
if (failed_tests == 0)
|
||||
pr_info("all %u tests passed\n", total_tests);
|
||||
|
|
|
@ -161,7 +161,6 @@ config MEMORY_HOTPLUG
|
|||
bool "Allow for memory hot-add"
|
||||
depends on SPARSEMEM || X86_64_ACPI_NUMA
|
||||
depends on ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
depends on COMPILE_TEST || !KASAN
|
||||
|
||||
config MEMORY_HOTPLUG_SPARSE
|
||||
def_bool y
|
||||
|
|
|
@ -24,7 +24,7 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
|
|||
{
|
||||
unsigned long flags;
|
||||
struct page *page = alloc_page(balloon_mapping_gfp_mask() |
|
||||
__GFP_NOMEMALLOC | __GFP_NORETRY);
|
||||
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_ZERO);
|
||||
if (!page)
|
||||
return NULL;
|
||||
|
||||
|
|
20
mm/cma.c
20
mm/cma.c
|
@ -59,7 +59,7 @@ const char *cma_get_name(const struct cma *cma)
|
|||
}
|
||||
|
||||
static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
|
||||
int align_order)
|
||||
unsigned int align_order)
|
||||
{
|
||||
if (align_order <= cma->order_per_bit)
|
||||
return 0;
|
||||
|
@ -67,17 +67,14 @@ static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
|
|||
}
|
||||
|
||||
/*
|
||||
* Find a PFN aligned to the specified order and return an offset represented in
|
||||
* order_per_bits.
|
||||
* Find the offset of the base PFN from the specified align_order.
|
||||
* The value returned is represented in order_per_bits.
|
||||
*/
|
||||
static unsigned long cma_bitmap_aligned_offset(const struct cma *cma,
|
||||
int align_order)
|
||||
unsigned int align_order)
|
||||
{
|
||||
if (align_order <= cma->order_per_bit)
|
||||
return 0;
|
||||
|
||||
return (ALIGN(cma->base_pfn, (1UL << align_order))
|
||||
- cma->base_pfn) >> cma->order_per_bit;
|
||||
return (cma->base_pfn & ((1UL << align_order) - 1))
|
||||
>> cma->order_per_bit;
|
||||
}
|
||||
|
||||
static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
|
||||
|
@ -127,7 +124,7 @@ static int __init cma_activate_area(struct cma *cma)
|
|||
* to be in the same zone.
|
||||
*/
|
||||
if (page_zone(pfn_to_page(pfn)) != zone)
|
||||
goto err;
|
||||
goto not_in_zone;
|
||||
}
|
||||
init_cma_reserved_pageblock(pfn_to_page(base_pfn));
|
||||
} while (--i);
|
||||
|
@ -141,7 +138,8 @@ static int __init cma_activate_area(struct cma *cma)
|
|||
|
||||
return 0;
|
||||
|
||||
err:
|
||||
not_in_zone:
|
||||
pr_err("CMA area %s could not be activated\n", cma->name);
|
||||
kfree(cma->bitmap);
|
||||
cma->count = 0;
|
||||
return -EINVAL;
|
||||
|
|
|
@ -239,14 +239,16 @@ void __delete_from_page_cache(struct page *page, void *shadow)
|
|||
/* Leave page->index set: truncation lookup relies upon it */
|
||||
|
||||
/* hugetlb pages do not participate in page cache accounting. */
|
||||
if (!PageHuge(page))
|
||||
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
|
||||
if (PageHuge(page))
|
||||
return;
|
||||
|
||||
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
|
||||
if (PageSwapBacked(page)) {
|
||||
__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
|
||||
if (PageTransHuge(page))
|
||||
__dec_node_page_state(page, NR_SHMEM_THPS);
|
||||
} else {
|
||||
VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page);
|
||||
VM_BUG_ON_PAGE(PageTransHuge(page), page);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
292
mm/hugetlb.c
292
mm/hugetlb.c
|
@ -20,9 +20,9 @@
|
|||
#include <linux/slab.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <linux/rmap.h>
|
||||
#include <linux/string_helpers.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/page-isolation.h>
|
||||
#include <linux/jhash.h>
|
||||
|
||||
#include <asm/page.h>
|
||||
|
@ -872,7 +872,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
|
|||
struct page *page;
|
||||
|
||||
list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
|
||||
if (!is_migrate_isolate_page(page))
|
||||
if (!PageHWPoison(page))
|
||||
break;
|
||||
/*
|
||||
* if 'non-isolated free hugepage' not found on the list,
|
||||
|
@ -887,19 +887,39 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
|
|||
return page;
|
||||
}
|
||||
|
||||
static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
|
||||
static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
|
||||
nodemask_t *nmask)
|
||||
{
|
||||
struct page *page;
|
||||
int node;
|
||||
unsigned int cpuset_mems_cookie;
|
||||
struct zonelist *zonelist;
|
||||
struct zone *zone;
|
||||
struct zoneref *z;
|
||||
int node = -1;
|
||||
|
||||
if (nid != NUMA_NO_NODE)
|
||||
return dequeue_huge_page_node_exact(h, nid);
|
||||
zonelist = node_zonelist(nid, gfp_mask);
|
||||
|
||||
retry_cpuset:
|
||||
cpuset_mems_cookie = read_mems_allowed_begin();
|
||||
for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
|
||||
struct page *page;
|
||||
|
||||
if (!cpuset_zone_allowed(zone, gfp_mask))
|
||||
continue;
|
||||
/*
|
||||
* no need to ask again on the same node. Pool is node rather than
|
||||
* zone aware
|
||||
*/
|
||||
if (zone_to_nid(zone) == node)
|
||||
continue;
|
||||
node = zone_to_nid(zone);
|
||||
|
||||
for_each_online_node(node) {
|
||||
page = dequeue_huge_page_node_exact(h, node);
|
||||
if (page)
|
||||
return page;
|
||||
}
|
||||
if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
|
||||
goto retry_cpuset;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -917,15 +937,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
|
|||
unsigned long address, int avoid_reserve,
|
||||
long chg)
|
||||
{
|
||||
struct page *page = NULL;
|
||||
struct page *page;
|
||||
struct mempolicy *mpol;
|
||||
nodemask_t *nodemask;
|
||||
gfp_t gfp_mask;
|
||||
nodemask_t *nodemask;
|
||||
int nid;
|
||||
struct zonelist *zonelist;
|
||||
struct zone *zone;
|
||||
struct zoneref *z;
|
||||
unsigned int cpuset_mems_cookie;
|
||||
|
||||
/*
|
||||
* A child process with MAP_PRIVATE mappings created by their parent
|
||||
|
@ -940,32 +956,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
|
|||
if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
|
||||
goto err;
|
||||
|
||||
retry_cpuset:
|
||||
cpuset_mems_cookie = read_mems_allowed_begin();
|
||||
gfp_mask = htlb_alloc_mask(h);
|
||||
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
|
||||
zonelist = node_zonelist(nid, gfp_mask);
|
||||
|
||||
for_each_zone_zonelist_nodemask(zone, z, zonelist,
|
||||
MAX_NR_ZONES - 1, nodemask) {
|
||||
if (cpuset_zone_allowed(zone, gfp_mask)) {
|
||||
page = dequeue_huge_page_node(h, zone_to_nid(zone));
|
||||
if (page) {
|
||||
if (avoid_reserve)
|
||||
break;
|
||||
if (!vma_has_reserves(vma, chg))
|
||||
break;
|
||||
|
||||
SetPagePrivate(page);
|
||||
h->resv_huge_pages--;
|
||||
break;
|
||||
}
|
||||
}
|
||||
page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
|
||||
if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
|
||||
SetPagePrivate(page);
|
||||
h->resv_huge_pages--;
|
||||
}
|
||||
|
||||
mpol_cond_put(mpol);
|
||||
if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
|
||||
goto retry_cpuset;
|
||||
return page;
|
||||
|
||||
err:
|
||||
|
@ -1460,7 +1459,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
|
|||
* number of free hugepages would be reduced below the number of reserved
|
||||
* hugepages.
|
||||
*/
|
||||
static int dissolve_free_huge_page(struct page *page)
|
||||
int dissolve_free_huge_page(struct page *page)
|
||||
{
|
||||
int rc = 0;
|
||||
|
||||
|
@ -1473,6 +1472,14 @@ static int dissolve_free_huge_page(struct page *page)
|
|||
rc = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
/*
|
||||
* Move PageHWPoison flag from head page to the raw error page,
|
||||
* which makes any subpages rather than the error page reusable.
|
||||
*/
|
||||
if (PageHWPoison(head) && page != head) {
|
||||
SetPageHWPoison(page);
|
||||
ClearPageHWPoison(head);
|
||||
}
|
||||
list_del(&head->lru);
|
||||
h->free_huge_pages--;
|
||||
h->free_huge_pages_node[nid]--;
|
||||
|
@ -1513,82 +1520,19 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
|
|||
return rc;
|
||||
}
|
||||
|
||||
/*
|
||||
* There are 3 ways this can get called:
|
||||
* 1. With vma+addr: we use the VMA's memory policy
|
||||
* 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge
|
||||
* page from any node, and let the buddy allocator itself figure
|
||||
* it out.
|
||||
* 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page
|
||||
* strictly from 'nid'
|
||||
*/
|
||||
static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
|
||||
struct vm_area_struct *vma, unsigned long addr, int nid)
|
||||
gfp_t gfp_mask, int nid, nodemask_t *nmask)
|
||||
{
|
||||
int order = huge_page_order(h);
|
||||
gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
|
||||
unsigned int cpuset_mems_cookie;
|
||||
|
||||
/*
|
||||
* We need a VMA to get a memory policy. If we do not
|
||||
* have one, we use the 'nid' argument.
|
||||
*
|
||||
* The mempolicy stuff below has some non-inlined bits
|
||||
* and calls ->vm_ops. That makes it hard to optimize at
|
||||
* compile-time, even when NUMA is off and it does
|
||||
* nothing. This helps the compiler optimize it out.
|
||||
*/
|
||||
if (!IS_ENABLED(CONFIG_NUMA) || !vma) {
|
||||
/*
|
||||
* If a specific node is requested, make sure to
|
||||
* get memory from there, but only when a node
|
||||
* is explicitly specified.
|
||||
*/
|
||||
if (nid != NUMA_NO_NODE)
|
||||
gfp |= __GFP_THISNODE;
|
||||
/*
|
||||
* Make sure to call something that can handle
|
||||
* nid=NUMA_NO_NODE
|
||||
*/
|
||||
return alloc_pages_node(nid, gfp, order);
|
||||
}
|
||||
|
||||
/*
|
||||
* OK, so we have a VMA. Fetch the mempolicy and try to
|
||||
* allocate a huge page with it. We will only reach this
|
||||
* when CONFIG_NUMA=y.
|
||||
*/
|
||||
do {
|
||||
struct page *page;
|
||||
struct mempolicy *mpol;
|
||||
int nid;
|
||||
nodemask_t *nodemask;
|
||||
|
||||
cpuset_mems_cookie = read_mems_allowed_begin();
|
||||
nid = huge_node(vma, addr, gfp, &mpol, &nodemask);
|
||||
mpol_cond_put(mpol);
|
||||
page = __alloc_pages_nodemask(gfp, order, nid, nodemask);
|
||||
if (page)
|
||||
return page;
|
||||
} while (read_mems_allowed_retry(cpuset_mems_cookie));
|
||||
|
||||
return NULL;
|
||||
gfp_mask |= __GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
|
||||
if (nid == NUMA_NO_NODE)
|
||||
nid = numa_mem_id();
|
||||
return __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
|
||||
}
|
||||
|
||||
/*
|
||||
* There are two ways to allocate a huge page:
|
||||
* 1. When you have a VMA and an address (like a fault)
|
||||
* 2. When you have no VMA (like when setting /proc/.../nr_hugepages)
|
||||
*
|
||||
* 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in
|
||||
* this case which signifies that the allocation should be done with
|
||||
* respect for the VMA's memory policy.
|
||||
*
|
||||
* For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This
|
||||
* implies that memory policies will not be taken in to account.
|
||||
*/
|
||||
static struct page *__alloc_buddy_huge_page(struct hstate *h,
|
||||
struct vm_area_struct *vma, unsigned long addr, int nid)
|
||||
static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
|
||||
int nid, nodemask_t *nmask)
|
||||
{
|
||||
struct page *page;
|
||||
unsigned int r_nid;
|
||||
|
@ -1596,15 +1540,6 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
|
|||
if (hstate_is_gigantic(h))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Make sure that anyone specifying 'nid' is not also specifying a VMA.
|
||||
* This makes sure the caller is picking _one_ of the modes with which
|
||||
* we can call this function, not both.
|
||||
*/
|
||||
if (vma || (addr != -1)) {
|
||||
VM_WARN_ON_ONCE(addr == -1);
|
||||
VM_WARN_ON_ONCE(nid != NUMA_NO_NODE);
|
||||
}
|
||||
/*
|
||||
* Assume we will successfully allocate the surplus page to
|
||||
* prevent racing processes from causing the surplus to exceed
|
||||
|
@ -1638,7 +1573,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
|
|||
}
|
||||
spin_unlock(&hugetlb_lock);
|
||||
|
||||
page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
|
||||
page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
if (page) {
|
||||
|
@ -1662,19 +1597,6 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
|
|||
return page;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a huge page from 'nid'. Note, 'nid' may be
|
||||
* NUMA_NO_NODE, which means that it may be allocated
|
||||
* anywhere.
|
||||
*/
|
||||
static
|
||||
struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
|
||||
{
|
||||
unsigned long addr = -1;
|
||||
|
||||
return __alloc_buddy_huge_page(h, NULL, addr, nid);
|
||||
}
|
||||
|
||||
/*
|
||||
* Use the VMA's mpolicy to allocate a huge page from the buddy.
|
||||
*/
|
||||
|
@ -1682,7 +1604,17 @@ static
|
|||
struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
|
||||
struct vm_area_struct *vma, unsigned long addr)
|
||||
{
|
||||
return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE);
|
||||
struct page *page;
|
||||
struct mempolicy *mpol;
|
||||
gfp_t gfp_mask = htlb_alloc_mask(h);
|
||||
int nid;
|
||||
nodemask_t *nodemask;
|
||||
|
||||
nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
|
||||
page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask);
|
||||
mpol_cond_put(mpol);
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1692,19 +1624,46 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
|
|||
*/
|
||||
struct page *alloc_huge_page_node(struct hstate *h, int nid)
|
||||
{
|
||||
gfp_t gfp_mask = htlb_alloc_mask(h);
|
||||
struct page *page = NULL;
|
||||
|
||||
if (nid != NUMA_NO_NODE)
|
||||
gfp_mask |= __GFP_THISNODE;
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
if (h->free_huge_pages - h->resv_huge_pages > 0)
|
||||
page = dequeue_huge_page_node(h, nid);
|
||||
page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
|
||||
spin_unlock(&hugetlb_lock);
|
||||
|
||||
if (!page)
|
||||
page = __alloc_buddy_huge_page_no_mpol(h, nid);
|
||||
page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL);
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
|
||||
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
|
||||
nodemask_t *nmask)
|
||||
{
|
||||
gfp_t gfp_mask = htlb_alloc_mask(h);
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
if (h->free_huge_pages - h->resv_huge_pages > 0) {
|
||||
struct page *page;
|
||||
|
||||
page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
|
||||
if (page) {
|
||||
spin_unlock(&hugetlb_lock);
|
||||
return page;
|
||||
}
|
||||
}
|
||||
spin_unlock(&hugetlb_lock);
|
||||
|
||||
/* No reservations, try to overcommit */
|
||||
|
||||
return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
|
||||
}
|
||||
|
||||
/*
|
||||
* Increase the hugetlb pool such that it can accommodate a reservation
|
||||
* of size 'delta'.
|
||||
|
@ -1730,12 +1689,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
|
|||
retry:
|
||||
spin_unlock(&hugetlb_lock);
|
||||
for (i = 0; i < needed; i++) {
|
||||
page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);
|
||||
page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h),
|
||||
NUMA_NO_NODE, NULL);
|
||||
if (!page) {
|
||||
alloc_ok = false;
|
||||
break;
|
||||
}
|
||||
list_add(&page->lru, &surplus_list);
|
||||
cond_resched();
|
||||
}
|
||||
allocated += i;
|
||||
|
||||
|
@ -2204,8 +2165,16 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
|
|||
} else if (!alloc_fresh_huge_page(h,
|
||||
&node_states[N_MEMORY]))
|
||||
break;
|
||||
cond_resched();
|
||||
}
|
||||
if (i < h->max_huge_pages) {
|
||||
char buf[32];
|
||||
|
||||
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
|
||||
pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
|
||||
h->max_huge_pages, buf, i);
|
||||
h->max_huge_pages = i;
|
||||
}
|
||||
h->max_huge_pages = i;
|
||||
}
|
||||
|
||||
static void __init hugetlb_init_hstates(void)
|
||||
|
@ -2223,26 +2192,16 @@ static void __init hugetlb_init_hstates(void)
|
|||
VM_BUG_ON(minimum_order == UINT_MAX);
|
||||
}
|
||||
|
||||
static char * __init memfmt(char *buf, unsigned long n)
|
||||
{
|
||||
if (n >= (1UL << 30))
|
||||
sprintf(buf, "%lu GB", n >> 30);
|
||||
else if (n >= (1UL << 20))
|
||||
sprintf(buf, "%lu MB", n >> 20);
|
||||
else
|
||||
sprintf(buf, "%lu KB", n >> 10);
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void __init report_hugepages(void)
|
||||
{
|
||||
struct hstate *h;
|
||||
|
||||
for_each_hstate(h) {
|
||||
char buf[32];
|
||||
|
||||
string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
|
||||
pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
|
||||
memfmt(buf, huge_page_size(h)),
|
||||
h->free_huge_pages);
|
||||
buf, h->free_huge_pages);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2801,6 +2760,11 @@ static int __init hugetlb_init(void)
|
|||
return 0;
|
||||
|
||||
if (!size_to_hstate(default_hstate_size)) {
|
||||
if (default_hstate_size != 0) {
|
||||
pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n",
|
||||
default_hstate_size, HPAGE_SIZE);
|
||||
}
|
||||
|
||||
default_hstate_size = HPAGE_SIZE;
|
||||
if (!size_to_hstate(default_hstate_size))
|
||||
hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
|
||||
|
@ -4739,40 +4703,6 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla
|
|||
return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_FAILURE
|
||||
|
||||
/*
|
||||
* This function is called from memory failure code.
|
||||
*/
|
||||
int dequeue_hwpoisoned_huge_page(struct page *hpage)
|
||||
{
|
||||
struct hstate *h = page_hstate(hpage);
|
||||
int nid = page_to_nid(hpage);
|
||||
int ret = -EBUSY;
|
||||
|
||||
spin_lock(&hugetlb_lock);
|
||||
/*
|
||||
* Just checking !page_huge_active is not enough, because that could be
|
||||
* an isolated/hwpoisoned hugepage (which have >0 refcount).
|
||||
*/
|
||||
if (!page_huge_active(hpage) && !page_count(hpage)) {
|
||||
/*
|
||||
* Hwpoisoned hugepage isn't linked to activelist or freelist,
|
||||
* but dangling hpage->lru can trigger list-debug warnings
|
||||
* (this happens when we call unpoison_memory() on it),
|
||||
* so let it point to itself with list_del_init().
|
||||
*/
|
||||
list_del_init(&hpage->lru);
|
||||
set_page_refcounted(hpage);
|
||||
h->free_huge_pages--;
|
||||
h->free_huge_pages_node[nid]--;
|
||||
ret = 0;
|
||||
}
|
||||
spin_unlock(&hugetlb_lock);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool isolate_huge_page(struct page *page, struct list_head *list)
|
||||
{
|
||||
bool ret = true;
|
||||
|
|
152
mm/kasan/kasan.c
152
mm/kasan/kasan.c
|
@ -134,97 +134,33 @@ static __always_inline bool memory_is_poisoned_1(unsigned long addr)
|
|||
return false;
|
||||
}
|
||||
|
||||
static __always_inline bool memory_is_poisoned_2(unsigned long addr)
|
||||
static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
|
||||
unsigned long size)
|
||||
{
|
||||
u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
|
||||
u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr);
|
||||
|
||||
if (unlikely(*shadow_addr)) {
|
||||
if (memory_is_poisoned_1(addr + 1))
|
||||
return true;
|
||||
/*
|
||||
* Access crosses 8(shadow size)-byte boundary. Such access maps
|
||||
* into 2 shadow bytes, so we need to check them both.
|
||||
*/
|
||||
if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1))
|
||||
return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
|
||||
|
||||
/*
|
||||
* If single shadow byte covers 2-byte access, we don't
|
||||
* need to do anything more. Otherwise, test the first
|
||||
* shadow byte.
|
||||
*/
|
||||
if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
|
||||
return false;
|
||||
|
||||
return unlikely(*(u8 *)shadow_addr);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static __always_inline bool memory_is_poisoned_4(unsigned long addr)
|
||||
{
|
||||
u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
|
||||
|
||||
if (unlikely(*shadow_addr)) {
|
||||
if (memory_is_poisoned_1(addr + 3))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* If single shadow byte covers 4-byte access, we don't
|
||||
* need to do anything more. Otherwise, test the first
|
||||
* shadow byte.
|
||||
*/
|
||||
if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
|
||||
return false;
|
||||
|
||||
return unlikely(*(u8 *)shadow_addr);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static __always_inline bool memory_is_poisoned_8(unsigned long addr)
|
||||
{
|
||||
u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
|
||||
|
||||
if (unlikely(*shadow_addr)) {
|
||||
if (memory_is_poisoned_1(addr + 7))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* If single shadow byte covers 8-byte access, we don't
|
||||
* need to do anything more. Otherwise, test the first
|
||||
* shadow byte.
|
||||
*/
|
||||
if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
|
||||
return false;
|
||||
|
||||
return unlikely(*(u8 *)shadow_addr);
|
||||
}
|
||||
|
||||
return false;
|
||||
return memory_is_poisoned_1(addr + size - 1);
|
||||
}
|
||||
|
||||
static __always_inline bool memory_is_poisoned_16(unsigned long addr)
|
||||
{
|
||||
u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr);
|
||||
u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
|
||||
|
||||
if (unlikely(*shadow_addr)) {
|
||||
u16 shadow_first_bytes = *(u16 *)shadow_addr;
|
||||
/* Unaligned 16-bytes access maps into 3 shadow bytes. */
|
||||
if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
|
||||
return *shadow_addr || memory_is_poisoned_1(addr + 15);
|
||||
|
||||
if (unlikely(shadow_first_bytes))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* If two shadow bytes covers 16-byte access, we don't
|
||||
* need to do anything more. Otherwise, test the last
|
||||
* shadow byte.
|
||||
*/
|
||||
if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
|
||||
return false;
|
||||
|
||||
return memory_is_poisoned_1(addr + 15);
|
||||
}
|
||||
|
||||
return false;
|
||||
return *shadow_addr;
|
||||
}
|
||||
|
||||
static __always_inline unsigned long bytes_is_zero(const u8 *start,
|
||||
static __always_inline unsigned long bytes_is_nonzero(const u8 *start,
|
||||
size_t size)
|
||||
{
|
||||
while (size) {
|
||||
|
@ -237,7 +173,7 @@ static __always_inline unsigned long bytes_is_zero(const u8 *start,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static __always_inline unsigned long memory_is_zero(const void *start,
|
||||
static __always_inline unsigned long memory_is_nonzero(const void *start,
|
||||
const void *end)
|
||||
{
|
||||
unsigned int words;
|
||||
|
@ -245,11 +181,11 @@ static __always_inline unsigned long memory_is_zero(const void *start,
|
|||
unsigned int prefix = (unsigned long)start % 8;
|
||||
|
||||
if (end - start <= 16)
|
||||
return bytes_is_zero(start, end - start);
|
||||
return bytes_is_nonzero(start, end - start);
|
||||
|
||||
if (prefix) {
|
||||
prefix = 8 - prefix;
|
||||
ret = bytes_is_zero(start, prefix);
|
||||
ret = bytes_is_nonzero(start, prefix);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
start += prefix;
|
||||
|
@ -258,12 +194,12 @@ static __always_inline unsigned long memory_is_zero(const void *start,
|
|||
words = (end - start) / 8;
|
||||
while (words) {
|
||||
if (unlikely(*(u64 *)start))
|
||||
return bytes_is_zero(start, 8);
|
||||
return bytes_is_nonzero(start, 8);
|
||||
start += 8;
|
||||
words--;
|
||||
}
|
||||
|
||||
return bytes_is_zero(start, (end - start) % 8);
|
||||
return bytes_is_nonzero(start, (end - start) % 8);
|
||||
}
|
||||
|
||||
static __always_inline bool memory_is_poisoned_n(unsigned long addr,
|
||||
|
@ -271,7 +207,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr,
|
|||
{
|
||||
unsigned long ret;
|
||||
|
||||
ret = memory_is_zero(kasan_mem_to_shadow((void *)addr),
|
||||
ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr),
|
||||
kasan_mem_to_shadow((void *)addr + size - 1) + 1);
|
||||
|
||||
if (unlikely(ret)) {
|
||||
|
@ -292,11 +228,9 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
|
|||
case 1:
|
||||
return memory_is_poisoned_1(addr);
|
||||
case 2:
|
||||
return memory_is_poisoned_2(addr);
|
||||
case 4:
|
||||
return memory_is_poisoned_4(addr);
|
||||
case 8:
|
||||
return memory_is_poisoned_8(addr);
|
||||
return memory_is_poisoned_2_4_8(addr, size);
|
||||
case 16:
|
||||
return memory_is_poisoned_16(addr);
|
||||
default:
|
||||
|
@ -803,17 +737,47 @@ void __asan_unpoison_stack_memory(const void *addr, size_t size)
|
|||
EXPORT_SYMBOL(__asan_unpoison_stack_memory);
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
static int kasan_mem_notifier(struct notifier_block *nb,
|
||||
static int __meminit kasan_mem_notifier(struct notifier_block *nb,
|
||||
unsigned long action, void *data)
|
||||
{
|
||||
return (action == MEM_GOING_ONLINE) ? NOTIFY_BAD : NOTIFY_OK;
|
||||
struct memory_notify *mem_data = data;
|
||||
unsigned long nr_shadow_pages, start_kaddr, shadow_start;
|
||||
unsigned long shadow_end, shadow_size;
|
||||
|
||||
nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT;
|
||||
start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn);
|
||||
shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr);
|
||||
shadow_size = nr_shadow_pages << PAGE_SHIFT;
|
||||
shadow_end = shadow_start + shadow_size;
|
||||
|
||||
if (WARN_ON(mem_data->nr_pages % KASAN_SHADOW_SCALE_SIZE) ||
|
||||
WARN_ON(start_kaddr % (KASAN_SHADOW_SCALE_SIZE << PAGE_SHIFT)))
|
||||
return NOTIFY_BAD;
|
||||
|
||||
switch (action) {
|
||||
case MEM_GOING_ONLINE: {
|
||||
void *ret;
|
||||
|
||||
ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start,
|
||||
shadow_end, GFP_KERNEL,
|
||||
PAGE_KERNEL, VM_NO_GUARD,
|
||||
pfn_to_nid(mem_data->start_pfn),
|
||||
__builtin_return_address(0));
|
||||
if (!ret)
|
||||
return NOTIFY_BAD;
|
||||
|
||||
kmemleak_ignore(ret);
|
||||
return NOTIFY_OK;
|
||||
}
|
||||
case MEM_OFFLINE:
|
||||
vfree((void *)shadow_start);
|
||||
}
|
||||
|
||||
return NOTIFY_OK;
|
||||
}
|
||||
|
||||
static int __init kasan_memhotplug_init(void)
|
||||
{
|
||||
pr_info("WARNING: KASAN doesn't support memory hot-add\n");
|
||||
pr_info("Memory hot-add will be disabled\n");
|
||||
|
||||
hotplug_memory_notifier(kasan_mem_notifier, 0);
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -118,6 +118,18 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr,
|
|||
|
||||
do {
|
||||
next = p4d_addr_end(addr, end);
|
||||
if (IS_ALIGNED(addr, P4D_SIZE) && end - addr >= P4D_SIZE) {
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
|
||||
p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud));
|
||||
pud = pud_offset(p4d, addr);
|
||||
pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd));
|
||||
pmd = pmd_offset(pud, addr);
|
||||
pmd_populate_kernel(&init_mm, pmd,
|
||||
lm_alias(kasan_zero_pte));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (p4d_none(*p4d)) {
|
||||
p4d_populate(&init_mm, p4d,
|
||||
|
|
|
@ -107,7 +107,7 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info)
|
|||
return bug_type;
|
||||
}
|
||||
|
||||
const char *get_wild_bug_type(struct kasan_access_info *info)
|
||||
static const char *get_wild_bug_type(struct kasan_access_info *info)
|
||||
{
|
||||
const char *bug_type = "unknown-crash";
|
||||
|
||||
|
|
|
@ -816,7 +816,8 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
|
|||
static bool hugepage_vma_check(struct vm_area_struct *vma)
|
||||
{
|
||||
if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
|
||||
(vma->vm_flags & VM_NOHUGEPAGE))
|
||||
(vma->vm_flags & VM_NOHUGEPAGE) ||
|
||||
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
|
||||
return false;
|
||||
if (shmem_file(vma->vm_file)) {
|
||||
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
|
||||
|
|
|
@ -117,6 +117,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
|
|||
l = list_lru_from_kmem(nlru, item);
|
||||
list_add_tail(item, &l->list);
|
||||
l->nr_items++;
|
||||
nlru->nr_items++;
|
||||
spin_unlock(&nlru->lock);
|
||||
return true;
|
||||
}
|
||||
|
@ -136,6 +137,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
|
|||
l = list_lru_from_kmem(nlru, item);
|
||||
list_del_init(item);
|
||||
l->nr_items--;
|
||||
nlru->nr_items--;
|
||||
spin_unlock(&nlru->lock);
|
||||
return true;
|
||||
}
|
||||
|
@ -183,15 +185,10 @@ EXPORT_SYMBOL_GPL(list_lru_count_one);
|
|||
|
||||
unsigned long list_lru_count_node(struct list_lru *lru, int nid)
|
||||
{
|
||||
long count = 0;
|
||||
int memcg_idx;
|
||||
struct list_lru_node *nlru;
|
||||
|
||||
count += __list_lru_count_one(lru, nid, -1);
|
||||
if (list_lru_memcg_aware(lru)) {
|
||||
for_each_memcg_cache_index(memcg_idx)
|
||||
count += __list_lru_count_one(lru, nid, memcg_idx);
|
||||
}
|
||||
return count;
|
||||
nlru = &lru->node[nid];
|
||||
return nlru->nr_items;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(list_lru_count_node);
|
||||
|
||||
|
@ -226,6 +223,7 @@ restart:
|
|||
assert_spin_locked(&nlru->lock);
|
||||
case LRU_REMOVED:
|
||||
isolated++;
|
||||
nlru->nr_items--;
|
||||
/*
|
||||
* If the lru lock has been dropped, our list
|
||||
* traversal is now invalid and so we have to
|
||||
|
|
46
mm/madvise.c
46
mm/madvise.c
|
@ -205,7 +205,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
|
|||
continue;
|
||||
|
||||
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
|
||||
vma, index);
|
||||
vma, index, false);
|
||||
if (page)
|
||||
put_page(page);
|
||||
}
|
||||
|
@ -246,7 +246,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
|
|||
}
|
||||
swap = radix_to_swp_entry(page);
|
||||
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
|
||||
NULL, 0);
|
||||
NULL, 0, false);
|
||||
if (page)
|
||||
put_page(page);
|
||||
}
|
||||
|
@ -451,9 +451,6 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
|
|||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct mmu_gather tlb;
|
||||
|
||||
if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
|
||||
return -EINVAL;
|
||||
|
||||
/* MADV_FREE works for only anon vma at the moment */
|
||||
if (!vma_is_anonymous(vma))
|
||||
return -EINVAL;
|
||||
|
@ -477,14 +474,6 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static long madvise_free(struct vm_area_struct *vma,
|
||||
struct vm_area_struct **prev,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
*prev = vma;
|
||||
return madvise_free_single_vma(vma, start, end);
|
||||
}
|
||||
|
||||
/*
|
||||
* Application no longer needs these pages. If the pages are dirty,
|
||||
* it's OK to just throw them away. The app will be more careful about
|
||||
|
@ -504,9 +493,17 @@ static long madvise_free(struct vm_area_struct *vma,
|
|||
* An interface that causes the system to free clean pages and flush
|
||||
* dirty pages is already available as msync(MS_INVALIDATE).
|
||||
*/
|
||||
static long madvise_dontneed(struct vm_area_struct *vma,
|
||||
struct vm_area_struct **prev,
|
||||
unsigned long start, unsigned long end)
|
||||
static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
zap_page_range(vma, start, end - start);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long madvise_dontneed_free(struct vm_area_struct *vma,
|
||||
struct vm_area_struct **prev,
|
||||
unsigned long start, unsigned long end,
|
||||
int behavior)
|
||||
{
|
||||
*prev = vma;
|
||||
if (!can_madv_dontneed_vma(vma))
|
||||
|
@ -526,7 +523,8 @@ static long madvise_dontneed(struct vm_area_struct *vma,
|
|||
* is also < vma->vm_end. If start <
|
||||
* vma->vm_start it means an hole materialized
|
||||
* in the user address space within the
|
||||
* virtual range passed to MADV_DONTNEED.
|
||||
* virtual range passed to MADV_DONTNEED
|
||||
* or MADV_FREE.
|
||||
*/
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
@ -537,7 +535,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
|
|||
* Don't fail if end > vma->vm_end. If the old
|
||||
* vma was splitted while the mmap_sem was
|
||||
* released the effect of the concurrent
|
||||
* operation may not cause MADV_DONTNEED to
|
||||
* operation may not cause madvise() to
|
||||
* have an undefined result. There may be an
|
||||
* adjacent next vma that we'll walk
|
||||
* next. userfaultfd_remove() will generate an
|
||||
|
@ -549,8 +547,13 @@ static long madvise_dontneed(struct vm_area_struct *vma,
|
|||
}
|
||||
VM_WARN_ON(start >= end);
|
||||
}
|
||||
zap_page_range(vma, start, end - start);
|
||||
return 0;
|
||||
|
||||
if (behavior == MADV_DONTNEED)
|
||||
return madvise_dontneed_single_vma(vma, start, end);
|
||||
else if (behavior == MADV_FREE)
|
||||
return madvise_free_single_vma(vma, start, end);
|
||||
else
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -656,9 +659,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
|
|||
case MADV_WILLNEED:
|
||||
return madvise_willneed(vma, prev, start, end);
|
||||
case MADV_FREE:
|
||||
return madvise_free(vma, prev, start, end);
|
||||
case MADV_DONTNEED:
|
||||
return madvise_dontneed(vma, prev, start, end);
|
||||
return madvise_dontneed_free(vma, prev, start, end, behavior);
|
||||
default:
|
||||
return madvise_behavior(vma, prev, start, end, behavior);
|
||||
}
|
||||
|
|
|
@ -631,7 +631,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
|
|||
val = __this_cpu_read(memcg->stat->nr_page_events);
|
||||
next = __this_cpu_read(memcg->stat->targets[target]);
|
||||
/* from time_after() in jiffies.h */
|
||||
if ((long)next - (long)val < 0) {
|
||||
if ((long)(next - val) < 0) {
|
||||
switch (target) {
|
||||
case MEM_CGROUP_TARGET_THRESH:
|
||||
next = val + THRESHOLDS_EVENTS_TARGET;
|
||||
|
@ -5317,38 +5317,52 @@ struct cgroup_subsys memory_cgrp_subsys = {
|
|||
|
||||
/**
|
||||
* mem_cgroup_low - check if memory consumption is below the normal range
|
||||
* @root: the highest ancestor to consider
|
||||
* @root: the top ancestor of the sub-tree being checked
|
||||
* @memcg: the memory cgroup to check
|
||||
*
|
||||
* Returns %true if memory consumption of @memcg, and that of all
|
||||
* configurable ancestors up to @root, is below the normal range.
|
||||
* ancestors up to (but not including) @root, is below the normal range.
|
||||
*
|
||||
* @root is exclusive; it is never low when looked at directly and isn't
|
||||
* checked when traversing the hierarchy.
|
||||
*
|
||||
* Excluding @root enables using memory.low to prioritize memory usage
|
||||
* between cgroups within a subtree of the hierarchy that is limited by
|
||||
* memory.high or memory.max.
|
||||
*
|
||||
* For example, given cgroup A with children B and C:
|
||||
*
|
||||
* A
|
||||
* / \
|
||||
* B C
|
||||
*
|
||||
* and
|
||||
*
|
||||
* 1. A/memory.current > A/memory.high
|
||||
* 2. A/B/memory.current < A/B/memory.low
|
||||
* 3. A/C/memory.current >= A/C/memory.low
|
||||
*
|
||||
* As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we
|
||||
* should reclaim from 'C' until 'A' is no longer high or until we can
|
||||
* no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by
|
||||
* mem_cgroup_low when reclaming from 'A', then 'B' won't be considered
|
||||
* low and we will reclaim indiscriminately from both 'B' and 'C'.
|
||||
*/
|
||||
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
|
||||
{
|
||||
if (mem_cgroup_disabled())
|
||||
return false;
|
||||
|
||||
/*
|
||||
* The toplevel group doesn't have a configurable range, so
|
||||
* it's never low when looked at directly, and it is not
|
||||
* considered an ancestor when assessing the hierarchy.
|
||||
*/
|
||||
|
||||
if (memcg == root_mem_cgroup)
|
||||
if (!root)
|
||||
root = root_mem_cgroup;
|
||||
if (memcg == root)
|
||||
return false;
|
||||
|
||||
if (page_counter_read(&memcg->memory) >= memcg->low)
|
||||
return false;
|
||||
|
||||
while (memcg != root) {
|
||||
memcg = parent_mem_cgroup(memcg);
|
||||
|
||||
if (memcg == root_mem_cgroup)
|
||||
break;
|
||||
|
||||
for (; memcg != root; memcg = parent_mem_cgroup(memcg)) {
|
||||
if (page_counter_read(&memcg->memory) >= memcg->low)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -49,7 +49,6 @@
|
|||
#include <linux/swap.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/migrate.h>
|
||||
#include <linux/page-isolation.h>
|
||||
#include <linux/suspend.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/swapops.h>
|
||||
|
@ -555,6 +554,39 @@ static int delete_from_lru_cache(struct page *p)
|
|||
return -EIO;
|
||||
}
|
||||
|
||||
static int truncate_error_page(struct page *p, unsigned long pfn,
|
||||
struct address_space *mapping)
|
||||
{
|
||||
int ret = MF_FAILED;
|
||||
|
||||
if (mapping->a_ops->error_remove_page) {
|
||||
int err = mapping->a_ops->error_remove_page(mapping, p);
|
||||
|
||||
if (err != 0) {
|
||||
pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
|
||||
pfn, err);
|
||||
} else if (page_has_private(p) &&
|
||||
!try_to_release_page(p, GFP_NOIO)) {
|
||||
pr_info("Memory failure: %#lx: failed to release buffers\n",
|
||||
pfn);
|
||||
} else {
|
||||
ret = MF_RECOVERED;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* If the file system doesn't support it just invalidate
|
||||
* This fails on dirty or anything with private pages
|
||||
*/
|
||||
if (invalidate_inode_page(p))
|
||||
ret = MF_RECOVERED;
|
||||
else
|
||||
pr_info("Memory failure: %#lx: Failed to invalidate\n",
|
||||
pfn);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Error hit kernel page.
|
||||
* Do nothing, try to be lucky and not touch this instead. For a few cases we
|
||||
|
@ -579,8 +611,6 @@ static int me_unknown(struct page *p, unsigned long pfn)
|
|||
*/
|
||||
static int me_pagecache_clean(struct page *p, unsigned long pfn)
|
||||
{
|
||||
int err;
|
||||
int ret = MF_FAILED;
|
||||
struct address_space *mapping;
|
||||
|
||||
delete_from_lru_cache(p);
|
||||
|
@ -612,30 +642,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
|
|||
*
|
||||
* Open: to take i_mutex or not for this? Right now we don't.
|
||||
*/
|
||||
if (mapping->a_ops->error_remove_page) {
|
||||
err = mapping->a_ops->error_remove_page(mapping, p);
|
||||
if (err != 0) {
|
||||
pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
|
||||
pfn, err);
|
||||
} else if (page_has_private(p) &&
|
||||
!try_to_release_page(p, GFP_NOIO)) {
|
||||
pr_info("Memory failure: %#lx: failed to release buffers\n",
|
||||
pfn);
|
||||
} else {
|
||||
ret = MF_RECOVERED;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* If the file system doesn't support it just invalidate
|
||||
* This fails on dirty or anything with private pages
|
||||
*/
|
||||
if (invalidate_inode_page(p))
|
||||
ret = MF_RECOVERED;
|
||||
else
|
||||
pr_info("Memory failure: %#lx: Failed to invalidate\n",
|
||||
pfn);
|
||||
}
|
||||
return ret;
|
||||
return truncate_error_page(p, pfn, mapping);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -741,24 +748,29 @@ static int me_huge_page(struct page *p, unsigned long pfn)
|
|||
{
|
||||
int res = 0;
|
||||
struct page *hpage = compound_head(p);
|
||||
struct address_space *mapping;
|
||||
|
||||
if (!PageHuge(hpage))
|
||||
return MF_DELAYED;
|
||||
|
||||
/*
|
||||
* We can safely recover from error on free or reserved (i.e.
|
||||
* not in-use) hugepage by dequeuing it from freelist.
|
||||
* To check whether a hugepage is in-use or not, we can't use
|
||||
* page->lru because it can be used in other hugepage operations,
|
||||
* such as __unmap_hugepage_range() and gather_surplus_pages().
|
||||
* So instead we use page_mapping() and PageAnon().
|
||||
*/
|
||||
if (!(page_mapping(hpage) || PageAnon(hpage))) {
|
||||
res = dequeue_hwpoisoned_huge_page(hpage);
|
||||
if (!res)
|
||||
return MF_RECOVERED;
|
||||
mapping = page_mapping(hpage);
|
||||
if (mapping) {
|
||||
res = truncate_error_page(hpage, pfn, mapping);
|
||||
} else {
|
||||
unlock_page(hpage);
|
||||
/*
|
||||
* migration entry prevents later access on error anonymous
|
||||
* hugepage, so we can free and dissolve it into buddy to
|
||||
* save healthy subpages.
|
||||
*/
|
||||
if (PageAnon(hpage))
|
||||
put_page(hpage);
|
||||
dissolve_free_huge_page(p);
|
||||
res = MF_RECOVERED;
|
||||
lock_page(hpage);
|
||||
}
|
||||
return MF_DELAYED;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -857,7 +869,7 @@ static int page_action(struct page_state *ps, struct page *p,
|
|||
count = page_count(p) - 1;
|
||||
if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
|
||||
count--;
|
||||
if (count != 0) {
|
||||
if (count > 0) {
|
||||
pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
|
||||
pfn, action_page_types[ps->type], count);
|
||||
result = MF_FAILED;
|
||||
|
@ -1010,20 +1022,84 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
|
|||
return unmap_success;
|
||||
}
|
||||
|
||||
static void set_page_hwpoison_huge_page(struct page *hpage)
|
||||
static int identify_page_state(unsigned long pfn, struct page *p,
|
||||
unsigned long page_flags)
|
||||
{
|
||||
int i;
|
||||
int nr_pages = 1 << compound_order(hpage);
|
||||
for (i = 0; i < nr_pages; i++)
|
||||
SetPageHWPoison(hpage + i);
|
||||
struct page_state *ps;
|
||||
|
||||
/*
|
||||
* The first check uses the current page flags which may not have any
|
||||
* relevant information. The second check with the saved page flags is
|
||||
* carried out only if the first check can't determine the page status.
|
||||
*/
|
||||
for (ps = error_states;; ps++)
|
||||
if ((p->flags & ps->mask) == ps->res)
|
||||
break;
|
||||
|
||||
page_flags |= (p->flags & (1UL << PG_dirty));
|
||||
|
||||
if (!ps->mask)
|
||||
for (ps = error_states;; ps++)
|
||||
if ((page_flags & ps->mask) == ps->res)
|
||||
break;
|
||||
return page_action(ps, p, pfn);
|
||||
}
|
||||
|
||||
static void clear_page_hwpoison_huge_page(struct page *hpage)
|
||||
static int memory_failure_hugetlb(unsigned long pfn, int trapno, int flags)
|
||||
{
|
||||
int i;
|
||||
int nr_pages = 1 << compound_order(hpage);
|
||||
for (i = 0; i < nr_pages; i++)
|
||||
ClearPageHWPoison(hpage + i);
|
||||
struct page *p = pfn_to_page(pfn);
|
||||
struct page *head = compound_head(p);
|
||||
int res;
|
||||
unsigned long page_flags;
|
||||
|
||||
if (TestSetPageHWPoison(head)) {
|
||||
pr_err("Memory failure: %#lx: already hardware poisoned\n",
|
||||
pfn);
|
||||
return 0;
|
||||
}
|
||||
|
||||
num_poisoned_pages_inc();
|
||||
|
||||
if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
|
||||
/*
|
||||
* Check "filter hit" and "race with other subpage."
|
||||
*/
|
||||
lock_page(head);
|
||||
if (PageHWPoison(head)) {
|
||||
if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
|
||||
|| (p != head && TestSetPageHWPoison(head))) {
|
||||
num_poisoned_pages_dec();
|
||||
unlock_page(head);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
unlock_page(head);
|
||||
dissolve_free_huge_page(p);
|
||||
action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED);
|
||||
return 0;
|
||||
}
|
||||
|
||||
lock_page(head);
|
||||
page_flags = head->flags;
|
||||
|
||||
if (!PageHWPoison(head)) {
|
||||
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
|
||||
num_poisoned_pages_dec();
|
||||
unlock_page(head);
|
||||
put_hwpoison_page(head);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!hwpoison_user_mappings(p, pfn, trapno, flags, &head)) {
|
||||
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
|
||||
res = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
res = identify_page_state(pfn, p, page_flags);
|
||||
out:
|
||||
unlock_page(head);
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1046,12 +1122,10 @@ static void clear_page_hwpoison_huge_page(struct page *hpage)
|
|||
*/
|
||||
int memory_failure(unsigned long pfn, int trapno, int flags)
|
||||
{
|
||||
struct page_state *ps;
|
||||
struct page *p;
|
||||
struct page *hpage;
|
||||
struct page *orig_head;
|
||||
int res;
|
||||
unsigned int nr_pages;
|
||||
unsigned long page_flags;
|
||||
|
||||
if (!sysctl_memory_failure_recovery)
|
||||
|
@ -1064,34 +1138,22 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
|||
}
|
||||
|
||||
p = pfn_to_page(pfn);
|
||||
orig_head = hpage = compound_head(p);
|
||||
if (PageHuge(p))
|
||||
return memory_failure_hugetlb(pfn, trapno, flags);
|
||||
if (TestSetPageHWPoison(p)) {
|
||||
pr_err("Memory failure: %#lx: already hardware poisoned\n",
|
||||
pfn);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Currently errors on hugetlbfs pages are measured in hugepage units,
|
||||
* so nr_pages should be 1 << compound_order. OTOH when errors are on
|
||||
* transparent hugepages, they are supposed to be split and error
|
||||
* measurement is done in normal page units. So nr_pages should be one
|
||||
* in this case.
|
||||
*/
|
||||
if (PageHuge(p))
|
||||
nr_pages = 1 << compound_order(hpage);
|
||||
else /* normal page or thp */
|
||||
nr_pages = 1;
|
||||
num_poisoned_pages_add(nr_pages);
|
||||
orig_head = hpage = compound_head(p);
|
||||
num_poisoned_pages_inc();
|
||||
|
||||
/*
|
||||
* We need/can do nothing about count=0 pages.
|
||||
* 1) it's a free page, and therefore in safe hand:
|
||||
* prep_new_page() will be the gate keeper.
|
||||
* 2) it's a free hugepage, which is also safe:
|
||||
* an affected hugepage will be dequeued from hugepage freelist,
|
||||
* so there's no concern about reusing it ever after.
|
||||
* 3) it's part of a non-compound high order page.
|
||||
* 2) it's part of a non-compound high order page.
|
||||
* Implies some kernel user: cannot stop them from
|
||||
* R/W the page; let's pray that the page has been
|
||||
* used and will be freed some time later.
|
||||
|
@ -1102,32 +1164,13 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
|||
if (is_free_buddy_page(p)) {
|
||||
action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
|
||||
return 0;
|
||||
} else if (PageHuge(hpage)) {
|
||||
/*
|
||||
* Check "filter hit" and "race with other subpage."
|
||||
*/
|
||||
lock_page(hpage);
|
||||
if (PageHWPoison(hpage)) {
|
||||
if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
|
||||
|| (p != hpage && TestSetPageHWPoison(hpage))) {
|
||||
num_poisoned_pages_sub(nr_pages);
|
||||
unlock_page(hpage);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
set_page_hwpoison_huge_page(hpage);
|
||||
res = dequeue_hwpoisoned_huge_page(hpage);
|
||||
action_result(pfn, MF_MSG_FREE_HUGE,
|
||||
res ? MF_IGNORED : MF_DELAYED);
|
||||
unlock_page(hpage);
|
||||
return res;
|
||||
} else {
|
||||
action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
|
||||
return -EBUSY;
|
||||
}
|
||||
}
|
||||
|
||||
if (!PageHuge(p) && PageTransHuge(hpage)) {
|
||||
if (PageTransHuge(hpage)) {
|
||||
lock_page(p);
|
||||
if (!PageAnon(p) || unlikely(split_huge_page(p))) {
|
||||
unlock_page(p);
|
||||
|
@ -1138,7 +1181,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
|||
pr_err("Memory failure: %#lx: thp split failed\n",
|
||||
pfn);
|
||||
if (TestClearPageHWPoison(p))
|
||||
num_poisoned_pages_sub(nr_pages);
|
||||
num_poisoned_pages_dec();
|
||||
put_hwpoison_page(p);
|
||||
return -EBUSY;
|
||||
}
|
||||
|
@ -1165,7 +1208,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
|||
return 0;
|
||||
}
|
||||
|
||||
lock_page(hpage);
|
||||
lock_page(p);
|
||||
|
||||
/*
|
||||
* The page could have changed compound pages during the locking.
|
||||
|
@ -1194,41 +1237,22 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
|||
*/
|
||||
if (!PageHWPoison(p)) {
|
||||
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
|
||||
num_poisoned_pages_sub(nr_pages);
|
||||
unlock_page(hpage);
|
||||
put_hwpoison_page(hpage);
|
||||
num_poisoned_pages_dec();
|
||||
unlock_page(p);
|
||||
put_hwpoison_page(p);
|
||||
return 0;
|
||||
}
|
||||
if (hwpoison_filter(p)) {
|
||||
if (TestClearPageHWPoison(p))
|
||||
num_poisoned_pages_sub(nr_pages);
|
||||
unlock_page(hpage);
|
||||
put_hwpoison_page(hpage);
|
||||
num_poisoned_pages_dec();
|
||||
unlock_page(p);
|
||||
put_hwpoison_page(p);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
|
||||
if (!PageTransTail(p) && !PageLRU(p))
|
||||
goto identify_page_state;
|
||||
|
||||
/*
|
||||
* For error on the tail page, we should set PG_hwpoison
|
||||
* on the head page to show that the hugepage is hwpoisoned
|
||||
*/
|
||||
if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
|
||||
action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
|
||||
unlock_page(hpage);
|
||||
put_hwpoison_page(hpage);
|
||||
return 0;
|
||||
}
|
||||
/*
|
||||
* Set PG_hwpoison on all pages in an error hugepage,
|
||||
* because containment is done in hugepage unit for now.
|
||||
* Since we have done TestSetPageHWPoison() for the head page with
|
||||
* page lock held, we can safely set PG_hwpoison bits on tail pages.
|
||||
*/
|
||||
if (PageHuge(p))
|
||||
set_page_hwpoison_huge_page(hpage);
|
||||
|
||||
/*
|
||||
* It's very difficult to mess with pages currently under IO
|
||||
* and in many cases impossible, so we just avoid it here.
|
||||
|
@ -1258,25 +1282,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
|
|||
}
|
||||
|
||||
identify_page_state:
|
||||
res = -EBUSY;
|
||||
/*
|
||||
* The first check uses the current page flags which may not have any
|
||||
* relevant information. The second check with the saved page flagss is
|
||||
* carried out only if the first check can't determine the page status.
|
||||
*/
|
||||
for (ps = error_states;; ps++)
|
||||
if ((p->flags & ps->mask) == ps->res)
|
||||
break;
|
||||
|
||||
page_flags |= (p->flags & (1UL << PG_dirty));
|
||||
|
||||
if (!ps->mask)
|
||||
for (ps = error_states;; ps++)
|
||||
if ((page_flags & ps->mask) == ps->res)
|
||||
break;
|
||||
res = page_action(ps, p, pfn);
|
||||
res = identify_page_state(pfn, p, page_flags);
|
||||
out:
|
||||
unlock_page(hpage);
|
||||
unlock_page(p);
|
||||
return res;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(memory_failure);
|
||||
|
@ -1398,7 +1406,6 @@ int unpoison_memory(unsigned long pfn)
|
|||
struct page *page;
|
||||
struct page *p;
|
||||
int freeit = 0;
|
||||
unsigned int nr_pages;
|
||||
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
|
||||
DEFAULT_RATELIMIT_BURST);
|
||||
|
||||
|
@ -1443,20 +1450,7 @@ int unpoison_memory(unsigned long pfn)
|
|||
return 0;
|
||||
}
|
||||
|
||||
nr_pages = 1 << compound_order(page);
|
||||
|
||||
if (!get_hwpoison_page(p)) {
|
||||
/*
|
||||
* Since HWPoisoned hugepage should have non-zero refcount,
|
||||
* race between memory failure and unpoison seems to happen.
|
||||
* In such case unpoison fails and memory failure runs
|
||||
* to the end.
|
||||
*/
|
||||
if (PageHuge(page)) {
|
||||
unpoison_pr_info("Unpoison: Memory failure is now running on free hugepage %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
}
|
||||
if (TestClearPageHWPoison(p))
|
||||
num_poisoned_pages_dec();
|
||||
unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
|
||||
|
@ -1474,10 +1468,8 @@ int unpoison_memory(unsigned long pfn)
|
|||
if (TestClearPageHWPoison(page)) {
|
||||
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
num_poisoned_pages_sub(nr_pages);
|
||||
num_poisoned_pages_dec();
|
||||
freeit = 1;
|
||||
if (PageHuge(page))
|
||||
clear_page_hwpoison_huge_page(page);
|
||||
}
|
||||
unlock_page(page);
|
||||
|
||||
|
@ -1492,16 +1484,8 @@ EXPORT_SYMBOL(unpoison_memory);
|
|||
static struct page *new_page(struct page *p, unsigned long private, int **x)
|
||||
{
|
||||
int nid = page_to_nid(p);
|
||||
if (PageHuge(p)) {
|
||||
struct hstate *hstate = page_hstate(compound_head(p));
|
||||
|
||||
if (hstate_is_gigantic(hstate))
|
||||
return alloc_huge_page_node(hstate, NUMA_NO_NODE);
|
||||
|
||||
return alloc_huge_page_node(hstate, nid);
|
||||
} else {
|
||||
return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0);
|
||||
}
|
||||
return new_page_nodemask(p, nid, &node_states[N_MEMORY]);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1608,15 +1592,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
|
|||
if (ret > 0)
|
||||
ret = -EIO;
|
||||
} else {
|
||||
/* overcommit hugetlb page will be freed to buddy */
|
||||
if (PageHuge(page)) {
|
||||
set_page_hwpoison_huge_page(hpage);
|
||||
dequeue_hwpoisoned_huge_page(hpage);
|
||||
num_poisoned_pages_add(1 << compound_order(hpage));
|
||||
} else {
|
||||
SetPageHWPoison(page);
|
||||
num_poisoned_pages_inc();
|
||||
}
|
||||
if (PageHuge(page))
|
||||
dissolve_free_huge_page(page);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
@ -1732,15 +1709,12 @@ static int soft_offline_in_use_page(struct page *page, int flags)
|
|||
|
||||
static void soft_offline_free_page(struct page *page)
|
||||
{
|
||||
if (PageHuge(page)) {
|
||||
struct page *hpage = compound_head(page);
|
||||
struct page *head = compound_head(page);
|
||||
|
||||
set_page_hwpoison_huge_page(hpage);
|
||||
if (!dequeue_hwpoisoned_huge_page(hpage))
|
||||
num_poisoned_pages_add(1 << compound_order(hpage));
|
||||
} else {
|
||||
if (!TestSetPageHWPoison(page))
|
||||
num_poisoned_pages_inc();
|
||||
if (!TestSetPageHWPoison(head)) {
|
||||
num_poisoned_pages_inc();
|
||||
if (PageHuge(head))
|
||||
dissolve_free_huge_page(page);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -3262,14 +3262,14 @@ static int fault_around_bytes_set(void *data, u64 val)
|
|||
fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */
|
||||
return 0;
|
||||
}
|
||||
DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops,
|
||||
DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
|
||||
fault_around_bytes_get, fault_around_bytes_set, "%llu\n");
|
||||
|
||||
static int __init fault_around_debugfs(void)
|
||||
{
|
||||
void *ret;
|
||||
|
||||
ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL,
|
||||
ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
|
||||
&fault_around_bytes_fops);
|
||||
if (!ret)
|
||||
pr_warn("Failed to create fault_around_bytes in debugfs");
|
||||
|
|
|
@ -52,32 +52,17 @@ static void generic_online_page(struct page *page);
|
|||
static online_page_callback_t online_page_callback = generic_online_page;
|
||||
static DEFINE_MUTEX(online_page_callback_lock);
|
||||
|
||||
/* The same as the cpu_hotplug lock, but for memory hotplug. */
|
||||
static struct {
|
||||
struct task_struct *active_writer;
|
||||
struct mutex lock; /* Synchronizes accesses to refcount, */
|
||||
/*
|
||||
* Also blocks the new readers during
|
||||
* an ongoing mem hotplug operation.
|
||||
*/
|
||||
int refcount;
|
||||
DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
|
||||
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
struct lockdep_map dep_map;
|
||||
#endif
|
||||
} mem_hotplug = {
|
||||
.active_writer = NULL,
|
||||
.lock = __MUTEX_INITIALIZER(mem_hotplug.lock),
|
||||
.refcount = 0,
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
.dep_map = {.name = "mem_hotplug.lock" },
|
||||
#endif
|
||||
};
|
||||
void get_online_mems(void)
|
||||
{
|
||||
percpu_down_read(&mem_hotplug_lock);
|
||||
}
|
||||
|
||||
/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */
|
||||
#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map)
|
||||
#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
|
||||
#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
|
||||
void put_online_mems(void)
|
||||
{
|
||||
percpu_up_read(&mem_hotplug_lock);
|
||||
}
|
||||
|
||||
bool movable_node_enabled = false;
|
||||
|
||||
|
@ -99,60 +84,16 @@ static int __init setup_memhp_default_state(char *str)
|
|||
}
|
||||
__setup("memhp_default_state=", setup_memhp_default_state);
|
||||
|
||||
void get_online_mems(void)
|
||||
{
|
||||
might_sleep();
|
||||
if (mem_hotplug.active_writer == current)
|
||||
return;
|
||||
memhp_lock_acquire_read();
|
||||
mutex_lock(&mem_hotplug.lock);
|
||||
mem_hotplug.refcount++;
|
||||
mutex_unlock(&mem_hotplug.lock);
|
||||
|
||||
}
|
||||
|
||||
void put_online_mems(void)
|
||||
{
|
||||
if (mem_hotplug.active_writer == current)
|
||||
return;
|
||||
mutex_lock(&mem_hotplug.lock);
|
||||
|
||||
if (WARN_ON(!mem_hotplug.refcount))
|
||||
mem_hotplug.refcount++; /* try to fix things up */
|
||||
|
||||
if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer))
|
||||
wake_up_process(mem_hotplug.active_writer);
|
||||
mutex_unlock(&mem_hotplug.lock);
|
||||
memhp_lock_release();
|
||||
|
||||
}
|
||||
|
||||
/* Serializes write accesses to mem_hotplug.active_writer. */
|
||||
static DEFINE_MUTEX(memory_add_remove_lock);
|
||||
|
||||
void mem_hotplug_begin(void)
|
||||
{
|
||||
mutex_lock(&memory_add_remove_lock);
|
||||
|
||||
mem_hotplug.active_writer = current;
|
||||
|
||||
memhp_lock_acquire();
|
||||
for (;;) {
|
||||
mutex_lock(&mem_hotplug.lock);
|
||||
if (likely(!mem_hotplug.refcount))
|
||||
break;
|
||||
__set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
mutex_unlock(&mem_hotplug.lock);
|
||||
schedule();
|
||||
}
|
||||
cpus_read_lock();
|
||||
percpu_down_write(&mem_hotplug_lock);
|
||||
}
|
||||
|
||||
void mem_hotplug_done(void)
|
||||
{
|
||||
mem_hotplug.active_writer = NULL;
|
||||
mutex_unlock(&mem_hotplug.lock);
|
||||
memhp_lock_release();
|
||||
mutex_unlock(&memory_add_remove_lock);
|
||||
percpu_up_write(&mem_hotplug_lock);
|
||||
cpus_read_unlock();
|
||||
}
|
||||
|
||||
/* add this memory to iomem resource */
|
||||
|
@ -580,11 +521,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
|
|||
{
|
||||
struct pglist_data *pgdat = zone->zone_pgdat;
|
||||
int nr_pages = PAGES_PER_SECTION;
|
||||
int zone_type;
|
||||
unsigned long flags;
|
||||
|
||||
zone_type = zone - pgdat->node_zones;
|
||||
|
||||
pgdat_resize_lock(zone->zone_pgdat, &flags);
|
||||
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
|
||||
shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
|
||||
|
@ -934,6 +872,19 @@ struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
|
|||
return &pgdat->node_zones[ZONE_NORMAL];
|
||||
}
|
||||
|
||||
static inline bool movable_pfn_range(int nid, struct zone *default_zone,
|
||||
unsigned long start_pfn, unsigned long nr_pages)
|
||||
{
|
||||
if (!allow_online_pfn_range(nid, start_pfn, nr_pages,
|
||||
MMOP_ONLINE_KERNEL))
|
||||
return true;
|
||||
|
||||
if (!movable_node_is_enabled())
|
||||
return false;
|
||||
|
||||
return !zone_intersects(default_zone, start_pfn, nr_pages);
|
||||
}
|
||||
|
||||
/*
|
||||
* Associates the given pfn range with the given node and the zone appropriate
|
||||
* for the given online type.
|
||||
|
@ -949,10 +900,10 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid,
|
|||
/*
|
||||
* MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use
|
||||
* movable zone if that is not possible (e.g. we are within
|
||||
* or past the existing movable zone)
|
||||
* or past the existing movable zone). movable_node overrides
|
||||
* this default and defaults to movable zone
|
||||
*/
|
||||
if (!allow_online_pfn_range(nid, start_pfn, nr_pages,
|
||||
MMOP_ONLINE_KERNEL))
|
||||
if (movable_pfn_range(nid, zone, start_pfn, nr_pages))
|
||||
zone = movable_zone;
|
||||
} else if (online_type == MMOP_ONLINE_MOVABLE) {
|
||||
zone = &pgdat->node_zones[ZONE_MOVABLE];
|
||||
|
@ -1268,7 +1219,7 @@ register_fail:
|
|||
|
||||
error:
|
||||
/* rollback pgdat allocation and others */
|
||||
if (new_pgdat)
|
||||
if (new_pgdat && pgdat)
|
||||
rollback_node_hotadd(nid, pgdat);
|
||||
memblock_remove(start, size);
|
||||
|
||||
|
@ -1420,32 +1371,19 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
|
|||
static struct page *new_node_page(struct page *page, unsigned long private,
|
||||
int **result)
|
||||
{
|
||||
gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
|
||||
int nid = page_to_nid(page);
|
||||
nodemask_t nmask = node_states[N_MEMORY];
|
||||
struct page *new_page = NULL;
|
||||
|
||||
/*
|
||||
* TODO: allocate a destination hugepage from a nearest neighbor node,
|
||||
* accordance with memory policy of the user process if possible. For
|
||||
* now as a simple work-around, we use the next node for destination.
|
||||
* try to allocate from a different node but reuse this node if there
|
||||
* are no other online nodes to be used (e.g. we are offlining a part
|
||||
* of the only existing node)
|
||||
*/
|
||||
if (PageHuge(page))
|
||||
return alloc_huge_page_node(page_hstate(compound_head(page)),
|
||||
next_node_in(nid, nmask));
|
||||
|
||||
node_clear(nid, nmask);
|
||||
if (nodes_empty(nmask))
|
||||
node_set(nid, nmask);
|
||||
|
||||
if (PageHighMem(page)
|
||||
|| (zone_idx(page_zone(page)) == ZONE_MOVABLE))
|
||||
gfp_mask |= __GFP_HIGHMEM;
|
||||
|
||||
if (!nodes_empty(nmask))
|
||||
new_page = __alloc_pages_nodemask(gfp_mask, 0, nid, &nmask);
|
||||
if (!new_page)
|
||||
new_page = __alloc_pages(gfp_mask, 0, nid);
|
||||
|
||||
return new_page;
|
||||
return new_page_nodemask(page, nid, &nmask);
|
||||
}
|
||||
|
||||
#define NR_OFFLINE_AT_ONCE_PAGES (256)
|
||||
|
@ -1728,7 +1666,7 @@ repeat:
|
|||
goto failed_removal;
|
||||
ret = 0;
|
||||
if (drain) {
|
||||
lru_add_drain_all();
|
||||
lru_add_drain_all_cpuslocked();
|
||||
cond_resched();
|
||||
drain_all_pages(zone);
|
||||
}
|
||||
|
@ -1749,7 +1687,7 @@ repeat:
|
|||
}
|
||||
}
|
||||
/* drain all zone's lru pagevec, this is asynchronous... */
|
||||
lru_add_drain_all();
|
||||
lru_add_drain_all_cpuslocked();
|
||||
yield();
|
||||
/* drain pcp pages, this is synchronous. */
|
||||
drain_all_pages(zone);
|
||||
|
|
17
mm/migrate.c
17
mm/migrate.c
|
@ -1252,6 +1252,8 @@ put_anon:
|
|||
out:
|
||||
if (rc != -EAGAIN)
|
||||
putback_active_hugepage(hpage);
|
||||
if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage))
|
||||
num_poisoned_pages_inc();
|
||||
|
||||
/*
|
||||
* If migration was not successful and there's a freeing callback, use
|
||||
|
@ -1914,7 +1916,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
|
|||
int page_lru = page_is_file_cache(page);
|
||||
unsigned long mmun_start = address & HPAGE_PMD_MASK;
|
||||
unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
|
||||
pmd_t orig_entry;
|
||||
|
||||
/*
|
||||
* Rate-limit the amount of data that is being migrated to a node.
|
||||
|
@ -1957,8 +1958,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
|
|||
/* Recheck the target PMD */
|
||||
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
|
||||
ptl = pmd_lock(mm, pmd);
|
||||
if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
|
||||
fail_putback:
|
||||
if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
|
||||
spin_unlock(ptl);
|
||||
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
|
||||
|
||||
|
@ -1980,7 +1980,6 @@ fail_putback:
|
|||
goto out_unlock;
|
||||
}
|
||||
|
||||
orig_entry = *pmd;
|
||||
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
|
||||
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
|
||||
|
||||
|
@ -1997,15 +1996,7 @@ fail_putback:
|
|||
set_pmd_at(mm, mmun_start, pmd, entry);
|
||||
update_mmu_cache_pmd(vma, address, &entry);
|
||||
|
||||
if (page_count(page) != 2) {
|
||||
set_pmd_at(mm, mmun_start, pmd, orig_entry);
|
||||
flush_pmd_tlb_range(vma, mmun_start, mmun_end);
|
||||
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
|
||||
update_mmu_cache_pmd(vma, address, &entry);
|
||||
page_remove_rmap(new_page, true);
|
||||
goto fail_putback;
|
||||
}
|
||||
|
||||
page_ref_unfreeze(page, 2);
|
||||
mlock_migrate_page(new_page, page);
|
||||
page_remove_rmap(page, true);
|
||||
set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
|
||||
|
|
19
mm/mmap.c
19
mm/mmap.c
|
@ -2177,7 +2177,6 @@ static int acct_stack_growth(struct vm_area_struct *vma,
|
|||
unsigned long size, unsigned long grow)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct rlimit *rlim = current->signal->rlim;
|
||||
unsigned long new_start;
|
||||
|
||||
/* address space limit tests */
|
||||
|
@ -2185,7 +2184,7 @@ static int acct_stack_growth(struct vm_area_struct *vma,
|
|||
return -ENOMEM;
|
||||
|
||||
/* Stack limit test */
|
||||
if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
|
||||
if (size > rlimit(RLIMIT_STACK))
|
||||
return -ENOMEM;
|
||||
|
||||
/* mlock limit tests */
|
||||
|
@ -2193,7 +2192,7 @@ static int acct_stack_growth(struct vm_area_struct *vma,
|
|||
unsigned long locked;
|
||||
unsigned long limit;
|
||||
locked = mm->locked_vm + grow;
|
||||
limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
|
||||
limit = rlimit(RLIMIT_MEMLOCK);
|
||||
limit >>= PAGE_SHIFT;
|
||||
if (locked > limit && !capable(CAP_IPC_LOCK))
|
||||
return -ENOMEM;
|
||||
|
@ -2244,7 +2243,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
|
|||
gap_addr = TASK_SIZE;
|
||||
|
||||
next = vma->vm_next;
|
||||
if (next && next->vm_start < gap_addr) {
|
||||
if (next && next->vm_start < gap_addr &&
|
||||
(next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
|
||||
if (!(next->vm_flags & VM_GROWSUP))
|
||||
return -ENOMEM;
|
||||
/* Check that both stack segments have the same anon_vma? */
|
||||
|
@ -2315,7 +2315,6 @@ int expand_downwards(struct vm_area_struct *vma,
|
|||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct vm_area_struct *prev;
|
||||
unsigned long gap_addr;
|
||||
int error;
|
||||
|
||||
address &= PAGE_MASK;
|
||||
|
@ -2324,14 +2323,12 @@ int expand_downwards(struct vm_area_struct *vma,
|
|||
return error;
|
||||
|
||||
/* Enforce stack_guard_gap */
|
||||
gap_addr = address - stack_guard_gap;
|
||||
if (gap_addr > address)
|
||||
return -ENOMEM;
|
||||
prev = vma->vm_prev;
|
||||
if (prev && prev->vm_end > gap_addr) {
|
||||
if (!(prev->vm_flags & VM_GROWSDOWN))
|
||||
/* Check that both stack segments have the same anon_vma? */
|
||||
if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
|
||||
(prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) {
|
||||
if (address - prev->vm_end < stack_guard_gap)
|
||||
return -ENOMEM;
|
||||
/* Check that both stack segments have the same anon_vma? */
|
||||
}
|
||||
|
||||
/* We must make sure the anon_vma is allocated. */
|
||||
|
|
|
@ -490,6 +490,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
|
|||
|
||||
if (!down_read_trylock(&mm->mmap_sem)) {
|
||||
ret = false;
|
||||
trace_skip_task_reaping(tsk->pid);
|
||||
goto unlock_oom;
|
||||
}
|
||||
|
||||
|
@ -500,9 +501,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
|
|||
*/
|
||||
if (!mmget_not_zero(mm)) {
|
||||
up_read(&mm->mmap_sem);
|
||||
trace_skip_task_reaping(tsk->pid);
|
||||
goto unlock_oom;
|
||||
}
|
||||
|
||||
trace_start_task_reaping(tsk->pid);
|
||||
|
||||
/*
|
||||
* Tell all users of get_user/copy_from_user etc... that the content
|
||||
* is no longer stable. No barriers really needed because unmapping
|
||||
|
@ -544,6 +548,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
|
|||
* put the oom_reaper out of the way.
|
||||
*/
|
||||
mmput_async(mm);
|
||||
trace_finish_task_reaping(tsk->pid);
|
||||
unlock_oom:
|
||||
mutex_unlock(&oom_lock);
|
||||
return ret;
|
||||
|
@ -615,6 +620,7 @@ static void wake_oom_reaper(struct task_struct *tsk)
|
|||
tsk->oom_reaper_list = oom_reaper_list;
|
||||
oom_reaper_list = tsk;
|
||||
spin_unlock(&oom_reaper_lock);
|
||||
trace_wake_reaper(tsk->pid);
|
||||
wake_up(&oom_reaper_wait);
|
||||
}
|
||||
|
||||
|
@ -666,6 +672,7 @@ static void mark_oom_victim(struct task_struct *tsk)
|
|||
*/
|
||||
__thaw_task(tsk);
|
||||
atomic_inc(&oom_victims);
|
||||
trace_mark_victim(tsk->pid);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -2206,19 +2206,26 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
|
|||
* list of requested migratetype, possibly along with other pages from the same
|
||||
* block, depending on fragmentation avoidance heuristics. Returns true if
|
||||
* fallback was found so that __rmqueue_smallest() can grab it.
|
||||
*
|
||||
* The use of signed ints for order and current_order is a deliberate
|
||||
* deviation from the rest of this file, to make the for loop
|
||||
* condition simpler.
|
||||
*/
|
||||
static inline bool
|
||||
__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
|
||||
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
|
||||
{
|
||||
struct free_area *area;
|
||||
unsigned int current_order;
|
||||
int current_order;
|
||||
struct page *page;
|
||||
int fallback_mt;
|
||||
bool can_steal;
|
||||
|
||||
/* Find the largest possible block of pages in the other list */
|
||||
for (current_order = MAX_ORDER-1;
|
||||
current_order >= order && current_order <= MAX_ORDER-1;
|
||||
/*
|
||||
* Find the largest available free page in the other list. This roughly
|
||||
* approximates finding the pageblock with the most free pages, which
|
||||
* would be too costly to do exactly.
|
||||
*/
|
||||
for (current_order = MAX_ORDER - 1; current_order >= order;
|
||||
--current_order) {
|
||||
area = &(zone->free_area[current_order]);
|
||||
fallback_mt = find_suitable_fallback(area, current_order,
|
||||
|
@ -2226,19 +2233,50 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
|
|||
if (fallback_mt == -1)
|
||||
continue;
|
||||
|
||||
page = list_first_entry(&area->free_list[fallback_mt],
|
||||
struct page, lru);
|
||||
/*
|
||||
* We cannot steal all free pages from the pageblock and the
|
||||
* requested migratetype is movable. In that case it's better to
|
||||
* steal and split the smallest available page instead of the
|
||||
* largest available page, because even if the next movable
|
||||
* allocation falls back into a different pageblock than this
|
||||
* one, it won't cause permanent fragmentation.
|
||||
*/
|
||||
if (!can_steal && start_migratetype == MIGRATE_MOVABLE
|
||||
&& current_order > order)
|
||||
goto find_smallest;
|
||||
|
||||
steal_suitable_fallback(zone, page, start_migratetype,
|
||||
can_steal);
|
||||
|
||||
trace_mm_page_alloc_extfrag(page, order, current_order,
|
||||
start_migratetype, fallback_mt);
|
||||
|
||||
return true;
|
||||
goto do_steal;
|
||||
}
|
||||
|
||||
return false;
|
||||
|
||||
find_smallest:
|
||||
for (current_order = order; current_order < MAX_ORDER;
|
||||
current_order++) {
|
||||
area = &(zone->free_area[current_order]);
|
||||
fallback_mt = find_suitable_fallback(area, current_order,
|
||||
start_migratetype, false, &can_steal);
|
||||
if (fallback_mt != -1)
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* This should not happen - we already found a suitable fallback
|
||||
* when looking for the largest page.
|
||||
*/
|
||||
VM_BUG_ON(current_order == MAX_ORDER);
|
||||
|
||||
do_steal:
|
||||
page = list_first_entry(&area->free_list[fallback_mt],
|
||||
struct page, lru);
|
||||
|
||||
steal_suitable_fallback(zone, page, start_migratetype, can_steal);
|
||||
|
||||
trace_mm_page_alloc_extfrag(page, order, current_order,
|
||||
start_migratetype, fallback_mt);
|
||||
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -5240,7 +5278,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
|
|||
#endif
|
||||
/* we have to stop all cpus to guarantee there is no user
|
||||
of zonelist */
|
||||
stop_machine(__build_all_zonelists, pgdat, NULL);
|
||||
stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
|
||||
/* cpuset refresh routine should be here */
|
||||
}
|
||||
vm_total_pages = nr_free_pagecache_pages();
|
||||
|
|
23
mm/page_io.c
23
mm/page_io.c
|
@ -117,6 +117,7 @@ static void swap_slot_free_notify(struct page *page)
|
|||
static void end_swap_bio_read(struct bio *bio)
|
||||
{
|
||||
struct page *page = bio->bi_io_vec[0].bv_page;
|
||||
struct task_struct *waiter = bio->bi_private;
|
||||
|
||||
if (bio->bi_status) {
|
||||
SetPageError(page);
|
||||
|
@ -132,7 +133,9 @@ static void end_swap_bio_read(struct bio *bio)
|
|||
swap_slot_free_notify(page);
|
||||
out:
|
||||
unlock_page(page);
|
||||
WRITE_ONCE(bio->bi_private, NULL);
|
||||
bio_put(bio);
|
||||
wake_up_process(waiter);
|
||||
}
|
||||
|
||||
int generic_swapfile_activate(struct swap_info_struct *sis,
|
||||
|
@ -329,11 +332,13 @@ out:
|
|||
return ret;
|
||||
}
|
||||
|
||||
int swap_readpage(struct page *page)
|
||||
int swap_readpage(struct page *page, bool do_poll)
|
||||
{
|
||||
struct bio *bio;
|
||||
int ret = 0;
|
||||
struct swap_info_struct *sis = page_swap_info(page);
|
||||
blk_qc_t qc;
|
||||
struct block_device *bdev;
|
||||
|
||||
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
|
@ -372,9 +377,23 @@ int swap_readpage(struct page *page)
|
|||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
bdev = bio->bi_bdev;
|
||||
bio->bi_private = current;
|
||||
bio_set_op_attrs(bio, REQ_OP_READ, 0);
|
||||
count_vm_event(PSWPIN);
|
||||
submit_bio(bio);
|
||||
bio_get(bio);
|
||||
qc = submit_bio(bio);
|
||||
while (do_poll) {
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
if (!READ_ONCE(bio->bi_private))
|
||||
break;
|
||||
|
||||
if (!blk_mq_poll(bdev_get_queue(bdev), qc))
|
||||
break;
|
||||
}
|
||||
__set_current_state(TASK_RUNNING);
|
||||
bio_put(bio);
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
#include <linux/memory.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/page_owner.h>
|
||||
#include <linux/migrate.h>
|
||||
#include "internal.h"
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
|
@ -294,20 +295,5 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
|
|||
struct page *alloc_migrate_target(struct page *page, unsigned long private,
|
||||
int **resultp)
|
||||
{
|
||||
gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
|
||||
|
||||
/*
|
||||
* TODO: allocate a destination hugepage from a nearest neighbor node,
|
||||
* accordance with memory policy of the user process if possible. For
|
||||
* now as a simple work-around, we use the next node for destination.
|
||||
*/
|
||||
if (PageHuge(page))
|
||||
return alloc_huge_page_node(page_hstate(compound_head(page)),
|
||||
next_node_in(page_to_nid(page),
|
||||
node_online_map));
|
||||
|
||||
if (PageHighMem(page))
|
||||
gfp_mask |= __GFP_HIGHMEM;
|
||||
|
||||
return alloc_page(gfp_mask);
|
||||
return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]);
|
||||
}
|
||||
|
|
|
@ -281,7 +281,11 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
|
|||
continue;
|
||||
|
||||
if (PageBuddy(page)) {
|
||||
pfn += (1UL << page_order(page)) - 1;
|
||||
unsigned long freepage_order;
|
||||
|
||||
freepage_order = page_order_unsafe(page);
|
||||
if (freepage_order < MAX_ORDER)
|
||||
pfn += (1UL << freepage_order) - 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
|
@ -1977,10 +1977,12 @@ static int shmem_fault(struct vm_fault *vmf)
|
|||
}
|
||||
|
||||
sgp = SGP_CACHE;
|
||||
if (vma->vm_flags & VM_HUGEPAGE)
|
||||
sgp = SGP_HUGE;
|
||||
else if (vma->vm_flags & VM_NOHUGEPAGE)
|
||||
|
||||
if ((vma->vm_flags & VM_NOHUGEPAGE) ||
|
||||
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
|
||||
sgp = SGP_NOHUGE;
|
||||
else if (vma->vm_flags & VM_HUGEPAGE)
|
||||
sgp = SGP_HUGE;
|
||||
|
||||
error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
|
||||
gfp, vma, vmf, &ret);
|
||||
|
|
11
mm/swap.c
11
mm/swap.c
|
@ -688,7 +688,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
|
|||
|
||||
static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
|
||||
|
||||
void lru_add_drain_all(void)
|
||||
void lru_add_drain_all_cpuslocked(void)
|
||||
{
|
||||
static DEFINE_MUTEX(lock);
|
||||
static struct cpumask has_work;
|
||||
|
@ -702,7 +702,6 @@ void lru_add_drain_all(void)
|
|||
return;
|
||||
|
||||
mutex_lock(&lock);
|
||||
get_online_cpus();
|
||||
cpumask_clear(&has_work);
|
||||
|
||||
for_each_online_cpu(cpu) {
|
||||
|
@ -722,10 +721,16 @@ void lru_add_drain_all(void)
|
|||
for_each_cpu(cpu, &has_work)
|
||||
flush_work(&per_cpu(lru_add_drain_work, cpu));
|
||||
|
||||
put_online_cpus();
|
||||
mutex_unlock(&lock);
|
||||
}
|
||||
|
||||
void lru_add_drain_all(void)
|
||||
{
|
||||
get_online_cpus();
|
||||
lru_add_drain_all_cpuslocked();
|
||||
put_online_cpus();
|
||||
}
|
||||
|
||||
/**
|
||||
* release_pages - batched put_page()
|
||||
* @pages: array of pages to release
|
||||
|
|
|
@ -273,11 +273,11 @@ int free_swap_slot(swp_entry_t entry)
|
|||
{
|
||||
struct swap_slots_cache *cache;
|
||||
|
||||
cache = &get_cpu_var(swp_slots);
|
||||
cache = raw_cpu_ptr(&swp_slots);
|
||||
if (use_swap_slot_cache && cache->slots_ret) {
|
||||
spin_lock_irq(&cache->free_lock);
|
||||
/* Swap slots cache may be deactivated before acquiring lock */
|
||||
if (!use_swap_slot_cache) {
|
||||
if (!use_swap_slot_cache || !cache->slots_ret) {
|
||||
spin_unlock_irq(&cache->free_lock);
|
||||
goto direct_free;
|
||||
}
|
||||
|
@ -297,7 +297,6 @@ int free_swap_slot(swp_entry_t entry)
|
|||
direct_free:
|
||||
swapcache_free_entries(&entry, 1);
|
||||
}
|
||||
put_cpu_var(swp_slots);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -412,14 +412,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
|||
* the swap entry is no longer in use.
|
||||
*/
|
||||
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||
struct vm_area_struct *vma, unsigned long addr)
|
||||
struct vm_area_struct *vma, unsigned long addr, bool do_poll)
|
||||
{
|
||||
bool page_was_allocated;
|
||||
struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
|
||||
vma, addr, &page_was_allocated);
|
||||
|
||||
if (page_was_allocated)
|
||||
swap_readpage(retpage);
|
||||
swap_readpage(retpage, do_poll);
|
||||
|
||||
return retpage;
|
||||
}
|
||||
|
@ -496,11 +496,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
|
|||
unsigned long start_offset, end_offset;
|
||||
unsigned long mask;
|
||||
struct blk_plug plug;
|
||||
bool do_poll = true;
|
||||
|
||||
mask = swapin_nr_pages(offset) - 1;
|
||||
if (!mask)
|
||||
goto skip;
|
||||
|
||||
do_poll = false;
|
||||
/* Read a page_cluster sized and aligned cluster around offset. */
|
||||
start_offset = offset & ~mask;
|
||||
end_offset = offset | mask;
|
||||
|
@ -511,7 +513,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
|
|||
for (offset = start_offset; offset <= end_offset ; offset++) {
|
||||
/* Ok, do the async read-ahead now */
|
||||
page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
|
||||
gfp_mask, vma, addr);
|
||||
gfp_mask, vma, addr, false);
|
||||
if (!page)
|
||||
continue;
|
||||
if (offset != entry_offset && likely(!PageTransCompound(page)))
|
||||
|
@ -522,7 +524,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
|
|||
|
||||
lru_add_drain(); /* Push any new pages onto the LRU now */
|
||||
skip:
|
||||
return read_swap_cache_async(entry, gfp_mask, vma, addr);
|
||||
return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
|
||||
}
|
||||
|
||||
int init_swap_address_space(unsigned int type, unsigned long nr_pages)
|
||||
|
|
|
@ -1868,7 +1868,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
|
|||
swap_map = &si->swap_map[i];
|
||||
entry = swp_entry(type, i);
|
||||
page = read_swap_cache_async(entry,
|
||||
GFP_HIGHUSER_MOVABLE, NULL, 0);
|
||||
GFP_HIGHUSER_MOVABLE, NULL, 0, false);
|
||||
if (!page) {
|
||||
/*
|
||||
* Either swap_duplicate() failed because entry
|
||||
|
|
|
@ -530,9 +530,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
|
|||
} else if (PageTransHuge(page)) {
|
||||
index += HPAGE_PMD_NR - 1;
|
||||
i += HPAGE_PMD_NR - 1;
|
||||
/* 'end' is in the middle of THP */
|
||||
if (index == round_down(end, HPAGE_PMD_NR))
|
||||
/*
|
||||
* 'end' is in the middle of THP. Don't
|
||||
* invalidate the page as the part outside of
|
||||
* 'end' could be still useful.
|
||||
*/
|
||||
if (index > end) {
|
||||
unlock_page(page);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
ret = invalidate_inode_page(page);
|
||||
|
|
10
mm/vmalloc.c
10
mm/vmalloc.c
|
@ -325,6 +325,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
|
|||
|
||||
/*** Global kva allocator ***/
|
||||
|
||||
#define VM_LAZY_FREE 0x02
|
||||
#define VM_VM_AREA 0x04
|
||||
|
||||
static DEFINE_SPINLOCK(vmap_area_lock);
|
||||
|
@ -1497,6 +1498,7 @@ struct vm_struct *remove_vm_area(const void *addr)
|
|||
spin_lock(&vmap_area_lock);
|
||||
va->vm = NULL;
|
||||
va->flags &= ~VM_VM_AREA;
|
||||
va->flags |= VM_LAZY_FREE;
|
||||
spin_unlock(&vmap_area_lock);
|
||||
|
||||
vmap_debug_free_range(va->va_start, va->va_end);
|
||||
|
@ -2704,8 +2706,14 @@ static int s_show(struct seq_file *m, void *p)
|
|||
* s_show can encounter race with remove_vm_area, !VM_VM_AREA on
|
||||
* behalf of vmap area is being tear down or vm_map_ram allocation.
|
||||
*/
|
||||
if (!(va->flags & VM_VM_AREA))
|
||||
if (!(va->flags & VM_VM_AREA)) {
|
||||
seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
|
||||
(void *)va->va_start, (void *)va->va_end,
|
||||
va->va_end - va->va_start,
|
||||
va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
v = va->vm;
|
||||
|
||||
|
|
122
mm/vmpressure.c
122
mm/vmpressure.c
|
@ -93,12 +93,25 @@ enum vmpressure_levels {
|
|||
VMPRESSURE_NUM_LEVELS,
|
||||
};
|
||||
|
||||
enum vmpressure_modes {
|
||||
VMPRESSURE_NO_PASSTHROUGH = 0,
|
||||
VMPRESSURE_HIERARCHY,
|
||||
VMPRESSURE_LOCAL,
|
||||
VMPRESSURE_NUM_MODES,
|
||||
};
|
||||
|
||||
static const char * const vmpressure_str_levels[] = {
|
||||
[VMPRESSURE_LOW] = "low",
|
||||
[VMPRESSURE_MEDIUM] = "medium",
|
||||
[VMPRESSURE_CRITICAL] = "critical",
|
||||
};
|
||||
|
||||
static const char * const vmpressure_str_modes[] = {
|
||||
[VMPRESSURE_NO_PASSTHROUGH] = "default",
|
||||
[VMPRESSURE_HIERARCHY] = "hierarchy",
|
||||
[VMPRESSURE_LOCAL] = "local",
|
||||
};
|
||||
|
||||
static enum vmpressure_levels vmpressure_level(unsigned long pressure)
|
||||
{
|
||||
if (pressure >= vmpressure_level_critical)
|
||||
|
@ -141,27 +154,31 @@ out:
|
|||
struct vmpressure_event {
|
||||
struct eventfd_ctx *efd;
|
||||
enum vmpressure_levels level;
|
||||
enum vmpressure_modes mode;
|
||||
struct list_head node;
|
||||
};
|
||||
|
||||
static bool vmpressure_event(struct vmpressure *vmpr,
|
||||
enum vmpressure_levels level)
|
||||
const enum vmpressure_levels level,
|
||||
bool ancestor, bool signalled)
|
||||
{
|
||||
struct vmpressure_event *ev;
|
||||
bool signalled = false;
|
||||
bool ret = false;
|
||||
|
||||
mutex_lock(&vmpr->events_lock);
|
||||
|
||||
list_for_each_entry(ev, &vmpr->events, node) {
|
||||
if (level >= ev->level) {
|
||||
eventfd_signal(ev->efd, 1);
|
||||
signalled = true;
|
||||
}
|
||||
if (ancestor && ev->mode == VMPRESSURE_LOCAL)
|
||||
continue;
|
||||
if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH)
|
||||
continue;
|
||||
if (level < ev->level)
|
||||
continue;
|
||||
eventfd_signal(ev->efd, 1);
|
||||
ret = true;
|
||||
}
|
||||
|
||||
mutex_unlock(&vmpr->events_lock);
|
||||
|
||||
return signalled;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void vmpressure_work_fn(struct work_struct *work)
|
||||
|
@ -170,6 +187,8 @@ static void vmpressure_work_fn(struct work_struct *work)
|
|||
unsigned long scanned;
|
||||
unsigned long reclaimed;
|
||||
enum vmpressure_levels level;
|
||||
bool ancestor = false;
|
||||
bool signalled = false;
|
||||
|
||||
spin_lock(&vmpr->sr_lock);
|
||||
/*
|
||||
|
@ -194,12 +213,9 @@ static void vmpressure_work_fn(struct work_struct *work)
|
|||
level = vmpressure_calc_level(scanned, reclaimed);
|
||||
|
||||
do {
|
||||
if (vmpressure_event(vmpr, level))
|
||||
break;
|
||||
/*
|
||||
* If not handled, propagate the event upward into the
|
||||
* hierarchy.
|
||||
*/
|
||||
if (vmpressure_event(vmpr, level, ancestor, signalled))
|
||||
signalled = true;
|
||||
ancestor = true;
|
||||
} while ((vmpr = vmpressure_parent(vmpr)));
|
||||
}
|
||||
|
||||
|
@ -326,17 +342,40 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
|
|||
vmpressure(gfp, memcg, true, vmpressure_win, 0);
|
||||
}
|
||||
|
||||
static enum vmpressure_levels str_to_level(const char *arg)
|
||||
{
|
||||
enum vmpressure_levels level;
|
||||
|
||||
for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++)
|
||||
if (!strcmp(vmpressure_str_levels[level], arg))
|
||||
return level;
|
||||
return -1;
|
||||
}
|
||||
|
||||
static enum vmpressure_modes str_to_mode(const char *arg)
|
||||
{
|
||||
enum vmpressure_modes mode;
|
||||
|
||||
for (mode = 0; mode < VMPRESSURE_NUM_MODES; mode++)
|
||||
if (!strcmp(vmpressure_str_modes[mode], arg))
|
||||
return mode;
|
||||
return -1;
|
||||
}
|
||||
|
||||
#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
|
||||
|
||||
/**
|
||||
* vmpressure_register_event() - Bind vmpressure notifications to an eventfd
|
||||
* @memcg: memcg that is interested in vmpressure notifications
|
||||
* @eventfd: eventfd context to link notifications with
|
||||
* @args: event arguments (used to set up a pressure level threshold)
|
||||
* @args: event arguments (pressure level threshold, optional mode)
|
||||
*
|
||||
* This function associates eventfd context with the vmpressure
|
||||
* infrastructure, so that the notifications will be delivered to the
|
||||
* @eventfd. The @args parameter is a string that denotes pressure level
|
||||
* threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
|
||||
* "critical").
|
||||
* @eventfd. The @args parameter is a comma-delimited string that denotes a
|
||||
* pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium",
|
||||
* or "critical") and an optional mode (one of vmpressure_str_modes, i.e.
|
||||
* "hierarchy" or "local").
|
||||
*
|
||||
* To be used as memcg event method.
|
||||
*/
|
||||
|
@ -345,28 +384,53 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
|
|||
{
|
||||
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
|
||||
struct vmpressure_event *ev;
|
||||
int level;
|
||||
enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
|
||||
enum vmpressure_levels level = -1;
|
||||
char *spec, *spec_orig;
|
||||
char *token;
|
||||
int ret = 0;
|
||||
|
||||
for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
|
||||
if (!strcmp(vmpressure_str_levels[level], args))
|
||||
break;
|
||||
spec_orig = spec = kzalloc(MAX_VMPRESSURE_ARGS_LEN + 1, GFP_KERNEL);
|
||||
if (!spec) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
strncpy(spec, args, MAX_VMPRESSURE_ARGS_LEN);
|
||||
|
||||
/* Find required level */
|
||||
token = strsep(&spec, ",");
|
||||
level = str_to_level(token);
|
||||
if (level == -1) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (level >= VMPRESSURE_NUM_LEVELS)
|
||||
return -EINVAL;
|
||||
/* Find optional mode */
|
||||
token = strsep(&spec, ",");
|
||||
if (token) {
|
||||
mode = str_to_mode(token);
|
||||
if (mode == -1) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
|
||||
if (!ev)
|
||||
return -ENOMEM;
|
||||
if (!ev) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ev->efd = eventfd;
|
||||
ev->level = level;
|
||||
ev->mode = mode;
|
||||
|
||||
mutex_lock(&vmpr->events_lock);
|
||||
list_add(&ev->node, &vmpr->events);
|
||||
mutex_unlock(&vmpr->events_lock);
|
||||
|
||||
return 0;
|
||||
out:
|
||||
kfree(spec_orig);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
13
mm/vmscan.c
13
mm/vmscan.c
|
@ -2228,8 +2228,17 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
|
|||
}
|
||||
|
||||
if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
|
||||
scan_balance = SCAN_ANON;
|
||||
goto out;
|
||||
/*
|
||||
* Force SCAN_ANON if there are enough inactive
|
||||
* anonymous pages on the LRU in eligible zones.
|
||||
* Otherwise, the small LRU gets thrashed.
|
||||
*/
|
||||
if (!inactive_list_is_low(lruvec, false, memcg, sc, false) &&
|
||||
lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
|
||||
>> sc->priority) {
|
||||
scan_balance = SCAN_ANON;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue