diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5abcbdc743fa..086dcbadce09 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1256,9 +1256,9 @@ void clean_tree_block(struct btrfs_fs_info *fs_info, btrfs_assert_tree_locked(buf); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { - __percpu_counter_add(&fs_info->dirty_metadata_bytes, - -buf->len, - fs_info->dirty_metadata_batch); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + -buf->len, + fs_info->dirty_metadata_batch); /* ugh, clear_extent_buffer_dirty needs to lock the page */ btrfs_set_lock_blocking(buf); clear_extent_buffer_dirty(buf); @@ -4047,9 +4047,9 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) buf->start, transid, fs_info->generation); was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) - __percpu_counter_add(&fs_info->dirty_metadata_bytes, - buf->len, - fs_info->dirty_metadata_batch); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + buf->len, + fs_info->dirty_metadata_batch); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { btrfs_print_leaf(fs_info, buf); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7a18b5762ac9..556484cf5d93 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3577,9 +3577,9 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); spin_unlock(&eb->refs_lock); btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - __percpu_counter_add(&fs_info->dirty_metadata_bytes, - -eb->len, - fs_info->dirty_metadata_batch); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + -eb->len, + fs_info->dirty_metadata_batch); ret = 1; } else { spin_unlock(&eb->refs_lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8d050314591c..06dea7c89bbd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1763,8 +1763,8 @@ static void btrfs_set_bit_hook(void *private_data, if (btrfs_is_testing(fs_info)) return; - __percpu_counter_add(&fs_info->delalloc_bytes, len, - fs_info->delalloc_batch); + percpu_counter_add_batch(&fs_info->delalloc_bytes, len, + fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; if (*bits & EXTENT_DEFRAG) @@ -1838,8 +1838,8 @@ static void btrfs_clear_bit_hook(void *private_data, &inode->vfs_inode, state->start, len); - __percpu_counter_add(&fs_info->delalloc_bytes, -len, - fs_info->delalloc_batch); + percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, + fs_info->delalloc_batch); spin_lock(&inode->lock); inode->delalloc_bytes -= len; if (do_list && inode->delalloc_bytes == 0 && diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d249546da15e..43d07f9c4e9e 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1211,7 +1211,7 @@ xfs_mod_icount( struct xfs_mount *mp, int64_t delta) { - __percpu_counter_add(&mp->m_icount, delta, XFS_ICOUNT_BATCH); + percpu_counter_add_batch(&mp->m_icount, delta, XFS_ICOUNT_BATCH); if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) { ASSERT(0); percpu_counter_add(&mp->m_icount, -delta); @@ -1290,7 +1290,7 @@ xfs_mod_fdblocks( else batch = XFS_FDBLOCKS_BATCH; - __percpu_counter_add(&mp->m_fdblocks, delta, batch); + percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside, XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 557d84063934..ace73f96eb1e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -66,7 +66,7 @@ static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) static inline void __add_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item, s64 amount) { - __percpu_counter_add(&wb->stat[item], amount, WB_STAT_BATCH); + percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); } static inline void __inc_wb_stat(struct bdi_writeback *wb, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 01b62e7bac74..7104bea8dab1 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -518,7 +518,7 @@ static inline void blkg_stat_exit(struct blkg_stat *stat) */ static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) { - __percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); + percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); } /** @@ -597,14 +597,14 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; - __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); + percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); if (op_is_sync(op)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; - __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); + percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); } /** diff --git a/include/linux/mman.h b/include/linux/mman.h index 634c4c51fe3a..c8367041fafd 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -22,7 +22,7 @@ unsigned long vm_memory_committed(void); static inline void vm_acct_memory(long pages) { - __percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch); + percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch); } static inline void vm_unacct_memory(long pages) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 84a109449610..ec065387f443 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -39,7 +39,8 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, void percpu_counter_destroy(struct percpu_counter *fbc); void percpu_counter_set(struct percpu_counter *fbc, s64 amount); -void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch); +void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, + s32 batch); s64 __percpu_counter_sum(struct percpu_counter *fbc); int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch); @@ -50,7 +51,7 @@ static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) { - __percpu_counter_add(fbc, amount, percpu_counter_batch); + percpu_counter_add_batch(fbc, amount, percpu_counter_batch); } static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) @@ -136,7 +137,7 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount) } static inline void -__percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) +percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { percpu_counter_add(fbc, amount); } diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 440c1e9d0623..6fdcd2427776 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -154,12 +154,12 @@ static inline int frag_mem_limit(struct netns_frags *nf) static inline void sub_frag_mem_limit(struct netns_frags *nf, int i) { - __percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch); + percpu_counter_add_batch(&nf->mem, -i, frag_percpu_counter_batch); } static inline void add_frag_mem_limit(struct netns_frags *nf, int i) { - __percpu_counter_add(&nf->mem, i, frag_percpu_counter_batch); + percpu_counter_add_batch(&nf->mem, i, frag_percpu_counter_batch); } static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf) diff --git a/include/trace/events/percpu.h b/include/trace/events/percpu.h new file mode 100644 index 000000000000..ad34b1bae047 --- /dev/null +++ b/include/trace/events/percpu.h @@ -0,0 +1,125 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM percpu + +#if !defined(_TRACE_PERCPU_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_PERCPU_H + +#include + +TRACE_EVENT(percpu_alloc_percpu, + + TP_PROTO(bool reserved, bool is_atomic, size_t size, + size_t align, void *base_addr, int off, void __percpu *ptr), + + TP_ARGS(reserved, is_atomic, size, align, base_addr, off, ptr), + + TP_STRUCT__entry( + __field( bool, reserved ) + __field( bool, is_atomic ) + __field( size_t, size ) + __field( size_t, align ) + __field( void *, base_addr ) + __field( int, off ) + __field( void __percpu *, ptr ) + ), + + TP_fast_assign( + __entry->reserved = reserved; + __entry->is_atomic = is_atomic; + __entry->size = size; + __entry->align = align; + __entry->base_addr = base_addr; + __entry->off = off; + __entry->ptr = ptr; + ), + + TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p", + __entry->reserved, __entry->is_atomic, + __entry->size, __entry->align, + __entry->base_addr, __entry->off, __entry->ptr) +); + +TRACE_EVENT(percpu_free_percpu, + + TP_PROTO(void *base_addr, int off, void __percpu *ptr), + + TP_ARGS(base_addr, off, ptr), + + TP_STRUCT__entry( + __field( void *, base_addr ) + __field( int, off ) + __field( void __percpu *, ptr ) + ), + + TP_fast_assign( + __entry->base_addr = base_addr; + __entry->off = off; + __entry->ptr = ptr; + ), + + TP_printk("base_addr=%p off=%d ptr=%p", + __entry->base_addr, __entry->off, __entry->ptr) +); + +TRACE_EVENT(percpu_alloc_percpu_fail, + + TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align), + + TP_ARGS(reserved, is_atomic, size, align), + + TP_STRUCT__entry( + __field( bool, reserved ) + __field( bool, is_atomic ) + __field( size_t, size ) + __field( size_t, align ) + ), + + TP_fast_assign( + __entry->reserved = reserved; + __entry->is_atomic = is_atomic; + __entry->size = size; + __entry->align = align; + ), + + TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu", + __entry->reserved, __entry->is_atomic, + __entry->size, __entry->align) +); + +TRACE_EVENT(percpu_create_chunk, + + TP_PROTO(void *base_addr), + + TP_ARGS(base_addr), + + TP_STRUCT__entry( + __field( void *, base_addr ) + ), + + TP_fast_assign( + __entry->base_addr = base_addr; + ), + + TP_printk("base_addr=%p", __entry->base_addr) +); + +TRACE_EVENT(percpu_destroy_chunk, + + TP_PROTO(void *base_addr), + + TP_ARGS(base_addr), + + TP_STRUCT__entry( + __field( void *, base_addr ) + ), + + TP_fast_assign( + __entry->base_addr = base_addr; + ), + + TP_printk("base_addr=%p", __entry->base_addr) +); + +#endif /* _TRACE_PERCPU_H */ + +#include diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index a71cf1bdd4c9..2cc1f94e03a1 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -207,7 +207,7 @@ static void fprop_reflect_period_percpu(struct fprop_global *p, if (val < (nr_cpu_ids * PROP_BATCH)) val = percpu_counter_sum(&pl->events); - __percpu_counter_add(&pl->events, + percpu_counter_add_batch(&pl->events, -val + (val >> (period-pl->period)), PROP_BATCH); } else percpu_counter_set(&pl->events, 0); @@ -219,7 +219,7 @@ static void fprop_reflect_period_percpu(struct fprop_global *p, void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl) { fprop_reflect_period_percpu(p, pl); - __percpu_counter_add(&pl->events, 1, PROP_BATCH); + percpu_counter_add_batch(&pl->events, 1, PROP_BATCH); percpu_counter_add(&p->events, 1); } @@ -267,6 +267,6 @@ void __fprop_inc_percpu_max(struct fprop_global *p, return; } else fprop_reflect_period_percpu(p, pl); - __percpu_counter_add(&pl->events, 1, PROP_BATCH); + percpu_counter_add_batch(&pl->events, 1, PROP_BATCH); percpu_counter_add(&p->events, 1); } diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 9c21000df0b5..8ee7e5ec21be 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -72,7 +72,7 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount) } EXPORT_SYMBOL(percpu_counter_set); -void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) +void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; @@ -89,7 +89,7 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch) } preempt_enable(); } -EXPORT_SYMBOL(__percpu_counter_add); +EXPORT_SYMBOL(percpu_counter_add_batch); /* * Add up all the per-cpu counts, return the result. This is a more accurate diff --git a/mm/Kconfig b/mm/Kconfig index 398b46064544..665cb370ad38 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -706,3 +706,11 @@ config ARCH_USES_HIGH_VMA_FLAGS bool config ARCH_HAS_PKEYS bool + +config PERCPU_STATS + bool "Collect percpu memory statistics" + default n + help + This feature collects and exposes statistics via debugfs. The + information includes global and per chunk statistics, which can + be used to help understand percpu memory usage. diff --git a/mm/Makefile b/mm/Makefile index 026f6a828a50..411bd24d4a7c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -103,3 +103,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o +obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h new file mode 100644 index 000000000000..cd2442e13d8f --- /dev/null +++ b/mm/percpu-internal.h @@ -0,0 +1,166 @@ +#ifndef _MM_PERCPU_INTERNAL_H +#define _MM_PERCPU_INTERNAL_H + +#include +#include + +struct pcpu_chunk { +#ifdef CONFIG_PERCPU_STATS + int nr_alloc; /* # of allocations */ + size_t max_alloc_size; /* largest allocation size */ +#endif + + struct list_head list; /* linked to pcpu_slot lists */ + int free_size; /* free bytes in the chunk */ + int contig_hint; /* max contiguous size hint */ + void *base_addr; /* base address of this chunk */ + + int map_used; /* # of map entries used before the sentry */ + int map_alloc; /* # of map entries allocated */ + int *map; /* allocation map */ + struct list_head map_extend_list;/* on pcpu_map_extend_chunks */ + + void *data; /* chunk data */ + int first_free; /* no free below this */ + bool immutable; /* no [de]population allowed */ + bool has_reserved; /* Indicates if chunk has reserved space + at the beginning. Reserved chunk will + contain reservation for static chunk. + Dynamic chunk will contain reservation + for static and reserved chunks. */ + int nr_populated; /* # of populated pages */ + unsigned long populated[]; /* populated bitmap */ +}; + +extern spinlock_t pcpu_lock; + +extern struct list_head *pcpu_slot; +extern int pcpu_nr_slots; + +extern struct pcpu_chunk *pcpu_first_chunk; +extern struct pcpu_chunk *pcpu_reserved_chunk; + +#ifdef CONFIG_PERCPU_STATS + +#include + +struct percpu_stats { + u64 nr_alloc; /* lifetime # of allocations */ + u64 nr_dealloc; /* lifetime # of deallocations */ + u64 nr_cur_alloc; /* current # of allocations */ + u64 nr_max_alloc; /* max # of live allocations */ + u32 nr_chunks; /* current # of live chunks */ + u32 nr_max_chunks; /* max # of live chunks */ + size_t min_alloc_size; /* min allocaiton size */ + size_t max_alloc_size; /* max allocation size */ +}; + +extern struct percpu_stats pcpu_stats; +extern struct pcpu_alloc_info pcpu_stats_ai; + +/* + * For debug purposes. We don't care about the flexible array. + */ +static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) +{ + memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info)); + + /* initialize min_alloc_size to unit_size */ + pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size; +} + +/* + * pcpu_stats_area_alloc - increment area allocation stats + * @chunk: the location of the area being allocated + * @size: size of area to allocate in bytes + * + * CONTEXT: + * pcpu_lock. + */ +static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) +{ + lockdep_assert_held(&pcpu_lock); + + pcpu_stats.nr_alloc++; + pcpu_stats.nr_cur_alloc++; + pcpu_stats.nr_max_alloc = + max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc); + pcpu_stats.min_alloc_size = + min(pcpu_stats.min_alloc_size, size); + pcpu_stats.max_alloc_size = + max(pcpu_stats.max_alloc_size, size); + + chunk->nr_alloc++; + chunk->max_alloc_size = max(chunk->max_alloc_size, size); +} + +/* + * pcpu_stats_area_dealloc - decrement allocation stats + * @chunk: the location of the area being deallocated + * + * CONTEXT: + * pcpu_lock. + */ +static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) +{ + lockdep_assert_held(&pcpu_lock); + + pcpu_stats.nr_dealloc++; + pcpu_stats.nr_cur_alloc--; + + chunk->nr_alloc--; +} + +/* + * pcpu_stats_chunk_alloc - increment chunk stats + */ +static inline void pcpu_stats_chunk_alloc(void) +{ + unsigned long flags; + spin_lock_irqsave(&pcpu_lock, flags); + + pcpu_stats.nr_chunks++; + pcpu_stats.nr_max_chunks = + max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks); + + spin_unlock_irqrestore(&pcpu_lock, flags); +} + +/* + * pcpu_stats_chunk_dealloc - decrement chunk stats + */ +static inline void pcpu_stats_chunk_dealloc(void) +{ + unsigned long flags; + spin_lock_irqsave(&pcpu_lock, flags); + + pcpu_stats.nr_chunks--; + + spin_unlock_irqrestore(&pcpu_lock, flags); +} + +#else + +static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) +{ +} + +static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) +{ +} + +static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) +{ +} + +static inline void pcpu_stats_chunk_alloc(void) +{ +} + +static inline void pcpu_stats_chunk_dealloc(void) +{ +} + +#endif /* !CONFIG_PERCPU_STATS */ + +#endif diff --git a/mm/percpu-km.c b/mm/percpu-km.c index d66911ff42d9..eb58aa4c0997 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -72,6 +72,9 @@ static struct pcpu_chunk *pcpu_create_chunk(void) pcpu_chunk_populated(chunk, 0, nr_pages); spin_unlock_irq(&pcpu_lock); + pcpu_stats_chunk_alloc(); + trace_percpu_create_chunk(chunk->base_addr); + return chunk; } @@ -79,7 +82,13 @@ static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) { const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; - if (chunk && chunk->data) + if (!chunk) + return; + + pcpu_stats_chunk_dealloc(); + trace_percpu_destroy_chunk(chunk->base_addr); + + if (chunk->data) __free_pages(chunk->data, order_base_2(nr_pages)); pcpu_free_chunk(chunk); } diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c new file mode 100644 index 000000000000..03524a56eeff --- /dev/null +++ b/mm/percpu-stats.c @@ -0,0 +1,222 @@ +/* + * mm/percpu-debug.c + * + * Copyright (C) 2017 Facebook Inc. + * Copyright (C) 2017 Dennis Zhou + * + * This file is released under the GPLv2. + * + * Prints statistics about the percpu allocator and backing chunks. + */ +#include +#include +#include +#include +#include +#include + +#include "percpu-internal.h" + +#define P(X, Y) \ + seq_printf(m, " %-24s: %8lld\n", X, (long long int)Y) + +struct percpu_stats pcpu_stats; +struct pcpu_alloc_info pcpu_stats_ai; + +static int cmpint(const void *a, const void *b) +{ + return *(int *)a - *(int *)b; +} + +/* + * Iterates over all chunks to find the max # of map entries used. + */ +static int find_max_map_used(void) +{ + struct pcpu_chunk *chunk; + int slot, max_map_used; + + max_map_used = 0; + for (slot = 0; slot < pcpu_nr_slots; slot++) + list_for_each_entry(chunk, &pcpu_slot[slot], list) + max_map_used = max(max_map_used, chunk->map_used); + + return max_map_used; +} + +/* + * Prints out chunk state. Fragmentation is considered between + * the beginning of the chunk to the last allocation. + */ +static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, + void *buffer) +{ + int i, s_index, last_alloc, alloc_sign, as_len; + int *alloc_sizes, *p; + /* statistics */ + int sum_frag = 0, max_frag = 0; + int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0; + + alloc_sizes = buffer; + s_index = chunk->has_reserved ? 1 : 0; + + /* find last allocation */ + last_alloc = -1; + for (i = chunk->map_used - 1; i >= s_index; i--) { + if (chunk->map[i] & 1) { + last_alloc = i; + break; + } + } + + /* if the chunk is not empty - ignoring reserve */ + if (last_alloc >= s_index) { + as_len = last_alloc + 1 - s_index; + + /* + * Iterate through chunk map computing size info. + * The first bit is overloaded to be a used flag. + * negative = free space, positive = allocated + */ + for (i = 0, p = chunk->map + s_index; i < as_len; i++, p++) { + alloc_sign = (*p & 1) ? 1 : -1; + alloc_sizes[i] = alloc_sign * + ((p[1] & ~1) - (p[0] & ~1)); + } + + sort(alloc_sizes, as_len, sizeof(chunk->map[0]), cmpint, NULL); + + /* Iterate through the unallocated fragements. */ + for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) { + sum_frag -= *p; + max_frag = max(max_frag, -1 * (*p)); + } + + cur_min_alloc = alloc_sizes[i]; + cur_med_alloc = alloc_sizes[(i + as_len - 1) / 2]; + cur_max_alloc = alloc_sizes[as_len - 1]; + } + + P("nr_alloc", chunk->nr_alloc); + P("max_alloc_size", chunk->max_alloc_size); + P("free_size", chunk->free_size); + P("contig_hint", chunk->contig_hint); + P("sum_frag", sum_frag); + P("max_frag", max_frag); + P("cur_min_alloc", cur_min_alloc); + P("cur_med_alloc", cur_med_alloc); + P("cur_max_alloc", cur_max_alloc); + seq_putc(m, '\n'); +} + +static int percpu_stats_show(struct seq_file *m, void *v) +{ + struct pcpu_chunk *chunk; + int slot, max_map_used; + void *buffer; + +alloc_buffer: + spin_lock_irq(&pcpu_lock); + max_map_used = find_max_map_used(); + spin_unlock_irq(&pcpu_lock); + + buffer = vmalloc(max_map_used * sizeof(pcpu_first_chunk->map[0])); + if (!buffer) + return -ENOMEM; + + spin_lock_irq(&pcpu_lock); + + /* if the buffer allocated earlier is too small */ + if (max_map_used < find_max_map_used()) { + spin_unlock_irq(&pcpu_lock); + vfree(buffer); + goto alloc_buffer; + } + +#define PL(X) \ + seq_printf(m, " %-24s: %8lld\n", #X, (long long int)pcpu_stats_ai.X) + + seq_printf(m, + "Percpu Memory Statistics\n" + "Allocation Info:\n" + "----------------------------------------\n"); + PL(unit_size); + PL(static_size); + PL(reserved_size); + PL(dyn_size); + PL(atom_size); + PL(alloc_size); + seq_putc(m, '\n'); + +#undef PL + +#define PU(X) \ + seq_printf(m, " %-18s: %14llu\n", #X, (unsigned long long)pcpu_stats.X) + + seq_printf(m, + "Global Stats:\n" + "----------------------------------------\n"); + PU(nr_alloc); + PU(nr_dealloc); + PU(nr_cur_alloc); + PU(nr_max_alloc); + PU(nr_chunks); + PU(nr_max_chunks); + PU(min_alloc_size); + PU(max_alloc_size); + seq_putc(m, '\n'); + +#undef PU + + seq_printf(m, + "Per Chunk Stats:\n" + "----------------------------------------\n"); + + if (pcpu_reserved_chunk) { + seq_puts(m, "Chunk: <- Reserved Chunk\n"); + chunk_map_stats(m, pcpu_reserved_chunk, buffer); + } + + for (slot = 0; slot < pcpu_nr_slots; slot++) { + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + if (chunk == pcpu_first_chunk) { + seq_puts(m, "Chunk: <- First Chunk\n"); + chunk_map_stats(m, chunk, buffer); + + + } else { + seq_puts(m, "Chunk:\n"); + chunk_map_stats(m, chunk, buffer); + } + + } + } + + spin_unlock_irq(&pcpu_lock); + + vfree(buffer); + + return 0; +} + +static int percpu_stats_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, percpu_stats_show, NULL); +} + +static const struct file_operations percpu_stats_fops = { + .open = percpu_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init init_percpu_stats_debugfs(void) +{ + debugfs_create_file("percpu_stats", 0444, NULL, NULL, + &percpu_stats_fops); + + return 0; +} + +late_initcall(init_percpu_stats_debugfs); diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 9ac639499bd1..15dab691ea70 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -343,12 +343,22 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->data = vms; chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; + + pcpu_stats_chunk_alloc(); + trace_percpu_create_chunk(chunk->base_addr); + return chunk; } static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) { - if (chunk && chunk->data) + if (!chunk) + return; + + pcpu_stats_chunk_dealloc(); + trace_percpu_destroy_chunk(chunk->base_addr); + + if (chunk->data) pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); pcpu_free_chunk(chunk); } diff --git a/mm/percpu.c b/mm/percpu.c index e0aa8ae7bde7..bd4130a69bbc 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -76,6 +76,11 @@ #include #include +#define CREATE_TRACE_POINTS +#include + +#include "percpu-internal.h" + #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ #define PCPU_ATOMIC_MAP_MARGIN_LOW 32 @@ -103,53 +108,35 @@ #define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr) #endif /* CONFIG_SMP */ -struct pcpu_chunk { - struct list_head list; /* linked to pcpu_slot lists */ - int free_size; /* free bytes in the chunk */ - int contig_hint; /* max contiguous size hint */ - void *base_addr; /* base address of this chunk */ - - int map_used; /* # of map entries used before the sentry */ - int map_alloc; /* # of map entries allocated */ - int *map; /* allocation map */ - struct list_head map_extend_list;/* on pcpu_map_extend_chunks */ - - void *data; /* chunk data */ - int first_free; /* no free below this */ - bool immutable; /* no [de]population allowed */ - int nr_populated; /* # of populated pages */ - unsigned long populated[]; /* populated bitmap */ -}; - -static int pcpu_unit_pages __read_mostly; -static int pcpu_unit_size __read_mostly; -static int pcpu_nr_units __read_mostly; -static int pcpu_atom_size __read_mostly; -static int pcpu_nr_slots __read_mostly; -static size_t pcpu_chunk_struct_size __read_mostly; +static int pcpu_unit_pages __ro_after_init; +static int pcpu_unit_size __ro_after_init; +static int pcpu_nr_units __ro_after_init; +static int pcpu_atom_size __ro_after_init; +int pcpu_nr_slots __ro_after_init; +static size_t pcpu_chunk_struct_size __ro_after_init; /* cpus with the lowest and highest unit addresses */ -static unsigned int pcpu_low_unit_cpu __read_mostly; -static unsigned int pcpu_high_unit_cpu __read_mostly; +static unsigned int pcpu_low_unit_cpu __ro_after_init; +static unsigned int pcpu_high_unit_cpu __ro_after_init; /* the address of the first chunk which starts with the kernel static area */ -void *pcpu_base_addr __read_mostly; +void *pcpu_base_addr __ro_after_init; EXPORT_SYMBOL_GPL(pcpu_base_addr); -static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */ -const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */ +static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */ +const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */ /* group information, used for vm allocation */ -static int pcpu_nr_groups __read_mostly; -static const unsigned long *pcpu_group_offsets __read_mostly; -static const size_t *pcpu_group_sizes __read_mostly; +static int pcpu_nr_groups __ro_after_init; +static const unsigned long *pcpu_group_offsets __ro_after_init; +static const size_t *pcpu_group_sizes __ro_after_init; /* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different * ways and thus often doesn't live in the vmalloc area. */ -static struct pcpu_chunk *pcpu_first_chunk; +struct pcpu_chunk *pcpu_first_chunk __ro_after_init; /* * Optional reserved chunk. This chunk reserves part of the first @@ -158,13 +145,13 @@ static struct pcpu_chunk *pcpu_first_chunk; * area doesn't exist, the following variables contain NULL and 0 * respectively. */ -static struct pcpu_chunk *pcpu_reserved_chunk; -static int pcpu_reserved_chunk_limit; +struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init; +static int pcpu_reserved_chunk_limit __ro_after_init; -static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ +DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ -static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ +struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */ /* chunks which need their map areas extended, protected by pcpu_lock */ static LIST_HEAD(pcpu_map_extend_chunks); @@ -672,6 +659,9 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, int to_free = 0; int *p; + lockdep_assert_held(&pcpu_lock); + pcpu_stats_area_dealloc(chunk); + freeme |= 1; /* we are searching for pair */ i = 0; @@ -735,6 +725,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map[0] = 0; chunk->map[1] = pcpu_unit_size | 1; chunk->map_used = 1; + chunk->has_reserved = false; INIT_LIST_HEAD(&chunk->list); INIT_LIST_HEAD(&chunk->map_extend_list); @@ -965,8 +956,10 @@ restart: * tasks to create chunks simultaneously. Serialize and create iff * there's still no empty chunk after grabbing the mutex. */ - if (is_atomic) + if (is_atomic) { + err = "atomic alloc failed, no space left"; goto fail; + } if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { chunk = pcpu_create_chunk(); @@ -984,6 +977,7 @@ restart: goto restart; area_found: + pcpu_stats_area_alloc(chunk, size); spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ @@ -1026,11 +1020,17 @@ area_found: ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); kmemleak_alloc_percpu(ptr, size, gfp); + + trace_percpu_alloc_percpu(reserved, is_atomic, size, align, + chunk->base_addr, off, ptr); + return ptr; fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail: + trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); + if (!is_atomic && warn_limit) { pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", size, align, is_atomic, err); @@ -1280,6 +1280,8 @@ void free_percpu(void __percpu *ptr) } } + trace_percpu_free_percpu(chunk->base_addr, off, ptr); + spin_unlock_irqrestore(&pcpu_lock, flags); } EXPORT_SYMBOL_GPL(free_percpu); @@ -1656,6 +1658,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); + pcpu_stats_save_ai(ai); + /* * Allocate chunk slots. The additional last slot is for * empty chunks. @@ -1699,6 +1703,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (schunk->free_size) schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size; schunk->map[schunk->map_used] |= 1; + schunk->has_reserved = true; /* init dynamic chunk if necessary */ if (dyn_size) { @@ -1717,6 +1722,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, dchunk->map[1] = pcpu_reserved_chunk_limit; dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1; dchunk->map_used = 2; + dchunk->has_reserved = true; } /* link the first chunk in */ @@ -1725,6 +1731,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_count_occupied_pages(pcpu_first_chunk, 1); pcpu_chunk_relocate(pcpu_first_chunk, -1); + pcpu_stats_chunk_alloc(); + trace_percpu_create_chunk(base_addr); + /* we're done */ pcpu_base_addr = base_addr; return 0;