From 1ed8d48c57bf7400eac7b8dc622ab0413715cafb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:54:52 -0700 Subject: [PATCH 01/45] writeback: bdi_for_each_wb() iteration is memcg ID based not blkcg wb's (bdi_writeback's) are currently keyed by memcg ID; however, in an earlier implementation, wb's were keyed by blkcg ID. bdi_for_each_wb() walks bdi->cgwb_tree in the ascending ID order and allows iterations to start from an arbitrary ID which is used to interrupt and resume iterations. Unfortunately, while changing wb to be keyed by memcg ID instead of blkcg, bdi_for_each_wb() was missed and is still assuming that wb's are keyed by blkcg ID. This doesn't affect iterations which don't get interrupted but bdi_split_work_to_wbs() makes use of iteration resuming on allocation failures and thus may incorrectly skip or repeat wb's. Fix it by changing bdi_for_each_wb() to take memcg IDs instead of blkcg IDs and updating bdi_split_work_to_wbs() accordingly. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 6 +++--- include/linux/backing-dev.h | 24 ++++++++++++------------ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 518c6294bf6c..c9def2115aca 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -839,7 +839,7 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, bool skip_if_busy) { long nr_pages = base_work->nr_pages; - int next_blkcg_id = 0; + int next_memcg_id = 0; struct bdi_writeback *wb; struct wb_iter iter; @@ -849,14 +849,14 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, return; restart: rcu_read_lock(); - bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) { + bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) { if (!wb_has_dirty_io(wb) || (skip_if_busy && writeback_in_progress(wb))) continue; base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages); if (!wb_clone_and_queue_work(wb, base_work)) { - next_blkcg_id = wb->blkcg_css->id + 1; + next_memcg_id = wb->memcg_css->id + 1; rcu_read_unlock(); wb_wait_for_single_work(bdi, base_work); goto restart; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0fe9df983ab7..23ebb946e66f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -402,7 +402,7 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked) } struct wb_iter { - int start_blkcg_id; + int start_memcg_id; struct radix_tree_iter tree_iter; void **slot; }; @@ -414,9 +414,9 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter, WARN_ON_ONCE(!rcu_read_lock_held()); - if (iter->start_blkcg_id >= 0) { - iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id); - iter->start_blkcg_id = -1; + if (iter->start_memcg_id >= 0) { + iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id); + iter->start_memcg_id = -1; } else { iter->slot = radix_tree_next_slot(iter->slot, titer, 0); } @@ -430,30 +430,30 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter, static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter, struct backing_dev_info *bdi, - int start_blkcg_id) + int start_memcg_id) { - iter->start_blkcg_id = start_blkcg_id; + iter->start_memcg_id = start_memcg_id; - if (start_blkcg_id) + if (start_memcg_id) return __wb_iter_next(iter, bdi); else return &bdi->wb; } /** - * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order + * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order * @wb_cur: cursor struct bdi_writeback pointer * @bdi: bdi to walk wb's of * @iter: pointer to struct wb_iter to be used as iteration buffer - * @start_blkcg_id: blkcg ID to start iteration from + * @start_memcg_id: memcg ID to start iteration from * * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending - * blkcg ID order starting from @start_blkcg_id. @iter is struct wb_iter + * memcg ID order starting from @start_memcg_id. @iter is struct wb_iter * to be used as temp storage during iteration. rcu_read_lock() must be * held throughout iteration. */ -#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id) \ - for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id); \ +#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id) \ + for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id); \ (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi)) #else /* CONFIG_CGROUP_WRITEBACK */ From 8a1270cda7b473c8ea71a7a44613465fe0708488 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:54:53 -0700 Subject: [PATCH 02/45] writeback: remove wb_writeback_work->single_wait/done wb_writeback_work->single_wait/done are used for the wait mechanism for synchronous wb_work (wb_writeback_work) items which are issued when bdi_split_work_to_wbs() fails to allocate memory for asynchronous wb_work items; however, there's no reason to use a separate wait mechanism for this. bdi_split_work_to_wbs() can simply use on-stack fallback wb_work item and separate wb_completion to wait for it. This patch removes wb_work->single_wait/done and the related code and make bdi_split_work_to_wbs() use on-stack fallback wb_work and wb_completion instead. Signed-off-by: Tejun Heo Suggested-by: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 116 ++++++++++++---------------------------------- 1 file changed, 30 insertions(+), 86 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c9def2115aca..38e103506484 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -53,8 +53,6 @@ struct wb_writeback_work { unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ unsigned int auto_free:1; /* free on completion */ - unsigned int single_wait:1; - unsigned int single_done:1; enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ @@ -181,11 +179,8 @@ static void wb_queue_work(struct bdi_writeback *wb, trace_writeback_queue(wb->bdi, work); spin_lock_bh(&wb->work_lock); - if (!test_bit(WB_registered, &wb->state)) { - if (work->single_wait) - work->single_done = 1; + if (!test_bit(WB_registered, &wb->state)) goto out_unlock; - } if (work->done) atomic_inc(&work->done->cnt); list_add_tail(&work->list, &wb->work_list); @@ -737,32 +732,6 @@ int inode_congested(struct inode *inode, int cong_bits) } EXPORT_SYMBOL_GPL(inode_congested); -/** - * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work - * @bdi: bdi the work item was issued to - * @work: work item to wait for - * - * Wait for the completion of @work which was issued to one of @bdi's - * bdi_writeback's. The caller must have set @work->single_wait before - * issuing it. This wait operates independently fo - * wb_wait_for_completion() and also disables automatic freeing of @work. - */ -static void wb_wait_for_single_work(struct backing_dev_info *bdi, - struct wb_writeback_work *work) -{ - if (WARN_ON_ONCE(!work->single_wait)) - return; - - wait_event(bdi->wb_waitq, work->single_done); - - /* - * Paired with smp_wmb() in wb_do_writeback() and ensures that all - * modifications to @work prior to assertion of ->single_done is - * visible to the caller once this function returns. - */ - smp_rmb(); -} - /** * wb_split_bdi_pages - split nr_pages to write according to bandwidth * @wb: target bdi_writeback to split @nr_pages to @@ -791,38 +760,6 @@ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw); } -/** - * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb - * @wb: target bdi_writeback - * @base_work: source wb_writeback_work - * - * Try to make a clone of @base_work and issue it to @wb. If cloning - * succeeds, %true is returned; otherwise, @base_work is issued directly - * and %false is returned. In the latter case, the caller is required to - * wait for @base_work's completion using wb_wait_for_single_work(). - * - * A clone is auto-freed on completion. @base_work never is. - */ -static bool wb_clone_and_queue_work(struct bdi_writeback *wb, - struct wb_writeback_work *base_work) -{ - struct wb_writeback_work *work; - - work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (work) { - *work = *base_work; - work->auto_free = 1; - work->single_wait = 0; - } else { - work = base_work; - work->auto_free = 0; - work->single_wait = 1; - } - work->single_done = 0; - wb_queue_work(wb, work); - return work != base_work; -} - /** * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi * @bdi: target backing_dev_info @@ -838,7 +775,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, struct wb_writeback_work *base_work, bool skip_if_busy) { - long nr_pages = base_work->nr_pages; int next_memcg_id = 0; struct bdi_writeback *wb; struct wb_iter iter; @@ -850,17 +786,39 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, restart: rcu_read_lock(); bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) { + DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done); + struct wb_writeback_work fallback_work; + struct wb_writeback_work *work; + long nr_pages; + if (!wb_has_dirty_io(wb) || (skip_if_busy && writeback_in_progress(wb))) continue; - base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages); - if (!wb_clone_and_queue_work(wb, base_work)) { - next_memcg_id = wb->memcg_css->id + 1; - rcu_read_unlock(); - wb_wait_for_single_work(bdi, base_work); - goto restart; + nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages); + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + *work = *base_work; + work->nr_pages = nr_pages; + work->auto_free = 1; + wb_queue_work(wb, work); + continue; } + + /* alloc failed, execute synchronously using on-stack fallback */ + work = &fallback_work; + *work = *base_work; + work->nr_pages = nr_pages; + work->auto_free = 0; + work->done = &fallback_work_done; + + wb_queue_work(wb, work); + + next_memcg_id = wb->memcg_css->id + 1; + rcu_read_unlock(); + wb_wait_for_completion(bdi, &fallback_work_done); + goto restart; } rcu_read_unlock(); } @@ -902,8 +860,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, if (bdi_has_dirty_io(bdi) && (!skip_if_busy || !writeback_in_progress(&bdi->wb))) { base_work->auto_free = 0; - base_work->single_wait = 0; - base_work->single_done = 0; wb_queue_work(&bdi->wb, base_work); } } @@ -1794,26 +1750,14 @@ static long wb_do_writeback(struct bdi_writeback *wb) set_bit(WB_writeback_running, &wb->state); while ((work = get_next_work_item(wb)) != NULL) { struct wb_completion *done = work->done; - bool need_wake_up = false; trace_writeback_exec(wb->bdi, work); wrote += wb_writeback(wb, work); - if (work->single_wait) { - WARN_ON_ONCE(work->auto_free); - /* paired w/ rmb in wb_wait_for_single_work() */ - smp_wmb(); - work->single_done = 1; - need_wake_up = true; - } else if (work->auto_free) { + if (work->auto_free) kfree(work); - } - if (done && atomic_dec_and_test(&done->cnt)) - need_wake_up = true; - - if (need_wake_up) wake_up_all(&wb->bdi->wb_waitq); } From 60292bcc1b240a6708aa531c8ece1da2a78d560a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:54:54 -0700 Subject: [PATCH 03/45] writeback: explain why @inode is allowed to be NULL for inode_congested() Signed-off-by: Tejun Heo Suggested-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 38e103506484..128404310fc0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -701,7 +701,7 @@ EXPORT_SYMBOL_GPL(wbc_account_io); /** * inode_congested - test whether an inode is congested - * @inode: inode to test for congestion + * @inode: inode to test for congestion (may be NULL) * @cong_bits: mask of WB_[a]sync_congested bits to test * * Tests whether @inode is congested. @cong_bits is the mask of congestion @@ -711,6 +711,9 @@ EXPORT_SYMBOL_GPL(wbc_account_io); * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg * associated with @inode is congested; otherwise, the root wb's congestion * state is used. + * + * @inode is allowed to be NULL as this function is often called on + * mapping->host which is NULL for the swapper space. */ int inode_congested(struct inode *inode, int cong_bits) { From 9acee9c551f045d2c5b5261aa587331423fd7d92 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:54:55 -0700 Subject: [PATCH 04/45] kernfs: implement kernfs_path_len() Add a function to determine the path length of a kernfs node. This for now will be used by writeback tracepoint updates. Signed-off-by: Tejun Heo Acked-by: Greg Kroah-Hartman Signed-off-by: Jens Axboe --- fs/kernfs/dir.c | 23 +++++++++++++++++++++++ include/linux/kernfs.h | 4 ++++ 2 files changed, 27 insertions(+) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 2d48d28e1640..91e004518237 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -91,6 +91,29 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) return ret; } +/** + * kernfs_path_len - determine the length of the full path of a given node + * @kn: kernfs_node of interest + * + * The returned length doesn't include the space for the terminating '\0'. + */ +size_t kernfs_path_len(struct kernfs_node *kn) +{ + size_t len = 0; + unsigned long flags; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + + do { + len += strlen(kn->name) + 1; + kn = kn->parent; + } while (kn && kn->parent); + + spin_unlock_irqrestore(&kernfs_rename_lock, flags); + + return len; +} + /** * kernfs_path - build full path of a given node * @kn: kernfs_node of interest diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 123be25ea15a..5d4e9c4b821d 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -266,6 +266,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) } int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); +size_t kernfs_path_len(struct kernfs_node *kn); char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen); void pr_cont_kernfs_name(struct kernfs_node *kn); @@ -332,6 +333,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { return -ENOSYS; } +static inline size_t kernfs_path_len(struct kernfs_node *kn) +{ return 0; } + static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) { return NULL; } From 5634cc2aa9aebc77bc862992e7805469dcf83dac Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:54:56 -0700 Subject: [PATCH 05/45] writeback: update writeback tracepoints to report cgroup The following tracepoints are updated to report the cgroup used during cgroup writeback. * writeback_write_inode[_start] * writeback_queue * writeback_exec * writeback_start * writeback_written * writeback_wait * writeback_nowork * writeback_wake_background * wbc_writepage * writeback_queue_io * bdi_dirty_ratelimit * balance_dirty_pages * writeback_sb_inodes_requeue * writeback_single_inode[_start] Note that writeback_bdi_register is separated out from writeback_class as reporting cgroup doesn't make sense to it. Tracepoints which take bdi are updated to take bdi_writeback instead. Signed-off-by: Tejun Heo Suggested-by: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 14 +-- include/trace/events/writeback.h | 182 ++++++++++++++++++++++++------- mm/page-writeback.c | 6 +- 3 files changed, 152 insertions(+), 50 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 128404310fc0..f4f0f228a530 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -176,7 +176,7 @@ static void wb_wakeup(struct bdi_writeback *wb) static void wb_queue_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { - trace_writeback_queue(wb->bdi, work); + trace_writeback_queue(wb, work); spin_lock_bh(&wb->work_lock); if (!test_bit(WB_registered, &wb->state)) @@ -883,7 +883,7 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - trace_writeback_nowork(wb->bdi); + trace_writeback_nowork(wb); wb_wakeup(wb); return; } @@ -913,7 +913,7 @@ void wb_start_background_writeback(struct bdi_writeback *wb) * We just wake up the flusher thread. It will perform background * writeback as soon as there is no other work to do. */ - trace_writeback_wake_background(wb->bdi); + trace_writeback_wake_background(wb); wb_wakeup(wb); } @@ -1616,14 +1616,14 @@ static long wb_writeback(struct bdi_writeback *wb, } else if (work->for_background) oldest_jif = jiffies; - trace_writeback_start(wb->bdi, work); + trace_writeback_start(wb, work); if (list_empty(&wb->b_io)) queue_io(wb, work); if (work->sb) progress = writeback_sb_inodes(work->sb, wb, work); else progress = __writeback_inodes_wb(wb, work); - trace_writeback_written(wb->bdi, work); + trace_writeback_written(wb, work); wb_update_bandwidth(wb, wb_start); @@ -1648,7 +1648,7 @@ static long wb_writeback(struct bdi_writeback *wb, * we'll just busyloop. */ if (!list_empty(&wb->b_more_io)) { - trace_writeback_wait(wb->bdi, work); + trace_writeback_wait(wb, work); inode = wb_inode(wb->b_more_io.prev); spin_lock(&inode->i_lock); spin_unlock(&wb->list_lock); @@ -1754,7 +1754,7 @@ static long wb_do_writeback(struct bdi_writeback *wb) while ((work = get_next_work_item(wb)) != NULL) { struct wb_completion *done = work->done; - trace_writeback_exec(wb->bdi, work); + trace_writeback_exec(wb, work); wrote += wb_writeback(wb, work); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index a7aa607a4c55..fff846b512e6 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -131,6 +131,66 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, TP_ARGS(inode, flags) ); +#ifdef CREATE_TRACE_POINTS +#ifdef CONFIG_CGROUP_WRITEBACK + +static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb) +{ + return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1; +} + +static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb) +{ + struct cgroup *cgrp = wb->memcg_css->cgroup; + char *path; + + path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1); + WARN_ON_ONCE(path != buf); +} + +static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc) +{ + if (wbc->wb) + return __trace_wb_cgroup_size(wbc->wb); + else + return 2; +} + +static inline void __trace_wbc_assign_cgroup(char *buf, + struct writeback_control *wbc) +{ + if (wbc->wb) + __trace_wb_assign_cgroup(buf, wbc->wb); + else + strcpy(buf, "/"); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb) +{ + return 2; +} + +static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb) +{ + strcpy(buf, "/"); +} + +static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc) +{ + return 2; +} + +static inline void __trace_wbc_assign_cgroup(char *buf, + struct writeback_control *wbc) +{ + strcpy(buf, "/"); +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ +#endif /* CREATE_TRACE_POINTS */ + DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc), @@ -141,6 +201,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, __array(char, name, 32) __field(unsigned long, ino) __field(int, sync_mode) + __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc)) ), TP_fast_assign( @@ -148,12 +209,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template, dev_name(inode_to_bdi(inode)->dev), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; + __trace_wbc_assign_cgroup(__get_str(cgroup), wbc); ), - TP_printk("bdi %s: ino=%lu sync_mode=%d", + TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s", __entry->name, __entry->ino, - __entry->sync_mode + __entry->sync_mode, + __get_str(cgroup) ) ); @@ -172,8 +235,8 @@ DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode, ); DECLARE_EVENT_CLASS(writeback_work_class, - TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), - TP_ARGS(bdi, work), + TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), + TP_ARGS(wb, work), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_pages) @@ -183,10 +246,11 @@ DECLARE_EVENT_CLASS(writeback_work_class, __field(int, range_cyclic) __field(int, for_background) __field(int, reason) + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) ), TP_fast_assign( strncpy(__entry->name, - bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); + wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; @@ -194,9 +258,10 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->range_cyclic = work->range_cyclic; __entry->for_background = work->for_background; __entry->reason = work->reason; + __trace_wb_assign_cgroup(__get_str(cgroup), wb); ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " - "kupdate=%d range_cyclic=%d background=%d reason=%s", + "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, @@ -204,13 +269,14 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->for_kupdate, __entry->range_cyclic, __entry->for_background, - __print_symbolic(__entry->reason, WB_WORK_REASON) + __print_symbolic(__entry->reason, WB_WORK_REASON), + __get_str(cgroup) ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ DEFINE_EVENT(writeback_work_class, name, \ - TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ - TP_ARGS(bdi, work)) + TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \ + TP_ARGS(wb, work)) DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); DEFINE_WRITEBACK_WORK_EVENT(writeback_start); @@ -230,6 +296,30 @@ TRACE_EVENT(writeback_pages_written, ); DECLARE_EVENT_CLASS(writeback_class, + TP_PROTO(struct bdi_writeback *wb), + TP_ARGS(wb), + TP_STRUCT__entry( + __array(char, name, 32) + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) + ), + TP_fast_assign( + strncpy(__entry->name, dev_name(wb->bdi->dev), 32); + __trace_wb_assign_cgroup(__get_str(cgroup), wb); + ), + TP_printk("bdi %s: cgroup=%s", + __entry->name, + __get_str(cgroup) + ) +); +#define DEFINE_WRITEBACK_EVENT(name) \ +DEFINE_EVENT(writeback_class, name, \ + TP_PROTO(struct bdi_writeback *wb), \ + TP_ARGS(wb)) + +DEFINE_WRITEBACK_EVENT(writeback_nowork); +DEFINE_WRITEBACK_EVENT(writeback_wake_background); + +TRACE_EVENT(writeback_bdi_register, TP_PROTO(struct backing_dev_info *bdi), TP_ARGS(bdi), TP_STRUCT__entry( @@ -239,17 +329,9 @@ DECLARE_EVENT_CLASS(writeback_class, strncpy(__entry->name, dev_name(bdi->dev), 32); ), TP_printk("bdi %s", - __entry->name + __entry->name ) ); -#define DEFINE_WRITEBACK_EVENT(name) \ -DEFINE_EVENT(writeback_class, name, \ - TP_PROTO(struct backing_dev_info *bdi), \ - TP_ARGS(bdi)) - -DEFINE_WRITEBACK_EVENT(writeback_nowork); -DEFINE_WRITEBACK_EVENT(writeback_wake_background); -DEFINE_WRITEBACK_EVENT(writeback_bdi_register); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), @@ -265,6 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class, __field(int, range_cyclic) __field(long, range_start) __field(long, range_end) + __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc)) ), TP_fast_assign( @@ -278,11 +361,12 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->range_cyclic = wbc->range_cyclic; __entry->range_start = (long)wbc->range_start; __entry->range_end = (long)wbc->range_end; + __trace_wbc_assign_cgroup(__get_str(cgroup), wbc); ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " "bgrd=%d reclm=%d cyclic=%d " - "start=0x%lx end=0x%lx", + "start=0x%lx end=0x%lx cgroup=%s", __entry->name, __entry->nr_to_write, __entry->pages_skipped, @@ -292,7 +376,9 @@ DECLARE_EVENT_CLASS(wbc_class, __entry->for_reclaim, __entry->range_cyclic, __entry->range_start, - __entry->range_end) + __entry->range_end, + __get_str(cgroup) + ) ) #define DEFINE_WBC_EVENT(name) \ @@ -312,6 +398,7 @@ TRACE_EVENT(writeback_queue_io, __field(long, age) __field(int, moved) __field(int, reason) + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) ), TP_fast_assign( unsigned long *older_than_this = work->older_than_this; @@ -321,13 +408,15 @@ TRACE_EVENT(writeback_queue_io, (jiffies - *older_than_this) * 1000 / HZ : -1; __entry->moved = moved; __entry->reason = work->reason; + __trace_wb_assign_cgroup(__get_str(cgroup), wb); ), - TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s", + TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s", __entry->name, __entry->older, /* older_than_this in jiffies */ __entry->age, /* older_than_this in relative milliseconds */ __entry->moved, - __print_symbolic(__entry->reason, WB_WORK_REASON) + __print_symbolic(__entry->reason, WB_WORK_REASON), + __get_str(cgroup) ) ); @@ -381,11 +470,11 @@ TRACE_EVENT(global_dirty_state, TRACE_EVENT(bdi_dirty_ratelimit, - TP_PROTO(struct backing_dev_info *bdi, + TP_PROTO(struct bdi_writeback *wb, unsigned long dirty_rate, unsigned long task_ratelimit), - TP_ARGS(bdi, dirty_rate, task_ratelimit), + TP_ARGS(wb, dirty_rate, task_ratelimit), TP_STRUCT__entry( __array(char, bdi, 32) @@ -395,36 +484,39 @@ TRACE_EVENT(bdi_dirty_ratelimit, __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned long, balanced_dirty_ratelimit) + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) ), TP_fast_assign( - strlcpy(__entry->bdi, dev_name(bdi->dev), 32); - __entry->write_bw = KBps(bdi->wb.write_bandwidth); - __entry->avg_write_bw = KBps(bdi->wb.avg_write_bandwidth); + strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); + __entry->write_bw = KBps(wb->write_bandwidth); + __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); - __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit); + __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->balanced_dirty_ratelimit = - KBps(bdi->wb.balanced_dirty_ratelimit); + KBps(wb->balanced_dirty_ratelimit); + __trace_wb_assign_cgroup(__get_str(cgroup), wb); ), TP_printk("bdi %s: " "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " - "balanced_dirty_ratelimit=%lu", + "balanced_dirty_ratelimit=%lu cgroup=%s", __entry->bdi, __entry->write_bw, /* write bandwidth */ __entry->avg_write_bw, /* avg write bandwidth */ __entry->dirty_rate, /* bdi dirty rate */ __entry->dirty_ratelimit, /* base ratelimit */ __entry->task_ratelimit, /* ratelimit with position control */ - __entry->balanced_dirty_ratelimit /* the balanced ratelimit */ + __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ + __get_str(cgroup) ) ); TRACE_EVENT(balance_dirty_pages, - TP_PROTO(struct backing_dev_info *bdi, + TP_PROTO(struct bdi_writeback *wb, unsigned long thresh, unsigned long bg_thresh, unsigned long dirty, @@ -437,7 +529,7 @@ TRACE_EVENT(balance_dirty_pages, long pause, unsigned long start_time), - TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, + TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, dirty_ratelimit, task_ratelimit, dirtied, period, pause, start_time), @@ -456,11 +548,12 @@ TRACE_EVENT(balance_dirty_pages, __field( long, pause) __field(unsigned long, period) __field( long, think) + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) ), TP_fast_assign( unsigned long freerun = (thresh + bg_thresh) / 2; - strlcpy(__entry->bdi, dev_name(bdi->dev), 32); + strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); __entry->limit = global_wb_domain.dirty_limit; __entry->setpoint = (global_wb_domain.dirty_limit + @@ -478,6 +571,7 @@ TRACE_EVENT(balance_dirty_pages, __entry->period = period * 1000 / HZ; __entry->pause = pause * 1000 / HZ; __entry->paused = (jiffies - start_time) * 1000 / HZ; + __trace_wb_assign_cgroup(__get_str(cgroup), wb); ), @@ -486,7 +580,7 @@ TRACE_EVENT(balance_dirty_pages, "bdi_setpoint=%lu bdi_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " - "paused=%lu pause=%ld period=%lu think=%ld", + "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s", __entry->bdi, __entry->limit, __entry->setpoint, @@ -500,7 +594,8 @@ TRACE_EVENT(balance_dirty_pages, __entry->paused, /* ms */ __entry->pause, /* ms */ __entry->period, /* ms */ - __entry->think /* ms */ + __entry->think, /* ms */ + __get_str(cgroup) ) ); @@ -514,6 +609,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue, __field(unsigned long, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) + __dynamic_array(char, cgroup, + __trace_wb_cgroup_size(inode_to_wb(inode))) ), TP_fast_assign( @@ -522,14 +619,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue, __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; + __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode)); ), - TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu", + TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s", __entry->name, __entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, - (jiffies - __entry->dirtied_when) / HZ + (jiffies - __entry->dirtied_when) / HZ, + __get_str(cgroup) ) ); @@ -585,6 +684,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, __field(unsigned long, writeback_index) __field(long, nr_to_write) __field(unsigned long, wrote) + __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc)) ), TP_fast_assign( @@ -596,10 +696,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, __entry->writeback_index = inode->i_mapping->writeback_index; __entry->nr_to_write = nr_to_write; __entry->wrote = nr_to_write - wbc->nr_to_write; + __trace_wbc_assign_cgroup(__get_str(cgroup), wbc); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " - "index=%lu to_write=%ld wrote=%lu", + "index=%lu to_write=%ld wrote=%lu cgroup=%s", __entry->name, __entry->ino, show_inode_state(__entry->state), @@ -607,7 +708,8 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, (jiffies - __entry->dirtied_when) / HZ, __entry->writeback_index, __entry->nr_to_write, - __entry->wrote + __entry->wrote, + __get_str(cgroup) ) ); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 22cddd3e5de8..802beeb6b86b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1289,7 +1289,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; - trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit); + trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); } static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, @@ -1683,7 +1683,7 @@ static void balance_dirty_pages(struct address_space *mapping, * do a reset, as it may be a light dirtier. */ if (pause < min_pause) { - trace_balance_dirty_pages(bdi, + trace_balance_dirty_pages(wb, sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, @@ -1712,7 +1712,7 @@ static void balance_dirty_pages(struct address_space *mapping, } pause: - trace_balance_dirty_pages(bdi, + trace_balance_dirty_pages(wb, sdtc->thresh, sdtc->bg_thresh, sdtc->dirty, From 4ebc1c61d6185604c97fd0b0355ab668052044ab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:54:57 -0700 Subject: [PATCH 06/45] cfq-iosched: simplify control flow in cfq_get_queue() cfq_get_queue()'s control flow looks like the following. async_cfqq = NULL; cfqq = NULL; if (!is_sync) { ... async_cfqq = ...; cfqq = *async_cfqq; } if (!cfqq) cfqq = ...; if (!is_sync && !(*async_cfqq)) ...; The only thing the local variable init, the second if, and the async_cfqq test in the third if achieves is to skip cfqq creation and installation if *async_cfqq was already non-NULL. This is needlessly complicated with different tests examining the same condition. Simplify it to the following. if (!is_sync) { ... async_cfqq = ...; cfqq = *async_cfqq; if (cfqq) goto out; } cfqq = ...; if (!is_sync) ...; out: Signed-off-by: Tejun Heo Acked-by: Jeff Moyer Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index c62bb2e650b8..e91093d0c0eb 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3731,8 +3731,8 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, { int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); - struct cfq_queue **async_cfqq = NULL; - struct cfq_queue *cfqq = NULL; + struct cfq_queue **async_cfqq; + struct cfq_queue *cfqq; if (!is_sync) { if (!ioprio_valid(cic->ioprio)) { @@ -3742,19 +3742,20 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, } async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); cfqq = *async_cfqq; + if (cfqq) + goto out; } - if (!cfqq) - cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); + cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); /* * pin the queue now that it's allocated, scheduler exit will prune it */ - if (!is_sync && !(*async_cfqq)) { + if (!is_sync) { cfqq->ref++; *async_cfqq = cfqq; } - +out: cfqq->ref++; return cfqq; } From 95e5d6f62693f27d9011ec307eb32c6126314ea3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:54:58 -0700 Subject: [PATCH 07/45] cfq-iosched: fix async oom queue handling Async cfqq's (cfq_queue's) are shared across cfq_data. When cfq_get_queue() obtains a new queue from cfq_find_alloc_queue(), it stashes the pointer in cfq_data and reuses it from then on; however, the function doesn't consider that cfq_find_alloc_queue() may return the oom_cfqq under memory pressure and installs the returned queue unconditionally. If the oom_cfqq is installed as an async cfqq, cfq_set_request() will continue calling cfq_get_queue() hoping to replace it with a proper queue; however, cfq_get_queue() will keep returning the cached queue for the slot - the oom_cfqq. Fix it by skipping caching if the queue is the oom one. Signed-off-by: Tejun Heo Acked-by: Jeff Moyer Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e91093d0c0eb..baf0b706169d 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3751,7 +3751,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, /* * pin the queue now that it's allocated, scheduler exit will prune it */ - if (!is_sync) { + if (!is_sync && cfqq != &cfqd->oom_cfqq) { cfqq->ref++; *async_cfqq = cfqq; } From bce6133b09013f70d41a678d262a12147ed43889 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:54:59 -0700 Subject: [PATCH 08/45] cfq-iosched: fix oom cfq_queue ref leak in cfq_set_request() If the cfq_queue cached in cfq_io_cq is the oom one, cfq_set_request() replaces it by invoking cfq_get_queue() again without putting the oom queue leaking the reference it was holding. While oom queues are not released through reference counting, they're still reference counted and this can theoretically lead to the reference count overflowing and incorrectly invoke the usual release path on it. Fix it by making cfq_set_request() put the ref it was holding. Signed-off-by: Tejun Heo Acked-by: Jeff Moyer Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index baf0b706169d..8acfb7a3b2ba 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -4299,6 +4299,8 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, new_queue: cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { + if (cfqq) + cfq_put_queue(cfqq); cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask); cic_set_cfqq(cic, cfqq, is_sync); } else { From 563180a44b7d7978f44e9776eedfbbc550c2398d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:00 -0700 Subject: [PATCH 09/45] cfq-iosched: minor cleanups * Some were accessing cic->cfqq[] directly. Always use cic_to_cfqq() and cic_set_cfqq(). * check_ioprio_changed() doesn't need to verify cfq_get_queue()'s return for NULL. It's always non-NULL. Simplify accordingly. This patch doesn't cause any functional changes. Signed-off-by: Tejun Heo Acked-by: Jeff Moyer Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 8acfb7a3b2ba..bf6f49cca311 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3506,14 +3506,14 @@ static void cfq_exit_icq(struct io_cq *icq) struct cfq_io_cq *cic = icq_to_cic(icq); struct cfq_data *cfqd = cic_to_cfqd(cic); - if (cic->cfqq[BLK_RW_ASYNC]) { - cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); - cic->cfqq[BLK_RW_ASYNC] = NULL; + if (cic_to_cfqq(cic, false)) { + cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false)); + cic_set_cfqq(cic, NULL, false); } - if (cic->cfqq[BLK_RW_SYNC]) { - cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); - cic->cfqq[BLK_RW_SYNC] = NULL; + if (cic_to_cfqq(cic, true)) { + cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true)); + cic_set_cfqq(cic, NULL, true); } } @@ -3572,18 +3572,14 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) return; - cfqq = cic->cfqq[BLK_RW_ASYNC]; + cfqq = cic_to_cfqq(cic, false); if (cfqq) { - struct cfq_queue *new_cfqq; - new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, - GFP_ATOMIC); - if (new_cfqq) { - cic->cfqq[BLK_RW_ASYNC] = new_cfqq; - cfq_put_queue(cfqq); - } + cfq_put_queue(cfqq); + cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, GFP_ATOMIC); + cic_set_cfqq(cic, cfqq, false); } - cfqq = cic->cfqq[BLK_RW_SYNC]; + cfqq = cic_to_cfqq(cic, true); if (cfqq) cfq_mark_cfqq_prio_changed(cfqq); From d93a11f1cd890d4ea72f7cef75fac56801b099b3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:01 -0700 Subject: [PATCH 10/45] blkcg, cfq-iosched: use GFP_NOWAIT instead of GFP_ATOMIC for non-critical allocations blkcg performs several allocations to track IOs per cgroup and enforce resource control. Most of these allocations are performed lazily on demand in the IO path and thus can't involve reclaim path. Currently, these allocations use GFP_ATOMIC; however, blkcg can gracefully deal with occassional failures of these allocations by punting IOs to the root cgroup and there's no reason to reach into the emergency reserve. This patch replaces GFP_ATOMIC with GFP_NOWAIT for the following allocations. * bdi_writeback_congested and blkcg_gq allocations in blkg_create(). * radix tree node allocations for blkcg->blkg_tree. * cfq_queue allocation on ioprio changes. Signed-off-by: Tejun Heo Suggested-and-Reviewed-by: Jeff Moyer Suggested-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 8 ++++---- block/cfq-iosched.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d6283b3f5db5..1db904f95502 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -188,7 +188,7 @@ EXPORT_SYMBOL_GPL(blkg_lookup); /* * If @new_blkg is %NULL, this function tries to allocate a new one as - * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. + * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct request_queue *q, @@ -208,7 +208,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, } wb_congested = wb_congested_get_create(&q->backing_dev_info, - blkcg->css.id, GFP_ATOMIC); + blkcg->css.id, GFP_NOWAIT); if (!wb_congested) { ret = -ENOMEM; goto err_put_css; @@ -216,7 +216,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); + new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_congested; @@ -882,7 +882,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) done: spin_lock_init(&blkcg->lock); - INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); + INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index bf6f49cca311..5f119292a254 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3575,7 +3575,7 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) cfqq = cic_to_cfqq(cic, false); if (cfqq) { cfq_put_queue(cfqq); - cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, GFP_ATOMIC); + cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, GFP_NOWAIT); cic_set_cfqq(cic, cfqq, false); } From 2da8de0bb799bf2bdfa893e5a1e294eb6bafba62 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:02 -0700 Subject: [PATCH 11/45] cfq-iosched: remove @gfp_mask from cfq_find_alloc_queue() Even when allocations fail, cfq_find_alloc_queue() always returns a valid cfq_queue by falling back to the oom cfq_queue. As such, there isn't much point in taking @gfp_mask and trying "harder" if __GFP_WAIT is set. GFP_NOWAIT allocations don't fail often and even when they do the degraded behavior is acceptable and temporary. After all, the only reason get_request(), which ultimately determines the gfp_mask, cares about __GFP_WAIT is to guarantee request allocation, assuming IO forward progress, for callers which are willing to wait. There's no reason for cfq_find_alloc_queue() to behave differently on __GFP_WAIT when it already has a fallback mechanism. Remove @gfp_mask from cfq_find_alloc_queue() and propagate the changes to its callers. This simplifies the function quite a bit and will help making async queues per-cfq_group. v2: Updated to reflect GFP_ATOMIC -> GPF_NOWAIT. Signed-off-by: Tejun Heo Reviewed-by: Jeff Moyer Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 45 ++++++++++----------------------------------- 1 file changed, 10 insertions(+), 35 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5f119292a254..146b03d64b7e 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -883,8 +883,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, static void cfq_dispatch_insert(struct request_queue *, struct request *); static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, - struct cfq_io_cq *cic, struct bio *bio, - gfp_t gfp_mask); + struct cfq_io_cq *cic, struct bio *bio); static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) { @@ -3575,7 +3574,7 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) cfqq = cic_to_cfqq(cic, false); if (cfqq) { cfq_put_queue(cfqq); - cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, GFP_NOWAIT); + cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio); cic_set_cfqq(cic, cfqq, false); } @@ -3643,13 +3642,12 @@ static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { static struct cfq_queue * cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, - struct bio *bio, gfp_t gfp_mask) + struct bio *bio) { struct blkcg *blkcg; - struct cfq_queue *cfqq, *new_cfqq = NULL; + struct cfq_queue *cfqq; struct cfq_group *cfqg; -retry: rcu_read_lock(); blkcg = bio_blkcg(bio); @@ -3666,27 +3664,9 @@ retry: * originally, since it should just be a temporary situation. */ if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = NULL; - if (new_cfqq) { - cfqq = new_cfqq; - new_cfqq = NULL; - } else if (gfp_mask & __GFP_WAIT) { - rcu_read_unlock(); - spin_unlock_irq(cfqd->queue->queue_lock); - new_cfqq = kmem_cache_alloc_node(cfq_pool, - gfp_mask | __GFP_ZERO, - cfqd->queue->node); - spin_lock_irq(cfqd->queue->queue_lock); - if (new_cfqq) - goto retry; - else - return &cfqd->oom_cfqq; - } else { - cfqq = kmem_cache_alloc_node(cfq_pool, - gfp_mask | __GFP_ZERO, - cfqd->queue->node); - } - + cfqq = kmem_cache_alloc_node(cfq_pool, + GFP_NOWAIT | __GFP_ZERO, + cfqd->queue->node); if (cfqq) { cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); cfq_init_prio_data(cfqq, cic); @@ -3696,9 +3676,6 @@ retry: cfqq = &cfqd->oom_cfqq; } out: - if (new_cfqq) - kmem_cache_free(cfq_pool, new_cfqq); - rcu_read_unlock(); return cfqq; } @@ -3723,7 +3700,7 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) static struct cfq_queue * cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, - struct bio *bio, gfp_t gfp_mask) + struct bio *bio) { int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); @@ -3742,7 +3719,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, goto out; } - cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); + cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio); /* * pin the queue now that it's allocated, scheduler exit will prune it @@ -4286,8 +4263,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, const bool is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; - might_sleep_if(gfp_mask & __GFP_WAIT); - spin_lock_irq(q->queue_lock); check_ioprio_changed(cic, bio); @@ -4297,7 +4272,7 @@ new_queue: if (!cfqq || cfqq == &cfqd->oom_cfqq) { if (cfqq) cfq_put_queue(cfqq); - cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask); + cfqq = cfq_get_queue(cfqd, is_sync, cic, bio); cic_set_cfqq(cic, cfqq, is_sync); } else { /* From 322731ed0dd2d8a7f11307e0444257f48580a0de Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:03 -0700 Subject: [PATCH 12/45] cfq-iosched: move cfq_group determination from cfq_find_alloc_queue() to cfq_get_queue() This is necessary for making async cfq_cgroups per-cfq_group instead of per-cfq_data. While this change makes cfq_get_queue() perform RCU locking and look up cfq_group even when it reuses async queue, the extra overhead is extremely unlikely to be noticeable given that this is already sitting behind cic->cfqq[] cache and the overall cost of cfq operation. Signed-off-by: Tejun Heo Reviewed-by: Jeff Moyer Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 146b03d64b7e..f3ea8a198def 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3641,21 +3641,10 @@ static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { #endif /* CONFIG_CFQ_GROUP_IOSCHED */ static struct cfq_queue * -cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, - struct bio *bio) +cfq_find_alloc_queue(struct cfq_data *cfqd, struct cfq_group *cfqg, bool is_sync, + struct cfq_io_cq *cic, struct bio *bio) { - struct blkcg *blkcg; struct cfq_queue *cfqq; - struct cfq_group *cfqg; - - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - cfqg = cfq_lookup_create_cfqg(cfqd, blkcg); - if (!cfqg) { - cfqq = &cfqd->oom_cfqq; - goto out; - } cfqq = cic_to_cfqq(cic, is_sync); @@ -3675,8 +3664,6 @@ cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, } else cfqq = &cfqd->oom_cfqq; } -out: - rcu_read_unlock(); return cfqq; } @@ -3706,6 +3693,14 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); struct cfq_queue **async_cfqq; struct cfq_queue *cfqq; + struct cfq_group *cfqg; + + rcu_read_lock(); + cfqg = cfq_lookup_create_cfqg(cfqd, bio_blkcg(bio)); + if (!cfqg) { + cfqq = &cfqd->oom_cfqq; + goto out; + } if (!is_sync) { if (!ioprio_valid(cic->ioprio)) { @@ -3719,7 +3714,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, goto out; } - cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio); + cfqq = cfq_find_alloc_queue(cfqd, cfqg, is_sync, cic, bio); /* * pin the queue now that it's allocated, scheduler exit will prune it @@ -3730,6 +3725,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, } out: cfqq->ref++; + rcu_read_unlock(); return cfqq; } From d4aad7ff04dfd00f2a69356a48054d6be84dda31 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:04 -0700 Subject: [PATCH 13/45] cfq-iosched: fold cfq_find_alloc_queue() into cfq_get_queue() cfq_find_alloc_queue() checks whether a queue actually needs to be allocated, which is unnecessary as its sole caller, cfq_get_queue(), only calls it if so. Also, the oom queue fallback logic is scattered between cfq_get_queue() and cfq_find_alloc_queue(). There really isn't much going on in the latter and things can be made simpler by folding it into cfq_get_queue(). This patch collapses cfq_find_alloc_queue() into cfq_get_queue(). The change is fairly straight-forward with one exception - async_cfqq is now initialized to NULL and the "!is_sync" test in the last if conditional is replaced with "async_cfqq" test. This is because gcc (5.1.1) gets confused for some reason and warns that async_cfqq may be used uninitialized otherwise. Oh well, the code isn't necessarily worse this way. This patch doesn't cause any functional difference. v2: Updated to reflect GFP_ATOMIC -> GPF_NOWAIT. Signed-off-by: Tejun Heo Reviewed-by: Jeff Moyer Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 49 ++++++++++++++------------------------------- 1 file changed, 15 insertions(+), 34 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index f3ea8a198def..fa2a2f7dbecd 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3640,33 +3640,6 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } #endif /* CONFIG_CFQ_GROUP_IOSCHED */ -static struct cfq_queue * -cfq_find_alloc_queue(struct cfq_data *cfqd, struct cfq_group *cfqg, bool is_sync, - struct cfq_io_cq *cic, struct bio *bio) -{ - struct cfq_queue *cfqq; - - cfqq = cic_to_cfqq(cic, is_sync); - - /* - * Always try a new alloc if we fell back to the OOM cfqq - * originally, since it should just be a temporary situation. - */ - if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = kmem_cache_alloc_node(cfq_pool, - GFP_NOWAIT | __GFP_ZERO, - cfqd->queue->node); - if (cfqq) { - cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); - cfq_init_prio_data(cfqq, cic); - cfq_link_cfqq_cfqg(cfqq, cfqg); - cfq_log_cfqq(cfqd, cfqq, "alloced"); - } else - cfqq = &cfqd->oom_cfqq; - } - return cfqq; -} - static struct cfq_queue ** cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) { @@ -3691,7 +3664,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, { int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); - struct cfq_queue **async_cfqq; + struct cfq_queue **async_cfqq = NULL; struct cfq_queue *cfqq; struct cfq_group *cfqg; @@ -3714,12 +3687,20 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, goto out; } - cfqq = cfq_find_alloc_queue(cfqd, cfqg, is_sync, cic, bio); + cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO, + cfqd->queue->node); + if (!cfqq) { + cfqq = &cfqd->oom_cfqq; + goto out; + } - /* - * pin the queue now that it's allocated, scheduler exit will prune it - */ - if (!is_sync && cfqq != &cfqd->oom_cfqq) { + cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); + cfq_init_prio_data(cfqq, cic); + cfq_link_cfqq_cfqg(cfqq, cfqg); + cfq_log_cfqq(cfqd, cfqq, "alloced"); + + if (async_cfqq) { + /* a new async queue is created, pin and remember */ cfqq->ref++; *async_cfqq = cfqq; } @@ -4469,7 +4450,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) cfqd->prio_trees[i] = RB_ROOT; /* - * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. + * Our fallback cfqq if cfq_get_queue() runs into OOM issues. * Grab a permanent reference to it, so that the normal code flow * will not attempt to free it. oom_cfqq is linked to root_group * but shouldn't hold a reference as it'll never be unlinked. Lose From 60a837077e2b5672ebbd4c8bd67ca443951bbc92 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:05 -0700 Subject: [PATCH 14/45] cfq-iosched: charge async IOs to the appropriate blkcg's instead of the root Up until now, all async IOs were queued to async queues which are shared across the whole request_queue, which means that blkcg resource control is completely void on async IOs including all writeback IOs. It was done this way because writeback didn't support writeback and there was no way of telling which writeback IO belonged to which cgroup; however, writeback recently became cgroup aware and writeback bio's are sent down properly tagged with the blkcg's to charge them against. This patch makes async cfq_queues per-cfq_cgroup instead of per-cfq_data so that each async IO is charged to the blkcg that it was tagged for instead of unconditionally attributing it to root. * cfq_data->async_cfqq and ->async_idle_cfqq are moved to cfq_group and alloc / destroy paths are updated accordingly. * cfq_link_cfqq_cfqg() no longer overrides @cfqg to root for async queues. * check_blkcg_changed() now also invalidates async queues as they no longer stay the same across cgroups. After this patch, cfq's proportional IO control through blkio.weight works correctly when cgroup writeback is in use. Signed-off-by: Tejun Heo Reviewed-by: Jeff Moyer Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 85 ++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 43 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index fa2a2f7dbecd..9c9ec7cc9f99 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -305,6 +305,11 @@ struct cfq_group { struct cfq_ttime ttime; struct cfqg_stats stats; /* stats for this cfqg */ struct cfqg_stats dead_stats; /* stats pushed from dead children */ + + /* async queue for each priority case */ + struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; + struct cfq_queue *async_idle_cfqq; + }; struct cfq_io_cq { @@ -370,12 +375,6 @@ struct cfq_data { struct cfq_queue *active_queue; struct cfq_io_cq *active_cic; - /* - * async queue for each priority case - */ - struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; - struct cfq_queue *async_idle_cfqq; - sector_t last_position; /* @@ -401,6 +400,7 @@ struct cfq_data { }; static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); +static void cfq_put_queue(struct cfq_queue *cfqq); static struct cfq_rb_root *st_for(struct cfq_group *cfqg, enum wl_class_t class, @@ -1596,13 +1596,26 @@ static void cfq_pd_init(struct blkcg_gq *blkg) static void cfq_pd_offline(struct blkcg_gq *blkg) { + struct cfq_group *cfqg = blkg_to_cfqg(blkg); + int i; + + for (i = 0; i < IOPRIO_BE_NR; i++) { + if (cfqg->async_cfqq[0][i]) + cfq_put_queue(cfqg->async_cfqq[0][i]); + if (cfqg->async_cfqq[1][i]) + cfq_put_queue(cfqg->async_cfqq[1][i]); + } + + if (cfqg->async_idle_cfqq) + cfq_put_queue(cfqg->async_idle_cfqq); + /* * @blkg is going offline and will be ignored by * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so * that they don't get lost. If IOs complete after this point, the * stats for them will be lost. Oh well... */ - cfqg_stats_xfer_dead(blkg_to_cfqg(blkg)); + cfqg_stats_xfer_dead(cfqg); } /* offset delta from cfqg->stats to cfqg->dead_stats */ @@ -1665,10 +1678,6 @@ static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { - /* Currently, all async queues are mapped to root group */ - if (!cfq_cfqq_sync(cfqq)) - cfqg = cfqq->cfqd->root_group; - cfqq->cfqg = cfqg; /* cfqq reference on cfqg */ cfqg_get(cfqg); @@ -3609,7 +3618,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { struct cfq_data *cfqd = cic_to_cfqd(cic); - struct cfq_queue *sync_cfqq; + struct cfq_queue *cfqq; uint64_t serial_nr; rcu_read_lock(); @@ -3623,15 +3632,22 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) return; - sync_cfqq = cic_to_cfqq(cic, 1); - if (sync_cfqq) { - /* - * Drop reference to sync queue. A new sync queue will be - * assigned in new group upon arrival of a fresh request. - */ - cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); - cic_set_cfqq(cic, NULL, 1); - cfq_put_queue(sync_cfqq); + /* + * Drop reference to queues. New queues will be assigned in new + * group upon arrival of fresh requests. + */ + cfqq = cic_to_cfqq(cic, false); + if (cfqq) { + cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); + cic_set_cfqq(cic, NULL, false); + cfq_put_queue(cfqq); + } + + cfqq = cic_to_cfqq(cic, true); + if (cfqq) { + cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); + cic_set_cfqq(cic, NULL, true); + cfq_put_queue(cfqq); } cic->blkcg_serial_nr = serial_nr; @@ -3641,18 +3657,18 @@ static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { #endif /* CONFIG_CFQ_GROUP_IOSCHED */ static struct cfq_queue ** -cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) +cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio) { switch (ioprio_class) { case IOPRIO_CLASS_RT: - return &cfqd->async_cfqq[0][ioprio]; + return &cfqg->async_cfqq[0][ioprio]; case IOPRIO_CLASS_NONE: ioprio = IOPRIO_NORM; /* fall through */ case IOPRIO_CLASS_BE: - return &cfqd->async_cfqq[1][ioprio]; + return &cfqg->async_cfqq[1][ioprio]; case IOPRIO_CLASS_IDLE: - return &cfqd->async_idle_cfqq; + return &cfqg->async_idle_cfqq; default: BUG(); } @@ -3681,7 +3697,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, ioprio = task_nice_ioprio(tsk); ioprio_class = task_nice_ioclass(tsk); } - async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); + async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio); cfqq = *async_cfqq; if (cfqq) goto out; @@ -4355,21 +4371,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) cancel_work_sync(&cfqd->unplug_work); } -static void cfq_put_async_queues(struct cfq_data *cfqd) -{ - int i; - - for (i = 0; i < IOPRIO_BE_NR; i++) { - if (cfqd->async_cfqq[0][i]) - cfq_put_queue(cfqd->async_cfqq[0][i]); - if (cfqd->async_cfqq[1][i]) - cfq_put_queue(cfqd->async_cfqq[1][i]); - } - - if (cfqd->async_idle_cfqq) - cfq_put_queue(cfqd->async_idle_cfqq); -} - static void cfq_exit_queue(struct elevator_queue *e) { struct cfq_data *cfqd = e->elevator_data; @@ -4382,8 +4383,6 @@ static void cfq_exit_queue(struct elevator_queue *e) if (cfqd->active_queue) __cfq_slice_expired(cfqd, cfqd->active_queue, 0); - cfq_put_async_queues(cfqd); - spin_unlock_irq(q->queue_lock); cfq_shutdown_timer_wq(cfqd); From 401efbf835040dd2ebca54f78d58fc8e3c51f91d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:06 -0700 Subject: [PATCH 15/45] blkcg: remove unnecessary request_list->blkg NULL test in blk_put_rl() Since ec13b1d6f0a0 ("blkcg: always create the blkcg_gq for the root blkcg"), a request_list always has its blkg associated. Drop unnecessary rl->blkg NULL test from blk_put_rl(). Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 1b62d768c7df..9711fc277c02 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -394,8 +394,7 @@ root_rl: */ static inline void blk_put_rl(struct request_list *rl) { - /* root_rl may not have blkg set */ - if (rl->blkg && rl->blkg->blkcg != &blkcg_root) + if (rl->blkg->blkcg != &blkcg_root) blkg_put(rl->blkg); } From 994b78327458ea14a1743196ee0560c73ace37f3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:07 -0700 Subject: [PATCH 16/45] blkcg: use blkg_free() in blkcg_init_queue() failure path When blkcg_init_queue() fails midway after creating a new blkg, it performs kfree() directly; however, this doesn't free the policy data areas. Make it use blkg_free() instead. In turn, blkg_free() is updated to handle root request_list special case. While this fixes a possible memory leak, it's on an unlikely failure path of an already cold path and the size leaked per occurrence is miniscule too. I don't think it needs to be tagged for -stable. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1db904f95502..cd251830038a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -70,7 +70,8 @@ static void blkg_free(struct blkcg_gq *blkg) for (i = 0; i < BLKCG_MAX_POLS; i++) kfree(blkg->pd[i]); - blk_exit_rl(&blkg->rl); + if (blkg->blkcg != &blkcg_root) + blk_exit_rl(&blkg->rl); kfree(blkg); } @@ -938,7 +939,7 @@ int blkcg_init_queue(struct request_queue *q) radix_tree_preload_end(); if (IS_ERR(blkg)) { - kfree(new_blkg); + blkg_free(new_blkg); return PTR_ERR(blkg); } From bc915e61cde25d0b429f536cec9e83039bf23504 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:08 -0700 Subject: [PATCH 17/45] blkcg: remove unnecessary blkcg_root handling from css_alloc/free paths blkcg_css_alloc() bypasses policy data allocation and blkcg_css_free() bypasses policy data and blkcg freeing for blkcg_root. There's no reason to to treat policy data any differently for blkcg_root. If the root css gets allocated after policies are registered, policy registration path will add policy data; otherwise, the alloc path will. The free path isn't never invoked for root csses. This patch removes the unnecessary special handling of blkcg_root from css_alloc/free paths. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index cd251830038a..f91a4e09e0c9 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -823,18 +823,15 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); + int i; mutex_lock(&blkcg_pol_mutex); list_del(&blkcg->all_blkcgs_node); mutex_unlock(&blkcg_pol_mutex); - if (blkcg != &blkcg_root) { - int i; - - for (i = 0; i < BLKCG_MAX_POLS; i++) - kfree(blkcg->pd[i]); - kfree(blkcg); - } + for (i = 0; i < BLKCG_MAX_POLS; i++) + kfree(blkcg->pd[i]); + kfree(blkcg); } static struct cgroup_subsys_state * @@ -848,13 +845,12 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) if (!parent_css) { blkcg = &blkcg_root; - goto done; - } - - blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); - if (!blkcg) { - ret = ERR_PTR(-ENOMEM); - goto free_blkcg; + } else { + blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); + if (!blkcg) { + ret = ERR_PTR(-ENOMEM); + goto free_blkcg; + } } for (i = 0; i < BLKCG_MAX_POLS ; i++) { @@ -881,7 +877,6 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) pol->cpd_init_fn(blkcg); } -done: spin_lock_init(&blkcg->lock); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); INIT_HLIST_HEAD(&blkcg->blkg_list); From 4c55f4f9ad3001ac1fefdd8d8ca7641d18558e23 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:09 -0700 Subject: [PATCH 18/45] blkcg: restructure blkg_policy_data allocation in blkcg_activate_policy() When a policy gets activated, it needs to allocate and install its policy data on all existing blkg's (blkcg_gq's). Because blkg iteration is protected by a spinlock, it currently counts the total number of blkg's in the system, allocates the matching number of policy data on a list and installs them during a single iteration. This can be simplified by using speculative GFP_NOWAIT allocations while iterating and falling back to a preallocated policy data on failure. If the preallocated one has already been consumed, it releases the lock, preallocate with GFP_KERNEL and then restarts the iteration. This can be a bit more expensive than before but policy activation is a very cold path and shouldn't matter. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 55 +++++++++++++++----------------------- include/linux/blk-cgroup.h | 3 --- 2 files changed, 21 insertions(+), 37 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f91a4e09e0c9..9e9b0df339ee 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1047,65 +1047,52 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { - LIST_HEAD(pds); + struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg; - struct blkg_policy_data *pd, *nd; - int cnt = 0, ret; + int ret; if (blkcg_policy_enabled(q, pol)) return 0; - /* count and allocate policy_data for all existing blkgs */ blk_queue_bypass_start(q); - spin_lock_irq(q->queue_lock); - list_for_each_entry(blkg, &q->blkg_list, q_node) - cnt++; - spin_unlock_irq(q->queue_lock); - - /* allocate per-blkg policy data for all existing blkgs */ - while (cnt--) { - pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); - if (!pd) { +pd_prealloc: + if (!pd_prealloc) { + pd_prealloc = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); + if (!pd_prealloc) { ret = -ENOMEM; - goto out_free; + goto out_bypass_end; } - list_add_tail(&pd->alloc_node, &pds); } - /* - * Install the allocated pds and cpds. With @q bypassing, no new blkg - * should have been created while the queue lock was dropped. - */ spin_lock_irq(q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { - if (WARN_ON(list_empty(&pds))) { - /* umm... this shouldn't happen, just abort */ - ret = -ENOMEM; - goto out_unlock; - } - pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); - list_del_init(&pd->alloc_node); + struct blkg_policy_data *pd; - /* grab blkcg lock too while installing @pd on @blkg */ - spin_lock(&blkg->blkcg->lock); + if (blkg->pd[pol->plid]) + continue; + + pd = kzalloc_node(pol->pd_size, GFP_NOWAIT, q->node); + if (!pd) + swap(pd, pd_prealloc); + if (!pd) { + spin_unlock_irq(q->queue_lock); + goto pd_prealloc; + } blkg->pd[pol->plid] = pd; pd->blkg = blkg; pd->plid = pol->plid; pol->pd_init_fn(blkg); - - spin_unlock(&blkg->blkcg->lock); } __set_bit(pol->plid, q->blkcg_pols); ret = 0; -out_unlock: + spin_unlock_irq(q->queue_lock); -out_free: +out_bypass_end: blk_queue_bypass_end(q); - list_for_each_entry_safe(pd, nd, &pds, alloc_node) - kfree(pd); + kfree(pd_prealloc); return ret; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 9711fc277c02..db822880242a 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -80,9 +80,6 @@ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ struct blkcg_gq *blkg; int plid; - - /* used during policy activation */ - struct list_head alloc_node; }; /* From 3e41871046bfe0ba7d122a1f14f0c1db2dca0256 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:10 -0700 Subject: [PATCH 19/45] blkcg: make blkcg_activate_policy() allow NULL ->pd_init_fn blkg_create() allows NULL ->pd_init_fn() but blkcg_activate_policy() doesn't. As both in-kernel policies implement ->pd_init_fn, it currently doesn't break anything. Update blkcg_activate_policy() so that its behavior is consistent with blkg_create(). Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 9e9b0df339ee..4defbbabc0ff 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1083,7 +1083,8 @@ pd_prealloc: blkg->pd[pol->plid] = pd; pd->blkg = blkg; pd->plid = pol->plid; - pol->pd_init_fn(blkg); + if (pol->pd_init_fn) + pol->pd_init_fn(blkg); } __set_bit(pol->plid, q->blkcg_pols); From 001bea73e70efdf48a9e00188cf302f6b6aed2bf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:11 -0700 Subject: [PATCH 20/45] blkcg: replace blkcg_policy->pd_size with ->pd_alloc/free_fn() methods A blkg (blkcg_gq) represents the relationship between a cgroup and request_queue. Each active policy has a pd (blkg_policy_data) on each blkg. The pd's were allocated by blkcg core and each policy could request to allocate extra space at the end by setting blkcg_policy->pd_size larger than the size of pd. This is a bit unusual but was done this way mostly to simplify error handling and all the existing use cases could be handled this way; however, this is becoming too restrictive now that percpu memory can be allocated without blocking. This introduces two new mandatory blkcg_policy methods - pd_alloc_fn() and pd_free_fn() - which are used to allocate and release pd for a given policy. As pd allocation is now done from policy side, it can simply allocate a larger area which embeds pd at the beginning. This change makes ->pd_size pointless. Removed. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 21 +++++++++++---------- block/blk-throttle.c | 13 ++++++++++++- block/cfq-iosched.c | 13 ++++++++++++- include/linux/blk-cgroup.h | 18 +++++++++--------- 4 files changed, 44 insertions(+), 21 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4defbbabc0ff..d1bc6099bd1e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -68,7 +68,8 @@ static void blkg_free(struct blkcg_gq *blkg) return; for (i = 0; i < BLKCG_MAX_POLS; i++) - kfree(blkg->pd[i]); + if (blkg->pd[i]) + blkcg_policy[i]->pd_free_fn(blkg->pd[i]); if (blkg->blkcg != &blkcg_root) blk_exit_rl(&blkg->rl); @@ -114,7 +115,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, continue; /* alloc per-policy data and attach it to blkg */ - pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); + pd = pol->pd_alloc_fn(gfp_mask, q->node); if (!pd) goto err_free; @@ -1057,7 +1058,7 @@ int blkcg_activate_policy(struct request_queue *q, blk_queue_bypass_start(q); pd_prealloc: if (!pd_prealloc) { - pd_prealloc = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); + pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); if (!pd_prealloc) { ret = -ENOMEM; goto out_bypass_end; @@ -1072,7 +1073,7 @@ pd_prealloc: if (blkg->pd[pol->plid]) continue; - pd = kzalloc_node(pol->pd_size, GFP_NOWAIT, q->node); + pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node); if (!pd) swap(pd, pd_prealloc); if (!pd) { @@ -1093,7 +1094,8 @@ pd_prealloc: spin_unlock_irq(q->queue_lock); out_bypass_end: blk_queue_bypass_end(q); - kfree(pd_prealloc); + if (pd_prealloc) + pol->pd_free_fn(pd_prealloc); return ret; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); @@ -1128,8 +1130,10 @@ void blkcg_deactivate_policy(struct request_queue *q, if (pol->pd_exit_fn) pol->pd_exit_fn(blkg); - kfree(blkg->pd[pol->plid]); - blkg->pd[pol->plid] = NULL; + if (blkg->pd[pol->plid]) { + pol->pd_free_fn(blkg->pd[pol->plid]); + blkg->pd[pol->plid] = NULL; + } spin_unlock(&blkg->blkcg->lock); } @@ -1151,9 +1155,6 @@ int blkcg_policy_register(struct blkcg_policy *pol) struct blkcg *blkcg; int i, ret; - if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) - return -EINVAL; - mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b23193518ac7..f1dd691c5359 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -403,6 +403,11 @@ static void throtl_service_queue_exit(struct throtl_service_queue *sq) del_timer_sync(&sq->pending_timer); } +static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) +{ + return kzalloc_node(sizeof(struct throtl_grp), gfp, node); +} + static void throtl_pd_init(struct blkcg_gq *blkg) { struct throtl_grp *tg = blkg_to_tg(blkg); @@ -493,6 +498,11 @@ static void throtl_pd_exit(struct blkcg_gq *blkg) throtl_service_queue_exit(&tg->service_queue); } +static void throtl_pd_free(struct blkg_policy_data *pd) +{ + kfree(pd); +} + static void throtl_pd_reset_stats(struct blkcg_gq *blkg) { struct throtl_grp *tg = blkg_to_tg(blkg); @@ -1468,12 +1478,13 @@ static void throtl_shutdown_wq(struct request_queue *q) } static struct blkcg_policy blkcg_policy_throtl = { - .pd_size = sizeof(struct throtl_grp), .cftypes = throtl_files, + .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, .pd_exit_fn = throtl_pd_exit, + .pd_free_fn = throtl_pd_free, .pd_reset_stats_fn = throtl_pd_reset_stats, }; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 9c9ec7cc9f99..69ce2883099e 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1582,6 +1582,11 @@ static void cfq_cpd_init(const struct blkcg *blkcg) } } +static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) +{ + return kzalloc_node(sizeof(struct cfq_group), gfp, node); +} + static void cfq_pd_init(struct blkcg_gq *blkg) { struct cfq_group *cfqg = blkg_to_cfqg(blkg); @@ -1618,6 +1623,11 @@ static void cfq_pd_offline(struct blkcg_gq *blkg) cfqg_stats_xfer_dead(cfqg); } +static void cfq_pd_free(struct blkg_policy_data *pd) +{ + return kfree(pd); +} + /* offset delta from cfqg->stats to cfqg->dead_stats */ static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) - offsetof(struct cfq_group, stats); @@ -4633,13 +4643,14 @@ static struct elevator_type iosched_cfq = { #ifdef CONFIG_CFQ_GROUP_IOSCHED static struct blkcg_policy blkcg_policy_cfq = { - .pd_size = sizeof(struct cfq_group), .cpd_size = sizeof(struct cfq_group_data), .cftypes = cfq_blkcg_files, .cpd_init_fn = cfq_cpd_init, + .pd_alloc_fn = cfq_pd_alloc, .pd_init_fn = cfq_pd_init, .pd_offline_fn = cfq_pd_offline, + .pd_free_fn = cfq_pd_free, .pd_reset_stats_fn = cfq_pd_reset_stats, }; #endif diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index db822880242a..bd173ea360ce 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -68,13 +68,11 @@ struct blkg_rwstat { * request_queue (q). This is used by blkcg policies which need to track * information per blkcg - q pair. * - * There can be multiple active blkcg policies and each has its private - * data on each blkg, the size of which is determined by - * blkcg_policy->pd_size. blkcg core allocates and frees such areas - * together with blkg and invokes pd_init/exit_fn() methods. - * - * Such private data must embed struct blkg_policy_data (pd) at the - * beginning and pd_size can't be smaller than pd. + * There can be multiple active blkcg policies and each blkg:policy pair is + * represented by a blkg_policy_data which is allocated and freed by each + * policy's pd_alloc/free_fn() methods. A policy can allocate private data + * area by allocating larger data structure which embeds blkg_policy_data + * at the beginning. */ struct blkg_policy_data { /* the blkg and policy id this per-policy data belongs to */ @@ -126,16 +124,16 @@ struct blkcg_gq { }; typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg); +typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); struct blkcg_policy { int plid; - /* policy specific private data size */ - size_t pd_size; /* policy specific per-blkcg data size */ size_t cpd_size; /* cgroup files for the policy */ @@ -143,10 +141,12 @@ struct blkcg_policy { /* operations */ blkcg_pol_init_cpd_fn *cpd_init_fn; + blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; blkcg_pol_offline_pd_fn *pd_offline_fn; blkcg_pol_exit_pd_fn *pd_exit_fn; + blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; }; From 4fb72036fbf9c28de7a64b1d3f19b4ce9da1c6bf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:12 -0700 Subject: [PATCH 21/45] blk-throttle: remove asynchrnous percpu stats allocation mechanism Because percpu allocator couldn't do non-blocking allocations, blk-throttle was forced to implement an ad-hoc asynchronous allocation mechanism for its percpu stats for cases where blkg's (blkcg_gq's) are allocated from an IO path without sleepable context. Now that percpu allocator can handle gfp_mask and blkg_policy_data alloc / free are handled by policy methods, the ad-hoc asynchronous allocation mechanism can be replaced with direct allocation from tg_stats_alloc_fn(). Rit it out. This ensures that an active throtl_grp always has valid non-NULL ->stats_cpu. Remove checks on it. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-throttle.c | 112 ++++++++++--------------------------------- 1 file changed, 25 insertions(+), 87 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f1dd691c5359..3c869768cfdd 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -144,9 +144,6 @@ struct throtl_grp { /* Per cpu stats pointer */ struct tg_stats_cpu __percpu *stats_cpu; - - /* List of tgs waiting for per cpu stats memory to be allocated */ - struct list_head stats_alloc_node; }; struct throtl_data @@ -168,13 +165,6 @@ struct throtl_data struct work_struct dispatch_work; }; -/* list and work item to allocate percpu group stats */ -static DEFINE_SPINLOCK(tg_stats_alloc_lock); -static LIST_HEAD(tg_stats_alloc_list); - -static void tg_stats_alloc_fn(struct work_struct *); -static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); - static void throtl_pending_timer_fn(unsigned long arg); static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) @@ -256,53 +246,6 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) } \ } while (0) -static void tg_stats_init(struct tg_stats_cpu *tg_stats) -{ - blkg_rwstat_init(&tg_stats->service_bytes); - blkg_rwstat_init(&tg_stats->serviced); -} - -/* - * Worker for allocating per cpu stat for tgs. This is scheduled on the - * system_wq once there are some groups on the alloc_list waiting for - * allocation. - */ -static void tg_stats_alloc_fn(struct work_struct *work) -{ - static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ - struct delayed_work *dwork = to_delayed_work(work); - bool empty = false; - -alloc_stats: - if (!stats_cpu) { - int cpu; - - stats_cpu = alloc_percpu(struct tg_stats_cpu); - if (!stats_cpu) { - /* allocation failed, try again after some time */ - schedule_delayed_work(dwork, msecs_to_jiffies(10)); - return; - } - for_each_possible_cpu(cpu) - tg_stats_init(per_cpu_ptr(stats_cpu, cpu)); - } - - spin_lock_irq(&tg_stats_alloc_lock); - - if (!list_empty(&tg_stats_alloc_list)) { - struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, - struct throtl_grp, - stats_alloc_node); - swap(tg->stats_cpu, stats_cpu); - list_del_init(&tg->stats_alloc_node); - } - - empty = list_empty(&tg_stats_alloc_list); - spin_unlock_irq(&tg_stats_alloc_lock); - if (!empty) - goto alloc_stats; -} - static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) { INIT_LIST_HEAD(&qn->node); @@ -405,7 +348,27 @@ static void throtl_service_queue_exit(struct throtl_service_queue *sq) static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) { - return kzalloc_node(sizeof(struct throtl_grp), gfp, node); + struct throtl_grp *tg; + int cpu; + + tg = kzalloc_node(sizeof(*tg), gfp, node); + if (!tg) + return NULL; + + tg->stats_cpu = alloc_percpu_gfp(struct tg_stats_cpu, gfp); + if (!tg->stats_cpu) { + kfree(tg); + return NULL; + } + + for_each_possible_cpu(cpu) { + struct tg_stats_cpu *stats_cpu = per_cpu_ptr(tg->stats_cpu, cpu); + + blkg_rwstat_init(&stats_cpu->service_bytes); + blkg_rwstat_init(&stats_cpu->serviced); + } + + return &tg->pd; } static void throtl_pd_init(struct blkcg_gq *blkg) @@ -413,7 +376,6 @@ static void throtl_pd_init(struct blkcg_gq *blkg) struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_data *td = blkg->q->td; struct throtl_service_queue *parent_sq; - unsigned long flags; int rw; /* @@ -448,16 +410,6 @@ static void throtl_pd_init(struct blkcg_gq *blkg) tg->bps[WRITE] = -1; tg->iops[READ] = -1; tg->iops[WRITE] = -1; - - /* - * Ugh... We need to perform per-cpu allocation for tg->stats_cpu - * but percpu allocator can't be called from IO path. Queue tg on - * tg_stats_alloc_list and allocate from work item. - */ - spin_lock_irqsave(&tg_stats_alloc_lock, flags); - list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); - schedule_delayed_work(&tg_stats_alloc_work, 0); - spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); } /* @@ -487,20 +439,16 @@ static void throtl_pd_online(struct blkcg_gq *blkg) static void throtl_pd_exit(struct blkcg_gq *blkg) { struct throtl_grp *tg = blkg_to_tg(blkg); - unsigned long flags; - - spin_lock_irqsave(&tg_stats_alloc_lock, flags); - list_del_init(&tg->stats_alloc_node); - spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); - - free_percpu(tg->stats_cpu); throtl_service_queue_exit(&tg->service_queue); } static void throtl_pd_free(struct blkg_policy_data *pd) { - kfree(pd); + struct throtl_grp *tg = pd_to_tg(pd); + + free_percpu(tg->stats_cpu); + kfree(tg); } static void throtl_pd_reset_stats(struct blkcg_gq *blkg) @@ -508,9 +456,6 @@ static void throtl_pd_reset_stats(struct blkcg_gq *blkg) struct throtl_grp *tg = blkg_to_tg(blkg); int cpu; - if (tg->stats_cpu == NULL) - return; - for_each_possible_cpu(cpu) { struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); @@ -973,10 +918,6 @@ static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, struct tg_stats_cpu *stats_cpu; unsigned long flags; - /* If per cpu stats are not allocated yet, don't do any accounting. */ - if (tg->stats_cpu == NULL) - return; - /* * Disabling interrupts to provide mutual exclusion between two * writes on same cpu. It probably is not needed for 64bit. Not @@ -1302,9 +1243,6 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, struct blkg_rwstat rwstat = { }, tmp; int i, cpu; - if (tg->stats_cpu == NULL) - return 0; - for_each_possible_cpu(cpu) { struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); From b2ce2643cc705aa9043642d7b6248ccfd8e20629 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:13 -0700 Subject: [PATCH 22/45] blk-throttle: clean up blkg_policy_data alloc/init/exit/free methods With the recent addition of alloc and free methods, things became messier. This patch reorganizes them according to the followings. * ->pd_alloc_fn() Responsible for allocation and static initializations - the ones which can be done independent of where the pd might be attached. * ->pd_init_fn() Initializations which require the knowledge of where the pd is attached. * ->pd_free_fn() The counter part of pd_alloc_fn(). Static de-init and freeing. This leaves ->pd_exit_fn() without any users. Removed. While at it, collapse an one liner function throtl_pd_exit(), which has only one user, into its user. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 11 -------- block/blk-throttle.c | 57 +++++++++++++------------------------- block/cfq-iosched.c | 15 +++++++--- include/linux/blk-cgroup.h | 2 -- 4 files changed, 31 insertions(+), 54 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d1bc6099bd1e..acfb09af58a5 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -402,15 +402,6 @@ static void blkg_destroy_all(struct request_queue *q) void __blkg_release_rcu(struct rcu_head *rcu_head) { struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); - int i; - - /* tell policies that this one is being freed */ - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - - if (blkg->pd[i] && pol->pd_exit_fn) - pol->pd_exit_fn(blkg); - } /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); @@ -1127,8 +1118,6 @@ void blkcg_deactivate_policy(struct request_queue *q, if (pol->pd_offline_fn) pol->pd_offline_fn(blkg); - if (pol->pd_exit_fn) - pol->pd_exit_fn(blkg); if (blkg->pd[pol->plid]) { pol->pd_free_fn(blkg->pd[pol->plid]); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 3c869768cfdd..c3a235b8ec7e 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -330,26 +330,19 @@ static struct bio *throtl_pop_queued(struct list_head *queued, } /* init a service_queue, assumes the caller zeroed it */ -static void throtl_service_queue_init(struct throtl_service_queue *sq, - struct throtl_service_queue *parent_sq) +static void throtl_service_queue_init(struct throtl_service_queue *sq) { INIT_LIST_HEAD(&sq->queued[0]); INIT_LIST_HEAD(&sq->queued[1]); sq->pending_tree = RB_ROOT; - sq->parent_sq = parent_sq; setup_timer(&sq->pending_timer, throtl_pending_timer_fn, (unsigned long)sq); } -static void throtl_service_queue_exit(struct throtl_service_queue *sq) -{ - del_timer_sync(&sq->pending_timer); -} - static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) { struct throtl_grp *tg; - int cpu; + int rw, cpu; tg = kzalloc_node(sizeof(*tg), gfp, node); if (!tg) @@ -361,6 +354,19 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) return NULL; } + throtl_service_queue_init(&tg->service_queue); + + for (rw = READ; rw <= WRITE; rw++) { + throtl_qnode_init(&tg->qnode_on_self[rw], tg); + throtl_qnode_init(&tg->qnode_on_parent[rw], tg); + } + + RB_CLEAR_NODE(&tg->rb_node); + tg->bps[READ] = -1; + tg->bps[WRITE] = -1; + tg->iops[READ] = -1; + tg->iops[WRITE] = -1; + for_each_possible_cpu(cpu) { struct tg_stats_cpu *stats_cpu = per_cpu_ptr(tg->stats_cpu, cpu); @@ -375,8 +381,7 @@ static void throtl_pd_init(struct blkcg_gq *blkg) { struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_data *td = blkg->q->td; - struct throtl_service_queue *parent_sq; - int rw; + struct throtl_service_queue *sq = &tg->service_queue; /* * If on the default hierarchy, we switch to properly hierarchical @@ -391,25 +396,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg) * Limits of a group don't interact with limits of other groups * regardless of the position of the group in the hierarchy. */ - parent_sq = &td->service_queue; - + sq->parent_sq = &td->service_queue; if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) - parent_sq = &blkg_to_tg(blkg->parent)->service_queue; - - throtl_service_queue_init(&tg->service_queue, parent_sq); - - for (rw = READ; rw <= WRITE; rw++) { - throtl_qnode_init(&tg->qnode_on_self[rw], tg); - throtl_qnode_init(&tg->qnode_on_parent[rw], tg); - } - - RB_CLEAR_NODE(&tg->rb_node); + sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; - - tg->bps[READ] = -1; - tg->bps[WRITE] = -1; - tg->iops[READ] = -1; - tg->iops[WRITE] = -1; } /* @@ -436,17 +426,11 @@ static void throtl_pd_online(struct blkcg_gq *blkg) tg_update_has_rules(blkg_to_tg(blkg)); } -static void throtl_pd_exit(struct blkcg_gq *blkg) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - - throtl_service_queue_exit(&tg->service_queue); -} - static void throtl_pd_free(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); + del_timer_sync(&tg->service_queue.pending_timer); free_percpu(tg->stats_cpu); kfree(tg); } @@ -1421,7 +1405,6 @@ static struct blkcg_policy blkcg_policy_throtl = { .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, - .pd_exit_fn = throtl_pd_exit, .pd_free_fn = throtl_pd_free, .pd_reset_stats_fn = throtl_pd_reset_stats, }; @@ -1616,7 +1599,7 @@ int blk_throtl_init(struct request_queue *q) return -ENOMEM; INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); - throtl_service_queue_init(&td->service_queue, NULL); + throtl_service_queue_init(&td->service_queue); q->td = td; td->queue = q; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 69ce2883099e..4b795c7250ef 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1584,7 +1584,17 @@ static void cfq_cpd_init(const struct blkcg *blkcg) static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) { - return kzalloc_node(sizeof(struct cfq_group), gfp, node); + struct cfq_group *cfqg; + + cfqg = kzalloc_node(sizeof(*cfqg), gfp, node); + if (!cfqg) + return NULL; + + cfq_init_cfqg_base(cfqg); + cfqg_stats_init(&cfqg->stats); + cfqg_stats_init(&cfqg->dead_stats); + + return &cfqg->pd; } static void cfq_pd_init(struct blkcg_gq *blkg) @@ -1592,11 +1602,8 @@ static void cfq_pd_init(struct blkcg_gq *blkg) struct cfq_group *cfqg = blkg_to_cfqg(blkg); struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg); - cfq_init_cfqg_base(cfqg); cfqg->weight = cgd->weight; cfqg->leaf_weight = cgd->leaf_weight; - cfqg_stats_init(&cfqg->stats); - cfqg_stats_init(&cfqg->dead_stats); } static void cfq_pd_offline(struct blkcg_gq *blkg) diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index bd173ea360ce..9879469b1b38 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -128,7 +128,6 @@ typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); @@ -145,7 +144,6 @@ struct blkcg_policy { blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; blkcg_pol_offline_pd_fn *pd_offline_fn; - blkcg_pol_exit_pd_fn *pd_exit_fn; blkcg_pol_free_pd_fn *pd_free_fn; blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; }; From a9520cd6f2ac1fbbf206b915946534c6dddbaae2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:14 -0700 Subject: [PATCH 23/45] blkcg: make blkcg_policy methods take a pointer to blkcg_policy_data The newly added ->pd_alloc_fn() and ->pd_free_fn() deal with pd (blkg_policy_data) while the older ones use blkg (blkcg_gq). As using blkg doesn't make sense for ->pd_alloc_fn() and after allocation pd can always be mapped to blkg and given that these are policy-specific methods, it makes sense to converge on pd. This patch makes all methods deal with pd instead of blkg. Most conversions are trivial. In blk-cgroup.c, a couple method invocation sites now test whether pd exists instead of policy state for consistency. This shouldn't cause any behavioral differences. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 18 ++++++++---------- block/blk-throttle.c | 13 +++++++------ block/cfq-iosched.c | 14 +++++++------- include/linux/blk-cgroup.h | 8 ++++---- 4 files changed, 26 insertions(+), 27 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index acfb09af58a5..8343450cffe2 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -242,7 +242,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_init_fn) - pol->pd_init_fn(blkg); + pol->pd_init_fn(blkg->pd[i]); } /* insert */ @@ -256,7 +256,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_online_fn) - pol->pd_online_fn(blkg); + pol->pd_online_fn(blkg->pd[i]); } } blkg->online = true; @@ -347,7 +347,7 @@ static void blkg_destroy(struct blkcg_gq *blkg) struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_offline_fn) - pol->pd_offline_fn(blkg); + pol->pd_offline_fn(blkg->pd[i]); } blkg->online = false; @@ -468,9 +468,8 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - if (blkcg_policy_enabled(blkg->q, pol) && - pol->pd_reset_stats_fn) - pol->pd_reset_stats_fn(blkg); + if (blkg->pd[i] && pol->pd_reset_stats_fn) + pol->pd_reset_stats_fn(blkg->pd[i]); } } @@ -1076,7 +1075,7 @@ pd_prealloc: pd->blkg = blkg; pd->plid = pol->plid; if (pol->pd_init_fn) - pol->pd_init_fn(blkg); + pol->pd_init_fn(pd); } __set_bit(pol->plid, q->blkcg_pols); @@ -1116,10 +1115,9 @@ void blkcg_deactivate_policy(struct request_queue *q, /* grab blkcg lock too while removing @pd from @blkg */ spin_lock(&blkg->blkcg->lock); - if (pol->pd_offline_fn) - pol->pd_offline_fn(blkg); - if (blkg->pd[pol->plid]) { + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg->pd[pol->plid]); pol->pd_free_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c3a235b8ec7e..c2c75477a6b2 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -377,9 +377,10 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) return &tg->pd; } -static void throtl_pd_init(struct blkcg_gq *blkg) +static void throtl_pd_init(struct blkg_policy_data *pd) { - struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_grp *tg = pd_to_tg(pd); + struct blkcg_gq *blkg = tg_to_blkg(tg); struct throtl_data *td = blkg->q->td; struct throtl_service_queue *sq = &tg->service_queue; @@ -417,13 +418,13 @@ static void tg_update_has_rules(struct throtl_grp *tg) (tg->bps[rw] != -1 || tg->iops[rw] != -1); } -static void throtl_pd_online(struct blkcg_gq *blkg) +static void throtl_pd_online(struct blkg_policy_data *pd) { /* * We don't want new groups to escape the limits of its ancestors. * Update has_rules[] after a new group is brought online. */ - tg_update_has_rules(blkg_to_tg(blkg)); + tg_update_has_rules(pd_to_tg(pd)); } static void throtl_pd_free(struct blkg_policy_data *pd) @@ -435,9 +436,9 @@ static void throtl_pd_free(struct blkg_policy_data *pd) kfree(tg); } -static void throtl_pd_reset_stats(struct blkcg_gq *blkg) +static void throtl_pd_reset_stats(struct blkg_policy_data *pd) { - struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_grp *tg = pd_to_tg(pd); int cpu; for_each_possible_cpu(cpu) { diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 4b795c7250ef..95e6b0cbaa84 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1597,18 +1597,18 @@ static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) return &cfqg->pd; } -static void cfq_pd_init(struct blkcg_gq *blkg) +static void cfq_pd_init(struct blkg_policy_data *pd) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); - struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg); + struct cfq_group *cfqg = pd_to_cfqg(pd); + struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg); cfqg->weight = cgd->weight; cfqg->leaf_weight = cgd->leaf_weight; } -static void cfq_pd_offline(struct blkcg_gq *blkg) +static void cfq_pd_offline(struct blkg_policy_data *pd) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); + struct cfq_group *cfqg = pd_to_cfqg(pd); int i; for (i = 0; i < IOPRIO_BE_NR; i++) { @@ -1661,9 +1661,9 @@ static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data * return a; } -static void cfq_pd_reset_stats(struct blkcg_gq *blkg) +static void cfq_pd_reset_stats(struct blkg_policy_data *pd) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); + struct cfq_group *cfqg = pd_to_cfqg(pd); cfqg_stats_reset(&cfqg->stats); cfqg_stats_reset(&cfqg->dead_stats); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 9879469b1b38..ddd4b8b252c7 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -125,11 +125,11 @@ struct blkcg_gq { typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); -typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); -typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); +typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); struct blkcg_policy { int plid; From 814376483e7d85b69a70634633f1f9d01c6ee0cf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:15 -0700 Subject: [PATCH 24/45] blkcg: minor updates around blkcg_policy_data * Rename blkcg->pd[] to blkcg->cpd[] so that cpd is consistently used for blkcg_policy_data. * Make blkcg_policy->cpd_init_fn() take blkcg_policy_data instead of blkcg. This makes it consistent with blkg_policy_data methods and to-be-added cpd alloc/free methods. * blkcg_policy_data->blkcg and cpd_to_blkcg() added so that cpd_init_fn() can determine the associated blkcg from blkcg_policy_data. v2: blkcg_policy_data->blkcg initializations were missing. Added. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 24 +++++++++++++----------- block/cfq-iosched.c | 11 +++++------ include/linux/blk-cgroup.h | 14 ++++++++++---- 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8343450cffe2..247c42c8c83b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -821,7 +821,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css) mutex_unlock(&blkcg_pol_mutex); for (i = 0; i < BLKCG_MAX_POLS; i++) - kfree(blkcg->pd[i]); + kfree(blkcg->cpd[i]); kfree(blkcg); } @@ -857,15 +857,16 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) if (!pol || !pol->cpd_size) continue; - BUG_ON(blkcg->pd[i]); + BUG_ON(blkcg->cpd[i]); cpd = kzalloc(pol->cpd_size, GFP_KERNEL); if (!cpd) { ret = ERR_PTR(-ENOMEM); goto free_pd_blkcg; } - blkcg->pd[i] = cpd; + blkcg->cpd[i] = cpd; + cpd->blkcg = blkcg; cpd->plid = i; - pol->cpd_init_fn(blkcg); + pol->cpd_init_fn(cpd); } spin_lock_init(&blkcg->lock); @@ -881,7 +882,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) free_pd_blkcg: for (i--; i >= 0; i--) - kfree(blkcg->pd[i]); + kfree(blkcg->cpd[i]); free_blkcg: kfree(blkcg); mutex_unlock(&blkcg_pol_mutex); @@ -1168,9 +1169,10 @@ int blkcg_policy_register(struct blkcg_policy *pol) goto err_free_cpds; } - blkcg->pd[pol->plid] = cpd; + blkcg->cpd[pol->plid] = cpd; + cpd->blkcg = blkcg; cpd->plid = pol->plid; - pol->cpd_init_fn(blkcg); + pol->cpd_init_fn(cpd); } } @@ -1186,8 +1188,8 @@ int blkcg_policy_register(struct blkcg_policy *pol) err_free_cpds: if (pol->cpd_size) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - kfree(blkcg->pd[pol->plid]); - blkcg->pd[pol->plid] = NULL; + kfree(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; } } blkcg_policy[pol->plid] = NULL; @@ -1222,8 +1224,8 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) if (pol->cpd_size) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - kfree(blkcg->pd[pol->plid]); - blkcg->pd[pol->plid] = NULL; + kfree(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; } } blkcg_policy[pol->plid] = NULL; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 95e6b0cbaa84..dd6ea9ee6245 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -220,7 +220,7 @@ struct cfqg_stats { /* Per-cgroup data */ struct cfq_group_data { /* must be the first member */ - struct blkcg_policy_data pd; + struct blkcg_policy_data cpd; unsigned int weight; unsigned int leaf_weight; @@ -612,7 +612,7 @@ static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) static struct cfq_group_data *cpd_to_cfqgd(struct blkcg_policy_data *cpd) { - return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL; + return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL; } static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) @@ -1568,12 +1568,11 @@ static void cfqg_stats_init(struct cfqg_stats *stats) #endif } -static void cfq_cpd_init(const struct blkcg *blkcg) +static void cfq_cpd_init(struct blkcg_policy_data *cpd) { - struct cfq_group_data *cgd = - cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]); + struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); - if (blkcg == &blkcg_root) { + if (cpd_to_blkcg(cpd) == &blkcg_root) { cgd->weight = 2 * CFQ_WEIGHT_DEFAULT; cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; } else { diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index ddd4b8b252c7..7988d4749fff 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -45,7 +45,7 @@ struct blkcg { struct blkcg_gq *blkg_hint; struct hlist_head blkg_list; - struct blkcg_policy_data *pd[BLKCG_MAX_POLS]; + struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; #ifdef CONFIG_CGROUP_WRITEBACK @@ -88,7 +88,8 @@ struct blkg_policy_data { * each policy handle per-blkcg data. */ struct blkcg_policy_data { - /* the policy id this per-policy data belongs to */ + /* the blkcg and policy id this per-policy data belongs to */ + struct blkcg *blkcg; int plid; }; @@ -123,7 +124,7 @@ struct blkcg_gq { struct rcu_head rcu_head; }; -typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg); +typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); @@ -243,7 +244,7 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, struct blkcg_policy *pol) { - return blkcg ? blkcg->pd[pol->plid] : NULL; + return blkcg ? blkcg->cpd[pol->plid] : NULL; } /** @@ -257,6 +258,11 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) return pd ? pd->blkg : NULL; } +static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) +{ + return cpd ? cpd->blkcg : NULL; +} + /** * blkg_path - format cgroup path of blkg * @blkg: blkg of interest From e4a9bde9589fdc51283755cdd75d47b27ca7c6fb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:16 -0700 Subject: [PATCH 25/45] blkcg: replace blkcg_policy->cpd_size with ->cpd_alloc/free_fn() methods Each active policy has a cpd (blkcg_policy_data) on each blkcg. The cpd's were allocated by blkcg core and each policy could request to allocate extra space at the end by setting blkcg_policy->cpd_size larger than the size of cpd. This is a bit unusual but blkg (blkcg_gq) policy data used to be handled this way too so it made sense to be consistent; however, blkg policy data switched to alloc/free callbacks. This patch makes similar changes to cpd handling. blkcg_policy->cpd_alloc/free_fn() are added to replace ->cpd_size. As cpd allocation is now done from policy side, it can simply allocate a larger area which embeds cpd at the beginning. As ->cpd_alloc_fn() may be able to perform all necessary initializations, this patch makes ->cpd_init_fn() optional. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 39 +++++++++++++++++++++++--------------- block/cfq-iosched.c | 19 ++++++++++++++++++- include/linux/blk-cgroup.h | 17 ++++++++++------- 3 files changed, 52 insertions(+), 23 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 247c42c8c83b..2b4354b6b5de 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -817,11 +817,15 @@ static void blkcg_css_free(struct cgroup_subsys_state *css) int i; mutex_lock(&blkcg_pol_mutex); + list_del(&blkcg->all_blkcgs_node); - mutex_unlock(&blkcg_pol_mutex); for (i = 0; i < BLKCG_MAX_POLS; i++) - kfree(blkcg->cpd[i]); + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); + + mutex_unlock(&blkcg_pol_mutex); + kfree(blkcg); } @@ -854,11 +858,10 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) * check if the policy requires any specific per-cgroup * data: if it does, allocate and initialize it. */ - if (!pol || !pol->cpd_size) + if (!pol || !pol->cpd_alloc_fn) continue; - BUG_ON(blkcg->cpd[i]); - cpd = kzalloc(pol->cpd_size, GFP_KERNEL); + cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) { ret = ERR_PTR(-ENOMEM); goto free_pd_blkcg; @@ -866,7 +869,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) blkcg->cpd[i] = cpd; cpd->blkcg = blkcg; cpd->plid = i; - pol->cpd_init_fn(cpd); + if (pol->cpd_init_fn) + pol->cpd_init_fn(cpd); } spin_lock_init(&blkcg->lock); @@ -882,7 +886,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) free_pd_blkcg: for (i--; i >= 0; i--) - kfree(blkcg->cpd[i]); + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); free_blkcg: kfree(blkcg); mutex_unlock(&blkcg_pol_mutex); @@ -1159,11 +1164,11 @@ int blkcg_policy_register(struct blkcg_policy *pol) blkcg_policy[pol->plid] = pol; /* allocate and install cpd's */ - if (pol->cpd_size) { + if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { struct blkcg_policy_data *cpd; - cpd = kzalloc(pol->cpd_size, GFP_KERNEL); + cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) { mutex_unlock(&blkcg_pol_mutex); goto err_free_cpds; @@ -1186,10 +1191,12 @@ int blkcg_policy_register(struct blkcg_policy *pol) return 0; err_free_cpds: - if (pol->cpd_size) { + if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - kfree(blkcg->cpd[pol->plid]); - blkcg->cpd[pol->plid] = NULL; + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } } } blkcg_policy[pol->plid] = NULL; @@ -1222,10 +1229,12 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); - if (pol->cpd_size) { + if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - kfree(blkcg->cpd[pol->plid]); - blkcg->cpd[pol->plid] = NULL; + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } } } blkcg_policy[pol->plid] = NULL; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index dd6ea9ee6245..a4429b366820 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1568,6 +1568,16 @@ static void cfqg_stats_init(struct cfqg_stats *stats) #endif } +static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) +{ + struct cfq_group_data *cgd; + + cgd = kzalloc(sizeof(*cgd), GFP_KERNEL); + if (!cgd) + return NULL; + return &cgd->cpd; +} + static void cfq_cpd_init(struct blkcg_policy_data *cpd) { struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); @@ -1581,6 +1591,11 @@ static void cfq_cpd_init(struct blkcg_policy_data *cpd) } } +static void cfq_cpd_free(struct blkcg_policy_data *cpd) +{ + kfree(cpd_to_cfqgd(cpd)); +} + static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) { struct cfq_group *cfqg; @@ -4649,10 +4664,12 @@ static struct elevator_type iosched_cfq = { #ifdef CONFIG_CFQ_GROUP_IOSCHED static struct blkcg_policy blkcg_policy_cfq = { - .cpd_size = sizeof(struct cfq_group_data), .cftypes = cfq_blkcg_files, + .cpd_alloc_fn = cfq_cpd_alloc, .cpd_init_fn = cfq_cpd_init, + .cpd_free_fn = cfq_cpd_free, + .pd_alloc_fn = cfq_pd_alloc, .pd_init_fn = cfq_pd_init, .pd_offline_fn = cfq_pd_offline, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 7988d4749fff..15f2382bc723 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -81,11 +81,11 @@ struct blkg_policy_data { }; /* - * Policies that need to keep per-blkcg data which is independent - * from any request_queue associated to it must specify its size - * with the cpd_size field of the blkcg_policy structure and - * embed a blkcg_policy_data in it. cpd_init() is invoked to let - * each policy handle per-blkcg data. + * Policies that need to keep per-blkcg data which is independent from any + * request_queue associated to it should implement cpd_alloc/free_fn() + * methods. A policy can allocate private data area by allocating larger + * data structure which embeds blkcg_policy_data at the beginning. + * cpd_init() is invoked to let each policy handle per-blkcg data. */ struct blkcg_policy_data { /* the blkcg and policy id this per-policy data belongs to */ @@ -124,7 +124,9 @@ struct blkcg_gq { struct rcu_head rcu_head; }; +typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); +typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); @@ -134,13 +136,14 @@ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); struct blkcg_policy { int plid; - /* policy specific per-blkcg data size */ - size_t cpd_size; /* cgroup files for the policy */ struct cftype *cftypes; /* operations */ + blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; blkcg_pol_init_cpd_fn *cpd_init_fn; + blkcg_pol_free_cpd_fn *cpd_free_fn; + blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn; blkcg_pol_online_pd_fn *pd_online_fn; From 24f290466f79a6497f1654f64b9a841872cba3ca Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:17 -0700 Subject: [PATCH 26/45] blkcg: inline [__]blkg_lookup() blkg_lookup() checks whether the target queue is bypassing and, if not, calls __blkg_lookup() which first checks the lookup hint and then performs radix tree walk. The operations upto hint checking are trivial and there are many users of this function. This patch inlines blkg_lookup() and the fast path part of __blkg_lookup(). The radix tree lookup and hint update are now in blkg_lookup_slowpath(). This will help consolidating blkg handling by easing moving root blkcg short-circuit to inlined lookup fast path. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 38 ++--------------------------- include/linux/blk-cgroup.h | 49 ++++++++++++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 40 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 2b4354b6b5de..52e637ed9d7e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -131,26 +131,11 @@ err_free: return NULL; } -/** - * __blkg_lookup - internal version of blkg_lookup() - * @blkcg: blkcg of interest - * @q: request_queue of interest - * @update_hint: whether to update lookup hint with the result or not - * - * This is internal version and shouldn't be used by policy - * implementations. Looks up blkgs for the @blkcg - @q pair regardless of - * @q's bypass state. If @update_hint is %true, the caller should be - * holding @q->queue_lock and lookup hint is updated on success. - */ -struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, - bool update_hint) +struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, + struct request_queue *q, bool update_hint) { struct blkcg_gq *blkg; - blkg = rcu_dereference(blkcg->blkg_hint); - if (blkg && blkg->q == q) - return blkg; - /* * Hint didn't match. Look up from the radix tree. Note that the * hint can only be updated under queue_lock as otherwise @blkg @@ -169,25 +154,6 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, return NULL; } -/** - * blkg_lookup - lookup blkg for the specified blkcg - q pair - * @blkcg: blkcg of interest - * @q: request_queue of interest - * - * Lookup blkg for the @blkcg - @q pair. This function should be called - * under RCU read lock and is guaranteed to return %NULL if @q is bypassing - * - see blk_queue_bypass_start() for details. - */ -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - if (unlikely(blk_queue_bypass(q))) - return NULL; - return __blkg_lookup(blkcg, q, false); -} -EXPORT_SYMBOL_GPL(blkg_lookup); - /* * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 15f2382bc723..d5b54aa50582 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -155,7 +155,8 @@ struct blkcg_policy { extern struct blkcg blkcg_root; extern struct cgroup_subsys_state * const blkcg_root_css; -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); +struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, + struct request_queue *q, bool update_hint); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q); int blkcg_init_queue(struct request_queue *q); @@ -231,6 +232,49 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) return css_to_blkcg(blkcg->css.parent); } +/** + * __blkg_lookup - internal version of blkg_lookup() + * @blkcg: blkcg of interest + * @q: request_queue of interest + * @update_hint: whether to update lookup hint with the result or not + * + * This is internal version and shouldn't be used by policy + * implementations. Looks up blkgs for the @blkcg - @q pair regardless of + * @q's bypass state. If @update_hint is %true, the caller should be + * holding @q->queue_lock and lookup hint is updated on success. + */ +static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, + struct request_queue *q, + bool update_hint) +{ + struct blkcg_gq *blkg; + + blkg = rcu_dereference(blkcg->blkg_hint); + if (blkg && blkg->q == q) + return blkg; + + return blkg_lookup_slowpath(blkcg, q, update_hint); +} + +/** + * blkg_lookup - lookup blkg for the specified blkcg - q pair + * @blkcg: blkcg of interest + * @q: request_queue of interest + * + * Lookup blkg for the @blkcg - @q pair. This function should be called + * under RCU read lock and is guaranteed to return %NULL if @q is bypassing + * - see blk_queue_bypass_start() for details. + */ +static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, + struct request_queue *q) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (unlikely(blk_queue_bypass(q))) + return NULL; + return __blkg_lookup(blkcg, q, false); +} + /** * blkg_to_pdata - get policy private data * @blkg: blkg of interest @@ -313,9 +357,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) call_rcu(&blkg->rcu_head, __blkg_release_rcu); } -struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, - bool update_hint); - /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant From 85b6bc9db6d5ab6980b43c38b5cbd11d24414ce4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:18 -0700 Subject: [PATCH 27/45] blkcg: move root blkg lookup optimization from throtl_lookup_tg() to __blkg_lookup() Currently, both throttle and cfq policies implement their own root blkg (blkcg_gq) lookup fast path. This patch moves root blkg optimization from throtl_lookup_tg() to __blkg_lookup(). cfq-iosched currently doesn't use blkg_lookup() but will be converted and drop the optimization too. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/blk-throttle.c | 7 ------- include/linux/blk-cgroup.h | 3 +++ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c2c75477a6b2..1f63fc834dc3 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -452,13 +452,6 @@ static void throtl_pd_reset_stats(struct blkg_policy_data *pd) static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkcg *blkcg) { - /* - * This is the common case when there are no blkcgs. Avoid lookup - * in this case - */ - if (blkcg == &blkcg_root) - return td_root_tg(td); - return blkg_to_tg(blkg_lookup(blkcg, td->queue)); } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index d5b54aa50582..0609bce69f68 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -249,6 +249,9 @@ static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, { struct blkcg_gq *blkg; + if (blkcg == &blkcg_root) + return q->root_blkg; + blkg = rcu_dereference(blkcg->blkg_hint); if (blkg && blkg->q == q) return blkg; From c9589f03e490956628ff91a1da133216dc796b63 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:19 -0700 Subject: [PATCH 28/45] blk-throttle: improve queue bypass handling If a queue is bypassing, all blkcg policies should become noops but blk-throttle wasn't. It only became noop if the queue was dying. While this wouldn't lead to an oops as falling back to the root blkg is safe in this case, this can be a bit surprising - a bypassing queue could still be applying throttle limits. Fix it by removing blk_queue_dying() test in throtl_lookup_create_tg() and testing blk_queue_bypass() in blk_throtl_bio() and bypassing before doing anything else. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/blk-throttle.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 1f63fc834dc3..900a777e01c2 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -475,7 +475,7 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, /* if %NULL and @q is alive, fall back to root_tg */ if (!IS_ERR(blkg)) tg = blkg_to_tg(blkg); - else if (!blk_queue_dying(q)) + else tg = td_root_tg(td); } @@ -1438,10 +1438,11 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) * IO group */ spin_lock_irq(q->queue_lock); - tg = throtl_lookup_create_tg(td, blkcg); - if (unlikely(!tg)) + + if (unlikely(blk_queue_bypass(q))) goto out_unlock; + tg = throtl_lookup_create_tg(td, blkcg); sq = &tg->service_queue; while (true) { From ae11889636111199dbcf47283b4167f578b69472 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:20 -0700 Subject: [PATCH 29/45] blkcg: consolidate blkg creation in blkcg_bio_issue_check() blkg (blkcg_gq) currently is created by blkcg policies invoking blkg_lookup_create() which ends up repeating about the same code in different policies. Theoretically, this can avoid the overhead of looking and/or creating blkg's if blkcg is enabled but no policy is in use; however, the cost of blkg lookup / creation is very low especially if only the root blkcg is in use which is highly likely if no blkcg policy is in active use - it boils down to a single very predictable conditional and surrounding RCU protection. This patch consolidates blkg creation to a new function blkcg_bio_issue_check() which is called during bio issue from generic_make_request_checks(). blkcg_bio_issue_check() is now the only function which tries to create missing blkg's. The subsequent policy and request_list operations just perform blkg_lookup() and if missing falls back to the root. * blk_get_rl() no longer tries to create blkg. It uses blkg_lookup() instead of blkg_lookup_create(). * blk_throtl_bio() is now called from blkcg_bio_issue_check() with rcu read locked and blkg already looked up. Both throtl_lookup_tg() and throtl_lookup_create_tg() are dropped. * cfq is similarly updated. cfq_lookup_create_cfqg() is replaced with cfq_lookup_cfqg()which uses blkg_lookup(). This consolidates blkg handling and avoids unnecessary blkg creation retries under memory pressure. In addition, this provides a common bio entry point into blkcg where things like common accounting can be performed. v2: Build fixes for !CONFIG_CFQ_GROUP_IOSCHED and !CONFIG_BLK_DEV_THROTTLING. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- block/blk-core.c | 4 +-- block/blk-throttle.c | 72 ++++---------------------------------- block/blk.h | 5 --- block/cfq-iosched.c | 33 ++++++----------- include/linux/blk-cgroup.h | 40 +++++++++++++++++++-- 6 files changed, 57 insertions(+), 99 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 52e637ed9d7e..097c4a670fa4 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -153,6 +153,7 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, return NULL; } +EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); /* * If @new_blkg is %NULL, this function tries to allocate a new one as @@ -295,7 +296,6 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, return blkg; } } -EXPORT_SYMBOL_GPL(blkg_lookup_create); static void blkg_destroy(struct blkcg_gq *blkg) { diff --git a/block/blk-core.c b/block/blk-core.c index 627ed0c593fb..140094f4eaca 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1889,8 +1889,8 @@ generic_make_request_checks(struct bio *bio) */ create_io_context(GFP_ATOMIC, q->node); - if (blk_throtl_bio(q, bio)) - return false; /* throttled, will be resubmitted later */ + if (!blkcg_bio_issue_check(q, bio)) + return false; trace_block_bio_queue(q, bio); return true; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 900a777e01c2..29c22ed4b073 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -182,11 +182,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) return pd_to_blkg(&tg->pd); } -static inline struct throtl_grp *td_root_tg(struct throtl_data *td) -{ - return blkg_to_tg(td->queue->root_blkg); -} - /** * sq_to_tg - return the throl_grp the specified service queue belongs to * @sq: the throtl_service_queue of interest @@ -449,39 +444,6 @@ static void throtl_pd_reset_stats(struct blkg_policy_data *pd) } } -static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, - struct blkcg *blkcg) -{ - return blkg_to_tg(blkg_lookup(blkcg, td->queue)); -} - -static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, - struct blkcg *blkcg) -{ - struct request_queue *q = td->queue; - struct throtl_grp *tg = NULL; - - /* - * This is the common case when there are no blkcgs. Avoid lookup - * in this case - */ - if (blkcg == &blkcg_root) { - tg = td_root_tg(td); - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - - /* if %NULL and @q is alive, fall back to root_tg */ - if (!IS_ERR(blkg)) - tg = blkg_to_tg(blkg); - else - tg = td_root_tg(td); - } - - return tg; -} - static struct throtl_grp * throtl_rb_first(struct throtl_service_queue *parent_sq) { @@ -1403,46 +1365,26 @@ static struct blkcg_policy blkcg_policy_throtl = { .pd_reset_stats_fn = throtl_pd_reset_stats, }; -bool blk_throtl_bio(struct request_queue *q, struct bio *bio) +bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + struct bio *bio) { - struct throtl_data *td = q->td; struct throtl_qnode *qn = NULL; - struct throtl_grp *tg; + struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); - struct blkcg *blkcg; bool throttled = false; + WARN_ON_ONCE(!rcu_read_lock_held()); + /* see throtl_charge_bio() */ - if (bio->bi_rw & REQ_THROTTLED) + if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw]) goto out; - /* - * A throtl_grp pointer retrieved under rcu can be used to access - * basic fields like stats and io rates. If a group has no rules, - * just update the dispatch stats in lockless manner and return. - */ - rcu_read_lock(); - blkcg = bio_blkcg(bio); - tg = throtl_lookup_tg(td, blkcg); - if (tg) { - if (!tg->has_rules[rw]) { - throtl_update_dispatch_stats(tg_to_blkg(tg), - bio->bi_iter.bi_size, bio->bi_rw); - goto out_unlock_rcu; - } - } - - /* - * Either group has not been allocated yet or it is not an unlimited - * IO group - */ spin_lock_irq(q->queue_lock); if (unlikely(blk_queue_bypass(q))) goto out_unlock; - tg = throtl_lookup_create_tg(td, blkcg); sq = &tg->service_queue; while (true) { @@ -1507,8 +1449,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) out_unlock: spin_unlock_irq(q->queue_lock); -out_unlock_rcu: - rcu_read_unlock(); out: /* * As multiple blk-throtls may stack in the same issue path, we diff --git a/block/blk.h b/block/blk.h index 026d9594142b..d905b26fbff5 100644 --- a/block/blk.h +++ b/block/blk.h @@ -266,15 +266,10 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) * Internal throttling interface */ #ifdef CONFIG_BLK_DEV_THROTTLING -extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); extern void blk_throtl_drain(struct request_queue *q); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); #else /* CONFIG_BLK_DEV_THROTTLING */ -static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) -{ - return false; -} static inline void blk_throtl_drain(struct request_queue *q) { } static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index a4429b366820..0994f3b523a8 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1683,28 +1683,15 @@ static void cfq_pd_reset_stats(struct blkg_policy_data *pd) cfqg_stats_reset(&cfqg->dead_stats); } -/* - * Search for the cfq group current task belongs to. request_queue lock must - * be held. - */ -static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) +static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) { - struct request_queue *q = cfqd->queue; - struct cfq_group *cfqg = NULL; + struct blkcg_gq *blkg; - /* avoid lookup for the common case where there's no blkcg */ - if (blkcg == &blkcg_root) { - cfqg = cfqd->root_group; - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - if (!IS_ERR(blkg)) - cfqg = blkg_to_cfqg(blkg); - } - - return cfqg; + blkg = blkg_lookup(blkcg, cfqd->queue); + if (likely(blkg)) + return blkg_to_cfqg(blkg); + return NULL; } static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) @@ -2108,8 +2095,8 @@ static struct cftype cfq_blkcg_files[] = { { } /* terminate */ }; #else /* GROUP_IOSCHED */ -static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) +static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) { return cfqd->root_group; } @@ -3716,7 +3703,7 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, struct cfq_group *cfqg; rcu_read_lock(); - cfqg = cfq_lookup_create_cfqg(cfqd, bio_blkcg(bio)); + cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); if (!cfqg) { cfqq = &cfqd->oom_cfqq; goto out; diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 0609bce69f68..4d1659c7f84b 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -421,8 +421,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, * or if either the blkcg or queue is going away. Fall back to * root_rl in such cases. */ - blkg = blkg_lookup_create(blkcg, q); - if (unlikely(IS_ERR(blkg))) + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) goto root_rl; blkg_get(blkg); @@ -636,6 +636,39 @@ static inline void blkg_rwstat_merge(struct blkg_rwstat *to, u64_stats_update_end(&to->syncp); } +#ifdef CONFIG_BLK_DEV_THROTTLING +extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + struct bio *bio); +#else +static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + struct bio *bio) { return false; } +#endif + +static inline bool blkcg_bio_issue_check(struct request_queue *q, + struct bio *bio) +{ + struct blkcg *blkcg; + struct blkcg_gq *blkg; + bool throtl = false; + + rcu_read_lock(); + blkcg = bio_blkcg(bio); + + blkg = blkg_lookup(blkcg, q); + if (unlikely(!blkg)) { + spin_lock_irq(q->queue_lock); + blkg = blkg_lookup_create(blkcg, q); + if (IS_ERR(blkg)) + blkg = NULL; + spin_unlock_irq(q->queue_lock); + } + + throtl = blk_throtl_bio(q, blkg, bio); + + rcu_read_unlock(); + return !throtl; +} + #else /* CONFIG_BLK_CGROUP */ struct blkcg { @@ -689,6 +722,9 @@ static inline void blk_put_rl(struct request_list *rl) { } static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } +static inline bool blkcg_bio_issue_check(struct request_queue *q, + struct bio *bio) { return true; } + #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) From e6269c44546755094979ab53609e6e203a68c8ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:21 -0700 Subject: [PATCH 30/45] blkcg: add blkg_[rw]stat->aux_cnt and replace cfq_group->dead_stats with it cgroup stats are local to each cgroup and doesn't propagate to ancestors by default. When recursive stats are necessary, the sum is calculated over all the descendants. This initially was for backward compatibility to support both group-local and recursive stats but this mode of operation makes general sense as stat update is much hotter thafn reporting those stats. This however ends up losing recursive stats when a child is removed. To work around this, cfq-iosched adds its stats to its parent cfq_group->dead_stats which is summed up together when calculating recursive stats. It's planned that the core stats will be moved to blkcg_gq, so we want to move the mechanism for keeping track of the stats of dead children from cfq to blkcg core. This patch adds blkg_[rw]stat->aux_cnt which are atomic64_t's keeping track of auxiliary counts which are excluded when reading local counts but included for recursive. blkg_[rw]stat_merge() which were used by cfq to implement dead_stats are replaced by blkg_[rw]stat_add_aux(), and cfq now forwards stats of a dead cgroup to the aux counts of parent->stats instead of separate ->dead_stats. This will also help making blkg_[rw]stats per-cpu. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 10 +++--- block/cfq-iosched.c | 67 ++++++++++---------------------------- include/linux/blk-cgroup.h | 46 ++++++++++++++++++-------- 3 files changed, 57 insertions(+), 66 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 097c4a670fa4..ff79b52d1a0e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -584,7 +584,7 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); * @off: offset to the blkg_stat in @pd * * Collect the blkg_stat specified by @off from @pd and all its online - * descendants and return the sum. The caller must be holding the queue + * descendants and their aux counts. The caller must be holding the queue * lock for online tests. */ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) @@ -602,7 +602,8 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) struct blkg_stat *stat = (void *)pos_pd + off; if (pos_blkg->online) - sum += blkg_stat_read(stat); + sum += blkg_stat_read(stat) + + atomic64_read(&stat->aux_cnt); } rcu_read_unlock(); @@ -616,7 +617,7 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); * @off: offset to the blkg_stat in @pd * * Collect the blkg_rwstat specified by @off from @pd and all its online - * descendants and return the sum. The caller must be holding the queue + * descendants and their aux counts. The caller must be holding the queue * lock for online tests. */ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, @@ -642,7 +643,8 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, tmp = blkg_rwstat_read(rwstat); for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum.cnt[i] += tmp.cnt[i]; + sum.cnt[i] += tmp.cnt[i] + + atomic64_read(&rwstat->aux_cnt[i]); } rcu_read_unlock(); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 0994f3b523a8..b272cfff7364 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -304,7 +304,6 @@ struct cfq_group { int dispatched; struct cfq_ttime ttime; struct cfqg_stats stats; /* stats for this cfqg */ - struct cfqg_stats dead_stats; /* stats pushed from dead children */ /* async queue for each priority case */ struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; @@ -736,28 +735,28 @@ static void cfqg_stats_reset(struct cfqg_stats *stats) } /* @to += @from */ -static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from) +static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from) { /* queued stats shouldn't be cleared */ - blkg_rwstat_merge(&to->service_bytes, &from->service_bytes); - blkg_rwstat_merge(&to->serviced, &from->serviced); - blkg_rwstat_merge(&to->merged, &from->merged); - blkg_rwstat_merge(&to->service_time, &from->service_time); - blkg_rwstat_merge(&to->wait_time, &from->wait_time); - blkg_stat_merge(&from->time, &from->time); + blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); + blkg_rwstat_add_aux(&to->serviced, &from->serviced); + blkg_rwstat_add_aux(&to->merged, &from->merged); + blkg_rwstat_add_aux(&to->service_time, &from->service_time); + blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); + blkg_stat_add_aux(&from->time, &from->time); #ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time); - blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum); - blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples); - blkg_stat_merge(&to->dequeue, &from->dequeue); - blkg_stat_merge(&to->group_wait_time, &from->group_wait_time); - blkg_stat_merge(&to->idle_time, &from->idle_time); - blkg_stat_merge(&to->empty_time, &from->empty_time); + blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); + blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); + blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); + blkg_stat_add_aux(&to->dequeue, &from->dequeue); + blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); + blkg_stat_add_aux(&to->idle_time, &from->idle_time); + blkg_stat_add_aux(&to->empty_time, &from->empty_time); #endif } /* - * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors' + * Transfer @cfqg's stats to its parent's aux counts so that the ancestors' * recursive stats can still account for the amount used by this cfqg after * it's gone. */ @@ -770,10 +769,8 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) if (unlikely(!parent)) return; - cfqg_stats_merge(&parent->dead_stats, &cfqg->stats); - cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats); + cfqg_stats_add_aux(&parent->stats, &cfqg->stats); cfqg_stats_reset(&cfqg->stats); - cfqg_stats_reset(&cfqg->dead_stats); } #else /* CONFIG_CFQ_GROUP_IOSCHED */ @@ -1606,7 +1603,6 @@ static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) cfq_init_cfqg_base(cfqg); cfqg_stats_init(&cfqg->stats); - cfqg_stats_init(&cfqg->dead_stats); return &cfqg->pd; } @@ -1649,38 +1645,11 @@ static void cfq_pd_free(struct blkg_policy_data *pd) return kfree(pd); } -/* offset delta from cfqg->stats to cfqg->dead_stats */ -static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) - - offsetof(struct cfq_group, stats); - -/* to be used by recursive prfill, sums live and dead stats recursively */ -static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) -{ - u64 sum = 0; - - sum += blkg_stat_recursive_sum(pd, off); - sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta); - return sum; -} - -/* to be used by recursive prfill, sums live and dead rwstats recursively */ -static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, - int off) -{ - struct blkg_rwstat a, b; - - a = blkg_rwstat_recursive_sum(pd, off); - b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta); - blkg_rwstat_merge(&a, &b); - return a; -} - static void cfq_pd_reset_stats(struct blkg_policy_data *pd) { struct cfq_group *cfqg = pd_to_cfqg(pd); cfqg_stats_reset(&cfqg->stats); - cfqg_stats_reset(&cfqg->dead_stats); } static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, @@ -1883,7 +1852,7 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v) static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = cfqg_stat_pd_recursive_sum(pd, off); + u64 sum = blkg_stat_recursive_sum(pd, off); return __blkg_prfill_u64(sf, pd, sum); } @@ -1891,7 +1860,7 @@ static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off); + struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd, off); return __blkg_prfill_rwstat(sf, pd, &sum); } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 4d1659c7f84b..e8092276af58 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -53,14 +53,20 @@ struct blkcg { #endif }; +/* + * blkg_[rw]stat->aux_cnt is excluded for local stats but included for + * recursive. Used to carry stats of dead children. + */ struct blkg_stat { struct u64_stats_sync syncp; uint64_t cnt; + atomic64_t aux_cnt; }; struct blkg_rwstat { struct u64_stats_sync syncp; uint64_t cnt[BLKG_RWSTAT_NR]; + atomic64_t aux_cnt[BLKG_RWSTAT_NR]; }; /* @@ -483,6 +489,7 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl, static inline void blkg_stat_init(struct blkg_stat *stat) { u64_stats_init(&stat->syncp); + atomic64_set(&stat->aux_cnt, 0); } /** @@ -504,8 +511,9 @@ static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) * blkg_stat_read - read the current value of a blkg_stat * @stat: blkg_stat to read * - * Read the current value of @stat. This function can be called without - * synchroniztion and takes care of u64 atomicity. + * Read the current value of @stat. The returned value doesn't include the + * aux count. This function can be called without synchroniztion and takes + * care of u64 atomicity. */ static inline uint64_t blkg_stat_read(struct blkg_stat *stat) { @@ -527,23 +535,31 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat) static inline void blkg_stat_reset(struct blkg_stat *stat) { stat->cnt = 0; + atomic64_set(&stat->aux_cnt, 0); } /** - * blkg_stat_merge - merge a blkg_stat into another + * blkg_stat_add_aux - add a blkg_stat into another's aux count * @to: the destination blkg_stat * @from: the source * - * Add @from's count to @to. + * Add @from's count including the aux one to @to's aux count. */ -static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) +static inline void blkg_stat_add_aux(struct blkg_stat *to, + struct blkg_stat *from) { - blkg_stat_add(to, blkg_stat_read(from)); + atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt), + &to->aux_cnt); } static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) { + int i; + u64_stats_init(&rwstat->syncp); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_set(&rwstat->aux_cnt[i], 0); } /** @@ -614,26 +630,30 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) */ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) { + int i; + memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_set(&rwstat->aux_cnt[i], 0); } /** - * blkg_rwstat_merge - merge a blkg_rwstat into another + * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count * @to: the destination blkg_rwstat * @from: the source * - * Add @from's counts to @to. + * Add @from's count including the aux one to @to's aux count. */ -static inline void blkg_rwstat_merge(struct blkg_rwstat *to, - struct blkg_rwstat *from) +static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, + struct blkg_rwstat *from) { struct blkg_rwstat v = blkg_rwstat_read(from); int i; - u64_stats_update_begin(&to->syncp); for (i = 0; i < BLKG_RWSTAT_NR; i++) - to->cnt[i] += v.cnt[i]; - u64_stats_update_end(&to->syncp); + atomic64_add(v.cnt[i] + atomic64_read(&from->aux_cnt[i]), + &to->aux_cnt[i]); } #ifdef CONFIG_BLK_DEV_THROTTLING From 24bdb8ef068ebdc2a57ce715f0ab22d5da32832a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:22 -0700 Subject: [PATCH 31/45] blkcg: make blkcg_[rw]stat per-cpu blkcg_[rw]stat are used as stat counters for blkcg policies. It isn't per-cpu by itself and blk-throttle makes it per-cpu by wrapping around it. This patch makes blkcg_[rw]stat per-cpu and drop the ad-hoc per-cpu wrapping in blk-throttle. * blkg_[rw]stat->cnt is replaced with cpu_cnt which is struct percpu_counter. This makes syncp unnecessary as remote accesses are handled by percpu_counter itself. * blkg_[rw]stat_init() can now fail due to percpu allocation failure and thus are updated to return int. * percpu_counters need explicit freeing. blkg_[rw]stat_exit() added. * As blkg_rwstat->cpu_cnt[] can't be read directly anymore, reading and summing results are stored in ->aux_cnt[] instead. * Custom per-cpu stat implementation in blk-throttle is removed. This makes all blkcg stat counters per-cpu without complicating policy implmentations. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 10 +-- block/blk-throttle.c | 89 +++++++++----------------- block/cfq-iosched.c | 70 +++++++++++++++------ include/linux/blk-cgroup.h | 124 +++++++++++++++++++++---------------- 4 files changed, 155 insertions(+), 138 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index ff79b52d1a0e..02a2d029b5a5 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -539,9 +539,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, for (i = 0; i < BLKG_RWSTAT_NR; i++) seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], - (unsigned long long)rwstat->cnt[i]); + (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); - v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; + v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]); seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); return v; } @@ -643,8 +644,9 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, tmp = blkg_rwstat_read(rwstat); for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum.cnt[i] += tmp.cnt[i] + - atomic64_read(&rwstat->aux_cnt[i]); + atomic64_add(atomic64_read(&tmp.aux_cnt[i]) + + atomic64_read(&rwstat->aux_cnt[i]), + &sum.aux_cnt[i]); } rcu_read_unlock(); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 29c22ed4b073..c0b2263a222a 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -83,14 +83,6 @@ enum tg_state_flags { #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) -/* Per-cpu group stats */ -struct tg_stats_cpu { - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; -}; - struct throtl_grp { /* must be the first member */ struct blkg_policy_data pd; @@ -142,8 +134,10 @@ struct throtl_grp { unsigned long slice_start[2]; unsigned long slice_end[2]; - /* Per cpu stats pointer */ - struct tg_stats_cpu __percpu *stats_cpu; + /* total bytes transferred */ + struct blkg_rwstat service_bytes; + /* total IOs serviced, post merge */ + struct blkg_rwstat serviced; }; struct throtl_data @@ -337,17 +331,15 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq) static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) { struct throtl_grp *tg; - int rw, cpu; + int rw; tg = kzalloc_node(sizeof(*tg), gfp, node); if (!tg) - return NULL; + goto err; - tg->stats_cpu = alloc_percpu_gfp(struct tg_stats_cpu, gfp); - if (!tg->stats_cpu) { - kfree(tg); - return NULL; - } + if (blkg_rwstat_init(&tg->service_bytes, gfp) || + blkg_rwstat_init(&tg->serviced, gfp)) + goto err_free_tg; throtl_service_queue_init(&tg->service_queue); @@ -362,14 +354,14 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) tg->iops[READ] = -1; tg->iops[WRITE] = -1; - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *stats_cpu = per_cpu_ptr(tg->stats_cpu, cpu); - - blkg_rwstat_init(&stats_cpu->service_bytes); - blkg_rwstat_init(&stats_cpu->serviced); - } - return &tg->pd; + +err_free_tg: + blkg_rwstat_exit(&tg->serviced); + blkg_rwstat_exit(&tg->service_bytes); + kfree(tg); +err: + return NULL; } static void throtl_pd_init(struct blkg_policy_data *pd) @@ -427,21 +419,17 @@ static void throtl_pd_free(struct blkg_policy_data *pd) struct throtl_grp *tg = pd_to_tg(pd); del_timer_sync(&tg->service_queue.pending_timer); - free_percpu(tg->stats_cpu); + blkg_rwstat_exit(&tg->serviced); + blkg_rwstat_exit(&tg->service_bytes); kfree(tg); } static void throtl_pd_reset_stats(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); - int cpu; - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - - blkg_rwstat_reset(&sc->service_bytes); - blkg_rwstat_reset(&sc->serviced); - } + blkg_rwstat_reset(&tg->service_bytes); + blkg_rwstat_reset(&tg->serviced); } static struct throtl_grp * @@ -855,7 +843,6 @@ static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, int rw) { struct throtl_grp *tg = blkg_to_tg(blkg); - struct tg_stats_cpu *stats_cpu; unsigned long flags; /* @@ -865,10 +852,8 @@ static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, */ local_irq_save(flags); - stats_cpu = this_cpu_ptr(tg->stats_cpu); - - blkg_rwstat_add(&stats_cpu->serviced, rw, 1); - blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); + blkg_rwstat_add(&tg->serviced, rw, 1); + blkg_rwstat_add(&tg->service_bytes, rw, bytes); local_irq_restore(flags); } @@ -1176,27 +1161,9 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) } } -static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, - struct blkg_policy_data *pd, int off) +static int tg_print_rwstat(struct seq_file *sf, void *v) { - struct throtl_grp *tg = pd_to_tg(pd); - struct blkg_rwstat rwstat = { }, tmp; - int i, cpu; - - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - - tmp = blkg_rwstat_read((void *)sc + off); - for (i = 0; i < BLKG_RWSTAT_NR; i++) - rwstat.cnt[i] += tmp.cnt[i]; - } - - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} - -static int tg_print_cpu_rwstat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat, + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, &blkcg_policy_throtl, seq_cft(sf)->private, true); return 0; } @@ -1337,13 +1304,13 @@ static struct cftype throtl_files[] = { }, { .name = "throttle.io_service_bytes", - .private = offsetof(struct tg_stats_cpu, service_bytes), - .seq_show = tg_print_cpu_rwstat, + .private = offsetof(struct throtl_grp, service_bytes), + .seq_show = tg_print_rwstat, }, { .name = "throttle.io_serviced", - .private = offsetof(struct tg_stats_cpu, serviced), - .seq_show = tg_print_cpu_rwstat, + .private = offsetof(struct throtl_grp, serviced), + .seq_show = tg_print_rwstat, }, { } /* terminate */ }; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b272cfff7364..71e55c91ee98 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1542,27 +1542,55 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg) } #ifdef CONFIG_CFQ_GROUP_IOSCHED -static void cfqg_stats_init(struct cfqg_stats *stats) +static void cfqg_stats_exit(struct cfqg_stats *stats) { - blkg_rwstat_init(&stats->service_bytes); - blkg_rwstat_init(&stats->serviced); - blkg_rwstat_init(&stats->merged); - blkg_rwstat_init(&stats->service_time); - blkg_rwstat_init(&stats->wait_time); - blkg_rwstat_init(&stats->queued); + blkg_rwstat_exit(&stats->service_bytes); + blkg_rwstat_exit(&stats->serviced); + blkg_rwstat_exit(&stats->merged); + blkg_rwstat_exit(&stats->service_time); + blkg_rwstat_exit(&stats->wait_time); + blkg_rwstat_exit(&stats->queued); - blkg_stat_init(&stats->sectors); - blkg_stat_init(&stats->time); + blkg_stat_exit(&stats->sectors); + blkg_stat_exit(&stats->time); +#ifdef CONFIG_DEBUG_BLK_CGROUP + blkg_stat_exit(&stats->unaccounted_time); + blkg_stat_exit(&stats->avg_queue_size_sum); + blkg_stat_exit(&stats->avg_queue_size_samples); + blkg_stat_exit(&stats->dequeue); + blkg_stat_exit(&stats->group_wait_time); + blkg_stat_exit(&stats->idle_time); + blkg_stat_exit(&stats->empty_time); +#endif +} + +static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp) +{ + if (blkg_rwstat_init(&stats->service_bytes, gfp) || + blkg_rwstat_init(&stats->serviced, gfp) || + blkg_rwstat_init(&stats->merged, gfp) || + blkg_rwstat_init(&stats->service_time, gfp) || + blkg_rwstat_init(&stats->wait_time, gfp) || + blkg_rwstat_init(&stats->queued, gfp) || + + blkg_stat_init(&stats->sectors, gfp) || + blkg_stat_init(&stats->time, gfp)) + goto err; #ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_init(&stats->unaccounted_time); - blkg_stat_init(&stats->avg_queue_size_sum); - blkg_stat_init(&stats->avg_queue_size_samples); - blkg_stat_init(&stats->dequeue); - blkg_stat_init(&stats->group_wait_time); - blkg_stat_init(&stats->idle_time); - blkg_stat_init(&stats->empty_time); + if (blkg_stat_init(&stats->unaccounted_time, gfp) || + blkg_stat_init(&stats->avg_queue_size_sum, gfp) || + blkg_stat_init(&stats->avg_queue_size_samples, gfp) || + blkg_stat_init(&stats->dequeue, gfp) || + blkg_stat_init(&stats->group_wait_time, gfp) || + blkg_stat_init(&stats->idle_time, gfp) || + blkg_stat_init(&stats->empty_time, gfp)) + goto err; #endif + return 0; +err: + cfqg_stats_exit(stats); + return -ENOMEM; } static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) @@ -1602,7 +1630,10 @@ static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) return NULL; cfq_init_cfqg_base(cfqg); - cfqg_stats_init(&cfqg->stats); + if (cfqg_stats_init(&cfqg->stats, gfp)) { + kfree(cfqg); + return NULL; + } return &cfqg->pd; } @@ -1642,7 +1673,10 @@ static void cfq_pd_offline(struct blkg_policy_data *pd) static void cfq_pd_free(struct blkg_policy_data *pd) { - return kfree(pd); + struct cfq_group *cfqg = pd_to_cfqg(pd); + + cfqg_stats_exit(&cfqg->stats); + return kfree(cfqg); } static void cfq_pd_reset_stats(struct blkg_policy_data *pd) diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index e8092276af58..fdc7ac08b1ce 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -14,12 +14,15 @@ */ #include -#include +#include #include #include #include #include +/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ +#define BLKG_STAT_CPU_BATCH (INT_MAX / 2) + /* Max limits for throttle policy */ #define THROTL_IOPS_MAX UINT_MAX @@ -55,17 +58,16 @@ struct blkcg { /* * blkg_[rw]stat->aux_cnt is excluded for local stats but included for - * recursive. Used to carry stats of dead children. + * recursive. Used to carry stats of dead children, and, for blkg_rwstat, + * to carry result values from read and sum operations. */ struct blkg_stat { - struct u64_stats_sync syncp; - uint64_t cnt; + struct percpu_counter cpu_cnt; atomic64_t aux_cnt; }; struct blkg_rwstat { - struct u64_stats_sync syncp; - uint64_t cnt[BLKG_RWSTAT_NR]; + struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; atomic64_t aux_cnt[BLKG_RWSTAT_NR]; }; @@ -486,10 +488,21 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl, #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) -static inline void blkg_stat_init(struct blkg_stat *stat) +static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp) { - u64_stats_init(&stat->syncp); + int ret; + + ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp); + if (ret) + return ret; + atomic64_set(&stat->aux_cnt, 0); + return 0; +} + +static inline void blkg_stat_exit(struct blkg_stat *stat) +{ + percpu_counter_destroy(&stat->cpu_cnt); } /** @@ -497,35 +510,21 @@ static inline void blkg_stat_init(struct blkg_stat *stat) * @stat: target blkg_stat * @val: value to add * - * Add @val to @stat. The caller is responsible for synchronizing calls to - * this function. + * Add @val to @stat. The caller must ensure that IRQ on the same CPU + * don't re-enter this function for the same counter. */ static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) { - u64_stats_update_begin(&stat->syncp); - stat->cnt += val; - u64_stats_update_end(&stat->syncp); + __percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); } /** * blkg_stat_read - read the current value of a blkg_stat * @stat: blkg_stat to read - * - * Read the current value of @stat. The returned value doesn't include the - * aux count. This function can be called without synchroniztion and takes - * care of u64 atomicity. */ static inline uint64_t blkg_stat_read(struct blkg_stat *stat) { - unsigned int start; - uint64_t v; - - do { - start = u64_stats_fetch_begin_irq(&stat->syncp); - v = stat->cnt; - } while (u64_stats_fetch_retry_irq(&stat->syncp, start)); - - return v; + return percpu_counter_sum_positive(&stat->cpu_cnt); } /** @@ -534,7 +533,7 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat) */ static inline void blkg_stat_reset(struct blkg_stat *stat) { - stat->cnt = 0; + percpu_counter_set(&stat->cpu_cnt, 0); atomic64_set(&stat->aux_cnt, 0); } @@ -552,14 +551,28 @@ static inline void blkg_stat_add_aux(struct blkg_stat *to, &to->aux_cnt); } -static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) +static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) +{ + int i, ret; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); + if (ret) { + while (--i >= 0) + percpu_counter_destroy(&rwstat->cpu_cnt[i]); + return ret; + } + atomic64_set(&rwstat->aux_cnt[i], 0); + } + return 0; +} + +static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat) { int i; - u64_stats_init(&rwstat->syncp); - for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_set(&rwstat->aux_cnt[i], 0); + percpu_counter_destroy(&rwstat->cpu_cnt[i]); } /** @@ -574,39 +587,38 @@ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, int rw, uint64_t val) { - u64_stats_update_begin(&rwstat->syncp); + struct percpu_counter *cnt; if (rw & REQ_WRITE) - rwstat->cnt[BLKG_RWSTAT_WRITE] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; else - rwstat->cnt[BLKG_RWSTAT_READ] += val; - if (rw & REQ_SYNC) - rwstat->cnt[BLKG_RWSTAT_SYNC] += val; - else - rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; - u64_stats_update_end(&rwstat->syncp); + __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); + + if (rw & REQ_SYNC) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; + else + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; + + __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); } /** * blkg_rwstat_read - read the current values of a blkg_rwstat * @rwstat: blkg_rwstat to read * - * Read the current snapshot of @rwstat and return it as the return value. - * This function can be called without synchronization and takes care of - * u64 atomicity. + * Read the current snapshot of @rwstat and return it in the aux counts. */ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) { - unsigned int start; - struct blkg_rwstat tmp; + struct blkg_rwstat result; + int i; - do { - start = u64_stats_fetch_begin_irq(&rwstat->syncp); - tmp = *rwstat; - } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start)); - - return tmp; + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_set(&result.aux_cnt[i], + percpu_counter_sum_positive(&rwstat->cpu_cnt[i])); + return result; } /** @@ -621,7 +633,8 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) { struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); - return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; + return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); } /** @@ -632,10 +645,10 @@ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) { int i; - memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); - - for (i = 0; i < BLKG_RWSTAT_NR; i++) + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + percpu_counter_set(&rwstat->cpu_cnt[i], 0); atomic64_set(&rwstat->aux_cnt[i], 0); + } } /** @@ -652,7 +665,8 @@ static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, int i; for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_add(v.cnt[i] + atomic64_read(&from->aux_cnt[i]), + atomic64_add(atomic64_read(&v.aux_cnt[i]) + + atomic64_read(&from->aux_cnt[i]), &to->aux_cnt[i]); } From f12c74cab1635d67077ce8cc40da88b57980f637 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:23 -0700 Subject: [PATCH 32/45] blkcg: make blkg_[rw]stat_recursive_sum() to be able to index into blkcg_gq Currently, blkg_[rw]stat_recursive_sum() assume that the target counter is located in pd (blkg_policy_data); however, some counters are planned to be moved to blkg (blkcg_gq). This patch updates blkg_[rw]stat_recursive_sum() to take blkg and blkg_policy pointers instead of pd. If policy is NULL, it indexes into blkg. If non-NULL, into the blkg's pd of the policy. The existing usages are updated to maintain the current behaviors. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 69 +++++++++++++++++++++++--------------- block/cfq-iosched.c | 8 ++--- include/linux/blk-cgroup.h | 7 ++-- 3 files changed, 50 insertions(+), 34 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 02a2d029b5a5..b26320720a3c 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -581,30 +581,39 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); /** * blkg_stat_recursive_sum - collect hierarchical blkg_stat - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_stat + * @off: offset to the blkg_stat in blkg_policy_data or @blkg * - * Collect the blkg_stat specified by @off from @pd and all its online - * descendants and their aux counts. The caller must be holding the queue - * lock for online tests. + * Collect the blkg_stat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. + * + * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is + * at @off bytes into @blkg's blkg_policy_data of the policy. */ -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) +u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off) { - struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; u64 sum = 0; - lockdep_assert_held(pd->blkg->q->queue_lock); + lockdep_assert_held(blkg->q->queue_lock); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { - struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); - struct blkg_stat *stat = (void *)pos_pd + off; + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_stat *stat; - if (pos_blkg->online) - sum += blkg_stat_read(stat) + - atomic64_read(&stat->aux_cnt); + if (!pos_blkg->online) + continue; + + if (pol) + stat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + stat = (void *)blkg + off; + + sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt); } rcu_read_unlock(); @@ -614,33 +623,39 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); /** * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_rwstat + * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg * - * Collect the blkg_rwstat specified by @off from @pd and all its online - * descendants and their aux counts. The caller must be holding the queue - * lock for online tests. + * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. + * + * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it + * is at @off bytes into @blkg's blkg_policy_data of the policy. */ -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off) +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off) { - struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; struct blkg_rwstat sum = { }; int i; - lockdep_assert_held(pd->blkg->q->queue_lock); + lockdep_assert_held(blkg->q->queue_lock); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { - struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); - struct blkg_rwstat *rwstat = (void *)pos_pd + off; - struct blkg_rwstat tmp; + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_rwstat *rwstat, tmp; if (!pos_blkg->online) continue; + if (pol) + rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + rwstat = (void *)pos_blkg + off; + tmp = blkg_rwstat_read(rwstat); for (i = 0; i < BLKG_RWSTAT_NR; i++) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 71e55c91ee98..e861cc1ebd62 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1886,16 +1886,16 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v) static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = blkg_stat_recursive_sum(pd, off); - + u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_cfq, off); return __blkg_prfill_u64(sf, pd, sum); } static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd, off); - + struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_cfq, off); return __blkg_prfill_rwstat(sf, pd, &sum); } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index fdc7ac08b1ce..4630ce8f9425 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -191,9 +191,10 @@ u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off); +u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off); +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off); struct blkg_conf_ctx { struct gendisk *disk; From 77ea733884eb5520f22c36def1309fe2ab61633e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:24 -0700 Subject: [PATCH 33/45] blkcg: move io_service_bytes and io_serviced stats into blkcg_gq Currently, both cfq-iosched and blk-throttle keep track of io_service_bytes and io_serviced stats. While keeping track of them separately may be useful during development, it doesn't make much sense otherwise. Also, blk-throttle was counting bio's as IOs while cfq-iosched request's, which is more confusing than informative. This patch adds ->stat_bytes and ->stat_ios to blkg (blkcg_gq), removes the counterparts from cfq-iosched and blk-throttle and let them print from the common blkg counters. The common counters are incremented during bio issue in blkcg_bio_issue_check(). The outputs are still filtered by whether the policy has blkg_policy_data on a given blkg, so cfq's output won't show up if it has never been used for a given blkg. The only times when the outputs would differ significantly are when policies are attached on the fly or elevators are switched back and forth. Those are quite exceptional operations and I don't think they warrant keeping separate counters. v3: Update blkio-controller.txt accordingly. v2: Account IOs during bio issues instead of request completions so that bio-based drivers can be handled the same way. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- Documentation/cgroups/blkio-controller.txt | 24 ++---- block/blk-cgroup.c | 98 ++++++++++++++++++++++ block/blk-throttle.c | 73 ++-------------- block/cfq-iosched.c | 32 ++----- include/linux/blk-cgroup.h | 14 ++++ 5 files changed, 133 insertions(+), 108 deletions(-) diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt index 68b6a6a470b0..12686bec37b9 100644 --- a/Documentation/cgroups/blkio-controller.txt +++ b/Documentation/cgroups/blkio-controller.txt @@ -201,7 +201,7 @@ Proportional weight policy files specifies the number of bytes. - blkio.io_serviced - - Number of IOs completed to/from the disk by the group. These + - Number of IOs (bio) issued to the disk by the group. These are further divided by the type of operation - read or write, sync or async. First two fields specify the major and minor number of the device, third field specifies the operation type and the fourth field @@ -327,18 +327,11 @@ Note: If both BW and IOPS rules are specified for a device, then IO is subjected to both the constraints. - blkio.throttle.io_serviced - - Number of IOs (bio) completed to/from the disk by the group (as - seen by throttling policy). These are further divided by the type - of operation - read or write, sync or async. First two fields specify - the major and minor number of the device, third field specifies the - operation type and the fourth field specifies the number of IOs. - - blkio.io_serviced does accounting as seen by CFQ and counts are in - number of requests (struct request). On the other hand, - blkio.throttle.io_serviced counts number of IO in terms of number - of bios as seen by throttling policy. These bios can later be - merged by elevator and total number of requests completed can be - lesser. + - Number of IOs (bio) issued to the disk by the group. These + are further divided by the type of operation - read or write, sync + or async. First two fields specify the major and minor number of the + device, third field specifies the operation type and the fourth field + specifies the number of IOs. - blkio.throttle.io_service_bytes - Number of bytes transferred to/from the disk by the group. These @@ -347,11 +340,6 @@ Note: If both BW and IOPS rules are specified for a device, then IO is device, third field specifies the operation type and the fourth field specifies the number of bytes. - These numbers should roughly be same as blkio.io_service_bytes as - updated by CFQ. The difference between two is that - blkio.io_service_bytes will not be updated if CFQ is not operating - on request queue. - Common files among various policies ----------------------------------- - blkio.reset_stats diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b26320720a3c..a25263ca39ca 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -73,6 +73,9 @@ static void blkg_free(struct blkcg_gq *blkg) if (blkg->blkcg != &blkcg_root) blk_exit_rl(&blkg->rl); + + blkg_rwstat_exit(&blkg->stat_ios); + blkg_rwstat_exit(&blkg->stat_bytes); kfree(blkg); } @@ -95,6 +98,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (!blkg) return NULL; + if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || + blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) + goto err_free; + blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; @@ -300,6 +307,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; + struct blkcg_gq *parent = blkg->parent; int i; lockdep_assert_held(blkg->q->queue_lock); @@ -315,6 +323,12 @@ static void blkg_destroy(struct blkcg_gq *blkg) if (blkg->pd[i] && pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[i]); } + + if (parent) { + blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes); + blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios); + } + blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); @@ -431,6 +445,9 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + blkg_rwstat_reset(&blkg->stat_bytes); + blkg_rwstat_reset(&blkg->stat_ios); + for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -579,6 +596,87 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, } EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); +static u64 blkg_prfill_rwstat_field(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off); + + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} + +/** + * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes + * @sf: seq_file to print to + * @v: unused + * + * To be used as cftype->seq_show to print blkg->stat_bytes. + * cftype->private must be set to the blkcg_policy. + */ +int blkg_print_stat_bytes(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_bytes), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_bytes); + +/** + * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios + * @sf: seq_file to print to + * @v: unused + * + * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private + * must be set to the blkcg_policy. + */ +int blkg_print_stat_ios(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_ios), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_ios); + +static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, + int off) +{ + struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg, + NULL, off); + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} + +/** + * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes + * @sf: seq_file to print to + * @v: unused + */ +int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field_recursive, + (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_bytes), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive); + +/** + * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios + * @sf: seq_file to print to + * @v: unused + */ +int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field_recursive, + (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_ios), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); + /** * blkg_stat_recursive_sum - collect hierarchical blkg_stat * @blkg: blkg of interest diff --git a/block/blk-throttle.c b/block/blk-throttle.c index c0b2263a222a..bd3e4b2c7a9d 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -133,11 +133,6 @@ struct throtl_grp { /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; - - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; }; struct throtl_data @@ -335,11 +330,7 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) tg = kzalloc_node(sizeof(*tg), gfp, node); if (!tg) - goto err; - - if (blkg_rwstat_init(&tg->service_bytes, gfp) || - blkg_rwstat_init(&tg->serviced, gfp)) - goto err_free_tg; + return NULL; throtl_service_queue_init(&tg->service_queue); @@ -355,13 +346,6 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) tg->iops[WRITE] = -1; return &tg->pd; - -err_free_tg: - blkg_rwstat_exit(&tg->serviced); - blkg_rwstat_exit(&tg->service_bytes); - kfree(tg); -err: - return NULL; } static void throtl_pd_init(struct blkg_policy_data *pd) @@ -419,19 +403,9 @@ static void throtl_pd_free(struct blkg_policy_data *pd) struct throtl_grp *tg = pd_to_tg(pd); del_timer_sync(&tg->service_queue.pending_timer); - blkg_rwstat_exit(&tg->serviced); - blkg_rwstat_exit(&tg->service_bytes); kfree(tg); } -static void throtl_pd_reset_stats(struct blkg_policy_data *pd) -{ - struct throtl_grp *tg = pd_to_tg(pd); - - blkg_rwstat_reset(&tg->service_bytes); - blkg_rwstat_reset(&tg->serviced); -} - static struct throtl_grp * throtl_rb_first(struct throtl_service_queue *parent_sq) { @@ -839,25 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, return 0; } -static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, - int rw) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - unsigned long flags; - - /* - * Disabling interrupts to provide mutual exclusion between two - * writes on same cpu. It probably is not needed for 64bit. Not - * optimizing that case yet. - */ - local_irq_save(flags); - - blkg_rwstat_add(&tg->serviced, rw, 1); - blkg_rwstat_add(&tg->service_bytes, rw, bytes); - - local_irq_restore(flags); -} - static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); @@ -871,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) * more than once as a throttled bio will go through blk-throtl the * second time when it eventually gets issued. Set it when a bio * is being charged to a tg. - * - * Dispatch stats aren't recursive and each @bio should only be - * accounted by the @tg it was originally associated with. Let's - * update the stats when setting REQ_THROTTLED for the first time - * which is guaranteed to be for the @bio's original tg. */ - if (!(bio->bi_rw & REQ_THROTTLED)) { + if (!(bio->bi_rw & REQ_THROTTLED)) bio->bi_rw |= REQ_THROTTLED; - throtl_update_dispatch_stats(tg_to_blkg(tg), - bio->bi_iter.bi_size, bio->bi_rw); - } } /** @@ -1161,13 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) } } -static int tg_print_rwstat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, - &blkcg_policy_throtl, seq_cft(sf)->private, true); - return 0; -} - static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -1304,13 +1244,13 @@ static struct cftype throtl_files[] = { }, { .name = "throttle.io_service_bytes", - .private = offsetof(struct throtl_grp, service_bytes), - .seq_show = tg_print_rwstat, + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_bytes, }, { .name = "throttle.io_serviced", - .private = offsetof(struct throtl_grp, serviced), - .seq_show = tg_print_rwstat, + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_ios, }, { } /* terminate */ }; @@ -1329,7 +1269,6 @@ static struct blkcg_policy blkcg_policy_throtl = { .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, .pd_free_fn = throtl_pd_free, - .pd_reset_stats_fn = throtl_pd_reset_stats, }; bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e861cc1ebd62..a948d4df3fc3 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -177,10 +177,6 @@ enum wl_type_t { struct cfqg_stats { #ifdef CONFIG_CFQ_GROUP_IOSCHED - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ @@ -696,8 +692,6 @@ static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, uint64_t bytes, int rw) { blkg_stat_add(&cfqg->stats.sectors, bytes >> 9); - blkg_rwstat_add(&cfqg->stats.serviced, rw, 1); - blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes); } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, @@ -717,8 +711,6 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, static void cfqg_stats_reset(struct cfqg_stats *stats) { /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->service_bytes); - blkg_rwstat_reset(&stats->serviced); blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); blkg_rwstat_reset(&stats->wait_time); @@ -738,8 +730,6 @@ static void cfqg_stats_reset(struct cfqg_stats *stats) static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from) { /* queued stats shouldn't be cleared */ - blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); - blkg_rwstat_add_aux(&to->serviced, &from->serviced); blkg_rwstat_add_aux(&to->merged, &from->merged); blkg_rwstat_add_aux(&to->service_time, &from->service_time); blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); @@ -1544,8 +1534,6 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg) #ifdef CONFIG_CFQ_GROUP_IOSCHED static void cfqg_stats_exit(struct cfqg_stats *stats) { - blkg_rwstat_exit(&stats->service_bytes); - blkg_rwstat_exit(&stats->serviced); blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); blkg_rwstat_exit(&stats->wait_time); @@ -1566,9 +1554,7 @@ static void cfqg_stats_exit(struct cfqg_stats *stats) static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp) { - if (blkg_rwstat_init(&stats->service_bytes, gfp) || - blkg_rwstat_init(&stats->serviced, gfp) || - blkg_rwstat_init(&stats->merged, gfp) || + if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || blkg_rwstat_init(&stats->wait_time, gfp) || blkg_rwstat_init(&stats->queued, gfp) || @@ -1994,13 +1980,13 @@ static struct cftype cfq_blkcg_files[] = { }, { .name = "io_service_bytes", - .private = offsetof(struct cfq_group, stats.service_bytes), - .seq_show = cfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_bytes, }, { .name = "io_serviced", - .private = offsetof(struct cfq_group, stats.serviced), - .seq_show = cfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_ios, }, { .name = "io_service_time", @@ -2036,13 +2022,13 @@ static struct cftype cfq_blkcg_files[] = { }, { .name = "io_service_bytes_recursive", - .private = offsetof(struct cfq_group, stats.service_bytes), - .seq_show = cfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_bytes_recursive, }, { .name = "io_serviced_recursive", - .private = offsetof(struct cfq_group, stats.serviced), - .seq_show = cfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_ios_recursive, }, { .name = "io_service_time_recursive", diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 4630ce8f9425..286e1bde249f 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -127,6 +127,9 @@ struct blkcg_gq { /* is this blkg online? protected by both blkcg and q locks */ bool online; + struct blkg_rwstat stat_bytes; + struct blkg_rwstat stat_ios; + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; struct rcu_head rcu_head; @@ -190,6 +193,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); +int blkg_print_stat_bytes(struct seq_file *sf, void *v); +int blkg_print_stat_ios(struct seq_file *sf, void *v); +int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v); +int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v); u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, int off); @@ -700,6 +707,13 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, throtl = blk_throtl_bio(q, blkg, bio); + if (!throtl) { + blkg = blkg ?: q->root_blkg; + blkg_rwstat_add(&blkg->stat_bytes, bio->bi_flags, + bio->bi_iter.bi_size); + blkg_rwstat_add(&blkg->stat_ios, bio->bi_flags, 1); + } + rcu_read_unlock(); return !throtl; } From 702747cabe737fe9b358739443d539f10cc7c715 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:25 -0700 Subject: [PATCH 34/45] blkcg: remove cfqg_stats->sectors cfq_stats->sectors is a blkg_stat which keeps track of the total number of sectors serviced; however, this can be trivially calculated from blkcg_gq->stat_bytes. The only thing necessary is adding up READs and WRITEs and then dividing by sector size. Remove cfqg_stats->sectors and make cfq print "sectors" and "sectors_recursive" from stat_bytes. While this is a bit more code, it removes duplicate stat allocations and updates and ensures that the reported stats stay in tune with each other. Signed-off-by: Tejun Heo Cc: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 55 +++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index a948d4df3fc3..395476ab14fe 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -185,8 +185,6 @@ struct cfqg_stats { struct blkg_rwstat wait_time; /* number of IOs queued up */ struct blkg_rwstat queued; - /* total sectors transferred */ - struct blkg_stat sectors; /* total disk time and nr sectors dispatched by this group */ struct blkg_stat time; #ifdef CONFIG_DEBUG_BLK_CGROUP @@ -688,12 +686,6 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) blkg_rwstat_add(&cfqg->stats.merged, rw, 1); } -static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, - uint64_t bytes, int rw) -{ - blkg_stat_add(&cfqg->stats.sectors, bytes >> 9); -} - static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, uint64_t start_time, uint64_t io_start_time, int rw) { @@ -782,8 +774,6 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, unsigned long time, unsigned long unaccounted_time) { } static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { } static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { } -static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, - uint64_t bytes, int rw) { } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, uint64_t start_time, uint64_t io_start_time, int rw) { } @@ -1538,8 +1528,6 @@ static void cfqg_stats_exit(struct cfqg_stats *stats) blkg_rwstat_exit(&stats->service_time); blkg_rwstat_exit(&stats->wait_time); blkg_rwstat_exit(&stats->queued); - - blkg_stat_exit(&stats->sectors); blkg_stat_exit(&stats->time); #ifdef CONFIG_DEBUG_BLK_CGROUP blkg_stat_exit(&stats->unaccounted_time); @@ -1558,8 +1546,6 @@ static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp) blkg_rwstat_init(&stats->service_time, gfp) || blkg_rwstat_init(&stats->wait_time, gfp) || blkg_rwstat_init(&stats->queued, gfp) || - - blkg_stat_init(&stats->sectors, gfp) || blkg_stat_init(&stats->time, gfp)) goto err; @@ -1901,6 +1887,40 @@ static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v) return 0; } +static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int cfqg_print_stat_sectors(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false); + return 0; +} + +static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0, + false); + return 0; +} + #ifdef CONFIG_DEBUG_BLK_CGROUP static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, struct blkg_policy_data *pd, int off) @@ -1975,8 +1995,7 @@ static struct cftype cfq_blkcg_files[] = { }, { .name = "sectors", - .private = offsetof(struct cfq_group, stats.sectors), - .seq_show = cfqg_print_stat, + .seq_show = cfqg_print_stat_sectors, }, { .name = "io_service_bytes", @@ -2017,8 +2036,7 @@ static struct cftype cfq_blkcg_files[] = { }, { .name = "sectors_recursive", - .private = offsetof(struct cfq_group, stats.sectors), - .seq_show = cfqg_print_stat_recursive, + .seq_show = cfqg_print_stat_sectors_recursive, }, { .name = "io_service_bytes_recursive", @@ -2888,7 +2906,6 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; cfqq->nr_sectors += blk_rq_sectors(rq); - cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags); } /* From 3a7faeada20d72f07d3a7b13454859025cd50a36 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:26 -0700 Subject: [PATCH 35/45] blkcg: reduce stack usage of blkg_rwstat_recursive_sum() The recent percpu conversion of blkg_rwstat triggered the following warning in certain configurations. block/blk-cgroup.c:654:1: warning: the frame size of 1360 bytes is larger than 1024 bytes This is because blkg_rwstat now contains four percpu_counter which can be pretty big depending on debug options although it shouldn't be a problem in production configs. This patch removes one of the two local blkg_rwstat variables used by blkg_rwstat_recursive_sum() to reduce stack usage. Signed-off-by: Tejun Heo Cc: Vivek Goyal Reported-by: kbuild test robot Link: http://article.gmane.org/gmane.linux.kernel.cgroups/13835 Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index a25263ca39ca..c82c5dbf9187 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -744,7 +744,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { - struct blkg_rwstat *rwstat, tmp; + struct blkg_rwstat *rwstat; if (!pos_blkg->online) continue; @@ -754,12 +754,10 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, else rwstat = (void *)pos_blkg + off; - tmp = blkg_rwstat_read(rwstat); - for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_add(atomic64_read(&tmp.aux_cnt[i]) + - atomic64_read(&rwstat->aux_cnt[i]), - &sum.aux_cnt[i]); + atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) + + percpu_counter_sum_positive(&rwstat->cpu_cnt[i]), + &sum.aux_cnt[i]); } rcu_read_unlock(); From 5332dfc36483d2373d980526145789a354af2d49 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:27 -0700 Subject: [PATCH 36/45] blkcg: remove unnecessary NULL checks from __cfqg_set_weight_device() blkg_to_cfqg() and blkcg_to_cfqgd() on a valid blkg with the policy enabled are guaranteed to return non-NULL and the counterpart in blk-throttle doesn't have these checks either. Remove the spurious NULL checks. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 395476ab14fe..bcf40266bd0a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1752,12 +1752,10 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, if (ret) return ret; - ret = -EINVAL; cfqg = blkg_to_cfqg(ctx.blkg); cfqgd = blkcg_to_cfqgd(blkcg); - if (!cfqg || !cfqgd) - goto err; + ret = -EINVAL; if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { if (!is_leaf_weight) { cfqg->dev_weight = ctx.v; @@ -1769,7 +1767,6 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, ret = 0; } -err: blkg_conf_finish(&ctx); return ret ?: nbytes; } From 20386ce0143899ccb5bcbda714436a82d3029f33 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:28 -0700 Subject: [PATCH 37/45] blkcg: refine error codes returned during blkcg configuration blkcg currently returns -EINVAL for most errors which can be pretty confusing given that the failure modes are quite varied. Update the error returns so that * -EINVAL only for syntactic errors. * -ERANGE if the value is out of range. * -ENODEV if the target device can't be found. * -EOPNOTSUPP if the policy is not enabled on the target device. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 12 ++++++------ block/cfq-iosched.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c82c5dbf9187..0af3bff198ed 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -179,7 +179,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* blkg holds a reference to blkcg */ if (!css_tryget_online(&blkcg->css)) { - ret = -EINVAL; + ret = -ENODEV; goto err_free_blkg; } @@ -205,7 +205,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, if (blkcg_parent(blkcg)) { blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); if (WARN_ON_ONCE(!blkg->parent)) { - ret = -EINVAL; + ret = -ENODEV; goto err_put_congested; } blkg_get(blkg->parent); @@ -279,7 +279,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, * we shouldn't allow anything to go through for a bypassing queue. */ if (unlikely(blk_queue_bypass(q))) - return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); + return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); blkg = __blkg_lookup(blkcg, q, true); if (blkg) @@ -792,10 +792,10 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, disk = get_gendisk(MKDEV(major, minor), &part); if (!disk) - return -EINVAL; + return -ENODEV; if (part) { put_disk(disk); - return -EINVAL; + return -ENODEV; } rcu_read_lock(); @@ -804,7 +804,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (blkcg_policy_enabled(disk->queue, pol)) blkg = blkg_lookup_create(blkcg, disk->queue); else - blkg = ERR_PTR(-EINVAL); + blkg = ERR_PTR(-EOPNOTSUPP); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index bcf40266bd0a..38277e3ce286 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1755,7 +1755,7 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, cfqg = blkg_to_cfqg(ctx.blkg); cfqgd = blkcg_to_cfqgd(blkcg); - ret = -EINVAL; + ret = -ERANGE; if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { if (!is_leaf_weight) { cfqg->dev_weight = ctx.v; From c165b3e3c7bb68c2ed55a5ac2623f030d01d9567 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:29 -0700 Subject: [PATCH 38/45] blkcg: rename subsystem name from blkio to io blkio interface has become messy over time and is currently the largest. In addition to the inconsistent naming scheme, it has multiple stat files which report more or less the same thing, a number of debug stat files which expose internal details which shouldn't have been part of the public interface in the first place, recursive and non-recursive stats and leaf and non-leaf knobs. Both recursive vs. non-recursive and leaf vs. non-leaf distinctions don't make any sense on the unified hierarchy as only leaf cgroups can contain processes. cgroups is going through a major interface revision with the unified hierarchy involving significant fundamental usage changes and given that a significant portion of the interface doesn't make sense anymore, it's a good time to reorganize the interface. As the first step, this patch renames the external visible subsystem name from "blkio" to "io". This is more concise, matches the other two major subsystem names, "cpu" and "memory", and better suited as blkcg will be involved in anything writeback related too whether an actual block device is involved or not. As the subsystem legacy_name is set to "blkio", the only userland visible change outside the unified hierarchy is that blkcg is reported as "io" instead of "blkio" in the subsystem initialized message during boot. On the unified hierarchy, blkcg now appears as "io". Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner Cc: cgroups@vger.kernel.org Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk-cgroup.c | 7 ++++--- include/linux/backing-dev.h | 2 +- include/linux/blk-cgroup.h | 4 ++-- include/linux/cgroup_subsys.h | 2 +- mm/backing-dev.c | 4 ++-- 6 files changed, 11 insertions(+), 10 deletions(-) diff --git a/block/bio.c b/block/bio.c index d6e5ba3399f0..c52222c6c69b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -2046,7 +2046,7 @@ int bio_associate_current(struct bio *bio) get_io_context_active(ioc); bio->bi_ioc = ioc; - bio->bi_css = task_get_css(current, blkio_cgrp_id); + bio->bi_css = task_get_css(current, io_cgrp_id); return 0; } EXPORT_SYMBOL_GPL(bio_associate_current); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0af3bff198ed..fc197ea4c992 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1089,12 +1089,13 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, return ret; } -struct cgroup_subsys blkio_cgrp_subsys = { +struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, .legacy_cftypes = blkcg_files, + .legacy_name = "blkio", #ifdef CONFIG_MEMCG /* * This ensures that, if available, memcg is automatically enabled @@ -1104,7 +1105,7 @@ struct cgroup_subsys blkio_cgrp_subsys = { .depends_on = 1 << memory_cgrp_id, #endif }; -EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); +EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** * blkcg_activate_policy - activate a blkcg policy on a request_queue @@ -1266,7 +1267,7 @@ int blkcg_policy_register(struct blkcg_policy *pol) /* everything is in place, add intf files for the new policy */ if (pol->cftypes) - WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys, + WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, pol->cftypes)); mutex_unlock(&blkcg_pol_register_mutex); return 0; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 23ebb946e66f..5a5d79ee256f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -286,7 +286,7 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi * %current's blkcg equals the effective blkcg of its memcg. No * need to use the relatively expensive cgroup_get_e_css(). */ - if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id))) + if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id))) return wb; return NULL; } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 286e1bde249f..db89acd2a864 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -221,7 +221,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) static inline struct blkcg *task_blkcg(struct task_struct *tsk) { - return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); + return css_to_blkcg(task_css(tsk, io_cgrp_id)); } static inline struct blkcg *bio_blkcg(struct bio *bio) @@ -234,7 +234,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) static inline struct cgroup_subsys_state * task_get_blkcg_css(struct task_struct *task) { - return task_get_css(task, blkio_cgrp_id); + return task_get_css(task, io_cgrp_id); } /** diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index e4a96fb14403..86b5056104df 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -16,7 +16,7 @@ SUBSYS(cpuacct) #endif #if IS_ENABLED(CONFIG_BLK_CGROUP) -SUBSYS(blkio) +SUBSYS(io) #endif #if IS_ENABLED(CONFIG_MEMCG) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dac5bf59309d..d0ee90e72867 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -523,7 +523,7 @@ static int cgwb_create(struct backing_dev_info *bdi, int ret = 0; memcg = mem_cgroup_from_css(memcg_css); - blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys); + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); blkcg = css_to_blkcg(blkcg_css); memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); blkcg_cgwb_list = &blkcg->cgwb_list; @@ -645,7 +645,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, /* see whether the blkcg association has changed */ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, - &blkio_cgrp_subsys); + &io_cgrp_subsys); if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) wb = NULL; From 880f50e228f80626dff6327a6e281e40286f5228 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:30 -0700 Subject: [PATCH 39/45] blkcg: mark existing cftypes as legacy blkcg is about to grow interface for the unified hierarchy. Add legacy to existing cftypes. * blkcg_policy->cftypes -> blkcg_policy->legacy_cftypes * blk-cgroup.c:blkcg_files -> blkcg_legacy_files * cfq-iosched.c:cfq_blkcg_files -> cfq_blkcg_legacy_files * blk-throttle.c:throtl_files -> throtl_legacy_files Pure renames. No functional change. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 12 ++++++------ block/blk-throttle.c | 4 ++-- block/cfq-iosched.c | 4 ++-- include/linux/blk-cgroup.h | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index fc197ea4c992..429726bed560 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -847,7 +847,7 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_finish); -struct cftype blkcg_files[] = { +struct cftype blkcg_legacy_files[] = { { .name = "reset_stats", .write_u64 = blkcg_reset_stats, @@ -1094,7 +1094,7 @@ struct cgroup_subsys io_cgrp_subsys = { .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, - .legacy_cftypes = blkcg_files, + .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", #ifdef CONFIG_MEMCG /* @@ -1266,9 +1266,9 @@ int blkcg_policy_register(struct blkcg_policy *pol) mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ - if (pol->cftypes) + if (pol->legacy_cftypes) WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, - pol->cftypes)); + pol->legacy_cftypes)); mutex_unlock(&blkcg_pol_register_mutex); return 0; @@ -1305,8 +1305,8 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) goto out_unlock; /* kill the intf files first */ - if (pol->cftypes) - cgroup_rm_cftypes(pol->cftypes); + if (pol->legacy_cftypes) + cgroup_rm_cftypes(pol->legacy_cftypes); /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index bd3e4b2c7a9d..8b4f6b81bb72 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1217,7 +1217,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, return tg_set_conf(of, buf, nbytes, off, false); } -static struct cftype throtl_files[] = { +static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", .private = offsetof(struct throtl_grp, bps[READ]), @@ -1263,7 +1263,7 @@ static void throtl_shutdown_wq(struct request_queue *q) } static struct blkcg_policy blkcg_policy_throtl = { - .cftypes = throtl_files, + .legacy_cftypes = throtl_legacy_files, .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 38277e3ce286..baa8459f7064 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1944,7 +1944,7 @@ static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) } #endif /* CONFIG_DEBUG_BLK_CGROUP */ -static struct cftype cfq_blkcg_files[] = { +static struct cftype cfq_blkcg_legacy_files[] = { /* on root, weight is mapped to leaf_weight */ { .name = "weight_device", @@ -4654,7 +4654,7 @@ static struct elevator_type iosched_cfq = { #ifdef CONFIG_CFQ_GROUP_IOSCHED static struct blkcg_policy blkcg_policy_cfq = { - .cftypes = cfq_blkcg_files, + .legacy_cftypes = cfq_blkcg_legacy_files, .cpd_alloc_fn = cfq_cpd_alloc, .cpd_init_fn = cfq_cpd_init, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index db89acd2a864..6e016e6fee87 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -148,7 +148,7 @@ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); struct blkcg_policy { int plid; /* cgroup files for the policy */ - struct cftype *cftypes; + struct cftype *legacy_cftypes; /* operations */ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; From 36aa9e5f591e84d67aad2c5bff75e413d77660dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:31 -0700 Subject: [PATCH 40/45] blkcg: move body parsing from blkg_conf_prep() to its callers Currently, blkg_conf_prep() expects input to be of the following form MAJ:MIN NUM and reads the NUM part into blkg_conf_ctx->v. This is quite restrictive and gets in the way in implementing blkcg interface for the unified hierarchy. This patch updates blkg_conf_prep() so that it expects MAJ:MIN BODY_STR where BODY_STR is an arbitrary string. blkg_conf_ctx->v is replaced with ->body which is a char pointer pointing to the start of BODY_STR. Parsing of the body is moved to blkg_conf_prep()'s callers. To allow using, for example, strsep() on blkg_conf_ctx->val, it is a non-const pointer and to accommodate that const is dropped from @input too. This doesn't cause any behavior changes. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 22 ++++++++++++++-------- block/blk-throttle.c | 18 ++++++++++++------ block/cfq-iosched.c | 17 +++++++++++------ include/linux/blk-cgroup.h | 4 ++-- 4 files changed, 39 insertions(+), 22 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 429726bed560..8ee1ca4d4f2f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "blk.h" @@ -773,23 +774,28 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); * @ctx: blkg_conf_ctx to be filled * * Parse per-blkg config update from @input and initialize @ctx with the - * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new - * value. This function returns with RCU read lock and queue lock held and - * must be paired with blkg_conf_finish(). + * result. @ctx->blkg points to the blkg to be updated and @ctx->body the + * part of @input following MAJ:MIN. This function returns with RCU read + * lock and queue lock held and must be paired with blkg_conf_finish(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx) + char *input, struct blkg_conf_ctx *ctx) __acquires(rcu) __acquires(disk->queue->queue_lock) { struct gendisk *disk; struct blkcg_gq *blkg; unsigned int major, minor; - unsigned long long v; - int part, ret; + int key_len, part, ret; + char *body; - if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) + if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) return -EINVAL; + body = input + key_len; + if (!isspace(*body)) + return -EINVAL; + body = skip_spaces(body); + disk = get_gendisk(MKDEV(major, minor), &part); if (!disk) return -ENODEV; @@ -826,7 +832,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, ctx->disk = disk; ctx->blkg = blkg; - ctx->v = v; + ctx->body = body; return 0; } EXPORT_SYMBOL_GPL(blkg_conf_prep); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8b4f6b81bb72..0e17c8f92d25 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1154,21 +1154,25 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, struct blkcg_gq *blkg; struct cgroup_subsys_state *pos_css; int ret; + u64 v; ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); if (ret) return ret; + ret = -EINVAL; + if (sscanf(ctx.body, "%llu", &v) != 1) + goto out_finish; + if (!v) + v = -1; + tg = blkg_to_tg(ctx.blkg); sq = &tg->service_queue; - if (!ctx.v) - ctx.v = -1; - if (is_u64) - *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v; + *(u64 *)((void *)tg + of_cft(of)->private) = v; else - *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v; + *(unsigned int *)((void *)tg + of_cft(of)->private) = v; throtl_log(&tg->service_queue, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", @@ -1201,8 +1205,10 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, throtl_schedule_next_dispatch(sq->parent_sq, true); } + ret = 0; +out_finish: blkg_conf_finish(&ctx); - return nbytes; + return ret ?: nbytes; } static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index baa8459f7064..ea88d8905bbd 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1747,26 +1747,31 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, struct cfq_group *cfqg; struct cfq_group_data *cfqgd; int ret; + u64 v; ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); if (ret) return ret; + ret = -EINVAL; + if (sscanf(ctx.body, "%llu", &v) != 1) + goto out_finish; + cfqg = blkg_to_cfqg(ctx.blkg); cfqgd = blkcg_to_cfqgd(blkcg); ret = -ERANGE; - if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { + if (!v || (v >= CFQ_WEIGHT_MIN && v <= CFQ_WEIGHT_MAX)) { if (!is_leaf_weight) { - cfqg->dev_weight = ctx.v; - cfqg->new_weight = ctx.v ?: cfqgd->weight; + cfqg->dev_weight = v; + cfqg->new_weight = v ?: cfqgd->weight; } else { - cfqg->dev_leaf_weight = ctx.v; - cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight; + cfqg->dev_leaf_weight = v; + cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight; } ret = 0; } - +out_finish: blkg_conf_finish(&ctx); return ret ?: nbytes; } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 6e016e6fee87..85a4d989ae43 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -206,11 +206,11 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkg_conf_ctx { struct gendisk *disk; struct blkcg_gq *blkg; - u64 v; + char *body; }; int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx); + char *input, struct blkg_conf_ctx *ctx); void blkg_conf_finish(struct blkg_conf_ctx *ctx); From 69948b070ee2bc3cc253e862cbe2bb09b173d7bd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:32 -0700 Subject: [PATCH 41/45] blkcg: separate out tg_conf_updated() from tg_set_conf() tg_set_conf() is largely consisted of parsing and setting the new config and the follow-up application and propagation. This patch separates out the latter part into tg_conf_updated(). This will be used to implement interface for the unified hierarchy. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-throttle.c | 60 +++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 0e17c8f92d25..a8bb2fd8f523 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1144,35 +1144,11 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v) return 0; } -static ssize_t tg_set_conf(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off, bool is_u64) +static void tg_conf_updated(struct throtl_grp *tg) { - struct blkcg *blkcg = css_to_blkcg(of_css(of)); - struct blkg_conf_ctx ctx; - struct throtl_grp *tg; - struct throtl_service_queue *sq; - struct blkcg_gq *blkg; + struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; - int ret; - u64 v; - - ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); - if (ret) - return ret; - - ret = -EINVAL; - if (sscanf(ctx.body, "%llu", &v) != 1) - goto out_finish; - if (!v) - v = -1; - - tg = blkg_to_tg(ctx.blkg); - sq = &tg->service_queue; - - if (is_u64) - *(u64 *)((void *)tg + of_cft(of)->private) = v; - else - *(unsigned int *)((void *)tg + of_cft(of)->private) = v; + struct blkcg_gq *blkg; throtl_log(&tg->service_queue, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", @@ -1186,7 +1162,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ - blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) + blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) tg_update_has_rules(blkg_to_tg(blkg)); /* @@ -1204,7 +1180,35 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, tg_update_disptime(tg); throtl_schedule_next_dispatch(sq->parent_sq, true); } +} +static ssize_t tg_set_conf(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, bool is_u64) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; + struct throtl_grp *tg; + int ret; + u64 v; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + if (ret) + return ret; + + ret = -EINVAL; + if (sscanf(ctx.body, "%llu", &v) != 1) + goto out_finish; + if (!v) + v = -1; + + tg = blkg_to_tg(ctx.blkg); + + if (is_u64) + *(u64 *)((void *)tg + of_cft(of)->private) = v; + else + *(unsigned int *)((void *)tg + of_cft(of)->private) = v; + + tg_conf_updated(tg); ret = 0; out_finish: blkg_conf_finish(&ctx); From dd165eb3bb4ef16bcdb75417add40633f38c52b8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:33 -0700 Subject: [PATCH 42/45] blkcg: misc preparations for unified hierarchy interface * Export blkg_dev_name() * Drop unnecessary @cft from __cfq_set_weight(). Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 3 ++- block/cfq-iosched.c | 8 ++++---- include/linux/blk-cgroup.h | 1 + 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 8ee1ca4d4f2f..b5e72d756be1 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -462,13 +462,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, return 0; } -static const char *blkg_dev_name(struct blkcg_gq *blkg) +const char *blkg_dev_name(struct blkcg_gq *blkg) { /* some drivers (floppy) instantiate a queue w/o disk registered */ if (blkg->q->backing_dev_info.dev) return dev_name(blkg->q->backing_dev_info.dev); return NULL; } +EXPORT_SYMBOL_GPL(blkg_dev_name); /** * blkcg_print_blkgs - helper for printing per-blkg data diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ea88d8905bbd..7a7230136e43 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1788,8 +1788,8 @@ static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, return __cfqg_set_weight_device(of, buf, nbytes, off, true); } -static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val, bool is_leaf_weight) +static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, + bool is_leaf_weight) { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; @@ -1834,13 +1834,13 @@ out: static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, cft, val, false); + return __cfq_set_weight(css, val, false); } static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, cft, val, true); + return __cfq_set_weight(css, val, true); } static int cfqg_print_stat(struct seq_file *sf, void *v) diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 85a4d989ae43..b270aef519c6 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -182,6 +182,7 @@ int blkcg_activate_policy(struct request_queue *q, void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol); +const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), From 2ee867dcfa2eaef1063b686da55c35878b2da4a2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:34 -0700 Subject: [PATCH 43/45] blkcg: implement interface for the unified hierarchy blkcg interface grew to be the biggest of all controllers and unfortunately most inconsistent too. The interface files are inconsistent with a number of cloes duplicates. Some files have recursive variants while others don't. There's distinction between normal and leaf weights which isn't intuitive and there are a lot of stat knobs which don't make much sense outside of debugging and expose too much implementation details to userland. In the unified hierarchy, everything is always hierarchical and internal nodes can't have tasks rendering the two structural issues twisting the current interface. The interface has to be updated in a significant anyway and this is a good chance to revamp it as a whole. This patch implements blkcg interface for the unified hierarchy. * (from a previous patch) blkcg is identified by "io" instead of "blkio" on the unified hierarchy. Given that the whole interface is updated anyway, the rename shouldn't carry noticeable conversion overhead. * The original interface consisted of 27 files is replaced with the following three files. blkio.stat : per-blkcg stats blkio.weight : per-cgroup and per-cgroup-queue weight settings blkio.max : per-cgroup-queue bps and iops max limits Documentation/cgroups/unified-hierarchy.txt updated accordingly. v2: blkcg_policy->dfl_cftypes wasn't removed on blkcg_policy_unregister() corrupting the cftypes list. Fixed. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- Documentation/cgroups/unified-hierarchy.txt | 61 ++++++++++- block/blk-cgroup.c | 53 +++++++++ block/blk-throttle.c | 112 ++++++++++++++++++++ block/cfq-iosched.c | 61 ++++++++++- include/linux/blk-cgroup.h | 1 + 5 files changed, 279 insertions(+), 9 deletions(-) diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt index 1ee9caf29e57..bd1ce15d5178 100644 --- a/Documentation/cgroups/unified-hierarchy.txt +++ b/Documentation/cgroups/unified-hierarchy.txt @@ -27,7 +27,7 @@ CONTENTS 5-3-1. Format 5-3-2. Control Knobs 5-4. Per-Controller Changes - 5-4-1. blkio + 5-4-1. io 5-4-2. cpuset 5-4-3. memory 6. Planned Changes @@ -203,7 +203,7 @@ other issues. The mapping from nice level to weight isn't obvious or universal, and there are various other knobs which simply aren't available for tasks. -The blkio controller implicitly creates a hidden leaf node for each +The io controller implicitly creates a hidden leaf node for each cgroup to host the tasks. The hidden leaf has its own copies of all the knobs with "leaf_" prefixed. While this allows equivalent control over internal tasks, it's with serious drawbacks. It always adds an @@ -438,9 +438,62 @@ may be specified in any order and not all pairs have to be specified. 5-4. Per-Controller Changes -5-4-1. blkio +5-4-1. io -- blk-throttle becomes properly hierarchical. +- blkio is renamed to io. The interface is overhauled anyway. The + new name is more in line with the other two major controllers, cpu + and memory, and better suited given that it may be used for cgroup + writeback without involving block layer. + +- Everything including stat is always hierarchical making separate + recursive stat files pointless and, as no internal node can have + tasks, leaf weights are meaningless. The operation model is + simplified and the interface is overhauled accordingly. + + io.stat + + The stat file. The reported stats are from the point where + bio's are issued to request_queue. The stats are counted + independent of which policies are enabled. Each line in the + file follows the following format. More fields may later be + added at the end. + + $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS + + io.weight + + The weight setting, currently only available and effective if + cfq-iosched is in use for the target device. The weight is + between 10 and 1000 and defaults to 500. The first line + always contains the default weight in the following format to + use when per-device setting is missing. + + default $WEIGHT + + Subsequent lines list per-device weights of the following + format. + + $MAJ:$MIN $WEIGHT + + Writing "$WEIGHT" or "default $WEIGHT" changes the default + setting. Writing "$MAJ:$MIN $WEIGHT" sets per-device weight + while "$MAJ:$MIN default" clears it. + + This file is available only on non-root cgroups. + + io.max + + The maximum bandwidth and/or iops setting, only available if + blk-throttle is enabled. The file is of the following format. + + $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS + + ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are + read/write IOs per second. "max" indicates no limit. Writing + to the file follows the same format but the individual + settings may be ommitted or specified in any order. + + This file is available only on non-root cgroups. 5-4-2. cpuset diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b5e72d756be1..88bdb73bd5e0 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -854,6 +854,53 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_finish); +static int blkcg_print_stat(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + const char *dname; + struct blkg_rwstat rwstat; + u64 rbytes, wbytes, rios, wios; + + dname = blkg_dev_name(blkg); + if (!dname) + continue; + + spin_lock_irq(blkg->q->queue_lock); + + rwstat = blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); + wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + + rwstat = blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_ios)); + rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); + wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + + spin_unlock_irq(blkg->q->queue_lock); + + if (rbytes || wbytes || rios || wios) + seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n", + dname, rbytes, wbytes, rios, wios); + } + + rcu_read_unlock(); + return 0; +} + +struct cftype blkcg_files[] = { + { + .name = "stat", + .seq_show = blkcg_print_stat, + }, + { } /* terminate */ +}; + struct cftype blkcg_legacy_files[] = { { .name = "reset_stats", @@ -1101,6 +1148,7 @@ struct cgroup_subsys io_cgrp_subsys = { .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, + .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", #ifdef CONFIG_MEMCG @@ -1273,6 +1321,9 @@ int blkcg_policy_register(struct blkcg_policy *pol) mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ + if (pol->dfl_cftypes) + WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, + pol->dfl_cftypes)); if (pol->legacy_cftypes) WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, pol->legacy_cftypes)); @@ -1312,6 +1363,8 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) goto out_unlock; /* kill the intf files first */ + if (pol->dfl_cftypes) + cgroup_rm_cftypes(pol->dfl_cftypes); if (pol->legacy_cftypes) cgroup_rm_cftypes(pol->legacy_cftypes); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index a8bb2fd8f523..c75a2636dd40 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1265,6 +1265,117 @@ static struct cftype throtl_legacy_files[] = { { } /* terminate */ }; +static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct throtl_grp *tg = pd_to_tg(pd); + const char *dname = blkg_dev_name(pd->blkg); + char bufs[4][21] = { "max", "max", "max", "max" }; + + if (!dname) + return 0; + if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 && + tg->iops[READ] == -1 && tg->iops[WRITE] == -1) + return 0; + + if (tg->bps[READ] != -1) + snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]); + if (tg->bps[WRITE] != -1) + snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]); + if (tg->iops[READ] != -1) + snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]); + if (tg->iops[WRITE] != -1) + snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]); + + seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n", + dname, bufs[0], bufs[1], bufs[2], bufs[3]); + return 0; +} + +static int tg_print_max(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max, + &blkcg_policy_throtl, seq_cft(sf)->private, false); + return 0; +} + +static ssize_t tg_set_max(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; + struct throtl_grp *tg; + u64 v[4]; + int ret; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + if (ret) + return ret; + + tg = blkg_to_tg(ctx.blkg); + + v[0] = tg->bps[READ]; + v[1] = tg->bps[WRITE]; + v[2] = tg->iops[READ]; + v[3] = tg->iops[WRITE]; + + while (true) { + char tok[27]; /* wiops=18446744073709551616 */ + char *p; + u64 val = -1; + int len; + + if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) + break; + if (tok[0] == '\0') + break; + ctx.body += len; + + ret = -EINVAL; + p = tok; + strsep(&p, "="); + if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) + goto out_finish; + + ret = -ERANGE; + if (!val) + goto out_finish; + + ret = -EINVAL; + if (!strcmp(tok, "rbps")) + v[0] = val; + else if (!strcmp(tok, "wbps")) + v[1] = val; + else if (!strcmp(tok, "riops")) + v[2] = min_t(u64, val, UINT_MAX); + else if (!strcmp(tok, "wiops")) + v[3] = min_t(u64, val, UINT_MAX); + else + goto out_finish; + } + + tg->bps[READ] = v[0]; + tg->bps[WRITE] = v[1]; + tg->iops[READ] = v[2]; + tg->iops[WRITE] = v[3]; + + tg_conf_updated(tg); + ret = 0; +out_finish: + blkg_conf_finish(&ctx); + return ret ?: nbytes; +} + +static struct cftype throtl_files[] = { + { + .name = "max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = tg_print_max, + .write = tg_set_max, + }, + { } /* terminate */ +}; + static void throtl_shutdown_wq(struct request_queue *q) { struct throtl_data *td = q->td; @@ -1273,6 +1384,7 @@ static void throtl_shutdown_wq(struct request_queue *q) } static struct blkcg_policy blkcg_policy_throtl = { + .dfl_cftypes = throtl_files, .legacy_cftypes = throtl_legacy_files, .pd_alloc_fn = throtl_pd_alloc, diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 7a7230136e43..97da5719c87a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1740,7 +1740,7 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v) static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, - bool is_leaf_weight) + bool on_dfl, bool is_leaf_weight) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; @@ -1753,9 +1753,17 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, if (ret) return ret; - ret = -EINVAL; - if (sscanf(ctx.body, "%llu", &v) != 1) + if (sscanf(ctx.body, "%llu", &v) == 1) { + /* require "default" on dfl */ + ret = -ERANGE; + if (!v && on_dfl) + goto out_finish; + } else if (!strcmp(strim(ctx.body), "default")) { + v = 0; + } else { + ret = -EINVAL; goto out_finish; + } cfqg = blkg_to_cfqg(ctx.blkg); cfqgd = blkcg_to_cfqgd(blkcg); @@ -1779,13 +1787,13 @@ out_finish: static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return __cfqg_set_weight_device(of, buf, nbytes, off, false); + return __cfqg_set_weight_device(of, buf, nbytes, off, false, false); } static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return __cfqg_set_weight_device(of, buf, nbytes, off, true); + return __cfqg_set_weight_device(of, buf, nbytes, off, false, true); } static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, @@ -2103,6 +2111,48 @@ static struct cftype cfq_blkcg_legacy_files[] = { #endif /* CONFIG_DEBUG_BLK_CGROUP */ { } /* terminate */ }; + +static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); + + seq_printf(sf, "default %u\n", cgd->weight); + blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device, + &blkcg_policy_cfq, 0, false); + return 0; +} + +static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + char *endp; + int ret; + u64 v; + + buf = strim(buf); + + /* "WEIGHT" or "default WEIGHT" sets the default weight */ + v = simple_strtoull(buf, &endp, 0); + if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { + ret = __cfq_set_weight(of_css(of), v, false); + return ret ?: nbytes; + } + + /* "MAJ:MIN WEIGHT" */ + return __cfqg_set_weight_device(of, buf, nbytes, off, true, false); +} + +static struct cftype cfq_blkcg_files[] = { + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cfq_print_weight_on_dfl, + .write = cfq_set_weight_on_dfl, + }, + { } /* terminate */ +}; + #else /* GROUP_IOSCHED */ static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, struct blkcg *blkcg) @@ -4659,6 +4709,7 @@ static struct elevator_type iosched_cfq = { #ifdef CONFIG_CFQ_GROUP_IOSCHED static struct blkcg_policy blkcg_policy_cfq = { + .dfl_cftypes = cfq_blkcg_files, .legacy_cftypes = cfq_blkcg_legacy_files, .cpd_alloc_fn = cfq_cpd_alloc, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index b270aef519c6..9a7c4bd45fff 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -148,6 +148,7 @@ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); struct blkcg_policy { int plid; /* cgroup files for the policy */ + struct cftype *dfl_cftypes; struct cftype *legacy_cftypes; /* operations */ From 3ecca62931ee6a30822a1ab7299bc8b8a21e5288 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:35 -0700 Subject: [PATCH 44/45] blkcg: s/CFQ_WEIGHT_*/CFQ_WEIGHT_LEGACY_*/ blkcg is gonna switch to cgroup common weight range as defined by CGROUP_WEIGHT_* on the unified hierarchy. In preparation, rename CFQ_WEIGHT_* constants to CFQ_WEIGHT_LEGACY_*. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 97da5719c87a..0fe721eaff28 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -68,9 +68,9 @@ static struct kmem_cache *cfq_pool; #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) /* blkio-related constants */ -#define CFQ_WEIGHT_MIN 10 -#define CFQ_WEIGHT_MAX 1000 -#define CFQ_WEIGHT_DEFAULT 500 +#define CFQ_WEIGHT_LEGACY_MIN 10 +#define CFQ_WEIGHT_LEGACY_DFL 500 +#define CFQ_WEIGHT_LEGACY_MAX 1000 struct cfq_ttime { unsigned long last_end_request; @@ -1580,11 +1580,11 @@ static void cfq_cpd_init(struct blkcg_policy_data *cpd) struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); if (cpd_to_blkcg(cpd) == &blkcg_root) { - cgd->weight = 2 * CFQ_WEIGHT_DEFAULT; - cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; + cgd->weight = 2 * CFQ_WEIGHT_LEGACY_DFL; + cgd->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL; } else { - cgd->weight = CFQ_WEIGHT_DEFAULT; - cgd->leaf_weight = CFQ_WEIGHT_DEFAULT; + cgd->weight = CFQ_WEIGHT_LEGACY_DFL; + cgd->leaf_weight = CFQ_WEIGHT_LEGACY_DFL; } } @@ -1769,7 +1769,7 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, cfqgd = blkcg_to_cfqgd(blkcg); ret = -ERANGE; - if (!v || (v >= CFQ_WEIGHT_MIN && v <= CFQ_WEIGHT_MAX)) { + if (!v || (v >= CFQ_WEIGHT_LEGACY_MIN && v <= CFQ_WEIGHT_LEGACY_MAX)) { if (!is_leaf_weight) { cfqg->dev_weight = v; cfqg->new_weight = v ?: cfqgd->weight; @@ -1804,7 +1804,7 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, struct cfq_group_data *cfqgd; int ret = 0; - if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) + if (val < CFQ_WEIGHT_LEGACY_MIN || val > CFQ_WEIGHT_LEGACY_MAX) return -EINVAL; spin_lock_irq(&blkcg->lock); @@ -4513,8 +4513,8 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) cfq_init_cfqg_base(cfqd->root_group); #endif - cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; - cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; + cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL; + cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL; /* * Not strictly needed (since RB_ROOT just clears the node and we From 69d7fde5909b614114343974cfc52cb8ff30b544 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:36 -0700 Subject: [PATCH 45/45] blkcg: use CGROUP_WEIGHT_* scale for io.weight on the unified hierarchy cgroup is trying to make interface consistent across different controllers. For weight based resource control, the knob should have the range [1, 10000] and default to 100. This patch updates cfq-iosched so that the weight range conforms. The internal calculations have enough range and the widening of the weight range shouldn't cause any problem. * blkcg_policy->cpd_bind_fn() is added. If present, this is invoked when blkcg is attached to a hierarchy. * cfq_cpd_init() is updated to use the new default value on the unified hierarchy. * cfq_cpd_bind() callback is implemented to clear per-blkg configs and apply the default config matching the hierarchy type. * cfqd->root_group->[leaf_]weight initialization in cfq_init_queue() is moved into !CONFIG_CFQ_GROUP_IOSCHED block. cfq_cpd_bind() is now responsible for initializing the initial weights when blkcg is enabled. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- Documentation/cgroups/unified-hierarchy.txt | 2 +- block/blk-cgroup.c | 21 ++++++++ block/cfq-iosched.c | 55 +++++++++++++++------ include/linux/blk-cgroup.h | 2 + 4 files changed, 64 insertions(+), 16 deletions(-) diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt index bd1ce15d5178..e0975c2cf03d 100644 --- a/Documentation/cgroups/unified-hierarchy.txt +++ b/Documentation/cgroups/unified-hierarchy.txt @@ -464,7 +464,7 @@ may be specified in any order and not all pairs have to be specified. The weight setting, currently only available and effective if cfq-iosched is in use for the target device. The weight is - between 10 and 1000 and defaults to 500. The first line + between 1 and 10000 and defaults to 100. The first line always contains the default weight in the following format to use when per-device setting is missing. diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 88bdb73bd5e0..ac8370cb2515 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1143,11 +1143,32 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, return ret; } +static void blkcg_bind(struct cgroup_subsys_state *root_css) +{ + int i; + + mutex_lock(&blkcg_pol_mutex); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkcg *blkcg; + + if (!pol || !pol->cpd_bind_fn) + continue; + + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) + if (blkcg->cpd[pol->plid]) + pol->cpd_bind_fn(blkcg->cpd[pol->plid]); + } + mutex_unlock(&blkcg_pol_mutex); +} + struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, + .bind = blkcg_bind, .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 0fe721eaff28..04de88463a98 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1522,6 +1522,9 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg) } #ifdef CONFIG_CFQ_GROUP_IOSCHED +static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, + bool on_dfl, bool reset_dev, bool is_leaf_weight); + static void cfqg_stats_exit(struct cfqg_stats *stats) { blkg_rwstat_exit(&stats->merged); @@ -1578,14 +1581,14 @@ static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) static void cfq_cpd_init(struct blkcg_policy_data *cpd) { struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); + unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ? + CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; - if (cpd_to_blkcg(cpd) == &blkcg_root) { - cgd->weight = 2 * CFQ_WEIGHT_LEGACY_DFL; - cgd->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL; - } else { - cgd->weight = CFQ_WEIGHT_LEGACY_DFL; - cgd->leaf_weight = CFQ_WEIGHT_LEGACY_DFL; - } + if (cpd_to_blkcg(cpd) == &blkcg_root) + weight *= 2; + + cgd->weight = weight; + cgd->leaf_weight = weight; } static void cfq_cpd_free(struct blkcg_policy_data *cpd) @@ -1593,6 +1596,19 @@ static void cfq_cpd_free(struct blkcg_policy_data *cpd) kfree(cpd_to_cfqgd(cpd)); } +static void cfq_cpd_bind(struct blkcg_policy_data *cpd) +{ + struct blkcg *blkcg = cpd_to_blkcg(cpd); + bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup); + unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; + + if (blkcg == &blkcg_root) + weight *= 2; + + WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false)); + WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true)); +} + static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) { struct cfq_group *cfqg; @@ -1742,6 +1758,8 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool on_dfl, bool is_leaf_weight) { + unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; + unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct cfq_group *cfqg; @@ -1769,7 +1787,7 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, cfqgd = blkcg_to_cfqgd(blkcg); ret = -ERANGE; - if (!v || (v >= CFQ_WEIGHT_LEGACY_MIN && v <= CFQ_WEIGHT_LEGACY_MAX)) { + if (!v || (v >= min && v <= max)) { if (!is_leaf_weight) { cfqg->dev_weight = v; cfqg->new_weight = v ?: cfqgd->weight; @@ -1797,15 +1815,17 @@ static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, } static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, - bool is_leaf_weight) + bool on_dfl, bool reset_dev, bool is_leaf_weight) { + unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; + unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; struct cfq_group_data *cfqgd; int ret = 0; - if (val < CFQ_WEIGHT_LEGACY_MIN || val > CFQ_WEIGHT_LEGACY_MAX) - return -EINVAL; + if (val < min || val > max) + return -ERANGE; spin_lock_irq(&blkcg->lock); cfqgd = blkcg_to_cfqgd(blkcg); @@ -1826,9 +1846,13 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, continue; if (!is_leaf_weight) { + if (reset_dev) + cfqg->dev_weight = 0; if (!cfqg->dev_weight) cfqg->new_weight = cfqgd->weight; } else { + if (reset_dev) + cfqg->dev_leaf_weight = 0; if (!cfqg->dev_leaf_weight) cfqg->new_leaf_weight = cfqgd->leaf_weight; } @@ -1842,13 +1866,13 @@ out: static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, val, false); + return __cfq_set_weight(css, val, false, false, false); } static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, val, true); + return __cfq_set_weight(css, val, false, false, true); } static int cfqg_print_stat(struct seq_file *sf, void *v) @@ -2135,7 +2159,7 @@ static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of, /* "WEIGHT" or "default WEIGHT" sets the default weight */ v = simple_strtoull(buf, &endp, 0); if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { - ret = __cfq_set_weight(of_css(of), v, false); + ret = __cfq_set_weight(of_css(of), v, true, false, false); return ret ?: nbytes; } @@ -4512,9 +4536,9 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) goto out_free; cfq_init_cfqg_base(cfqd->root_group); -#endif cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL; cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL; +#endif /* * Not strictly needed (since RB_ROOT just clears the node and we @@ -4715,6 +4739,7 @@ static struct blkcg_policy blkcg_policy_cfq = { .cpd_alloc_fn = cfq_cpd_alloc, .cpd_init_fn = cfq_cpd_init, .cpd_free_fn = cfq_cpd_free, + .cpd_bind_fn = cfq_cpd_bind, .pd_alloc_fn = cfq_pd_alloc, .pd_init_fn = cfq_pd_init, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 9a7c4bd45fff..0a5cc7a1109b 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -138,6 +138,7 @@ struct blkcg_gq { typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); +typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); @@ -155,6 +156,7 @@ struct blkcg_policy { blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; blkcg_pol_init_cpd_fn *cpd_init_fn; blkcg_pol_free_cpd_fn *cpd_free_fn; + blkcg_pol_bind_cpd_fn *cpd_bind_fn; blkcg_pol_alloc_pd_fn *pd_alloc_fn; blkcg_pol_init_pd_fn *pd_init_fn;