From 57fb233f078beb5d0437a4ae575fbd4d9eb9c738 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Aug 2012 04:56:11 -0700 Subject: [PATCH 01/27] block: Reorder struct bio_set This is prep work for the next patch, which embeds a struct bio_list in struct bio_set. Signed-off-by: Kent Overstreet CC: Jens Axboe --- include/linux/bio.h | 66 ++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index 820e7aaad4fd..93d3d17a300d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -298,39 +298,6 @@ static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } static inline void bio_disassociate_task(struct bio *bio) { } #endif /* CONFIG_BLK_CGROUP */ -/* - * bio_set is used to allow other portions of the IO system to - * allocate their own private memory pools for bio and iovec structures. - * These memory pools in turn all allocate from the bio_slab - * and the bvec_slabs[]. - */ -#define BIO_POOL_SIZE 2 -#define BIOVEC_NR_POOLS 6 -#define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1) - -struct bio_set { - struct kmem_cache *bio_slab; - unsigned int front_pad; - - mempool_t *bio_pool; -#if defined(CONFIG_BLK_DEV_INTEGRITY) - mempool_t *bio_integrity_pool; -#endif - mempool_t *bvec_pool; -}; - -struct biovec_slab { - int nr_vecs; - char *name; - struct kmem_cache *slab; -}; - -/* - * a small number of entries is fine, not going to be performance critical. - * basically we just need to survive - */ -#define BIO_SPLIT_ENTRIES 2 - #ifdef CONFIG_HIGHMEM /* * remember never ever reenable interrupts between a bvec_kmap_irq and @@ -527,6 +494,39 @@ static inline struct bio *bio_list_get(struct bio_list *bl) return bio; } +/* + * bio_set is used to allow other portions of the IO system to + * allocate their own private memory pools for bio and iovec structures. + * These memory pools in turn all allocate from the bio_slab + * and the bvec_slabs[]. + */ +#define BIO_POOL_SIZE 2 +#define BIOVEC_NR_POOLS 6 +#define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1) + +struct bio_set { + struct kmem_cache *bio_slab; + unsigned int front_pad; + + mempool_t *bio_pool; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + mempool_t *bio_integrity_pool; +#endif + mempool_t *bvec_pool; +}; + +struct biovec_slab { + int nr_vecs; + char *name; + struct kmem_cache *slab; +}; + +/* + * a small number of entries is fine, not going to be performance critical. + * basically we just need to survive + */ +#define BIO_SPLIT_ENTRIES 2 + #if defined(CONFIG_BLK_DEV_INTEGRITY) #define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)])) From df2cb6daa4cbc34406bc4b1ac9b9335df1083a72 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Sep 2012 14:33:46 -0700 Subject: [PATCH 02/27] block: Avoid deadlocks with bio allocation by stacking drivers Previously, if we ever try to allocate more than once from the same bio set while running under generic_make_request() (i.e. a stacking block driver), we risk deadlock. This is because of the code in generic_make_request() that converts recursion to iteration; any bios we submit won't actually be submitted (so they can complete and eventually be freed) until after we return - this means if we allocate a second bio, we're blocking the first one from ever being freed. Thus if enough threads call into a stacking block driver at the same time with bios that need multiple splits, and the bio_set's reserve gets used up, we deadlock. This can be worked around in the driver code - we could check if we're running under generic_make_request(), then mask out __GFP_WAIT when we go to allocate a bio, and if the allocation fails punt to workqueue and retry the allocation. But this is tricky and not a generic solution. This patch solves it for all users by inverting the previously described technique. We allocate a rescuer workqueue for each bio_set, and then in the allocation code if there are bios on current->bio_list we would be blocking, we punt them to the rescuer workqueue to be submitted. This guarantees forward progress for bio allocations under generic_make_request() provided each bio is submitted before allocating the next, and provided the bios are freed after they complete. Note that this doesn't do anything for allocation from other mempools. Instead of allocating per bio data structures from a mempool, code should use bio_set's front_pad. Tested it by forcing the rescue codepath to be taken (by disabling the first GFP_NOWAIT) attempt, and then ran it with bcache (which does a lot of arbitrary bio splitting) and verified that the rescuer was being invoked. Signed-off-by: Kent Overstreet CC: Jens Axboe Acked-by: Tejun Heo Reviewed-by: Muthukumar Ratty --- fs/bio.c | 116 +++++++++++++++++++++++++++++++++++++++++++- include/linux/bio.h | 9 ++++ 2 files changed, 123 insertions(+), 2 deletions(-) diff --git a/fs/bio.c b/fs/bio.c index bb5768f59b32..73b544709945 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -297,6 +297,54 @@ void bio_reset(struct bio *bio) } EXPORT_SYMBOL(bio_reset); +static void bio_alloc_rescue(struct work_struct *work) +{ + struct bio_set *bs = container_of(work, struct bio_set, rescue_work); + struct bio *bio; + + while (1) { + spin_lock(&bs->rescue_lock); + bio = bio_list_pop(&bs->rescue_list); + spin_unlock(&bs->rescue_lock); + + if (!bio) + break; + + generic_make_request(bio); + } +} + +static void punt_bios_to_rescuer(struct bio_set *bs) +{ + struct bio_list punt, nopunt; + struct bio *bio; + + /* + * In order to guarantee forward progress we must punt only bios that + * were allocated from this bio_set; otherwise, if there was a bio on + * there for a stacking driver higher up in the stack, processing it + * could require allocating bios from this bio_set, and doing that from + * our own rescuer would be bad. + * + * Since bio lists are singly linked, pop them all instead of trying to + * remove from the middle of the list: + */ + + bio_list_init(&punt); + bio_list_init(&nopunt); + + while ((bio = bio_list_pop(current->bio_list))) + bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); + + *current->bio_list = nopunt; + + spin_lock(&bs->rescue_lock); + bio_list_merge(&bs->rescue_list, &punt); + spin_unlock(&bs->rescue_lock); + + queue_work(bs->rescue_workqueue, &bs->rescue_work); +} + /** * bio_alloc_bioset - allocate a bio for I/O * @gfp_mask: the GFP_ mask given to the slab allocator @@ -314,11 +362,27 @@ EXPORT_SYMBOL(bio_reset); * previously allocated bio for IO before attempting to allocate a new one. * Failure to do so can cause deadlocks under memory pressure. * + * Note that when running under generic_make_request() (i.e. any block + * driver), bios are not submitted until after you return - see the code in + * generic_make_request() that converts recursion into iteration, to prevent + * stack overflows. + * + * This would normally mean allocating multiple bios under + * generic_make_request() would be susceptible to deadlocks, but we have + * deadlock avoidance code that resubmits any blocked bios from a rescuer + * thread. + * + * However, we do not guarantee forward progress for allocations from other + * mempools. Doing multiple allocations from the same mempool under + * generic_make_request() should be avoided - instead, use bio_set's front_pad + * for per bio allocations. + * * RETURNS: * Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { + gfp_t saved_gfp = gfp_mask; unsigned front_pad; unsigned inline_vecs; unsigned long idx = BIO_POOL_NONE; @@ -336,7 +400,37 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) front_pad = 0; inline_vecs = nr_iovecs; } else { + /* + * generic_make_request() converts recursion to iteration; this + * means if we're running beneath it, any bios we allocate and + * submit will not be submitted (and thus freed) until after we + * return. + * + * This exposes us to a potential deadlock if we allocate + * multiple bios from the same bio_set() while running + * underneath generic_make_request(). If we were to allocate + * multiple bios (say a stacking block driver that was splitting + * bios), we would deadlock if we exhausted the mempool's + * reserve. + * + * We solve this, and guarantee forward progress, with a rescuer + * workqueue per bio_set. If we go to allocate and there are + * bios on current->bio_list, we first try the allocation + * without __GFP_WAIT; if that fails, we punt those bios we + * would be blocking to the rescuer workqueue before we retry + * with the original gfp_flags. + */ + + if (current->bio_list && !bio_list_empty(current->bio_list)) + gfp_mask &= ~__GFP_WAIT; + p = mempool_alloc(bs->bio_pool, gfp_mask); + if (!p && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; + p = mempool_alloc(bs->bio_pool, gfp_mask); + } + front_pad = bs->front_pad; inline_vecs = BIO_INLINE_VECS; } @@ -349,6 +443,12 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) if (nr_iovecs > inline_vecs) { bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); + if (!bvl && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; + bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); + } + if (unlikely(!bvl)) goto err_free; } else if (nr_iovecs) { @@ -1579,6 +1679,9 @@ static void biovec_free_pools(struct bio_set *bs) void bioset_free(struct bio_set *bs) { + if (bs->rescue_workqueue) + destroy_workqueue(bs->rescue_workqueue); + if (bs->bio_pool) mempool_destroy(bs->bio_pool); @@ -1614,6 +1717,10 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) bs->front_pad = front_pad; + spin_lock_init(&bs->rescue_lock); + bio_list_init(&bs->rescue_list); + INIT_WORK(&bs->rescue_work, bio_alloc_rescue); + bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); if (!bs->bio_slab) { kfree(bs); @@ -1624,9 +1731,14 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) if (!bs->bio_pool) goto bad; - if (!biovec_create_pools(bs, pool_size)) - return bs; + if (biovec_create_pools(bs, pool_size)) + goto bad; + bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); + if (!bs->rescue_workqueue) + goto bad; + + return bs; bad: bioset_free(bs); return NULL; diff --git a/include/linux/bio.h b/include/linux/bio.h index 93d3d17a300d..b31036ff779f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -513,6 +513,15 @@ struct bio_set { mempool_t *bio_integrity_pool; #endif mempool_t *bvec_pool; + + /* + * Deadlock avoidance for stacking block drivers: see comments in + * bio_alloc_bioset() for details + */ + spinlock_t rescue_lock; + struct bio_list rescue_list; + struct work_struct rescue_work; + struct workqueue_struct *rescue_workqueue; }; struct biovec_slab { From 6fda981cafbf908acd11e1e636fec50e99d56a47 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Oct 2012 13:18:27 -0700 Subject: [PATCH 03/27] block: Fix a buffer overrun in bio_integrity_split() bio_integrity_split() seemed to be confusing pointers and arrays - bip_vec in bio_integrity_payload was an array appended to the end of the payload, so the bio_vecs in struct bio_pair should have come after the bio_integrity_payload they're for. Fix it by making bip_vec a pointer to the inline vecs - a later patch is going to make more use of this pointer. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: Martin K. Petersen --- fs/bio-integrity.c | 5 +++-- include/linux/bio.h | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index a3f28f331b2b..94fa1c562c0e 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -112,6 +112,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, bip->bip_slab = idx; bip->bip_bio = bio; + bip->bip_vec = bip->bip_inline_vecs; bio->bi_integrity = bip; return bip; @@ -697,8 +698,8 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors) bp->iv1 = bip->bip_vec[0]; bp->iv2 = bip->bip_vec[0]; - bp->bip1.bip_vec[0] = bp->iv1; - bp->bip2.bip_vec[0] = bp->iv2; + bp->bip1.bip_vec = &bp->iv1; + bp->bip2.bip_vec = &bp->iv2; bp->iv1.bv_len = sectors * bi->tuple_size; bp->iv2.bv_offset += sectors * bi->tuple_size; diff --git a/include/linux/bio.h b/include/linux/bio.h index b31036ff779f..81004fdcc277 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -182,7 +182,9 @@ struct bio_integrity_payload { unsigned short bip_idx; /* current bip_vec index */ struct work_struct bip_work; /* I/O completion */ - struct bio_vec bip_vec[0]; /* embedded bvec array */ + + struct bio_vec *bip_vec; + struct bio_vec bip_inline_vecs[0];/* embedded bvec array */ }; #endif /* CONFIG_BLK_DEV_INTEGRITY */ From 9f060e2231ca96ca94f2ffcff730acd72606b280 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Oct 2012 15:29:33 -0700 Subject: [PATCH 04/27] block: Convert integrity to bvec_alloc_bs() This adds a pointer to the bvec array to struct bio_integrity_payload, instead of the bvecs always being inline; then the bvecs are allocated with bvec_alloc_bs(). Changed bvec_alloc_bs() and bvec_free_bs() to take a pointer to a mempool instead of the bioset, so that bio integrity can use a different mempool for its bvecs, and thus avoid a potential deadlock. This is eventually for immutable bio vecs - immutable bvecs aren't useful if we still have to copy them, hence the need for the pointer. Less code is always nice too, though. Also, bio_integrity_alloc() was using fs_bio_set if no bio_set was specified. This was wrong - using the bio_set doesn't protect us from memory allocation failures, because we just used kmalloc for the bio_integrity_payload. But it does introduce the possibility of deadlock, if for some reason we weren't supposed to be using fs_bio_set. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: Martin K. Petersen --- fs/bio-integrity.c | 130 ++++++++++++++++---------------------------- fs/bio.c | 36 +++++------- include/linux/bio.h | 8 ++- 3 files changed, 67 insertions(+), 107 deletions(-) diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 94fa1c562c0e..8c4c604c840d 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -27,48 +27,11 @@ #include #include -struct integrity_slab { - struct kmem_cache *slab; - unsigned short nr_vecs; - char name[8]; -}; - -#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) } -struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = { - IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES), -}; -#undef IS +#define BIP_INLINE_VECS 4 +static struct kmem_cache *bip_slab; static struct workqueue_struct *kintegrityd_wq; -static inline unsigned int vecs_to_idx(unsigned int nr) -{ - switch (nr) { - case 1: - return 0; - case 2 ... 4: - return 1; - case 5 ... 16: - return 2; - case 17 ... 64: - return 3; - case 65 ... 128: - return 4; - case 129 ... BIO_MAX_PAGES: - return 5; - default: - BUG(); - } -} - -static inline int use_bip_pool(unsigned int idx) -{ - if (idx == BIOVEC_MAX_IDX) - return 1; - - return 0; -} - /** * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to @@ -84,38 +47,41 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, unsigned int nr_vecs) { struct bio_integrity_payload *bip; - unsigned int idx = vecs_to_idx(nr_vecs); struct bio_set *bs = bio->bi_pool; + unsigned long idx = BIO_POOL_NONE; + unsigned inline_vecs; - if (!bs) - bs = fs_bio_set; - - BUG_ON(bio == NULL); - bip = NULL; - - /* Lower order allocations come straight from slab */ - if (!use_bip_pool(idx)) - bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask); - - /* Use mempool if lower order alloc failed or max vecs were requested */ - if (bip == NULL) { - idx = BIOVEC_MAX_IDX; /* so we free the payload properly later */ + if (!bs) { + bip = kmalloc(sizeof(struct bio_integrity_payload) + + sizeof(struct bio_vec) * nr_vecs, gfp_mask); + inline_vecs = nr_vecs; + } else { bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); - - if (unlikely(bip == NULL)) { - printk(KERN_ERR "%s: could not alloc bip\n", __func__); - return NULL; - } + inline_vecs = BIP_INLINE_VECS; } + if (unlikely(!bip)) + return NULL; + memset(bip, 0, sizeof(*bip)); + if (nr_vecs > inline_vecs) { + bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, + bs->bvec_integrity_pool); + if (!bip->bip_vec) + goto err; + } else { + bip->bip_vec = bip->bip_inline_vecs; + } + bip->bip_slab = idx; bip->bip_bio = bio; - bip->bip_vec = bip->bip_inline_vecs; bio->bi_integrity = bip; return bip; +err: + mempool_free(bip, bs->bio_integrity_pool); + return NULL; } EXPORT_SYMBOL(bio_integrity_alloc); @@ -131,20 +97,20 @@ void bio_integrity_free(struct bio *bio) struct bio_integrity_payload *bip = bio->bi_integrity; struct bio_set *bs = bio->bi_pool; - if (!bs) - bs = fs_bio_set; - - BUG_ON(bip == NULL); - /* A cloned bio doesn't own the integrity metadata */ if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY) && bip->bip_buf != NULL) kfree(bip->bip_buf); - if (use_bip_pool(bip->bip_slab)) + if (bs) { + if (bip->bip_slab != BIO_POOL_NONE) + bvec_free(bs->bvec_integrity_pool, bip->bip_vec, + bip->bip_slab); + mempool_free(bip, bs->bio_integrity_pool); - else - kmem_cache_free(bip_slab[bip->bip_slab].slab, bip); + } else { + kfree(bip); + } bio->bi_integrity = NULL; } @@ -747,13 +713,14 @@ EXPORT_SYMBOL(bio_integrity_clone); int bioset_integrity_create(struct bio_set *bs, int pool_size) { - unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES); - if (bs->bio_integrity_pool) return 0; - bs->bio_integrity_pool = - mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab); + bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab); + + bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); + if (!bs->bvec_integrity_pool) + return -1; if (!bs->bio_integrity_pool) return -1; @@ -766,13 +733,14 @@ void bioset_integrity_free(struct bio_set *bs) { if (bs->bio_integrity_pool) mempool_destroy(bs->bio_integrity_pool); + + if (bs->bvec_integrity_pool) + mempool_destroy(bs->bio_integrity_pool); } EXPORT_SYMBOL(bioset_integrity_free); void __init bio_integrity_init(void) { - unsigned int i; - /* * kintegrityd won't block much but may burn a lot of CPU cycles. * Make it highpri CPU intensive wq with max concurrency of 1. @@ -782,14 +750,10 @@ void __init bio_integrity_init(void) if (!kintegrityd_wq) panic("Failed to create kintegrityd\n"); - for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) { - unsigned int size; - - size = sizeof(struct bio_integrity_payload) - + bip_slab[i].nr_vecs * sizeof(struct bio_vec); - - bip_slab[i].slab = - kmem_cache_create(bip_slab[i].name, size, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - } + bip_slab = kmem_cache_create("bio_integrity_payload", + sizeof(struct bio_integrity_payload) + + sizeof(struct bio_vec) * BIP_INLINE_VECS, + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + if (!bip_slab) + panic("Failed to create slab\n"); } diff --git a/fs/bio.c b/fs/bio.c index 73b544709945..40aa96eae99f 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -160,12 +160,12 @@ unsigned int bvec_nr_vecs(unsigned short idx) return bvec_slabs[idx].nr_vecs; } -void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx) +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) { BIO_BUG_ON(idx >= BIOVEC_NR_POOLS); if (idx == BIOVEC_MAX_IDX) - mempool_free(bv, bs->bvec_pool); + mempool_free(bv, pool); else { struct biovec_slab *bvs = bvec_slabs + idx; @@ -173,8 +173,8 @@ void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx) } } -struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, - struct bio_set *bs) +struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, + mempool_t *pool) { struct bio_vec *bvl; @@ -210,7 +210,7 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, */ if (*idx == BIOVEC_MAX_IDX) { fallback: - bvl = mempool_alloc(bs->bvec_pool, gfp_mask); + bvl = mempool_alloc(pool, gfp_mask); } else { struct biovec_slab *bvs = bvec_slabs + *idx; gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); @@ -253,7 +253,7 @@ static void bio_free(struct bio *bio) if (bs) { if (bio_has_allocated_vec(bio)) - bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); + bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio)); /* * If we have front padding, adjust the bio pointer before freeing @@ -442,11 +442,11 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) bio_init(bio); if (nr_iovecs > inline_vecs) { - bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; - bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool); } if (unlikely(!bvl)) @@ -1661,20 +1661,11 @@ EXPORT_SYMBOL(bio_sector_offset); * create memory pools for biovec's in a bio_set. * use the global biovec slabs created for general use. */ -static int biovec_create_pools(struct bio_set *bs, int pool_entries) +mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries) { struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; - bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab); - if (!bs->bvec_pool) - return -ENOMEM; - - return 0; -} - -static void biovec_free_pools(struct bio_set *bs) -{ - mempool_destroy(bs->bvec_pool); + return mempool_create_slab_pool(pool_entries, bp->slab); } void bioset_free(struct bio_set *bs) @@ -1685,8 +1676,10 @@ void bioset_free(struct bio_set *bs) if (bs->bio_pool) mempool_destroy(bs->bio_pool); + if (bs->bvec_pool) + mempool_destroy(bs->bvec_pool); + bioset_integrity_free(bs); - biovec_free_pools(bs); bio_put_slab(bs); kfree(bs); @@ -1731,7 +1724,8 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) if (!bs->bio_pool) goto bad; - if (biovec_create_pools(bs, pool_size)) + bs->bvec_pool = biovec_create_pool(bs, pool_size); + if (!bs->bvec_pool) goto bad; bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); diff --git a/include/linux/bio.h b/include/linux/bio.h index 81004fdcc277..669b1cb18fee 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -213,6 +213,7 @@ extern void bio_pair_release(struct bio_pair *dbio); extern struct bio_set *bioset_create(unsigned int, unsigned int); extern void bioset_free(struct bio_set *); +extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries); extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); extern void bio_put(struct bio *); @@ -288,8 +289,8 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, int, int, gfp_t); extern int bio_uncopy_user(struct bio *); void zero_fill_bio(struct bio *bio); -extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *); -extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int); +extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); +extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); #ifdef CONFIG_BLK_CGROUP @@ -511,10 +512,11 @@ struct bio_set { unsigned int front_pad; mempool_t *bio_pool; + mempool_t *bvec_pool; #if defined(CONFIG_BLK_DEV_INTEGRITY) mempool_t *bio_integrity_pool; + mempool_t *bvec_integrity_pool; #endif - mempool_t *bvec_pool; /* * Deadlock avoidance for stacking block drivers: see comments in From 054bdf646e36c2f7dc1bf6bc6209dbbb5909164b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 28 Sep 2012 13:17:55 -0700 Subject: [PATCH 05/27] block: Add bio_advance() This is prep work for immutable bio vecs; we first want to centralize where bvecs are modified. Next two patches convert some existing code to use this function. Signed-off-by: Kent Overstreet CC: Jens Axboe --- fs/bio.c | 41 +++++++++++++++++++++++++++++++++++++++ include/linux/bio.h | 2 ++ include/linux/blk_types.h | 2 ++ 3 files changed, 45 insertions(+) diff --git a/fs/bio.c b/fs/bio.c index 40aa96eae99f..7edc08d2246c 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -752,6 +752,47 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, } EXPORT_SYMBOL(bio_add_page); +/** + * bio_advance - increment/complete a bio by some number of bytes + * @bio: bio to advance + * @bytes: number of bytes to complete + * + * This updates bi_sector, bi_size and bi_idx; if the number of bytes to + * complete doesn't align with a bvec boundary, then bv_len and bv_offset will + * be updated on the last bvec as well. + * + * @bio will then represent the remaining, uncompleted portion of the io. + */ +void bio_advance(struct bio *bio, unsigned bytes) +{ + if (bio_integrity(bio)) + bio_integrity_advance(bio, bytes); + + bio->bi_sector += bytes >> 9; + bio->bi_size -= bytes; + + if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK) + return; + + while (bytes) { + if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { + WARN_ONCE(1, "bio idx %d >= vcnt %d\n", + bio->bi_idx, bio->bi_vcnt); + break; + } + + if (bytes >= bio_iovec(bio)->bv_len) { + bytes -= bio_iovec(bio)->bv_len; + bio->bi_idx++; + } else { + bio_iovec(bio)->bv_len -= bytes; + bio_iovec(bio)->bv_offset += bytes; + bytes = 0; + } + } +} +EXPORT_SYMBOL(bio_advance); + struct bio_map_data { struct bio_vec *iovecs; struct sg_iovec *sgvecs; diff --git a/include/linux/bio.h b/include/linux/bio.h index 669b1cb18fee..fcb4dba2d8ea 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -248,6 +248,8 @@ extern void bio_endio(struct bio *, int); struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); +extern void bio_advance(struct bio *, unsigned); + extern void bio_init(struct bio *); extern void bio_reset(struct bio *); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index cdf11191e645..c178d25e588b 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -197,6 +197,8 @@ enum rq_flag_bits { REQ_SECURE) #define REQ_CLONE_MASK REQ_COMMON_MASK +#define BIO_NO_ADVANCE_ITER_MASK (REQ_DISCARD|REQ_WRITE_SAME) + /* This mask is used for both bio and request merge checking */ #define REQ_NOMERGE_FLAGS \ (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA) From f79ea4161434b31e351658283b24e92c3e570142 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 20 Sep 2012 16:38:30 -0700 Subject: [PATCH 06/27] block: Refactor blk_update_request() Converts it to use bio_advance(), simplifying it quite a bit in the process. Note that req_bio_endio() now always calls bio_advance() - which means it always loops over the biovec, not just on partial completions. Don't expect it to affect performance, but worth noting. Tested it by forcing partial updates, and dumping before and after on various bio/bvec fields when doing a partial update. Signed-off-by: Kent Overstreet CC: Jens Axboe --- block/blk-core.c | 80 ++++++++---------------------------------------- 1 file changed, 12 insertions(+), 68 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 074b758efc42..86a1afeef606 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -158,20 +158,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio, else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) error = -EIO; - if (unlikely(nbytes > bio->bi_size)) { - printk(KERN_ERR "%s: want %u bytes done, %u left\n", - __func__, nbytes, bio->bi_size); - nbytes = bio->bi_size; - } - if (unlikely(rq->cmd_flags & REQ_QUIET)) set_bit(BIO_QUIET, &bio->bi_flags); - bio->bi_size -= nbytes; - bio->bi_sector += (nbytes >> 9); - - if (bio_integrity(bio)) - bio_integrity_advance(bio, nbytes); + bio_advance(bio, nbytes); /* don't actually finish bio if it's part of flush sequence */ if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) @@ -2252,8 +2242,7 @@ EXPORT_SYMBOL(blk_fetch_request); **/ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) { - int total_bytes, bio_nbytes, next_idx = 0; - struct bio *bio; + int total_bytes; if (!req->bio) return false; @@ -2299,56 +2288,21 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) blk_account_io_completion(req, nr_bytes); - total_bytes = bio_nbytes = 0; - while ((bio = req->bio) != NULL) { - int nbytes; + total_bytes = 0; + while (req->bio) { + struct bio *bio = req->bio; + unsigned bio_bytes = min(bio->bi_size, nr_bytes); - if (nr_bytes >= bio->bi_size) { + if (bio_bytes == bio->bi_size) req->bio = bio->bi_next; - nbytes = bio->bi_size; - req_bio_endio(req, bio, nbytes, error); - next_idx = 0; - bio_nbytes = 0; - } else { - int idx = bio->bi_idx + next_idx; - if (unlikely(idx >= bio->bi_vcnt)) { - blk_dump_rq_flags(req, "__end_that"); - printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n", - __func__, idx, bio->bi_vcnt); - break; - } + req_bio_endio(req, bio, bio_bytes, error); - nbytes = bio_iovec_idx(bio, idx)->bv_len; - BIO_BUG_ON(nbytes > bio->bi_size); + total_bytes += bio_bytes; + nr_bytes -= bio_bytes; - /* - * not a complete bvec done - */ - if (unlikely(nbytes > nr_bytes)) { - bio_nbytes += nr_bytes; - total_bytes += nr_bytes; - break; - } - - /* - * advance to the next vector - */ - next_idx++; - bio_nbytes += nbytes; - } - - total_bytes += nbytes; - nr_bytes -= nbytes; - - bio = req->bio; - if (bio) { - /* - * end more in this run, or just return 'not-done' - */ - if (unlikely(nr_bytes <= 0)) - break; - } + if (!nr_bytes) + break; } /* @@ -2364,16 +2318,6 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) return false; } - /* - * if the request wasn't completed, update state - */ - if (bio_nbytes) { - req_bio_endio(req, bio, bio_nbytes, error); - bio->bi_idx += next_idx; - bio_iovec(bio)->bv_offset += nr_bytes; - bio_iovec(bio)->bv_len -= nr_bytes; - } - req->__data_len -= total_bytes; req->buffer = bio_data(req->bio); From fb9e3534768c1daa0bd09dd00d1a8828e15079c8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 26 Sep 2012 21:41:20 -0700 Subject: [PATCH 07/27] md: Convert md_trim_bio() to use bio_advance() Signed-off-by: Kent Overstreet CC: Jens Axboe CC: NeilBrown Acked-by: NeilBrown --- drivers/md/md.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index fcb878f88796..d323676580a9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -194,21 +194,12 @@ void md_trim_bio(struct bio *bio, int offset, int size) if (offset == 0 && size == bio->bi_size) return; - bio->bi_sector += offset; - bio->bi_size = size; - offset <<= 9; clear_bit(BIO_SEG_VALID, &bio->bi_flags); - while (bio->bi_idx < bio->bi_vcnt && - bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { - /* remove this whole bio_vec */ - offset -= bio->bi_io_vec[bio->bi_idx].bv_len; - bio->bi_idx++; - } - if (bio->bi_idx < bio->bi_vcnt) { - bio->bi_io_vec[bio->bi_idx].bv_offset += offset; - bio->bi_io_vec[bio->bi_idx].bv_len -= offset; - } + bio_advance(bio, offset << 9); + + bio->bi_size = size; + /* avoid any complications with bi_idx being non-zero*/ if (bio->bi_idx) { memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, From f73a1c7d117d07a96d89475066188a2b79e53c48 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 25 Sep 2012 15:05:12 -0700 Subject: [PATCH 08/27] block: Add bio_end_sector() Just a little convenience macro - main reason to add it now is preparing for immutable bio vecs, it'll reduce the size of the patch that puts bi_sector/bi_size/bi_idx into a struct bvec_iter. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: Lars Ellenberg CC: Jiri Kosina CC: Alasdair Kergon CC: dm-devel@redhat.com CC: Neil Brown CC: Martin Schwidefsky CC: Heiko Carstens CC: linux-s390@vger.kernel.org CC: Chris Mason CC: Steven Whitehouse Acked-by: Steven Whitehouse --- block/blk-core.c | 2 +- block/cfq-iosched.c | 7 ++----- block/deadline-iosched.c | 2 +- drivers/block/brd.c | 3 +-- drivers/block/pktcdvd.c | 6 +++--- drivers/md/dm-stripe.c | 2 +- drivers/md/dm-verity.c | 2 +- drivers/md/faulty.c | 6 ++---- drivers/md/linear.c | 3 +-- drivers/md/raid1.c | 4 ++-- drivers/md/raid5.c | 14 +++++++------- drivers/s390/block/dcssblk.c | 3 +-- fs/btrfs/extent_io.c | 3 +-- fs/gfs2/lops.c | 2 +- include/linux/bio.h | 1 + 15 files changed, 26 insertions(+), 34 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 86a1afeef606..7236b826f4a1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1586,7 +1586,7 @@ static void handle_bad_sector(struct bio *bio) printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", bdevname(bio->bi_bdev, b), bio->bi_rw, - (unsigned long long)bio->bi_sector + bio_sectors(bio), + (unsigned long long)bio_end_sector(bio), (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); set_bit(BIO_EOF, &bio->bi_flags); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 4f0ade74cfd0..d5cd3131c57a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2270,11 +2270,8 @@ cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) return NULL; cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio)); - if (cfqq) { - sector_t sector = bio->bi_sector + bio_sectors(bio); - - return elv_rb_find(&cfqq->sort_list, sector); - } + if (cfqq) + return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio)); return NULL; } diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index 90037b5eb17f..ba19a3afab79 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -132,7 +132,7 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio) * check for front merge */ if (dd->front_merges) { - sector_t sector = bio->bi_sector + bio_sectors(bio); + sector_t sector = bio_end_sector(bio); __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); if (__rq) { diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 531ceb31d0ff..f1a29f8e9d33 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -334,8 +334,7 @@ static void brd_make_request(struct request_queue *q, struct bio *bio) int err = -EIO; sector = bio->bi_sector; - if (sector + (bio->bi_size >> SECTOR_SHIFT) > - get_capacity(bdev->bd_disk)) + if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) goto out; if (unlikely(bio->bi_rw & REQ_DISCARD)) { diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 2e7de7a59bfc..26938e8e2fc3 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -901,7 +901,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) pd->iosched.successive_reads += bio->bi_size >> 10; else { pd->iosched.successive_reads = 0; - pd->iosched.last_write = bio->bi_sector + bio_sectors(bio); + pd->iosched.last_write = bio_end_sector(bio); } if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) { if (pd->read_speed == pd->write_speed) { @@ -2454,7 +2454,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) zone = ZONE(bio->bi_sector, pd); VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n", (unsigned long long)bio->bi_sector, - (unsigned long long)(bio->bi_sector + bio_sectors(bio))); + (unsigned long long)bio_end_sector(bio)); /* Check if we have to split the bio */ { @@ -2462,7 +2462,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) sector_t last_zone; int first_sectors; - last_zone = ZONE(bio->bi_sector + bio_sectors(bio) - 1, pd); + last_zone = ZONE(bio_end_sector(bio) - 1, pd); if (last_zone != zone) { BUG_ON(last_zone != zone + pd->settings.size); first_sectors = last_zone - bio->bi_sector; diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index d8837d313f54..ea5e878a30b9 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -258,7 +258,7 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio, sector_t begin, end; stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin); - stripe_map_range_sector(sc, bio->bi_sector + bio_sectors(bio), + stripe_map_range_sector(sc, bio_end_sector(bio), target_stripe, &end); if (begin < end) { bio->bi_bdev = sc->stripe[target_stripe].dev->bdev; diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 6ad538375c3c..923115d08baa 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -472,7 +472,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) return -EIO; } - if ((bio->bi_sector + bio_sectors(bio)) >> + if (bio_end_sector(bio) >> (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) { DMERR_LIMIT("io out of range"); return -EIO; diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 5e7dc772f5de..3193aefe982b 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -185,8 +185,7 @@ static void make_request(struct mddev *mddev, struct bio *bio) return; } - if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9), - WRITE)) + if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), WRITE)) failit = 1; if (check_mode(conf, WritePersistent)) { add_sector(conf, bio->bi_sector, WritePersistent); @@ -196,8 +195,7 @@ static void make_request(struct mddev *mddev, struct bio *bio) failit = 1; } else { /* read request */ - if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9), - READ)) + if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), READ)) failit = 1; if (check_mode(conf, ReadTransient)) failit = 1; diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 21014836bdbf..f03fabd2b37b 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -317,8 +317,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) bio_io_error(bio); return; } - if (unlikely(bio->bi_sector + (bio->bi_size >> 9) > - tmp_dev->end_sector)) { + if (unlikely(bio_end_sector(bio) > tmp_dev->end_sector)) { /* This bio crosses a device boundary, so we have to * split it. */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fd86b372692d..4d8c2e0a6bad 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1018,7 +1018,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) md_write_start(mddev, bio); /* wait on superblock update early */ if (bio_data_dir(bio) == WRITE && - bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo && + bio_end_sector(bio) > mddev->suspend_lo && bio->bi_sector < mddev->suspend_hi) { /* As the suspend_* range is controlled by * userspace, we want an interruptible @@ -1029,7 +1029,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) flush_signals(current); prepare_to_wait(&conf->wait_barrier, &w, TASK_INTERRUPTIBLE); - if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo || + if (bio_end_sector(bio) <= mddev->suspend_lo || bio->bi_sector >= mddev->suspend_hi) break; schedule(); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3ee2912889e7..68706970d217 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2384,11 +2384,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in } else bip = &sh->dev[dd_idx].toread; while (*bip && (*bip)->bi_sector < bi->bi_sector) { - if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) + if (bio_end_sector(*bip) > bi->bi_sector) goto overlap; bip = & (*bip)->bi_next; } - if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) + if (*bip && (*bip)->bi_sector < bio_end_sector(bi)) goto overlap; BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); @@ -2404,8 +2404,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && bi && bi->bi_sector <= sector; bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { - if (bi->bi_sector + (bi->bi_size>>9) >= sector) - sector = bi->bi_sector + (bi->bi_size>>9); + if (bio_end_sector(bi) >= sector) + sector = bio_end_sector(bi); } if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); @@ -3941,7 +3941,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) 0, &dd_idx, NULL); - end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9); + end_sector = bio_end_sector(align_bi); rcu_read_lock(); rdev = rcu_dereference(conf->disks[dd_idx].replacement); if (!rdev || test_bit(Faulty, &rdev->flags) || @@ -4216,7 +4216,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) } logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); - last_sector = bi->bi_sector + (bi->bi_size>>9); + last_sector = bio_end_sector(bi); bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ @@ -4679,7 +4679,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); sector = raid5_compute_sector(conf, logical_sector, 0, &dd_idx, NULL); - last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); + last_sector = bio_end_sector(raid_bio); for (; logical_sector < last_sector; logical_sector += STRIPE_SECTORS, diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index b6ad0de07930..12d08b4529e9 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -826,8 +826,7 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio) if ((bio->bi_sector & 7) != 0 || (bio->bi_size & 4095) != 0) /* Request is not page-aligned. */ goto fail; - if (((bio->bi_size >> 9) + bio->bi_sector) - > get_capacity(bio->bi_bdev->bd_disk)) { + if (bio_end_sector(bio) > get_capacity(bio->bi_bdev->bd_disk)) { /* Request beyond end of DCSS segment. */ goto fail; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f173c5af6461..bed072aa461f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2527,8 +2527,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (old_compressed) contig = bio->bi_sector == sector; else - contig = bio->bi_sector + (bio->bi_size >> 9) == - sector; + contig = bio_end_sector(bio) == sector; if (prev_bio_flags != bio_flags || !contig || merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index a5055977a214..5c37ef982390 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -300,7 +300,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) u64 nblk; if (bio) { - nblk = bio->bi_sector + bio_sectors(bio); + nblk = bio_end_sector(bio); nblk >>= sdp->sd_fsb2bb_shift; if (blkno == nblk) return bio; diff --git a/include/linux/bio.h b/include/linux/bio.h index fcb4dba2d8ea..20507eb7c979 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -67,6 +67,7 @@ #define bio_offset(bio) bio_iovec((bio))->bv_offset #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx) #define bio_sectors(bio) ((bio)->bi_size >> 9) +#define bio_end_sector(bio) ((bio)->bi_sector + bio_sectors((bio))) static inline unsigned int bio_cur_bytes(struct bio *bio) { From aa8b57aa3d1c06ca53312294ee6dfc767ee3ddb3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 5 Feb 2013 15:19:29 -0800 Subject: [PATCH 09/27] block: Use bio_sectors() more consistently Bunch of places in the code weren't using it where they could be - this'll reduce the size of the patch that puts bi_sector/bi_size/bi_idx into a struct bvec_iter. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: "Ed L. Cashin" CC: Nick Piggin CC: Jiri Kosina CC: Jim Paris CC: Geoff Levand CC: Alasdair Kergon CC: dm-devel@redhat.com CC: Neil Brown CC: Steven Rostedt Acked-by: Ed Cashin --- drivers/block/pktcdvd.c | 2 +- drivers/md/dm-raid1.c | 2 +- drivers/md/raid0.c | 6 +++--- drivers/md/raid1.c | 17 ++++++++--------- drivers/md/raid10.c | 24 +++++++++++------------- drivers/md/raid5.c | 8 ++++---- fs/btrfs/volumes.c | 2 +- include/trace/events/block.h | 12 ++++++------ 8 files changed, 35 insertions(+), 38 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 26938e8e2fc3..2c27744b9ca6 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2433,7 +2433,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) cloned_bio->bi_bdev = pd->bdev; cloned_bio->bi_private = psd; cloned_bio->bi_end_io = pkt_end_io_read_cloned; - pd->stats.secs_r += bio->bi_size >> 9; + pd->stats.secs_r += bio_sectors(bio); pkt_queue_bio(pd, cloned_bio); return; } diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index d053098c6a91..699b5be68d31 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -458,7 +458,7 @@ static void map_region(struct dm_io_region *io, struct mirror *m, { io->bdev = m->dev->bdev; io->sector = map_sector(m, bio); - io->count = bio->bi_size >> 9; + io->count = bio_sectors(bio); } static void hold_bio(struct mirror_set *ms, struct bio *bio) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 0505452de8d6..23a38afec351 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -502,11 +502,11 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev, { if (likely(is_power_of_2(chunk_sects))) { return chunk_sects >= ((bio->bi_sector & (chunk_sects-1)) - + (bio->bi_size >> 9)); + + bio_sectors(bio)); } else{ sector_t sector = bio->bi_sector; return chunk_sects >= (sector_div(sector, chunk_sects) - + (bio->bi_size >> 9)); + + bio_sectors(bio)); } } @@ -567,7 +567,7 @@ bad_map: printk("md/raid0:%s: make_request bug: can't convert block across chunks" " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects / 2, - (unsigned long long)bio->bi_sector, bio->bi_size >> 10); + (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2); bio_io_error(bio); return; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 4d8c2e0a6bad..f741c9fe25c8 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -267,7 +267,7 @@ static void raid_end_bio_io(struct r1bio *r1_bio) (bio_data_dir(bio) == WRITE) ? "write" : "read", (unsigned long long) bio->bi_sector, (unsigned long long) bio->bi_sector + - (bio->bi_size >> 9) - 1); + bio_sectors(bio) - 1); call_bio_endio(r1_bio); } @@ -458,7 +458,7 @@ static void raid1_end_write_request(struct bio *bio, int error) " %llu-%llu\n", (unsigned long long) mbio->bi_sector, (unsigned long long) mbio->bi_sector + - (mbio->bi_size >> 9) - 1); + bio_sectors(mbio) - 1); call_bio_endio(r1_bio); } } @@ -1049,7 +1049,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); r1_bio->master_bio = bio; - r1_bio->sectors = bio->bi_size >> 9; + r1_bio->sectors = bio_sectors(bio); r1_bio->state = 0; r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector; @@ -1127,7 +1127,7 @@ read_again: r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); r1_bio->master_bio = bio; - r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; + r1_bio->sectors = bio_sectors(bio) - sectors_handled; r1_bio->state = 0; r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector + sectors_handled; @@ -1329,14 +1329,14 @@ read_again: /* Mustn't call r1_bio_write_done before this next test, * as it could result in the bio being freed. */ - if (sectors_handled < (bio->bi_size >> 9)) { + if (sectors_handled < bio_sectors(bio)) { r1_bio_write_done(r1_bio); /* We need another r1_bio. It has already been counted * in bio->bi_phys_segments */ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); r1_bio->master_bio = bio; - r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; + r1_bio->sectors = bio_sectors(bio) - sectors_handled; r1_bio->state = 0; r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector + sectors_handled; @@ -1947,7 +1947,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) wbio->bi_rw = WRITE; wbio->bi_end_io = end_sync_write; atomic_inc(&r1_bio->remaining); - md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); + md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); generic_make_request(wbio); } @@ -2284,8 +2284,7 @@ read_more: r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); r1_bio->master_bio = mbio; - r1_bio->sectors = (mbio->bi_size >> 9) - - sectors_handled; + r1_bio->sectors = bio_sectors(mbio) - sectors_handled; r1_bio->state = 0; set_bit(R1BIO_ReadError, &r1_bio->state); r1_bio->mddev = mddev; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 77b562d18a90..5ee14ab16a05 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1169,7 +1169,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) /* If this request crosses a chunk boundary, we need to * split it. This will only happen for 1 PAGE (or less) requests. */ - if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9) + if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio) > chunk_sects && (conf->geo.near_copies < conf->geo.raid_disks || conf->prev.near_copies < conf->prev.raid_disks))) { @@ -1209,7 +1209,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) bad_map: printk("md/raid10:%s: make_request bug: can't convert block across chunks" " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2, - (unsigned long long)bio->bi_sector, bio->bi_size >> 10); + (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2); bio_io_error(bio); return; @@ -1224,7 +1224,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) */ wait_barrier(conf); - sectors = bio->bi_size >> 9; + sectors = bio_sectors(bio); while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && bio->bi_sector < conf->reshape_progress && bio->bi_sector + sectors > conf->reshape_progress) { @@ -1326,8 +1326,7 @@ read_again: r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); r10_bio->master_bio = bio; - r10_bio->sectors = ((bio->bi_size >> 9) - - sectors_handled); + r10_bio->sectors = bio_sectors(bio) - sectors_handled; r10_bio->state = 0; r10_bio->mddev = mddev; r10_bio->sector = bio->bi_sector + sectors_handled; @@ -1569,7 +1568,7 @@ retry_write: * after checking if we need to go around again. */ - if (sectors_handled < (bio->bi_size >> 9)) { + if (sectors_handled < bio_sectors(bio)) { one_write_done(r10_bio); /* We need another r10_bio. It has already been counted * in bio->bi_phys_segments. @@ -1577,7 +1576,7 @@ retry_write: r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); r10_bio->master_bio = bio; - r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled; + r10_bio->sectors = bio_sectors(bio) - sectors_handled; r10_bio->mddev = mddev; r10_bio->sector = bio->bi_sector + sectors_handled; @@ -2103,7 +2102,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) d = r10_bio->devs[i].devnum; atomic_inc(&conf->mirrors[d].rdev->nr_pending); atomic_inc(&r10_bio->remaining); - md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9); + md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); tbio->bi_sector += conf->mirrors[d].rdev->data_offset; tbio->bi_bdev = conf->mirrors[d].rdev->bdev; @@ -2128,7 +2127,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) d = r10_bio->devs[i].devnum; atomic_inc(&r10_bio->remaining); md_sync_acct(conf->mirrors[d].replacement->bdev, - tbio->bi_size >> 9); + bio_sectors(tbio)); generic_make_request(tbio); } @@ -2254,13 +2253,13 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) wbio2 = r10_bio->devs[1].repl_bio; if (wbio->bi_end_io) { atomic_inc(&conf->mirrors[d].rdev->nr_pending); - md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); + md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); generic_make_request(wbio); } if (wbio2 && wbio2->bi_end_io) { atomic_inc(&conf->mirrors[d].replacement->nr_pending); md_sync_acct(conf->mirrors[d].replacement->bdev, - wbio2->bi_size >> 9); + bio_sectors(wbio2)); generic_make_request(wbio2); } } @@ -2690,8 +2689,7 @@ read_more: r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); r10_bio->master_bio = mbio; - r10_bio->sectors = (mbio->bi_size >> 9) - - sectors_handled; + r10_bio->sectors = bio_sectors(mbio) - sectors_handled; r10_bio->state = 0; set_bit(R10BIO_ReadError, &r10_bio->state); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 68706970d217..4e0f87e462ce 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -90,7 +90,7 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) */ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) { - int sectors = bio->bi_size >> 9; + int sectors = bio_sectors(bio); if (bio->bi_sector + sectors < sector + STRIPE_SECTORS) return bio->bi_next; else @@ -3804,7 +3804,7 @@ static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) { sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); unsigned int chunk_sectors = mddev->chunk_sectors; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bio_sectors(bio); if (mddev->new_chunk_sectors < mddev->chunk_sectors) chunk_sectors = mddev->new_chunk_sectors; @@ -3894,7 +3894,7 @@ static int bio_fits_rdev(struct bio *bi) { struct request_queue *q = bdev_get_queue(bi->bi_bdev); - if ((bi->bi_size>>9) > queue_max_sectors(q)) + if (bio_sectors(bi) > queue_max_sectors(q)) return 0; blk_recount_segments(q, bi); if (bi->bi_phys_segments > queue_max_segments(q)) @@ -3964,7 +3964,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); if (!bio_fits_rdev(align_bi) || - is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9, + is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi), &first_bad, &bad_sectors)) { /* too big in some way, or has a known bad block */ bio_put(align_bi); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5989a92236f7..d90e0485e01b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5166,7 +5166,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio, } prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if ((bio->bi_size >> 9) > max_sectors) + if (bio_sectors(bio) > max_sectors) return 0; if (!q->merge_bvec_fn) diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 9961726523d0..5a28843725df 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -244,7 +244,7 @@ TRACE_EVENT(block_bio_bounce, __entry->dev = bio->bi_bdev ? bio->bi_bdev->bd_dev : 0; __entry->sector = bio->bi_sector; - __entry->nr_sector = bio->bi_size >> 9; + __entry->nr_sector = bio_sectors(bio); blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -281,7 +281,7 @@ TRACE_EVENT(block_bio_complete, __entry->dev = bio->bi_bdev ? bio->bi_bdev->bd_dev : 0; __entry->sector = bio->bi_sector; - __entry->nr_sector = bio->bi_size >> 9; + __entry->nr_sector = bio_sectors(bio); __entry->error = error; blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); ), @@ -309,7 +309,7 @@ DECLARE_EVENT_CLASS(block_bio_merge, TP_fast_assign( __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_sector; - __entry->nr_sector = bio->bi_size >> 9; + __entry->nr_sector = bio_sectors(bio); blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -376,7 +376,7 @@ TRACE_EVENT(block_bio_queue, TP_fast_assign( __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_sector; - __entry->nr_sector = bio->bi_size >> 9; + __entry->nr_sector = bio_sectors(bio); blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -404,7 +404,7 @@ DECLARE_EVENT_CLASS(block_get_rq, TP_fast_assign( __entry->dev = bio ? bio->bi_bdev->bd_dev : 0; __entry->sector = bio ? bio->bi_sector : 0; - __entry->nr_sector = bio ? bio->bi_size >> 9 : 0; + __entry->nr_sector = bio ? bio_sectors(bio) : 0; blk_fill_rwbs(__entry->rwbs, bio ? bio->bi_rw : 0, __entry->nr_sector); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); @@ -580,7 +580,7 @@ TRACE_EVENT(block_bio_remap, TP_fast_assign( __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_sector; - __entry->nr_sector = bio->bi_size >> 9; + __entry->nr_sector = bio_sectors(bio); __entry->old_dev = dev; __entry->old_sector = from; blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); From 5b83636ae3c3b4f87d02a5929ad4dee831534db0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 4 Sep 2012 15:20:38 -0700 Subject: [PATCH 10/27] block: Change bio_split() to respect the current value of bi_idx In the current code bio_split() won't be seeing partially completed bios so this doesn't change any behaviour, but this makes the code a bit clearer as to what bio_split() actually requires. The immediate purpose of the patch is removing unnecessary bi_idx references, but the end goal is to allow partial completed bios to be submitted, which along with immutable biovecs enables effecient bio splitting. Some of the callers were (double) checking that bios could be split, so update their checks too. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: Lars Ellenberg CC: Neil Brown CC: Martin K. Petersen --- drivers/md/raid0.c | 3 +-- drivers/md/raid10.c | 3 +-- fs/bio-integrity.c | 4 ++-- fs/bio.c | 7 +++---- 4 files changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 23a38afec351..fcf65e512cf5 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -527,8 +527,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) sector_t sector = bio->bi_sector; struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ - if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || - bio->bi_idx != 0) + if (bio_segments(bio) > 1) goto bad_map; /* This is a one page bio that upper layers * refuse to split for us, so we need to split it. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 5ee14ab16a05..2e29df960bf5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1175,8 +1175,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) || conf->prev.near_copies < conf->prev.raid_disks))) { struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ - if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || - bio->bi_idx != 0) + if (bio_segments(bio) > 1) goto bad_map; /* This is a one page bio that upper layers * refuse to split for us, so we need to split it. diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 8c4c604c840d..ca7b02dbf09d 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -661,8 +661,8 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors) bp->bio1.bi_integrity = &bp->bip1; bp->bio2.bi_integrity = &bp->bip2; - bp->iv1 = bip->bip_vec[0]; - bp->iv2 = bip->bip_vec[0]; + bp->iv1 = bip->bip_vec[bip->bip_idx]; + bp->iv2 = bip->bip_vec[bip->bip_idx]; bp->bip1.bip_vec = &bp->iv1; bp->bip2.bip_vec = &bp->iv2; diff --git a/fs/bio.c b/fs/bio.c index 7edc08d2246c..f1b4c1651089 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1620,8 +1620,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) trace_block_split(bdev_get_queue(bi->bi_bdev), bi, bi->bi_sector + first_sectors); - BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0); - BUG_ON(bi->bi_idx != 0); + BUG_ON(bio_segments(bi) > 1); atomic_set(&bp->cnt, 3); bp->error = 0; bp->bio1 = *bi; @@ -1631,8 +1630,8 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) bp->bio1.bi_size = first_sectors << 9; if (bi->bi_vcnt != 0) { - bp->bv1 = bi->bi_io_vec[0]; - bp->bv2 = bi->bi_io_vec[0]; + bp->bv1 = *bio_iovec(bi); + bp->bv2 = *bio_iovec(bi); if (bio_is_rw(bi)) { bp->bv2.bv_offset += first_sectors << 9; From 4f2ac93c175c4922bdddbfec6cad94b32cea0070 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 4 Sep 2012 15:23:14 -0700 Subject: [PATCH 11/27] block: Remove bi_idx references For immutable bvecs, all bi_idx usage needs to be audited - so here we're removing all the unnecessary uses. Most of these are places where it was being initialized on a bio that was just allocated, a few others are conversions to standard macros. Signed-off-by: Kent Overstreet CC: Jens Axboe --- drivers/block/aoe/aoecmd.c | 2 +- drivers/block/floppy.c | 1 - drivers/md/dm-verity.c | 2 +- drivers/md/raid10.c | 1 - fs/buffer.c | 1 - fs/jfs/jfs_logmgr.c | 2 -- fs/logfs/dev_bdev.c | 5 ----- mm/page_io.c | 1 - 8 files changed, 2 insertions(+), 13 deletions(-) diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 25ef5c014fca..8188ad131b97 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -927,7 +927,7 @@ bufinit(struct buf *buf, struct request *rq, struct bio *bio) buf->resid = bio->bi_size; buf->sector = bio->bi_sector; bio_pageinc(bio); - buf->bv = bv = &bio->bi_io_vec[bio->bi_idx]; + buf->bv = bio_iovec(bio); buf->bv_resid = bv->bv_len; WARN_ON(buf->bv_resid == 0); } diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 2ddd64a9ffde..83232639034e 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3777,7 +3777,6 @@ static int __floppy_read_block_0(struct block_device *bdev) bio_vec.bv_len = size; bio_vec.bv_offset = 0; bio.bi_vcnt = 1; - bio.bi_idx = 0; bio.bi_size = size; bio.bi_bdev = bdev; bio.bi_sector = 0; diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 923115d08baa..4f06d9adf1ed 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -490,7 +490,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) bio->bi_end_io = verity_end_io; bio->bi_private = io; - io->io_vec_size = bio->bi_vcnt - bio->bi_idx; + io->io_vec_size = bio_segments(bio); if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE) io->io_vec = io->io_vec_inline; else diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 2e29df960bf5..6ffb6c08aec5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4386,7 +4386,6 @@ read_more: read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); read_bio->bi_flags |= 1 << BIO_UPTODATE; read_bio->bi_vcnt = 0; - read_bio->bi_idx = 0; read_bio->bi_size = 0; r10_bio->master_bio = read_bio; r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; diff --git a/fs/buffer.c b/fs/buffer.c index b4dcb34c9635..ecd3792ae0e9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2979,7 +2979,6 @@ int submit_bh(int rw, struct buffer_head * bh) bio->bi_io_vec[0].bv_offset = bh_offset(bh); bio->bi_vcnt = 1; - bio->bi_idx = 0; bio->bi_size = bh->b_size; bio->bi_end_io = end_bio_bh_io_sync; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 2eb952c41a69..8ae5e350da43 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2004,7 +2004,6 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio->bi_io_vec[0].bv_offset = bp->l_offset; bio->bi_vcnt = 1; - bio->bi_idx = 0; bio->bi_size = LOGPSIZE; bio->bi_end_io = lbmIODone; @@ -2145,7 +2144,6 @@ static void lbmStartIO(struct lbuf * bp) bio->bi_io_vec[0].bv_offset = bp->l_offset; bio->bi_vcnt = 1; - bio->bi_idx = 0; bio->bi_size = LOGPSIZE; bio->bi_end_io = lbmIODone; diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index e784a217b500..550475ca6a0e 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -32,7 +32,6 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw) bio_vec.bv_len = PAGE_SIZE; bio_vec.bv_offset = 0; bio.bi_vcnt = 1; - bio.bi_idx = 0; bio.bi_size = PAGE_SIZE; bio.bi_bdev = bdev; bio.bi_sector = page->index * (PAGE_SIZE >> 9); @@ -108,7 +107,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, if (i >= max_pages) { /* Block layer cannot split bios :( */ bio->bi_vcnt = i; - bio->bi_idx = 0; bio->bi_size = i * PAGE_SIZE; bio->bi_bdev = super->s_bdev; bio->bi_sector = ofs >> 9; @@ -136,7 +134,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, unlock_page(page); } bio->bi_vcnt = nr_pages; - bio->bi_idx = 0; bio->bi_size = nr_pages * PAGE_SIZE; bio->bi_bdev = super->s_bdev; bio->bi_sector = ofs >> 9; @@ -202,7 +199,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, if (i >= max_pages) { /* Block layer cannot split bios :( */ bio->bi_vcnt = i; - bio->bi_idx = 0; bio->bi_size = i * PAGE_SIZE; bio->bi_bdev = super->s_bdev; bio->bi_sector = ofs >> 9; @@ -224,7 +220,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_io_vec[i].bv_offset = 0; } bio->bi_vcnt = nr_pages; - bio->bi_idx = 0; bio->bi_size = nr_pages * PAGE_SIZE; bio->bi_bdev = super->s_bdev; bio->bi_sector = ofs >> 9; diff --git a/mm/page_io.c b/mm/page_io.c index 78eee32ee486..8d3c0c088105 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -35,7 +35,6 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, bio->bi_io_vec[0].bv_len = PAGE_SIZE; bio->bi_io_vec[0].bv_offset = 0; bio->bi_vcnt = 1; - bio->bi_idx = 0; bio->bi_size = PAGE_SIZE; bio->bi_end_io = end_io; } From 2f477877f8c4be18f054aeb7c4be8cc748cfe932 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 5 Sep 2012 17:31:42 -0700 Subject: [PATCH 12/27] block: Remove some unnecessary bi_vcnt usage More prep work for immutable bvecs/effecient bio splitting - usage of bi_vcnt has to be auditing, so getting rid of all the unnecessary usage makes that easier. Plus, bio_segments() is really what this code wanted, as it respects the current value of bi_idx. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: Eric Moore CC: "James E.J. Bottomley" CC: linux-scsi@vger.kernel.org --- drivers/message/fusion/mptsas.c | 6 +++--- drivers/scsi/libsas/sas_expander.c | 6 +++--- drivers/scsi/mpt2sas/mpt2sas_transport.c | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c index fa43c391c8ed..2bb01546df0b 100644 --- a/drivers/message/fusion/mptsas.c +++ b/drivers/message/fusion/mptsas.c @@ -2235,10 +2235,10 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, } /* do we need to support multiple segments? */ - if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) { + if (bio_segments(req->bio) > 1 || bio_segments(rsp->bio) > 1) { printk(MYIOC_s_ERR_FMT "%s: multiple segments req %u %u, rsp %u %u\n", - ioc->name, __func__, req->bio->bi_vcnt, blk_rq_bytes(req), - rsp->bio->bi_vcnt, blk_rq_bytes(rsp)); + ioc->name, __func__, bio_segments(req->bio), blk_rq_bytes(req), + bio_segments(rsp->bio), blk_rq_bytes(rsp)); return -EINVAL; } diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c index aec2e0da5016..7af776737b40 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -2151,10 +2151,10 @@ int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, } /* do we need to support multiple segments? */ - if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) { + if (bio_segments(req->bio) > 1 || bio_segments(rsp->bio) > 1) { printk("%s: multiple segments req %u %u, rsp %u %u\n", - __func__, req->bio->bi_vcnt, blk_rq_bytes(req), - rsp->bio->bi_vcnt, blk_rq_bytes(rsp)); + __func__, bio_segments(req->bio), blk_rq_bytes(req), + bio_segments(rsp->bio), blk_rq_bytes(rsp)); return -EINVAL; } diff --git a/drivers/scsi/mpt2sas/mpt2sas_transport.c b/drivers/scsi/mpt2sas/mpt2sas_transport.c index 8c2ffbe6af0f..193e7ae90c3b 100644 --- a/drivers/scsi/mpt2sas/mpt2sas_transport.c +++ b/drivers/scsi/mpt2sas/mpt2sas_transport.c @@ -1939,7 +1939,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, ioc->transport_cmds.status = MPT2_CMD_PENDING; /* Check if the request is split across multiple segments */ - if (req->bio->bi_vcnt > 1) { + if (bio_segments(req->bio) > 1) { u32 offset = 0; /* Allocate memory and copy the request */ @@ -1971,7 +1971,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, /* Check if the response needs to be populated across * multiple segments */ - if (rsp->bio->bi_vcnt > 1) { + if (bio_segments(rsp->bio) > 1) { pci_addr_in = pci_alloc_consistent(ioc->pdev, blk_rq_bytes(rsp), &pci_dma_in); if (!pci_addr_in) { @@ -2038,7 +2038,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, sgl_flags = (MPI2_SGE_FLAGS_SIMPLE_ELEMENT | MPI2_SGE_FLAGS_END_OF_BUFFER | MPI2_SGE_FLAGS_HOST_TO_IOC); sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT; - if (req->bio->bi_vcnt > 1) { + if (bio_segments(req->bio) > 1) { ioc->base_add_sg_single(psge, sgl_flags | (blk_rq_bytes(req) - 4), pci_dma_out); } else { @@ -2054,7 +2054,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, MPI2_SGE_FLAGS_LAST_ELEMENT | MPI2_SGE_FLAGS_END_OF_BUFFER | MPI2_SGE_FLAGS_END_OF_LIST); sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT; - if (rsp->bio->bi_vcnt > 1) { + if (bio_segments(rsp->bio) > 1) { ioc->base_add_sg_single(psge, sgl_flags | (blk_rq_bytes(rsp) + 4), pci_dma_in); } else { @@ -2099,7 +2099,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, le16_to_cpu(mpi_reply->ResponseDataLength); /* check if the resp needs to be copied from the allocated * pci mem */ - if (rsp->bio->bi_vcnt > 1) { + if (bio_segments(rsp->bio) > 1) { u32 offset = 0; u32 bytes_to_copy = le16_to_cpu(mpi_reply->ResponseDataLength); From 9e882242c6193ae6f416f2d8d8db0d9126bd996b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Sep 2012 14:41:12 -0700 Subject: [PATCH 13/27] block: Add submit_bio_wait(), remove from md Random cleanup - this code was duplicated and it's not really specific to md. Also added the ability to return the actual error code. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: NeilBrown Acked-by: Tejun Heo --- drivers/md/raid1.c | 19 ------------------- drivers/md/raid10.c | 19 ------------------- fs/bio.c | 36 ++++++++++++++++++++++++++++++++++++ include/linux/bio.h | 1 + 4 files changed, 37 insertions(+), 38 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f741c9fe25c8..800748d585ca 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2059,25 +2059,6 @@ static void fix_read_error(struct r1conf *conf, int read_disk, } } -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion *)bio->bi_private); -} - -static int submit_bio_wait(int rw, struct bio *bio) -{ - struct completion event; - rw |= REQ_SYNC; - - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(&event); - - return test_bit(BIO_UPTODATE, &bio->bi_flags); -} - static int narrow_write_error(struct r1bio *r1_bio, int i) { struct mddev *mddev = r1_bio->mddev; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6ffb6c08aec5..434586d43115 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2529,25 +2529,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 } } -static void bi_complete(struct bio *bio, int error) -{ - complete((struct completion *)bio->bi_private); -} - -static int submit_bio_wait(int rw, struct bio *bio) -{ - struct completion event; - rw |= REQ_SYNC; - - init_completion(&event); - bio->bi_private = &event; - bio->bi_end_io = bi_complete; - submit_bio(rw, bio); - wait_for_completion(&event); - - return test_bit(BIO_UPTODATE, &bio->bi_flags); -} - static int narrow_write_error(struct r10bio *r10_bio, int i) { struct bio *bio = r10_bio->master_bio; diff --git a/fs/bio.c b/fs/bio.c index f1b4c1651089..4ce24ee5dcd0 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -752,6 +752,42 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, } EXPORT_SYMBOL(bio_add_page); +struct submit_bio_ret { + struct completion event; + int error; +}; + +static void submit_bio_wait_endio(struct bio *bio, int error) +{ + struct submit_bio_ret *ret = bio->bi_private; + + ret->error = error; + complete(&ret->event); +} + +/** + * submit_bio_wait - submit a bio, and wait until it completes + * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) + * @bio: The &struct bio which describes the I/O + * + * Simple wrapper around submit_bio(). Returns 0 on success, or the error from + * bio_endio() on failure. + */ +int submit_bio_wait(int rw, struct bio *bio) +{ + struct submit_bio_ret ret; + + rw |= REQ_SYNC; + init_completion(&ret.event); + bio->bi_private = &ret; + bio->bi_end_io = submit_bio_wait_endio; + submit_bio(rw, bio); + wait_for_completion(&ret.event); + + return ret.error; +} +EXPORT_SYMBOL(submit_bio_wait); + /** * bio_advance - increment/complete a bio by some number of bytes * @bio: bio to advance diff --git a/include/linux/bio.h b/include/linux/bio.h index 20507eb7c979..b20a9cd776dd 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -249,6 +249,7 @@ extern void bio_endio(struct bio *, int); struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); +extern int submit_bio_wait(int rw, struct bio *bio); extern void bio_advance(struct bio *, unsigned); extern void bio_init(struct bio *); From 8be185f2c9d54d6bc0bac1445227b67cb14c0b13 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Sep 2012 14:14:43 -0700 Subject: [PATCH 14/27] raid10: Use bio_reset() More prep work for immutable bio vecs, mainly getting rid of references to bi_idx. bio_reset was being open coded in a few places. The one in sync_request was a bit nontrivial to convert, so could use some extra eyeballs. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: NeilBrown Acked-by: NeilBrown --- drivers/md/raid10.c | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 434586d43115..e32e8b1042f8 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2077,13 +2077,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) * First we need to fixup bv_offset, bv_len and * bi_vecs, as the read request might have corrupted these */ + bio_reset(tbio); + tbio->bi_vcnt = vcnt; tbio->bi_size = r10_bio->sectors << 9; - tbio->bi_idx = 0; - tbio->bi_phys_segments = 0; - tbio->bi_flags &= ~(BIO_POOL_MASK - 1); - tbio->bi_flags |= 1 << BIO_UPTODATE; - tbio->bi_next = NULL; tbio->bi_rw = WRITE; tbio->bi_private = r10_bio; tbio->bi_sector = r10_bio->devs[i].addr; @@ -3090,6 +3087,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, } } bio = r10_bio->devs[0].bio; + bio_reset(bio); bio->bi_next = biolist; biolist = bio; bio->bi_private = r10_bio; @@ -3114,6 +3112,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, rdev = mirror->rdev; if (!test_bit(In_sync, &rdev->flags)) { bio = r10_bio->devs[1].bio; + bio_reset(bio); bio->bi_next = biolist; biolist = bio; bio->bi_private = r10_bio; @@ -3142,6 +3141,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, if (rdev == NULL || bio == NULL || test_bit(Faulty, &rdev->flags)) break; + bio_reset(bio); bio->bi_next = biolist; biolist = bio; bio->bi_private = r10_bio; @@ -3240,7 +3240,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->devs[i].repl_bio->bi_end_io = NULL; bio = r10_bio->devs[i].bio; - bio->bi_end_io = NULL; + bio_reset(bio); clear_bit(BIO_UPTODATE, &bio->bi_flags); if (conf->mirrors[d].rdev == NULL || test_bit(Faulty, &conf->mirrors[d].rdev->flags)) @@ -3277,6 +3277,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, /* Need to set up for writing to the replacement */ bio = r10_bio->devs[i].repl_bio; + bio_reset(bio); clear_bit(BIO_UPTODATE, &bio->bi_flags); sector = r10_bio->devs[i].addr; @@ -3310,17 +3311,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, } } - for (bio = biolist; bio ; bio=bio->bi_next) { - - bio->bi_flags &= ~(BIO_POOL_MASK - 1); - if (bio->bi_end_io) - bio->bi_flags |= 1 << BIO_UPTODATE; - bio->bi_vcnt = 0; - bio->bi_idx = 0; - bio->bi_phys_segments = 0; - bio->bi_size = 0; - } - nr_sectors = 0; if (sector_nr + max_sync < max_sector) max_sector = sector_nr + max_sync; @@ -4390,17 +4380,14 @@ read_more: } if (!rdev2 || test_bit(Faulty, &rdev2->flags)) continue; + + bio_reset(b); b->bi_bdev = rdev2->bdev; b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; b->bi_private = r10_bio; b->bi_end_io = end_reshape_write; b->bi_rw = WRITE; - b->bi_flags &= ~(BIO_POOL_MASK - 1); - b->bi_flags |= 1 << BIO_UPTODATE; b->bi_next = blist; - b->bi_vcnt = 0; - b->bi_idx = 0; - b->bi_size = 0; blist = b; } From 2aabaa65ad147bf8238a8a261b15a0c95e9e2afa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Sep 2012 11:26:12 -0700 Subject: [PATCH 15/27] raid1: use bio_reset() Signed-off-by: Kent Overstreet CC: Jens Axboe CC: NeilBrown --- drivers/md/raid1.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 800748d585ca..bb5ec7710c00 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1862,7 +1862,7 @@ static int process_checks(struct r1bio *r1_bio) struct bio *sbio = r1_bio->bios[i]; int size; - if (r1_bio->bios[i]->bi_end_io != end_sync_read) + if (sbio->bi_end_io != end_sync_read) continue; if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { @@ -1887,16 +1887,15 @@ static int process_checks(struct r1bio *r1_bio) continue; } /* fixup the bio for reuse */ + bio_reset(sbio); sbio->bi_vcnt = vcnt; sbio->bi_size = r1_bio->sectors << 9; - sbio->bi_idx = 0; - sbio->bi_phys_segments = 0; - sbio->bi_flags &= ~(BIO_POOL_MASK - 1); - sbio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bi_next = NULL; sbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; sbio->bi_bdev = conf->mirrors[i].rdev->bdev; + sbio->bi_end_io = end_sync_read; + sbio->bi_private = r1_bio; + size = sbio->bi_size; for (j = 0; j < vcnt ; j++) { struct bio_vec *bi; @@ -2439,18 +2438,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp for (i = 0; i < conf->raid_disks * 2; i++) { struct md_rdev *rdev; bio = r1_bio->bios[i]; - - /* take from bio_init */ - bio->bi_next = NULL; - bio->bi_flags &= ~(BIO_POOL_MASK-1); - bio->bi_flags |= 1 << BIO_UPTODATE; - bio->bi_rw = READ; - bio->bi_vcnt = 0; - bio->bi_idx = 0; - bio->bi_phys_segments = 0; - bio->bi_size = 0; - bio->bi_end_io = NULL; - bio->bi_private = NULL; + bio_reset(bio); rdev = rcu_dereference(conf->mirrors[i].rdev); if (rdev == NULL || From 2f6db2a7073452b123726e2baab1f37d511bf86d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 11 Sep 2012 12:26:38 -0700 Subject: [PATCH 16/27] raid5: use bio_reset() Had to shuffle the code around a bit (where bi_rw and bi_end_io were set), but shouldn't really be anything tricky here Signed-off-by: Kent Overstreet CC: Jens Axboe CC: NeilBrown --- drivers/md/raid5.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4e0f87e462ce..7bbd28546214 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -567,14 +567,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) bi = &sh->dev[i].req; rbi = &sh->dev[i].rreq; /* For writing to replacement */ - bi->bi_rw = rw; - rbi->bi_rw = rw; - if (rw & WRITE) { - bi->bi_end_io = raid5_end_write_request; - rbi->bi_end_io = raid5_end_write_request; - } else - bi->bi_end_io = raid5_end_read_request; - rcu_read_lock(); rrdev = rcu_dereference(conf->disks[i].replacement); smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */ @@ -649,7 +641,14 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) set_bit(STRIPE_IO_STARTED, &sh->state); + bio_reset(bi); bi->bi_bdev = rdev->bdev; + bi->bi_rw = rw; + bi->bi_end_io = (rw & WRITE) + ? raid5_end_write_request + : raid5_end_read_request; + bi->bi_private = sh; + pr_debug("%s: for %llu schedule op %ld on disc %d\n", __func__, (unsigned long long)sh->sector, bi->bi_rw, i); @@ -663,12 +662,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) bi->bi_rw |= REQ_FLUSH; - bi->bi_flags = 1 << BIO_UPTODATE; - bi->bi_idx = 0; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; bi->bi_size = STRIPE_SIZE; - bi->bi_next = NULL; if (rrdev) set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), @@ -683,7 +679,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) set_bit(STRIPE_IO_STARTED, &sh->state); + bio_reset(rbi); rbi->bi_bdev = rrdev->bdev; + rbi->bi_rw = rw; + BUG_ON(!(rw & WRITE)); + rbi->bi_end_io = raid5_end_write_request; + rbi->bi_private = sh; + pr_debug("%s: for %llu schedule op %ld on " "replacement disc %d\n", __func__, (unsigned long long)sh->sector, @@ -695,12 +697,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) else rbi->bi_sector = (sh->sector + rrdev->data_offset); - rbi->bi_flags = 1 << BIO_UPTODATE; - rbi->bi_idx = 0; rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; rbi->bi_io_vec[0].bv_offset = 0; rbi->bi_size = STRIPE_SIZE; - rbi->bi_next = NULL; trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), rbi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); From b783863f68c26c5411c50002f98a047a40b94e8e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Sep 2012 15:17:11 -0700 Subject: [PATCH 17/27] raid1: Refactor narrow_write_error() to not use bi_idx More bi_idx removal. This code was just open coding bio_clone(). This could probably be further improved by using bio_advance() instead of skipping over null pages, but that'd be a larger rework. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: NeilBrown --- drivers/md/raid1.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index bb5ec7710c00..b36231866336 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2063,8 +2063,6 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; struct md_rdev *rdev = conf->mirrors[i].rdev; - int vcnt, idx; - struct bio_vec *vec; /* bio has the data to be written to device 'i' where * we just recently had a write error. @@ -2092,30 +2090,32 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) & ~(sector_t)(block_sectors - 1)) - sector; - if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { - vcnt = r1_bio->behind_page_count; - vec = r1_bio->behind_bvecs; - idx = 0; - while (vec[idx].bv_page == NULL) - idx++; - } else { - vcnt = r1_bio->master_bio->bi_vcnt; - vec = r1_bio->master_bio->bi_io_vec; - idx = r1_bio->master_bio->bi_idx; - } while (sect_to_write) { struct bio *wbio; if (sectors > sect_to_write) sectors = sect_to_write; /* Write at 'sector' for 'sectors'*/ - wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev); - memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec)); - wbio->bi_sector = r1_bio->sector; + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { + unsigned vcnt = r1_bio->behind_page_count; + struct bio_vec *vec = r1_bio->behind_bvecs; + + while (!vec->bv_page) { + vec++; + vcnt--; + } + + wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev); + memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec)); + + wbio->bi_vcnt = vcnt; + } else { + wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); + } + wbio->bi_rw = WRITE; - wbio->bi_vcnt = vcnt; + wbio->bi_sector = r1_bio->sector; wbio->bi_size = r1_bio->sectors << 9; - wbio->bi_idx = idx; md_trim_bio(wbio, sector - r1_bio->sector, sectors); wbio->bi_sector += rdev->data_offset; From 16ac3d63e74f3d6e34e42d6e523b6a61de0020f0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Sep 2012 13:57:51 -0700 Subject: [PATCH 18/27] block: Add bio_copy_data() This gets open coded quite a bit and it's tricky to get right, so make a generic version and convert some existing users over to it instead. Signed-off-by: Kent Overstreet CC: Jens Axboe --- fs/bio.c | 70 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/bio.h | 2 ++ 2 files changed, 72 insertions(+) diff --git a/fs/bio.c b/fs/bio.c index 4ce24ee5dcd0..e437f9aae67d 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -829,6 +829,76 @@ void bio_advance(struct bio *bio, unsigned bytes) } EXPORT_SYMBOL(bio_advance); +/** + * bio_copy_data - copy contents of data buffers from one chain of bios to + * another + * @src: source bio list + * @dst: destination bio list + * + * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats + * @src and @dst as linked lists of bios. + * + * Stops when it reaches the end of either @src or @dst - that is, copies + * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). + */ +void bio_copy_data(struct bio *dst, struct bio *src) +{ + struct bio_vec *src_bv, *dst_bv; + unsigned src_offset, dst_offset, bytes; + void *src_p, *dst_p; + + src_bv = bio_iovec(src); + dst_bv = bio_iovec(dst); + + src_offset = src_bv->bv_offset; + dst_offset = dst_bv->bv_offset; + + while (1) { + if (src_offset == src_bv->bv_offset + src_bv->bv_len) { + src_bv++; + if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) { + src = src->bi_next; + if (!src) + break; + + src_bv = bio_iovec(src); + } + + src_offset = src_bv->bv_offset; + } + + if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) { + dst_bv++; + if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) { + dst = dst->bi_next; + if (!dst) + break; + + dst_bv = bio_iovec(dst); + } + + dst_offset = dst_bv->bv_offset; + } + + bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset, + src_bv->bv_offset + src_bv->bv_len - src_offset); + + src_p = kmap_atomic(src_bv->bv_page); + dst_p = kmap_atomic(dst_bv->bv_page); + + memcpy(dst_p + dst_bv->bv_offset, + src_p + src_bv->bv_offset, + bytes); + + kunmap_atomic(dst_p); + kunmap_atomic(src_p); + + src_offset += bytes; + dst_offset += bytes; + } +} +EXPORT_SYMBOL(bio_copy_data); + struct bio_map_data { struct bio_vec *iovecs; struct sg_iovec *sgvecs; diff --git a/include/linux/bio.h b/include/linux/bio.h index b20a9cd776dd..90d36c65cb70 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -286,6 +286,8 @@ static inline void bio_flush_dcache_pages(struct bio *bi) } #endif +extern void bio_copy_data(struct bio *dst, struct bio *src); + extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, unsigned long, unsigned int, int, gfp_t); extern struct bio *bio_copy_user_iov(struct request_queue *, From ffb25dc60ff14f90a581975307b0c1d07e1f362a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 4 Sep 2012 06:44:57 -0700 Subject: [PATCH 19/27] pktcdvd: use bio_copy_data() Signed-off-by: Kent Overstreet CC: Jens Axboe CC: Jiri Kosina --- drivers/block/pktcdvd.c | 79 +++++++---------------------------------- 1 file changed, 12 insertions(+), 67 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 2c27744b9ca6..783c96c89b75 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -947,31 +947,6 @@ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_que } } -/* - * Copy CD_FRAMESIZE bytes from src_bio into a destination page - */ -static void pkt_copy_bio_data(struct bio *src_bio, int seg, int offs, struct page *dst_page, int dst_offs) -{ - unsigned int copy_size = CD_FRAMESIZE; - - while (copy_size > 0) { - struct bio_vec *src_bvl = bio_iovec_idx(src_bio, seg); - void *vfrom = kmap_atomic(src_bvl->bv_page) + - src_bvl->bv_offset + offs; - void *vto = page_address(dst_page) + dst_offs; - int len = min_t(int, copy_size, src_bvl->bv_len - offs); - - BUG_ON(len < 0); - memcpy(vto, vfrom, len); - kunmap_atomic(vfrom); - - seg++; - offs = 0; - dst_offs += len; - copy_size -= len; - } -} - /* * Copy all data for this packet to pkt->pages[], so that * a) The number of required segments for the write bio is minimized, which @@ -1325,55 +1300,35 @@ try_next_bio: */ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) { - struct bio *bio; int f; - int frames_write; struct bio_vec *bvec = pkt->w_bio->bi_io_vec; + bio_reset(pkt->w_bio); + pkt->w_bio->bi_sector = pkt->sector; + pkt->w_bio->bi_bdev = pd->bdev; + pkt->w_bio->bi_end_io = pkt_end_io_packet_write; + pkt->w_bio->bi_private = pkt; + + /* XXX: locking? */ for (f = 0; f < pkt->frames; f++) { bvec[f].bv_page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE]; bvec[f].bv_offset = (f * CD_FRAMESIZE) % PAGE_SIZE; + if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset)) + BUG(); } + VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt); /* * Fill-in bvec with data from orig_bios. */ - frames_write = 0; spin_lock(&pkt->lock); - bio_list_for_each(bio, &pkt->orig_bios) { - int segment = bio->bi_idx; - int src_offs = 0; - int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9); - int num_frames = bio->bi_size / CD_FRAMESIZE; - BUG_ON(first_frame < 0); - BUG_ON(first_frame + num_frames > pkt->frames); - for (f = first_frame; f < first_frame + num_frames; f++) { - struct bio_vec *src_bvl = bio_iovec_idx(bio, segment); + bio_copy_data(pkt->w_bio, pkt->orig_bios.head); - while (src_offs >= src_bvl->bv_len) { - src_offs -= src_bvl->bv_len; - segment++; - BUG_ON(segment >= bio->bi_vcnt); - src_bvl = bio_iovec_idx(bio, segment); - } - - if (src_bvl->bv_len - src_offs >= CD_FRAMESIZE) { - bvec[f].bv_page = src_bvl->bv_page; - bvec[f].bv_offset = src_bvl->bv_offset + src_offs; - } else { - pkt_copy_bio_data(bio, segment, src_offs, - bvec[f].bv_page, bvec[f].bv_offset); - } - src_offs += CD_FRAMESIZE; - frames_write++; - } - } pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE); spin_unlock(&pkt->lock); VPRINTK("pkt_start_write: Writing %d frames for zone %llx\n", - frames_write, (unsigned long long)pkt->sector); - BUG_ON(frames_write != pkt->write_size); + pkt->write_size, (unsigned long long)pkt->sector); if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) { pkt_make_local_copy(pkt, bvec); @@ -1383,16 +1338,6 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) } /* Start the write request */ - bio_reset(pkt->w_bio); - pkt->w_bio->bi_sector = pkt->sector; - pkt->w_bio->bi_bdev = pd->bdev; - pkt->w_bio->bi_end_io = pkt_end_io_packet_write; - pkt->w_bio->bi_private = pkt; - for (f = 0; f < pkt->frames; f++) - if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset)) - BUG(); - VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt); - atomic_set(&pkt->io_wait, 1); pkt->w_bio->bi_rw = WRITE; pkt_queue_bio(pd, pkt->w_bio); From ff8e0070d1a4d09cb462d3059956530fa67fd91d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 5 Sep 2012 14:11:38 -0700 Subject: [PATCH 20/27] pktcdvd: Use bio_reset() in disabled code to kill bi_idx usage In the short term this'll help with code auditing, and if this code ever gets used now it's converted :) Signed-off-by: Kent Overstreet CC: Jiri Kosina --- drivers/block/pktcdvd.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 783c96c89b75..11190424536a 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1156,16 +1156,15 @@ static int pkt_start_recovery(struct packet_data *pkt) new_sector = new_block * (CD_FRAMESIZE >> 9); pkt->sector = new_sector; + bio_reset(pkt->bio); + pkt->bio->bi_bdev = pd->bdev; + pkt->bio->bi_rw = REQ_WRITE; pkt->bio->bi_sector = new_sector; - pkt->bio->bi_next = NULL; - pkt->bio->bi_flags = 1 << BIO_UPTODATE; - pkt->bio->bi_idx = 0; + pkt->bio->bi_size = pkt->frames * CD_FRAMESIZE; + pkt->bio->bi_vcnt = pkt->frames; - BUG_ON(pkt->bio->bi_rw != REQ_WRITE); - BUG_ON(pkt->bio->bi_vcnt != pkt->frames); - BUG_ON(pkt->bio->bi_size != pkt->frames * CD_FRAMESIZE); - BUG_ON(pkt->bio->bi_end_io != pkt_end_io_packet_write); - BUG_ON(pkt->bio->bi_private != pkt); + pkt->bio->bi_end_io = pkt_end_io_packet_write; + pkt->bio->bi_private = pkt; drop_super(sb); return 1; From d3b45c2a056c5df443d1ddf27fbfada65f234af1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Sep 2012 13:49:33 -0700 Subject: [PATCH 21/27] raid1: use bio_copy_data() This doesn't really delete any code _yet_, but once immutable bvecs are done we can just delete the rest of the code in that loop. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: NeilBrown --- drivers/md/raid1.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b36231866336..0a3988a25aab 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1906,10 +1906,9 @@ static int process_checks(struct r1bio *r1_bio) else bi->bv_len = size; size -= PAGE_SIZE; - memcpy(page_address(bi->bv_page), - page_address(pbio->bi_io_vec[j].bv_page), - PAGE_SIZE); } + + bio_copy_data(sbio, pbio); } return 0; } From 6bc454d150047fcfbf53346412e64cdf3bf61a79 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Sep 2012 14:30:37 -0700 Subject: [PATCH 22/27] bounce: Refactor __blk_queue_bounce to not use bi_io_vec A bunch of what __blk_queue_bounce() was doing was problematic for the immutable bvec work; this cleans that up and the code is quite a bit smaller, too. The __bio_for_each_segment() in copy_to_high_bio_irq() was changed because that one's looping over the original bio, not the bounce bio - a later patch renames __bio_for_each_segment() -> bio_for_each_segment_all(), and documents that bio_for_each_segment_all() is only for code that owns the bio. Signed-off-by: Kent Overstreet CC: Jens Axboe --- mm/bounce.c | 73 ++++++++++++++--------------------------------------- 1 file changed, 19 insertions(+), 54 deletions(-) diff --git a/mm/bounce.c b/mm/bounce.c index 5f8901768602..55f512af50c7 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -101,7 +101,7 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) struct bio_vec *tovec, *fromvec; int i; - __bio_for_each_segment(tovec, to, i, 0) { + bio_for_each_segment(tovec, to, i) { fromvec = from->bi_io_vec + i; /* @@ -218,78 +218,43 @@ static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, mempool_t *pool, int force) { - struct page *page; - struct bio *bio = NULL; - int i, rw = bio_data_dir(*bio_orig); + struct bio *bio; + int rw = bio_data_dir(*bio_orig); struct bio_vec *to, *from; + unsigned i; - bio_for_each_segment(from, *bio_orig, i) { - page = from->bv_page; + bio_for_each_segment(from, *bio_orig, i) + if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q)) + goto bounce; + + return; +bounce: + bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set); + + bio_for_each_segment(to, bio, i) { + struct page *page = to->bv_page; - /* - * is destination page below bounce pfn? - */ if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) continue; - /* - * irk, bounce it - */ - if (!bio) { - unsigned int cnt = (*bio_orig)->bi_vcnt; - - bio = bio_alloc(GFP_NOIO, cnt); - memset(bio->bi_io_vec, 0, cnt * sizeof(struct bio_vec)); - } - - - to = bio->bi_io_vec + i; - - to->bv_page = mempool_alloc(pool, q->bounce_gfp); - to->bv_len = from->bv_len; - to->bv_offset = from->bv_offset; inc_zone_page_state(to->bv_page, NR_BOUNCE); + to->bv_page = mempool_alloc(pool, q->bounce_gfp); if (rw == WRITE) { char *vto, *vfrom; - flush_dcache_page(from->bv_page); + flush_dcache_page(page); + vto = page_address(to->bv_page) + to->bv_offset; - vfrom = kmap(from->bv_page) + from->bv_offset; + vfrom = kmap_atomic(page) + to->bv_offset; memcpy(vto, vfrom, to->bv_len); - kunmap(from->bv_page); + kunmap_atomic(vfrom); } } - /* - * no pages bounced - */ - if (!bio) - return; - trace_block_bio_bounce(q, *bio_orig); - /* - * at least one page was bounced, fill in possible non-highmem - * pages - */ - __bio_for_each_segment(from, *bio_orig, i, 0) { - to = bio_iovec_idx(bio, i); - if (!to->bv_page) { - to->bv_page = from->bv_page; - to->bv_len = from->bv_len; - to->bv_offset = from->bv_offset; - } - } - - bio->bi_bdev = (*bio_orig)->bi_bdev; bio->bi_flags |= (1 << BIO_BOUNCED); - bio->bi_sector = (*bio_orig)->bi_sector; - bio->bi_rw = (*bio_orig)->bi_rw; - - bio->bi_vcnt = (*bio_orig)->bi_vcnt; - bio->bi_idx = (*bio_orig)->bi_idx; - bio->bi_size = (*bio_orig)->bi_size; if (pool == page_pool) { bio->bi_end_io = bounce_end_io_write; From d74c6d514fe314b8bdab58b487b25992291577ec Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 6 Feb 2013 12:23:11 -0800 Subject: [PATCH 23/27] block: Add bio_for_each_segment_all() __bio_for_each_segment() iterates bvecs from the specified index instead of bio->bv_idx. Currently, the only usage is to walk all the bvecs after the bio has been advanced by specifying 0 index. For immutable bvecs, we need to split these apart; bio_for_each_segment() is going to have a different implementation. This will also help document the intent of code that's using it - bio_for_each_segment_all() is only legal to use for code that owns the bio. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: Neil Brown CC: Boaz Harrosh --- drivers/block/rbd.c | 2 +- drivers/md/raid1.c | 2 +- fs/bio.c | 12 ++++++------ fs/exofs/ore.c | 2 +- fs/exofs/ore_raid.c | 2 +- include/linux/bio.h | 17 ++++++++++++++--- mm/bounce.c | 2 +- 7 files changed, 25 insertions(+), 14 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6c81a4c040b9..11e179826b60 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -952,7 +952,7 @@ static struct bio *bio_clone_range(struct bio *bio_src, /* Find first affected segment... */ resid = offset; - __bio_for_each_segment(bv, bio_src, idx, 0) { + bio_for_each_segment(bv, bio_src, idx) { if (resid < bv->bv_len) break; resid -= bv->bv_len; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0a3988a25aab..853482015d3d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1291,7 +1291,7 @@ read_again: * know the original bi_idx, so we just free * them all */ - __bio_for_each_segment(bvec, mbio, j, 0) + bio_for_each_segment_all(bvec, mbio, j) bvec->bv_page = r1_bio->behind_bvecs[j].bv_page; if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) atomic_inc(&r1_bio->behind_remaining); diff --git a/fs/bio.c b/fs/bio.c index e437f9aae67d..618f9044c414 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -961,7 +961,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, int iov_idx = 0; unsigned int iov_off = 0; - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { char *bv_addr = page_address(bvec->bv_page); unsigned int bv_len = iovecs[i].bv_len; @@ -1143,7 +1143,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, return bio; cleanup: if (!map_data) - bio_for_each_segment(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, i) __free_page(bvec->bv_page); bio_put(bio); @@ -1357,7 +1357,7 @@ static void __bio_unmap_user(struct bio *bio) /* * make sure we dirty pages we wrote to */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { if (bio_data_dir(bio) == READ) set_page_dirty_lock(bvec->bv_page); @@ -1463,7 +1463,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err) int i; char *p = bmd->sgvecs[0].iov_base; - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { char *addr = page_address(bvec->bv_page); int len = bmd->iovecs[i].bv_len; @@ -1503,7 +1503,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, if (!reading) { void *p = data; - bio_for_each_segment(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i) { char *addr = page_address(bvec->bv_page); memcpy(addr, p, bvec->bv_len); @@ -1789,7 +1789,7 @@ sector_t bio_sector_offset(struct bio *bio, unsigned short index, if (index >= bio->bi_idx) index = bio->bi_vcnt - 1; - __bio_for_each_segment(bv, bio, i, 0) { + bio_for_each_segment_all(bv, bio, i) { if (i == index) { if (offset > bv->bv_offset) sectors += (offset - bv->bv_offset) / sector_sz; diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index f936cb50dc0d..b74422888604 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -401,7 +401,7 @@ static void _clear_bio(struct bio *bio) struct bio_vec *bv; unsigned i; - __bio_for_each_segment(bv, bio, i, 0) { + bio_for_each_segment_all(bv, bio, i) { unsigned this_count = bv->bv_len; if (likely(PAGE_SIZE == this_count)) diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index b963f38ac298..7682b970d0f1 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -432,7 +432,7 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) if (!bio) continue; - __bio_for_each_segment(bv, bio, i, 0) { + bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; SetPageUptodate(page); diff --git a/include/linux/bio.h b/include/linux/bio.h index 90d36c65cb70..be2efa09f9bf 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -137,16 +137,27 @@ static inline int bio_has_allocated_vec(struct bio *bio) #define bio_io_error(bio) bio_endio((bio), -EIO) /* - * drivers should not use the __ version unless they _really_ want to - * run through the entire bio and not just pending pieces + * drivers should not use the __ version unless they _really_ know what + * they're doing */ #define __bio_for_each_segment(bvl, bio, i, start_idx) \ for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx); \ i < (bio)->bi_vcnt; \ bvl++, i++) +/* + * drivers should _never_ use the all version - the bio may have been split + * before it got to the driver and the driver won't own all of it + */ +#define bio_for_each_segment_all(bvl, bio, i) \ + for (i = 0; \ + bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt; \ + i++) + #define bio_for_each_segment(bvl, bio, i) \ - __bio_for_each_segment(bvl, bio, i, (bio)->bi_idx) + for (i = (bio)->bi_idx; \ + bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt; \ + i++) /* * get a reference to a bio, so it won't disappear. the intended use is diff --git a/mm/bounce.c b/mm/bounce.c index 55f512af50c7..2ee1b6fef44a 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -134,7 +134,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) /* * free up bounce indirect pages used */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment_all(bvec, bio, i) { org_vec = bio_orig->bi_io_vec + i; if (bvec->bv_page == org_vec->bv_page) continue; From cb34e057ad22a1c2c6f2cb6cd1cbd05cc2f28f28 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 5 Sep 2012 15:22:02 -0700 Subject: [PATCH 24/27] block: Convert some code to bio_for_each_segment_all() More prep work for immutable bvecs: A few places in the code were either open coding or using the wrong version - fix. After we introduce the bvec iter, it'll no longer be possible to modify the biovec through bio_for_each_segment_all() - it doesn't increment a pointer to the current bvec, you pass in a struct bio_vec (not a pointer) which is updated with what the current biovec would be (taking into account bi_bvec_done and bi_size). So because of that it's more worthwhile to be consistent about bio_for_each_segment()/bio_for_each_segment_all() usage. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: NeilBrown CC: Alasdair Kergon CC: dm-devel@redhat.com CC: Alexander Viro --- drivers/md/dm-crypt.c | 3 +-- drivers/md/raid1.c | 10 +++------- fs/bio.c | 20 ++++++++++---------- fs/direct-io.c | 8 ++++---- mm/bounce.c | 2 +- 5 files changed, 19 insertions(+), 24 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 13c15480d940..6d2d41ae9e32 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -858,8 +858,7 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) unsigned int i; struct bio_vec *bv; - for (i = 0; i < clone->bi_vcnt; i++) { - bv = bio_iovec_idx(clone, i); + bio_for_each_segment_all(bv, clone, i) { BUG_ON(!bv->bv_page); mempool_free(bv->bv_page, cc->page_pool); bv->bv_page = NULL; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 853482015d3d..a7ea954abe1d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -925,7 +925,7 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio) if (unlikely(!bvecs)) return; - bio_for_each_segment(bvec, bio, i) { + bio_for_each_segment_all(bvec, bio, i) { bvecs[i] = *bvec; bvecs[i].bv_page = alloc_page(GFP_NOIO); if (unlikely(!bvecs[i].bv_page)) @@ -1284,12 +1284,8 @@ read_again: struct bio_vec *bvec; int j; - /* Yes, I really want the '__' version so that - * we clear any unused pointer in the io_vec, rather - * than leave them unchanged. This is important - * because when we come to free the pages, we won't - * know the original bi_idx, so we just free - * them all + /* + * We trimmed the bio, so _all is legit */ bio_for_each_segment_all(bvec, mbio, j) bvec->bv_page = r1_bio->behind_bvecs[j].bv_page; diff --git a/fs/bio.c b/fs/bio.c index 618f9044c414..fe3aee90c988 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1548,11 +1548,11 @@ EXPORT_SYMBOL(bio_copy_kern); */ void bio_set_pages_dirty(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; int i; - for (i = 0; i < bio->bi_vcnt; i++) { - struct page *page = bvec[i].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (page && !PageCompound(page)) set_page_dirty_lock(page); @@ -1561,11 +1561,11 @@ void bio_set_pages_dirty(struct bio *bio) static void bio_release_pages(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; int i; - for (i = 0; i < bio->bi_vcnt; i++) { - struct page *page = bvec[i].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (page) put_page(page); @@ -1614,16 +1614,16 @@ static void bio_dirty_fn(struct work_struct *work) void bio_check_pages_dirty(struct bio *bio) { - struct bio_vec *bvec = bio->bi_io_vec; + struct bio_vec *bvec; int nr_clean_pages = 0; int i; - for (i = 0; i < bio->bi_vcnt; i++) { - struct page *page = bvec[i].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (PageDirty(page) || PageCompound(page)) { page_cache_release(page); - bvec[i].bv_page = NULL; + bvec->bv_page = NULL; } else { nr_clean_pages++; } diff --git a/fs/direct-io.c b/fs/direct-io.c index f853263cf74f..38484b08a39a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -441,8 +441,8 @@ static struct bio *dio_await_one(struct dio *dio) static int dio_bio_complete(struct dio *dio, struct bio *bio) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); - struct bio_vec *bvec = bio->bi_io_vec; - int page_no; + struct bio_vec *bvec; + unsigned i; if (!uptodate) dio->io_error = -EIO; @@ -450,8 +450,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) if (dio->is_async && dio->rw == READ) { bio_check_pages_dirty(bio); /* transfers ownership */ } else { - for (page_no = 0; page_no < bio->bi_vcnt; page_no++) { - struct page *page = bvec[page_no].bv_page; + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; if (dio->rw == READ && !PageCompound(page)) set_page_dirty_lock(page); diff --git a/mm/bounce.c b/mm/bounce.c index 2ee1b6fef44a..f5326b24d65d 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -231,7 +231,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, bounce: bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set); - bio_for_each_segment(to, bio, i) { + bio_for_each_segment_all(to, bio, i) { struct page *page = to->bv_page; if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) From a07876064a0b73ab5ef1ebcf14b1cf0231c07858 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Sep 2012 14:03:28 -0700 Subject: [PATCH 25/27] block: Add bio_alloc_pages() More utility code to replace stuff that's getting open coded. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: NeilBrown --- drivers/md/raid1.c | 16 +++------------- fs/bio.c | 28 ++++++++++++++++++++++++++++ include/linux/bio.h | 1 + 3 files changed, 32 insertions(+), 13 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a7ea954abe1d..aeb4e3f74791 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -92,7 +92,6 @@ static void r1bio_pool_free(void *r1_bio, void *data) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) { struct pool_info *pi = data; - struct page *page; struct r1bio *r1_bio; struct bio *bio; int i, j; @@ -122,14 +121,10 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) j = 1; while(j--) { bio = r1_bio->bios[j]; - for (i = 0; i < RESYNC_PAGES; i++) { - page = alloc_page(gfp_flags); - if (unlikely(!page)) - goto out_free_pages; + bio->bi_vcnt = RESYNC_PAGES; - bio->bi_io_vec[i].bv_page = page; - bio->bi_vcnt = i+1; - } + if (bio_alloc_pages(bio, gfp_flags)) + goto out_free_bio; } /* If not user-requests, copy the page pointers to all bios */ if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) { @@ -143,11 +138,6 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) return r1_bio; -out_free_pages: - for (j=0 ; j < pi->raid_disks; j++) - for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++) - put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); - j = -1; out_free_bio: while (++j < pi->raid_disks) bio_put(r1_bio->bios[j]); diff --git a/fs/bio.c b/fs/bio.c index fe3aee90c988..e545a440d376 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -829,6 +829,34 @@ void bio_advance(struct bio *bio, unsigned bytes) } EXPORT_SYMBOL(bio_advance); +/** + * bio_alloc_pages - allocates a single page for each bvec in a bio + * @bio: bio to allocate pages for + * @gfp_mask: flags for allocation + * + * Allocates pages up to @bio->bi_vcnt. + * + * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are + * freed. + */ +int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) +{ + int i; + struct bio_vec *bv; + + bio_for_each_segment_all(bv, bio, i) { + bv->bv_page = alloc_page(gfp_mask); + if (!bv->bv_page) { + while (--bv >= bio->bi_io_vec) + __free_page(bv->bv_page); + return -ENOMEM; + } + } + + return 0; +} +EXPORT_SYMBOL(bio_alloc_pages); + /** * bio_copy_data - copy contents of data buffers from one chain of bios to * another diff --git a/include/linux/bio.h b/include/linux/bio.h index be2efa09f9bf..e25378f2f408 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -298,6 +298,7 @@ static inline void bio_flush_dcache_pages(struct bio *bi) #endif extern void bio_copy_data(struct bio *dst, struct bio *src); +extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, unsigned long, unsigned int, int, gfp_t); From a38352e0ac02dbbd4fa464dc22d1352b5fbd06fd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 25 May 2012 13:03:11 -0700 Subject: [PATCH 26/27] block: Add an explicit bio flag for bios that own their bvec This is for the new bio splitting code. When we split a bio, if the split occured on a bvec boundry we reuse the bvec for the new bio. But that means bio_free() can't free it, hence the explicit flag. Signed-off-by: Kent Overstreet CC: Jens Axboe Acked-by: Tejun Heo --- fs/bio.c | 4 +++- include/linux/bio.h | 5 ----- include/linux/blk_types.h | 1 + 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/bio.c b/fs/bio.c index e545a440d376..9238a54b562c 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -252,7 +252,7 @@ static void bio_free(struct bio *bio) __bio_free(bio); if (bs) { - if (bio_has_allocated_vec(bio)) + if (bio_flagged(bio, BIO_OWNS_VEC)) bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio)); /* @@ -451,6 +451,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) if (unlikely(!bvl)) goto err_free; + + bio->bi_flags |= 1 << BIO_OWNS_VEC; } else if (nr_iovecs) { bvl = bio->bi_inline_vecs; } diff --git a/include/linux/bio.h b/include/linux/bio.h index e25378f2f408..794bcd0c5039 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -85,11 +85,6 @@ static inline void *bio_data(struct bio *bio) return NULL; } -static inline int bio_has_allocated_vec(struct bio *bio) -{ - return bio->bi_io_vec && bio->bi_io_vec != bio->bi_inline_vecs; -} - /* * will die */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index c178d25e588b..538289ffc704 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -117,6 +117,7 @@ struct bio { * BIO_POOL_IDX() */ #define BIO_RESET_BITS 12 +#define BIO_OWNS_VEC 12 /* bio_free() should free bvec */ #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) From 29ed7813ce5c4661261aeebddb1b8660e0860223 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 4 Sep 2012 09:54:22 -0700 Subject: [PATCH 27/27] bio-integrity: Add explicit field for owner of bip_buf This was the only real user of BIO_CLONED, which didn't have very clear semantics. Convert to its own flag so we can get rid of BIO_CLONED. Signed-off-by: Kent Overstreet CC: Jens Axboe CC: Martin K. Petersen --- fs/bio-integrity.c | 5 ++--- include/linux/bio.h | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index ca7b02dbf09d..8fb42916d8a2 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -97,9 +97,7 @@ void bio_integrity_free(struct bio *bio) struct bio_integrity_payload *bip = bio->bi_integrity; struct bio_set *bs = bio->bi_pool; - /* A cloned bio doesn't own the integrity metadata */ - if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY) - && bip->bip_buf != NULL) + if (bip->bip_owns_buf) kfree(bip->bip_buf); if (bs) { @@ -386,6 +384,7 @@ int bio_integrity_prep(struct bio *bio) return -EIO; } + bip->bip_owns_buf = 1; bip->bip_buf = buf; bip->bip_size = len; bip->bip_sector = bio->bi_sector; diff --git a/include/linux/bio.h b/include/linux/bio.h index 794bcd0c5039..ef24466d8f82 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -187,6 +187,7 @@ struct bio_integrity_payload { unsigned short bip_slab; /* slab the bip came from */ unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_idx; /* current bip_vec index */ + unsigned bip_owns_buf:1; /* should free bip_buf */ struct work_struct bip_work; /* I/O completion */