[PATCH] md: write intent bitmap support for raid10

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
NeilBrown 2006-01-06 00:20:16 -08:00 committed by Linus Torvalds
parent b15c2e57f0
commit 6cce3b23f6
3 changed files with 171 additions and 26 deletions

View file

@ -714,9 +714,10 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
mddev->bitmap_file == NULL) {
if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) {
if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
&& mddev->level != 10) {
/* FIXME use a better test */
printk(KERN_WARNING "md: bitmaps only support for raid1\n");
printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
return -EINVAL;
}
mddev->bitmap_offset = mddev->default_bitmap_offset;
@ -1037,8 +1038,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
mddev->bitmap_file == NULL ) {
if (mddev->level != 1) {
printk(KERN_WARNING "md: bitmaps only supported for raid1\n");
if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
&& mddev->level != 10) {
printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
return -EINVAL;
}
mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);

View file

@ -18,7 +18,9 @@
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "dm-bio-list.h"
#include <linux/raid/raid10.h>
#include <linux/raid/bitmap.h>
/*
* RAID10 provides a combination of RAID0 and RAID1 functionality.
@ -306,9 +308,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
/*
* this branch is our 'one mirror IO has finished' event handler:
*/
if (!uptodate)
if (!uptodate) {
md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
else
/* an I/O failed, we can't clear the bitmap */
set_bit(R10BIO_Degraded, &r10_bio->state);
} else
/*
* Set R10BIO_Uptodate in our master bio, so that
* we will return a good error code for to the higher
@ -328,6 +332,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
* already.
*/
if (atomic_dec_and_test(&r10_bio->remaining)) {
/* clear the bitmap if all writes complete successfully */
bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
r10_bio->sectors,
!test_bit(R10BIO_Degraded, &r10_bio->state),
0);
md_write_end(r10_bio->mddev);
raid_end_bio_io(r10_bio);
}
@ -486,8 +495,9 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
rcu_read_lock();
/*
* Check if we can balance. We can balance on the whole
* device if no resync is going on, or below the resync window.
* We take the first readable disk when above the resync window.
* device if no resync is going on (recovery is ok), or below
* the resync window. We take the first readable disk when
* above the resync window.
*/
if (conf->mddev->recovery_cp < MaxSector
&& (this_sector + sectors >= conf->next_resync)) {
@ -591,7 +601,10 @@ static void unplug_slaves(mddev_t *mddev)
static void raid10_unplug(request_queue_t *q)
{
mddev_t *mddev = q->queuedata;
unplug_slaves(q->queuedata);
md_wakeup_thread(mddev->thread);
}
static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
@ -647,12 +660,13 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
*/
#define RESYNC_DEPTH 32
static void raise_barrier(conf_t *conf)
static void raise_barrier(conf_t *conf, int force)
{
BUG_ON(force && !conf->barrier);
spin_lock_irq(&conf->resync_lock);
/* Wait until no block IO is waiting */
wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
/* Wait until no block IO is waiting (unless 'force') */
wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
conf->resync_lock,
raid10_unplug(conf->mddev->queue));
@ -710,6 +724,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
int i;
int chunk_sects = conf->chunk_mask + 1;
const int rw = bio_data_dir(bio);
struct bio_list bl;
unsigned long flags;
if (unlikely(bio_barrier(bio))) {
bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
@ -767,6 +783,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
r10_bio->mddev = mddev;
r10_bio->sector = bio->bi_sector;
r10_bio->state = 0;
if (rw == READ) {
/*
@ -811,13 +828,16 @@ static int make_request(request_queue_t *q, struct bio * bio)
!test_bit(Faulty, &rdev->flags)) {
atomic_inc(&rdev->nr_pending);
r10_bio->devs[i].bio = bio;
} else
} else {
r10_bio->devs[i].bio = NULL;
set_bit(R10BIO_Degraded, &r10_bio->state);
}
}
rcu_read_unlock();
atomic_set(&r10_bio->remaining, 1);
atomic_set(&r10_bio->remaining, 0);
bio_list_init(&bl);
for (i = 0; i < conf->copies; i++) {
struct bio *mbio;
int d = r10_bio->devs[i].devnum;
@ -835,13 +855,14 @@ static int make_request(request_queue_t *q, struct bio * bio)
mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining);
generic_make_request(mbio);
bio_list_add(&bl, mbio);
}
if (atomic_dec_and_test(&r10_bio->remaining)) {
md_write_end(mddev);
raid_end_bio_io(r10_bio);
}
bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_merge(&conf->pending_bio_list, &bl);
blk_plug_device(mddev->queue);
spin_unlock_irqrestore(&conf->device_lock, flags);
return 0;
}
@ -999,7 +1020,12 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
if (!enough(conf))
return 0;
for (mirror=0; mirror < mddev->raid_disks; mirror++)
if (rdev->saved_raid_disk >= 0 &&
conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
mirror = rdev->saved_raid_disk;
else
mirror = 0;
for ( ; mirror < mddev->raid_disks; mirror++)
if ( !(p=conf->mirrors+mirror)->rdev) {
blk_queue_stack_limits(mddev->queue,
@ -1015,6 +1041,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
p->head_position = 0;
rdev->raid_disk = mirror;
found = 1;
if (rdev->saved_raid_disk != mirror)
conf->fullsync = 1;
rcu_assign_pointer(p->rdev, rdev);
break;
}
@ -1282,6 +1310,26 @@ static void raid10d(mddev_t *mddev)
for (;;) {
char b[BDEVNAME_SIZE];
spin_lock_irqsave(&conf->device_lock, flags);
if (conf->pending_bio_list.head) {
bio = bio_list_get(&conf->pending_bio_list);
blk_remove_plug(mddev->queue);
spin_unlock_irqrestore(&conf->device_lock, flags);
/* flush any pending bitmap writes to disk before proceeding w/ I/O */
if (bitmap_unplug(mddev->bitmap) != 0)
printk("%s: bitmap file write failed!\n", mdname(mddev));
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
bio->bi_next = NULL;
generic_make_request(bio);
bio = next;
}
unplug = 1;
continue;
}
if (list_empty(head))
break;
r10_bio = list_entry(head->prev, r10bio_t, retry_list);
@ -1388,6 +1436,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
sector_t max_sector, nr_sectors;
int disk;
int i;
int max_sync;
int sync_blocks;
sector_t sectors_skipped = 0;
int chunks_skipped = 0;
@ -1401,6 +1451,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
max_sector = mddev->resync_max_sectors;
if (sector_nr >= max_sector) {
/* If we aborted, we need to abort the
* sync on the 'current' bitmap chucks (there can
* be several when recovering multiple devices).
* as we may have started syncing it but not finished.
* We can find the current address in
* mddev->curr_resync, but for recovery,
* we need to convert that to several
* virtual addresses.
*/
if (mddev->curr_resync < max_sector) { /* aborted */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
&sync_blocks, 1);
else for (i=0; i<conf->raid_disks; i++) {
sector_t sect =
raid10_find_virt(conf, mddev->curr_resync, i);
bitmap_end_sync(mddev->bitmap, sect,
&sync_blocks, 1);
}
} else /* completed sync */
conf->fullsync = 0;
bitmap_close_sync(mddev->bitmap);
close_sync(conf);
*skipped = 1;
return sectors_skipped;
@ -1425,8 +1498,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
*/
if (!go_faster && conf->nr_waiting)
msleep_interruptible(1000);
raise_barrier(conf);
conf->next_resync = sector_nr;
/* Again, very different code for resync and recovery.
* Both must result in an r10bio with a list of bios that
@ -1443,6 +1514,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
* end_sync_write if we will want to write.
*/
max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
/* recovery... the complicated one */
int i, j, k;
@ -1451,13 +1523,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
for (i=0 ; i<conf->raid_disks; i++)
if (conf->mirrors[i].rdev &&
!test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
int still_degraded = 0;
/* want to reconstruct this device */
r10bio_t *rb2 = r10_bio;
sector_t sect = raid10_find_virt(conf, sector_nr, i);
int must_sync;
/* Unless we are doing a full sync, we only need
* to recover the block if it is set in the bitmap
*/
must_sync = bitmap_start_sync(mddev->bitmap, sect,
&sync_blocks, 1);
if (sync_blocks < max_sync)
max_sync = sync_blocks;
if (!must_sync &&
!conf->fullsync) {
/* yep, skip the sync_blocks here, but don't assume
* that there will never be anything to do here
*/
chunks_skipped = -1;
continue;
}
r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
spin_lock_irq(&conf->resync_lock);
if (rb2) conf->barrier++;
spin_unlock_irq(&conf->resync_lock);
raise_barrier(conf, rb2 != NULL);
atomic_set(&r10_bio->remaining, 0);
r10_bio->master_bio = (struct bio*)rb2;
@ -1465,8 +1553,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
atomic_inc(&rb2->remaining);
r10_bio->mddev = mddev;
set_bit(R10BIO_IsRecover, &r10_bio->state);
r10_bio->sector = raid10_find_virt(conf, sector_nr, i);
r10_bio->sector = sect;
raid10_find_phys(conf, r10_bio);
/* Need to check if this section will still be
* degraded
*/
for (j=0; j<conf->copies;j++) {
int d = r10_bio->devs[j].devnum;
if (conf->mirrors[d].rdev == NULL ||
test_bit(Faulty, &conf->mirrors[d].rdev->flags))
still_degraded = 1;
}
must_sync = bitmap_start_sync(mddev->bitmap, sect,
&sync_blocks, still_degraded);
for (j=0; j<conf->copies;j++) {
int d = r10_bio->devs[j].devnum;
if (conf->mirrors[d].rdev &&
@ -1526,10 +1627,22 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
} else {
/* resync. Schedule a read for every block at this virt offset */
int count = 0;
if (!bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
/* We can skip this block */
*skipped = 1;
return sync_blocks + sectors_skipped;
}
if (sync_blocks < max_sync)
max_sync = sync_blocks;
r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
r10_bio->mddev = mddev;
atomic_set(&r10_bio->remaining, 0);
raise_barrier(conf, 0);
conf->next_resync = sector_nr;
r10_bio->master_bio = NULL;
r10_bio->sector = sector_nr;
@ -1582,6 +1695,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
}
nr_sectors = 0;
if (sector_nr + max_sync < max_sector)
max_sector = sector_nr + max_sync;
do {
struct page *page;
int len = PAGE_SIZE;
@ -1821,6 +1936,26 @@ static int stop(mddev_t *mddev)
return 0;
}
static void raid10_quiesce(mddev_t *mddev, int state)
{
conf_t *conf = mddev_to_conf(mddev);
switch(state) {
case 1:
raise_barrier(conf, 0);
break;
case 0:
lower_barrier(conf);
break;
}
if (mddev->thread) {
if (mddev->bitmap)
mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
else
mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
md_wakeup_thread(mddev->thread);
}
}
static mdk_personality_t raid10_personality =
{
@ -1835,6 +1970,7 @@ static mdk_personality_t raid10_personality =
.hot_remove_disk= raid10_remove_disk,
.spare_active = raid10_spare_active,
.sync_request = sync_request,
.quiesce = raid10_quiesce,
};
static int __init raid_init(void)

View file

@ -35,13 +35,19 @@ struct r10_private_data_s {
sector_t chunk_mask;
struct list_head retry_list;
/* for use when syncing mirrors: */
/* queue pending writes and submit them on unplug */
struct bio_list pending_bio_list;
spinlock_t resync_lock;
int nr_pending;
int nr_waiting;
int barrier;
sector_t next_resync;
int fullsync; /* set to 1 if a full sync is needed,
* (fresh device added).
* Cleared when a sync completes.
*/
wait_queue_head_t wait_barrier;
@ -100,4 +106,5 @@ struct r10bio_s {
#define R10BIO_Uptodate 0
#define R10BIO_IsSync 1
#define R10BIO_IsRecover 2
#define R10BIO_Degraded 3
#endif