md/raid5: detect and handle replacements during recovery.

During recovery we want to write to the replacement but not
the original.  So we have two new flags
 - R5_NeedReplace if this stripe has a replacement that needs to
   be written at some stage
 - R5_WantReplace if NeedReplace, and the data is available, and
   a 'sync' has been requested on this stripe.

We also distinguish between 'sync and replace' which need to read
all other devices, and 'replace' which only needs to read the
devices being replaced.

Note that during resync we always write to any replacement device.
It might not need to be written to, but as we don't read to compare,
we have to write to be sure.

Signed-off-by: NeilBrown <neilb@suse.de>
This commit is contained in:
NeilBrown 2011-12-23 10:17:53 +11:00
parent 977df36255
commit 9a3e1101b8
2 changed files with 106 additions and 30 deletions

View file

@ -503,6 +503,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
for (i = disks; i--; ) { for (i = disks; i--; ) {
int rw; int rw;
int replace_only = 0;
struct bio *bi, *rbi; struct bio *bi, *rbi;
struct md_rdev *rdev, *rrdev = NULL; struct md_rdev *rdev, *rrdev = NULL;
if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
@ -512,7 +513,11 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
rw = WRITE; rw = WRITE;
} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
rw = READ; rw = READ;
else else if (test_and_clear_bit(R5_WantReplace,
&sh->dev[i].flags)) {
rw = WRITE;
replace_only = 1;
} else
continue; continue;
bi = &sh->dev[i].req; bi = &sh->dev[i].req;
@ -528,10 +533,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
rcu_read_lock(); rcu_read_lock();
rdev = rcu_dereference(conf->disks[i].rdev); rdev = rcu_dereference(conf->disks[i].rdev);
if (rw & WRITE) rrdev = rcu_dereference(conf->disks[i].replacement);
rrdev = rcu_dereference(conf->disks[i].replacement); if (rw & WRITE) {
else if (test_bit(R5_ReadRepl, &sh->dev[i].flags)) if (replace_only)
rdev = rcu_dereference(conf->disks[i].replacement); rdev = NULL;
} else {
if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
rdev = rrdev;
rrdev = NULL;
}
if (rdev && test_bit(Faulty, &rdev->flags)) if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL; rdev = NULL;
@ -575,7 +585,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
} }
if (rdev) { if (rdev) {
if (s->syncing || s->expanding || s->expanded) if (s->syncing || s->expanding || s->expanded
|| s->replacing)
md_sync_acct(rdev->bdev, STRIPE_SECTORS); md_sync_acct(rdev->bdev, STRIPE_SECTORS);
set_bit(STRIPE_IO_STARTED, &sh->state); set_bit(STRIPE_IO_STARTED, &sh->state);
@ -597,7 +608,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
generic_make_request(bi); generic_make_request(bi);
} }
if (rrdev) { if (rrdev) {
if (s->syncing || s->expanding || s->expanded) if (s->syncing || s->expanding || s->expanded
|| s->replacing)
md_sync_acct(rrdev->bdev, STRIPE_SECTORS); md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
set_bit(STRIPE_IO_STARTED, &sh->state); set_bit(STRIPE_IO_STARTED, &sh->state);
@ -2440,8 +2452,9 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
md_done_sync(conf->mddev, STRIPE_SECTORS, 0); md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
clear_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_SYNCING, &sh->state);
s->syncing = 0; s->syncing = 0;
s->replacing = 0;
/* There is nothing more to do for sync/check/repair. /* There is nothing more to do for sync/check/repair.
* For recover we need to record a bad block on all * For recover/replace we need to record a bad block on all
* non-sync devices, or abort the recovery * non-sync devices, or abort the recovery
*/ */
if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
@ -2451,12 +2464,18 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
*/ */
for (i = 0; i < conf->raid_disks; i++) { for (i = 0; i < conf->raid_disks; i++) {
struct md_rdev *rdev = conf->disks[i].rdev; struct md_rdev *rdev = conf->disks[i].rdev;
if (!rdev if (rdev
|| test_bit(Faulty, &rdev->flags) && !test_bit(Faulty, &rdev->flags)
|| test_bit(In_sync, &rdev->flags)) && !test_bit(In_sync, &rdev->flags)
continue; && !rdev_set_badblocks(rdev, sh->sector,
if (!rdev_set_badblocks(rdev, sh->sector, STRIPE_SECTORS, 0))
STRIPE_SECTORS, 0)) abort = 1;
rdev = conf->disks[i].replacement;
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
&& !rdev_set_badblocks(rdev, sh->sector,
STRIPE_SECTORS, 0))
abort = 1; abort = 1;
} }
if (abort) { if (abort) {
@ -2465,6 +2484,22 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
} }
} }
static int want_replace(struct stripe_head *sh, int disk_idx)
{
struct md_rdev *rdev;
int rv = 0;
/* Doing recovery so rcu locking not required */
rdev = sh->raid_conf->disks[disk_idx].replacement;
if (rdev
&& !test_bit(Faulty, &rdev->flags)
&& !test_bit(In_sync, &rdev->flags)
&& (rdev->recovery_offset <= sh->sector
|| rdev->mddev->recovery_cp <= sh->sector))
rv = 1;
return rv;
}
/* fetch_block - checks the given member device to see if its data needs /* fetch_block - checks the given member device to see if its data needs
* to be read or computed to satisfy a request. * to be read or computed to satisfy a request.
* *
@ -2484,6 +2519,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
(dev->toread || (dev->toread ||
(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
s->syncing || s->expanding || s->syncing || s->expanding ||
(s->replacing && want_replace(sh, disk_idx)) ||
(s->failed >= 1 && fdev[0]->toread) || (s->failed >= 1 && fdev[0]->toread) ||
(s->failed >= 2 && fdev[1]->toread) || (s->failed >= 2 && fdev[1]->toread) ||
(sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
@ -3037,22 +3073,18 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
} }
} }
/* /*
* handle_stripe - do things to a stripe. * handle_stripe - do things to a stripe.
* *
* We lock the stripe and then examine the state of various bits * We lock the stripe by setting STRIPE_ACTIVE and then examine the
* to see what needs to be done. * state of various bits to see what needs to be done.
* Possible results: * Possible results:
* return some read request which now have data * return some read requests which now have data
* return some write requests which are safely on disc * return some write requests which are safely on storage
* schedule a read on some buffers * schedule a read on some buffers
* schedule a write of some buffers * schedule a write of some buffers
* return confirmation of parity correctness * return confirmation of parity correctness
* *
* buffers are taken off read_list or write_list, and bh_cache buffers
* get BH_Lock set before the stripe lock is released.
*
*/ */
static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
@ -3061,10 +3093,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
int disks = sh->disks; int disks = sh->disks;
struct r5dev *dev; struct r5dev *dev;
int i; int i;
int do_recovery = 0;
memset(s, 0, sizeof(*s)); memset(s, 0, sizeof(*s));
s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
s->failed_num[0] = -1; s->failed_num[0] = -1;
@ -3082,7 +3114,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
dev = &sh->dev[i]; dev = &sh->dev[i];
pr_debug("check %d: state 0x%lx read %p write %p written %p\n", pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
i, dev->flags, dev->toread, dev->towrite, dev->written); i, dev->flags,
dev->toread, dev->towrite, dev->written);
/* maybe we can reply to a read /* maybe we can reply to a read
* *
* new wantfill requests are only permitted while * new wantfill requests are only permitted while
@ -3123,6 +3156,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
&first_bad, &bad_sectors)) &first_bad, &bad_sectors))
set_bit(R5_ReadRepl, &dev->flags); set_bit(R5_ReadRepl, &dev->flags);
else { else {
if (rdev)
set_bit(R5_NeedReplace, &dev->flags);
rdev = rcu_dereference(conf->disks[i].rdev); rdev = rcu_dereference(conf->disks[i].rdev);
clear_bit(R5_ReadRepl, &dev->flags); clear_bit(R5_ReadRepl, &dev->flags);
} }
@ -3210,9 +3245,25 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (s->failed < 2) if (s->failed < 2)
s->failed_num[s->failed] = i; s->failed_num[s->failed] = i;
s->failed++; s->failed++;
if (rdev && !test_bit(Faulty, &rdev->flags))
do_recovery = 1;
} }
} }
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
if (test_bit(STRIPE_SYNCING, &sh->state)) {
/* If there is a failed device being replaced,
* we must be recovering.
* else if we are after recovery_cp, we must be syncing
* else we can only be replacing
* sync and recovery both need to read all devices, and so
* use the same flag.
*/
if (do_recovery ||
sh->sector >= conf->mddev->recovery_cp)
s->syncing = 1;
else
s->replacing = 1;
}
rcu_read_unlock(); rcu_read_unlock();
} }
@ -3254,7 +3305,7 @@ static void handle_stripe(struct stripe_head *sh)
if (unlikely(s.blocked_rdev)) { if (unlikely(s.blocked_rdev)) {
if (s.syncing || s.expanding || s.expanded || if (s.syncing || s.expanding || s.expanded ||
s.to_write || s.written) { s.replacing || s.to_write || s.written) {
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
goto finish; goto finish;
} }
@ -3280,7 +3331,7 @@ static void handle_stripe(struct stripe_head *sh)
sh->reconstruct_state = 0; sh->reconstruct_state = 0;
if (s.to_read+s.to_write+s.written) if (s.to_read+s.to_write+s.written)
handle_failed_stripe(conf, sh, &s, disks, &s.return_bi); handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
if (s.syncing) if (s.syncing + s.replacing)
handle_failed_sync(conf, sh, &s); handle_failed_sync(conf, sh, &s);
} }
@ -3311,7 +3362,9 @@ static void handle_stripe(struct stripe_head *sh)
*/ */
if (s.to_read || s.non_overwrite if (s.to_read || s.non_overwrite
|| (conf->level == 6 && s.to_write && s.failed) || (conf->level == 6 && s.to_write && s.failed)
|| (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) || (s.syncing && (s.uptodate + s.compute < disks))
|| s.replacing
|| s.expanding)
handle_stripe_fill(sh, &s, disks); handle_stripe_fill(sh, &s, disks);
/* Now we check to see if any write operations have recently /* Now we check to see if any write operations have recently
@ -3373,7 +3426,20 @@ static void handle_stripe(struct stripe_head *sh)
handle_parity_checks5(conf, sh, &s, disks); handle_parity_checks5(conf, sh, &s, disks);
} }
if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { if (s.replacing && s.locked == 0
&& !test_bit(STRIPE_INSYNC, &sh->state)) {
/* Write out to replacement devices where possible */
for (i = 0; i < conf->raid_disks; i++)
if (test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
set_bit(R5_WantReplace, &sh->dev[i].flags);
set_bit(R5_LOCKED, &sh->dev[i].flags);
s.locked++;
}
set_bit(STRIPE_INSYNC, &sh->state);
}
if ((s.syncing || s.replacing) && s.locked == 0 &&
test_bit(STRIPE_INSYNC, &sh->state)) {
md_done_sync(conf->mddev, STRIPE_SECTORS, 1); md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
clear_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_SYNCING, &sh->state);
} }
@ -4262,7 +4328,6 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
} }
bitmap_cond_end_sync(mddev->bitmap, sector_nr); bitmap_cond_end_sync(mddev->bitmap, sector_nr);
sh = get_active_stripe(conf, sector_nr, 0, 1, 0); sh = get_active_stripe(conf, sector_nr, 0, 1, 0);

View file

@ -242,7 +242,13 @@ struct stripe_head {
* for handle_stripe. * for handle_stripe.
*/ */
struct stripe_head_state { struct stripe_head_state {
int syncing, expanding, expanded; /* 'syncing' means that we need to read all devices, either
* to check/correct parity, or to reconstruct a missing device.
* 'replacing' means we are replacing one or more drives and
* the source is valid at this point so we don't need to
* read all devices, just the replacement targets.
*/
int syncing, expanding, expanded, replacing;
int locked, uptodate, to_read, to_write, failed, written; int locked, uptodate, to_read, to_write, failed, written;
int to_fill, compute, req_compute, non_overwrite; int to_fill, compute, req_compute, non_overwrite;
int failed_num[2]; int failed_num[2];
@ -284,6 +290,11 @@ enum r5dev_flags {
R5_ReadRepl, /* Will/did read from replacement rather than orig */ R5_ReadRepl, /* Will/did read from replacement rather than orig */
R5_MadeGoodRepl,/* A bad block on the replacement device has been R5_MadeGoodRepl,/* A bad block on the replacement device has been
* fixed by writing to it */ * fixed by writing to it */
R5_NeedReplace, /* This device has a replacement which is not
* up-to-date at this stripe. */
R5_WantReplace, /* We need to update the replacement, we have read
* data in, and now is a good time to write it out.
*/
}; };
/* /*