md/raid1: handle merge_bvec_fn in member devices.

Currently we don't honour merge_bvec_fn in member devices so if there
is one, we force all requests to be single-page at most.
This is not ideal.

So create a raid1 merge_bvec_fn to check that function in children
as well.

This introduces a small problem.  There is no locking around calls
the ->merge_bvec_fn and subsequent calls to ->make_request.  So a
device added between these could end up getting a request which
violates its merge_bvec_fn.

Currently the best we can do is synchronize_sched().  This will work
providing no preemption happens.  If there is is preemption, we just
have to hope that new devices are largely consistent with old devices.

Signed-off-by: NeilBrown <neilb@suse.de>
This commit is contained in:
NeilBrown 2012-03-19 12:46:39 +11:00
parent 050b66152f
commit 6b740b8d79

View File

@ -523,6 +523,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
rdev = rcu_dereference(conf->mirrors[disk].rdev);
if (r1_bio->bios[disk] == IO_BLOCKED
|| rdev == NULL
|| test_bit(Unmerged, &rdev->flags)
|| test_bit(Faulty, &rdev->flags))
continue;
if (!test_bit(In_sync, &rdev->flags) &&
@ -614,6 +615,39 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
return best_disk;
}
static int raid1_mergeable_bvec(struct request_queue *q,
struct bvec_merge_data *bvm,
struct bio_vec *biovec)
{
struct mddev *mddev = q->queuedata;
struct r1conf *conf = mddev->private;
sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
int max = biovec->bv_len;
if (mddev->merge_check_needed) {
int disk;
rcu_read_lock();
for (disk = 0; disk < conf->raid_disks * 2; disk++) {
struct md_rdev *rdev = rcu_dereference(
conf->mirrors[disk].rdev);
if (rdev && !test_bit(Faulty, &rdev->flags)) {
struct request_queue *q =
bdev_get_queue(rdev->bdev);
if (q->merge_bvec_fn) {
bvm->bi_sector = sector +
rdev->data_offset;
bvm->bi_bdev = rdev->bdev;
max = min(max, q->merge_bvec_fn(
q, bvm, biovec));
}
}
}
rcu_read_unlock();
}
return max;
}
int md_raid1_congested(struct mddev *mddev, int bits)
{
struct r1conf *conf = mddev->private;
@ -1015,7 +1049,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
break;
}
r1_bio->bios[i] = NULL;
if (!rdev || test_bit(Faulty, &rdev->flags)) {
if (!rdev || test_bit(Faulty, &rdev->flags)
|| test_bit(Unmerged, &rdev->flags)) {
if (i < conf->raid_disks)
set_bit(R1BIO_Degraded, &r1_bio->state);
continue;
@ -1335,6 +1370,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
struct mirror_info *p;
int first = 0;
int last = conf->raid_disks - 1;
struct request_queue *q = bdev_get_queue(rdev->bdev);
if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY;
@ -1342,23 +1378,17 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
if (q->merge_bvec_fn) {
set_bit(Unmerged, &rdev->flags);
mddev->merge_check_needed = 1;
}
for (mirror = first; mirror <= last; mirror++) {
p = conf->mirrors+mirror;
if (!p->rdev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
/* as we don't honour merge_bvec_fn, we must
* never risk violating it, so limit
* ->max_segments to one lying with a single
* page, as a one page request is never in
* violation.
*/
if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
blk_queue_max_segments(mddev->queue, 1);
blk_queue_segment_boundary(mddev->queue,
PAGE_CACHE_SIZE - 1);
}
p->head_position = 0;
rdev->raid_disk = mirror;
@ -1383,6 +1413,19 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
break;
}
}
if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
/* Some requests might not have seen this new
* merge_bvec_fn. We must wait for them to complete
* before merging the device fully.
* First we make sure any code which has tested
* our function has submitted the request, then
* we wait for all outstanding requests to complete.
*/
synchronize_sched();
raise_barrier(conf);
lower_barrier(conf);
clear_bit(Unmerged, &rdev->flags);
}
md_integrity_add_rdev(rdev, mddev);
print_conf(conf);
return err;
@ -2627,15 +2670,6 @@ static int run(struct mddev *mddev)
continue;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
/* as we don't honour merge_bvec_fn, we must never risk
* violating it, so limit ->max_segments to 1 lying within
* a single page, as a one page request is never in violation.
*/
if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
blk_queue_max_segments(mddev->queue, 1);
blk_queue_segment_boundary(mddev->queue,
PAGE_CACHE_SIZE - 1);
}
}
mddev->degraded = 0;
@ -2669,6 +2703,7 @@ static int run(struct mddev *mddev)
if (mddev->queue) {
mddev->queue->backing_dev_info.congested_fn = raid1_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
}
return md_integrity_register(mddev);
}