aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/md/bitmap.c5
-rw-r--r--drivers/md/md.c99
-rw-r--r--drivers/md/raid1.c134
-rw-r--r--include/linux/raid/md.h1
-rw-r--r--include/linux/raid/md_k.h8
-rw-r--r--include/linux/raid/raid1.h4
6 files changed, 189 insertions, 62 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 220273e81ed..51315302a85 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -301,7 +301,7 @@ static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wai
page);
if (wait)
- wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
+ md_super_wait(mddev);
return 0;
}
@@ -828,8 +828,7 @@ int bitmap_unplug(struct bitmap *bitmap)
wake_up_process(bitmap->writeback_daemon->tsk));
spin_unlock_irq(&bitmap->write_lock);
} else
- wait_event(bitmap->mddev->sb_wait,
- atomic_read(&bitmap->mddev->pending_writes)==0);
+ md_super_wait(bitmap->mddev);
}
return 0;
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index caa4add00c1..199016932de 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -330,18 +330,46 @@ static void free_disk_sb(mdk_rdev_t * rdev)
static int super_written(struct bio *bio, unsigned int bytes_done, int error)
{
mdk_rdev_t *rdev = bio->bi_private;
+ mddev_t *mddev = rdev->mddev;
if (bio->bi_size)
return 1;
if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
- md_error(rdev->mddev, rdev);
+ md_error(mddev, rdev);
- if (atomic_dec_and_test(&rdev->mddev->pending_writes))
- wake_up(&rdev->mddev->sb_wait);
+ if (atomic_dec_and_test(&mddev->pending_writes))
+ wake_up(&mddev->sb_wait);
bio_put(bio);
return 0;
}
+static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error)
+{
+ struct bio *bio2 = bio->bi_private;
+ mdk_rdev_t *rdev = bio2->bi_private;
+ mddev_t *mddev = rdev->mddev;
+ if (bio->bi_size)
+ return 1;
+
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+ error == -EOPNOTSUPP) {
+ unsigned long flags;
+ /* barriers don't appear to be supported :-( */
+ set_bit(BarriersNotsupp, &rdev->flags);
+ mddev->barriers_work = 0;
+ spin_lock_irqsave(&mddev->write_lock, flags);
+ bio2->bi_next = mddev->biolist;
+ mddev->biolist = bio2;
+ spin_unlock_irqrestore(&mddev->write_lock, flags);
+ wake_up(&mddev->sb_wait);
+ bio_put(bio);
+ return 0;
+ }
+ bio_put(bio2);
+ bio->bi_private = rdev;
+ return super_written(bio, bytes_done, error);
+}
+
void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page)
{
@@ -350,16 +378,54 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
* and decrement it on completion, waking up sb_wait
* if zero is reached.
* If an error occurred, call md_error
+ *
+ * As we might need to resubmit the request if BIO_RW_BARRIER
+ * causes ENOTSUPP, we allocate a spare bio...
*/
struct bio *bio = bio_alloc(GFP_NOIO, 1);
+ int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
bio->bi_bdev = rdev->bdev;
bio->bi_sector = sector;
bio_add_page(bio, page, size, 0);
bio->bi_private = rdev;
bio->bi_end_io = super_written;
+ bio->bi_rw = rw;
+
atomic_inc(&mddev->pending_writes);
- submit_bio((1<<BIO_RW)|(1<<BIO_RW_SYNC), bio);
+ if (!test_bit(BarriersNotsupp, &rdev->flags)) {
+ struct bio *rbio;
+ rw |= (1<<BIO_RW_BARRIER);
+ rbio = bio_clone(bio, GFP_NOIO);
+ rbio->bi_private = bio;
+ rbio->bi_end_io = super_written_barrier;
+ submit_bio(rw, rbio);
+ } else
+ submit_bio(rw, bio);
+}
+
+void md_super_wait(mddev_t *mddev)
+{
+ /* wait for all superblock writes that were scheduled to complete.
+ * if any had to be retried (due to BARRIER problems), retry them
+ */
+ DEFINE_WAIT(wq);
+ for(;;) {
+ prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&mddev->pending_writes)==0)
+ break;
+ while (mddev->biolist) {
+ struct bio *bio;
+ spin_lock_irq(&mddev->write_lock);
+ bio = mddev->biolist;
+ mddev->biolist = bio->bi_next ;
+ bio->bi_next = NULL;
+ spin_unlock_irq(&mddev->write_lock);
+ submit_bio(bio->bi_rw, bio);
+ }
+ schedule();
+ }
+ finish_wait(&mddev->sb_wait, &wq);
}
static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
@@ -1382,7 +1448,7 @@ static void md_update_sb(mddev_t * mddev)
int sync_req;
repeat:
- spin_lock(&mddev->write_lock);
+ spin_lock_irq(&mddev->write_lock);
sync_req = mddev->in_sync;
mddev->utime = get_seconds();
mddev->events ++;
@@ -1405,11 +1471,11 @@ repeat:
*/
if (!mddev->persistent) {
mddev->sb_dirty = 0;
- spin_unlock(&mddev->write_lock);
+ spin_unlock_irq(&mddev->write_lock);
wake_up(&mddev->sb_wait);
return;
}
- spin_unlock(&mddev->write_lock);
+ spin_unlock_irq(&mddev->write_lock);
dprintk(KERN_INFO
"md: updating %s RAID superblock on device (in sync %d)\n",
@@ -1437,17 +1503,17 @@ repeat:
/* only need to write one superblock... */
break;
}
- wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
+ md_super_wait(mddev);
/* if there was a failure, sb_dirty was set to 1, and we re-write super */
- spin_lock(&mddev->write_lock);
+ spin_lock_irq(&mddev->write_lock);
if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
/* have to write it out again */
- spin_unlock(&mddev->write_lock);
+ spin_unlock_irq(&mddev->write_lock);
goto repeat;
}
mddev->sb_dirty = 0;
- spin_unlock(&mddev->write_lock);
+ spin_unlock_irq(&mddev->write_lock);
wake_up(&mddev->sb_wait);
}
@@ -1989,6 +2055,7 @@ static int do_md_run(mddev_t * mddev)
mddev->recovery = 0;
mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
+ mddev->barriers_work = 1;
/* before we start the array running, initialise the bitmap */
err = bitmap_create(mddev);
@@ -2107,7 +2174,7 @@ static int do_md_stop(mddev_t * mddev, int ro)
mddev->ro = 1;
} else {
bitmap_flush(mddev);
- wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
+ md_super_wait(mddev);
if (mddev->ro)
set_disk_ro(disk, 0);
blk_queue_make_request(mddev->queue, md_fail_request);
@@ -3796,13 +3863,13 @@ void md_write_start(mddev_t *mddev, struct bio *bi)
atomic_inc(&mddev->writes_pending);
if (mddev->in_sync) {
- spin_lock(&mddev->write_lock);
+ spin_lock_irq(&mddev->write_lock);
if (mddev->in_sync) {
mddev->in_sync = 0;
mddev->sb_dirty = 1;
md_wakeup_thread(mddev->thread);
}
- spin_unlock(&mddev->write_lock);
+ spin_unlock_irq(&mddev->write_lock);
}
wait_event(mddev->sb_wait, mddev->sb_dirty==0);
}
@@ -4112,7 +4179,7 @@ void md_check_recovery(mddev_t *mddev)
if (mddev_trylock(mddev)==0) {
int spares =0;
- spin_lock(&mddev->write_lock);
+ spin_lock_irq(&mddev->write_lock);
if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
!mddev->in_sync && mddev->recovery_cp == MaxSector) {
mddev->in_sync = 1;
@@ -4120,7 +4187,7 @@ void md_check_recovery(mddev_t *mddev)
}
if (mddev->safemode == 1)
mddev->safemode = 0;
- spin_unlock(&mddev->write_lock);
+ spin_unlock_irq(&mddev->write_lock);
if (mddev->sb_dirty)
md_update_sb(mddev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fb6b866c28f..1cbf51fbd43 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -301,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
{
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
- int mirror, behind;
+ int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
conf_t *conf = mddev_to_conf(r1_bio->mddev);
if (bio->bi_size)
@@ -311,47 +311,54 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
if (r1_bio->bios[mirror] == bio)
break;
- /*
- * this branch is our 'one mirror IO has finished' event handler:
- */
- if (!uptodate) {
- md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
- /* an I/O failed, we can't clear the bitmap */
- set_bit(R1BIO_Degraded, &r1_bio->state);
- } else
+ if (error == -ENOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
+ set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
+ set_bit(R1BIO_BarrierRetry, &r1_bio->state);
+ r1_bio->mddev->barriers_work = 0;
+ } else {
/*
- * Set R1BIO_Uptodate in our master bio, so that
- * we will return a good error code for to the higher
- * levels even if IO on some other mirrored buffer fails.
- *
- * The 'master' represents the composite IO operation to
- * user-side. So if something waits for IO, then it will
- * wait for the 'master' bio.
+ * this branch is our 'one mirror IO has finished' event handler:
*/
- set_bit(R1BIO_Uptodate, &r1_bio->state);
-
- update_head_pos(mirror, r1_bio);
-
- behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
- if (behind) {
- if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
- atomic_dec(&r1_bio->behind_remaining);
-
- /* In behind mode, we ACK the master bio once the I/O has safely
- * reached all non-writemostly disks. Setting the Returned bit
- * ensures that this gets done only once -- we don't ever want to
- * return -EIO here, instead we'll wait */
-
- if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
- test_bit(R1BIO_Uptodate, &r1_bio->state)) {
- /* Maybe we can return now */
- if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
- struct bio *mbio = r1_bio->master_bio;
- PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
- (unsigned long long) mbio->bi_sector,
- (unsigned long long) mbio->bi_sector +
- (mbio->bi_size >> 9) - 1);
- bio_endio(mbio, mbio->bi_size, 0);
+ r1_bio->bios[mirror] = NULL;
+ bio_put(bio);
+ if (!uptodate) {
+ md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
+ /* an I/O failed, we can't clear the bitmap */
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ } else
+ /*
+ * Set R1BIO_Uptodate in our master bio, so that
+ * we will return a good error code for to the higher
+ * levels even if IO on some other mirrored buffer fails.
+ *
+ * The 'master' represents the composite IO operation to
+ * user-side. So if something waits for IO, then it will
+ * wait for the 'master' bio.
+ */
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+
+ update_head_pos(mirror, r1_bio);
+
+ if (behind) {
+ if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
+ atomic_dec(&r1_bio->behind_remaining);
+
+ /* In behind mode, we ACK the master bio once the I/O has safely
+ * reached all non-writemostly disks. Setting the Returned bit
+ * ensures that this gets done only once -- we don't ever want to
+ * return -EIO here, instead we'll wait */
+
+ if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
+ test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+ /* Maybe we can return now */
+ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+ struct bio *mbio = r1_bio->master_bio;
+ PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
+ (unsigned long long) mbio->bi_sector,
+ (unsigned long long) mbio->bi_sector +
+ (mbio->bi_size >> 9) - 1);
+ bio_endio(mbio, mbio->bi_size, 0);
+ }
}
}
}
@@ -361,8 +368,16 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
* already.
*/
if (atomic_dec_and_test(&r1_bio->remaining)) {
+ if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
+ reschedule_retry(r1_bio);
+ /* Don't dec_pending yet, we want to hold
+ * the reference over the retry
+ */
+ return 0;
+ }
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
/* free extra copy of the data pages */
+/* FIXME bio has been freed!!! */
int i = bio->bi_vcnt;
while (i--)
__free_page(bio->bi_io_vec[i].bv_page);
@@ -648,8 +663,9 @@ static int make_request(request_queue_t *q, struct bio * bio)
struct bio_list bl;
struct page **behind_pages = NULL;
const int rw = bio_data_dir(bio);
+ int do_barriers;
- if (unlikely(bio_barrier(bio))) {
+ if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
return 0;
}
@@ -759,6 +775,10 @@ static int make_request(request_queue_t *q, struct bio * bio)
atomic_set(&r1_bio->remaining, 0);
atomic_set(&r1_bio->behind_remaining, 0);
+ do_barriers = bio->bi_rw & BIO_RW_BARRIER;
+ if (do_barriers)
+ set_bit(R1BIO_Barrier, &r1_bio->state);
+
bio_list_init(&bl);
for (i = 0; i < disks; i++) {
struct bio *mbio;
@@ -771,7 +791,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_end_io = raid1_end_write_request;
- mbio->bi_rw = WRITE;
+ mbio->bi_rw = WRITE | do_barriers;
mbio->bi_private = r1_bio;
if (behind_pages) {
@@ -1153,6 +1173,36 @@ static void raid1d(mddev_t *mddev)
if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
sync_request_write(mddev, r1_bio);
unplug = 1;
+ } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
+ /* some requests in the r1bio were BIO_RW_BARRIER
+ * requests which failed with -ENOTSUPP. Hohumm..
+ * Better resubmit without the barrier.
+ * We know which devices to resubmit for, because
+ * all others have had their bios[] entry cleared.
+ */
+ int i;
+ clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
+ clear_bit(R1BIO_Barrier, &r1_bio->state);
+ for (i=0; i < conf->raid_disks; i++)
+ if (r1_bio->bios[i]) {
+ struct bio_vec *bvec;
+ int j;
+
+ bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
+ /* copy pages from the failed bio, as
+ * this might be a write-behind device */
+ __bio_for_each_segment(bvec, bio, j, 0)
+ bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
+ bio_put(r1_bio->bios[i]);
+ bio->bi_sector = r1_bio->sector +
+ conf->mirrors[i].rdev->data_offset;
+ bio->bi_bdev = conf->mirrors[i].rdev->bdev;
+ bio->bi_end_io = raid1_end_write_request;
+ bio->bi_rw = WRITE;
+ bio->bi_private = r1_bio;
+ r1_bio->bios[i] = bio;
+ generic_make_request(bio);
+ }
} else {
int disk;
bio = r1_bio->bios[r1_bio->read_disk];
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 91467a3c4a5..13e7c4b6236 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -89,6 +89,7 @@ extern void md_print_devices (void);
extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page);
+extern void md_super_wait(mddev_t *mddev);
extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
struct page *page, int rw);
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 11629f92180..d5854c2b272 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -122,6 +122,7 @@ struct mdk_rdev_s
#define Faulty 1 /* device is known to have a fault */
#define In_sync 2 /* device is in_sync with rest of array */
#define WriteMostly 4 /* Avoid reading if at all possible */
+#define BarriersNotsupp 5 /* BIO_RW_BARRIER is not supported */
int desc_nr; /* descriptor index in the superblock */
int raid_disk; /* role of device in array */
@@ -210,6 +211,13 @@ struct mddev_s
int degraded; /* whether md should consider
* adding a spare
*/
+ int barriers_work; /* initialised to true, cleared as soon
+ * as a barrier request to slave
+ * fails. Only supported
+ */
+ struct bio *biolist; /* bios that need to be retried
+ * because BIO_RW_BARRIER is not supported
+ */
atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait;
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
index 60e19b66754..292b98f2b40 100644
--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -110,7 +110,9 @@ struct r1bio_s {
#define R1BIO_Uptodate 0
#define R1BIO_IsSync 1
#define R1BIO_Degraded 2
-#define R1BIO_BehindIO 3
+#define R1BIO_BehindIO 3
+#define R1BIO_Barrier 4
+#define R1BIO_BarrierRetry 5
/* For write-behind requests, we call bi_end_io when
* the last non-write-behind device completes, providing
* any write was successful. Otherwise we call when