From 7b24fc4d7eb611da367dea3aad45473050aacd6c Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Sun, 4 Jan 2009 02:43:38 -0500
Subject: [PATCH 01/14] block: Don't verify integrity metadata on read error

If we get an I/O error on a read request there is no point in doing a
verify pass on the integrity buffer.  Adjust the completion path
accordingly.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio-integrity.c  | 25 +++++++++++++++----------
 include/linux/bio.h |  1 -
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 77ebc3c263d6..8396d741f804 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -465,7 +465,7 @@ static int bio_integrity_verify(struct bio *bio)
 
 		if (ret) {
 			kunmap_atomic(kaddr, KM_USER0);
-			break;
+			return ret;
 		}
 
 		sectors = bv->bv_len / bi->sector_size;
@@ -493,18 +493,13 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 	struct bio_integrity_payload *bip =
 		container_of(work, struct bio_integrity_payload, bip_work);
 	struct bio *bio = bip->bip_bio;
-	int error = bip->bip_error;
+	int error;
 
-	if (bio_integrity_verify(bio)) {
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-		error = -EIO;
-	}
+	error = bio_integrity_verify(bio);
 
 	/* Restore original bio completion handler */
 	bio->bi_end_io = bip->bip_end_io;
-
-	if (bio->bi_end_io)
-		bio->bi_end_io(bio, error);
+	bio_endio(bio, error);
 }
 
 /**
@@ -525,7 +520,17 @@ void bio_integrity_endio(struct bio *bio, int error)
 
 	BUG_ON(bip->bip_bio != bio);
 
-	bip->bip_error = error;
+	/* In case of an I/O error there is no point in verifying the
+	 * integrity metadata.  Restore original bio end_io handler
+	 * and run it.
+	 */
+	if (error) {
+		bio->bi_end_io = bip->bip_end_io;
+		bio_endio(bio, error);
+
+		return;
+	}
+
 	INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
 	queue_work(kintegrityd_wq, &bip->bip_work);
 }
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 18462c5b8fff..18fc4a281a7b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -312,7 +312,6 @@ struct bio_integrity_payload {
 	void			*bip_buf;	/* generated integrity data */
 	bio_end_io_t		*bip_end_io;	/* saved I/O completion fn */
 
-	int			bip_error;	/* saved I/O error */
 	unsigned int		bip_size;
 
 	unsigned short		bip_pool;	/* pool the ivec came from */

From 8ae372e3bb4acaca37ffa2ce54f4cf8dd60a94fa Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Sun, 4 Jan 2009 02:43:39 -0500
Subject: [PATCH 02/14] block: Remove obsolete BUG_ON

Now that bio_vecs are no longer cleared in bvec_alloc_bs() the following
BUG_ON must go.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio-integrity.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 8396d741f804..549b0144da11 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -140,7 +140,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 
 	iv = bip_vec_idx(bip, bip->bip_vcnt);
 	BUG_ON(iv == NULL);
-	BUG_ON(iv->bv_page != NULL);
 
 	iv->bv_page = page;
 	iv->bv_len = len;

From 322316385dde5cd879e682bcb598c56d0659fb60 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Sun, 4 Jan 2009 02:43:40 -0500
Subject: [PATCH 03/14] block: Allow empty integrity profile

Allow a block device to allocate and register an integrity profile
without providing a template.  This allows DM to preallocate a profile
to avoid deadlocks during table reconfiguration.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-integrity.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 61a8e2f8fdd0..91fa8e06b6a5 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -309,24 +309,24 @@ static struct kobj_type integrity_ktype = {
 /**
  * blk_integrity_register - Register a gendisk as being integrity-capable
  * @disk:	struct gendisk pointer to make integrity-aware
- * @template:	integrity profile
+ * @template:	optional integrity profile to register
  *
  * Description: When a device needs to advertise itself as being able
  * to send/receive integrity metadata it must use this function to
  * register the capability with the block layer.  The template is a
  * blk_integrity struct with values appropriate for the underlying
- * hardware.  See Documentation/block/data-integrity.txt.
+ * hardware.  If template is NULL the new profile is allocated but
+ * not filled out. See Documentation/block/data-integrity.txt.
  */
 int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
 {
 	struct blk_integrity *bi;
 
 	BUG_ON(disk == NULL);
-	BUG_ON(template == NULL);
 
 	if (disk->integrity == NULL) {
 		bi = kmem_cache_alloc(integrity_cachep,
-						GFP_KERNEL | __GFP_ZERO);
+				      GFP_KERNEL | __GFP_ZERO);
 		if (!bi)
 			return -1;
 
@@ -346,13 +346,16 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
 		bi = disk->integrity;
 
 	/* Use the provided profile as template */
-	bi->name = template->name;
-	bi->generate_fn = template->generate_fn;
-	bi->verify_fn = template->verify_fn;
-	bi->tuple_size = template->tuple_size;
-	bi->set_tag_fn = template->set_tag_fn;
-	bi->get_tag_fn = template->get_tag_fn;
-	bi->tag_size = template->tag_size;
+	if (template != NULL) {
+		bi->name = template->name;
+		bi->generate_fn = template->generate_fn;
+		bi->verify_fn = template->verify_fn;
+		bi->tuple_size = template->tuple_size;
+		bi->set_tag_fn = template->set_tag_fn;
+		bi->get_tag_fn = template->get_tag_fn;
+		bi->tag_size = template->tag_size;
+	} else
+		bi->name = "unsupported";
 
 	return 0;
 }

From f48fc4d32e24c0b6a18aad30305d819bcc68c049 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 5 Jan 2009 10:17:25 +0100
Subject: [PATCH 04/14] block: get rid of the manual directory counting in
 blktrace

It can result in a stuck blktrace system, if --kill is used.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blktrace.c | 72 ++++++++++++++----------------------------------
 1 file changed, 21 insertions(+), 51 deletions(-)

diff --git a/block/blktrace.c b/block/blktrace.c
index b0a2cae886db..39cc3bfe56e4 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -187,59 +187,12 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 
 static struct dentry *blk_tree_root;
 static DEFINE_MUTEX(blk_tree_mutex);
-static unsigned int root_users;
-
-static inline void blk_remove_root(void)
-{
-	if (blk_tree_root) {
-		debugfs_remove(blk_tree_root);
-		blk_tree_root = NULL;
-	}
-}
-
-static void blk_remove_tree(struct dentry *dir)
-{
-	mutex_lock(&blk_tree_mutex);
-	debugfs_remove(dir);
-	if (--root_users == 0)
-		blk_remove_root();
-	mutex_unlock(&blk_tree_mutex);
-}
-
-static struct dentry *blk_create_tree(const char *blk_name)
-{
-	struct dentry *dir = NULL;
-	int created = 0;
-
-	mutex_lock(&blk_tree_mutex);
-
-	if (!blk_tree_root) {
-		blk_tree_root = debugfs_create_dir("block", NULL);
-		if (!blk_tree_root)
-			goto err;
-		created = 1;
-	}
-
-	dir = debugfs_create_dir(blk_name, blk_tree_root);
-	if (dir)
-		root_users++;
-	else {
-		/* Delete root only if we created it */
-		if (created)
-			blk_remove_root();
-	}
-
-err:
-	mutex_unlock(&blk_tree_mutex);
-	return dir;
-}
 
 static void blk_trace_cleanup(struct blk_trace *bt)
 {
-	relay_close(bt->rchan);
 	debugfs_remove(bt->msg_file);
 	debugfs_remove(bt->dropped_file);
-	blk_remove_tree(bt->dir);
+	relay_close(bt->rchan);
 	free_percpu(bt->sequence);
 	free_percpu(bt->msg_data);
 	kfree(bt);
@@ -346,7 +299,18 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
 
 static int blk_remove_buf_file_callback(struct dentry *dentry)
 {
+	struct dentry *parent = dentry->d_parent;
 	debugfs_remove(dentry);
+
+	/*
+	* this will fail for all but the last file, but that is ok. what we
+	* care about is the top level buts->name directory going away, when
+	* the last trace file is gone. Then we don't have to rmdir() that
+	* manually on trace stop, so it nicely solves the issue with
+	* force killing of running traces.
+	*/
+
+	debugfs_remove(parent);
 	return 0;
 }
 
@@ -404,7 +368,15 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 		goto err;
 
 	ret = -ENOENT;
-	dir = blk_create_tree(buts->name);
+
+	if (!blk_tree_root) {
+		blk_tree_root = debugfs_create_dir("block", NULL);
+		if (!blk_tree_root)
+			return -ENOMEM;
+	}
+
+	dir = debugfs_create_dir(buts->name, blk_tree_root);
+
 	if (!dir)
 		goto err;
 
@@ -458,8 +430,6 @@ probe_err:
 	atomic_dec(&blk_probes_ref);
 	mutex_unlock(&blk_probe_mutex);
 err:
-	if (dir)
-		blk_remove_tree(dir);
 	if (bt) {
 		if (bt->msg_file)
 			debugfs_remove(bt->msg_file);

From 16642eb68216d8e0e136a99e514e9166e7125838 Mon Sep 17 00:00:00 2001
From: Alberto Bertogli <albertito@blitiri.com.ar>
Date: Mon, 5 Jan 2009 10:18:53 +0100
Subject: [PATCH 05/14] Fix small typo in bio.h's documentation

Signed-off-by: Alberto Bertogli <albertito@blitiri.com.ar>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/bio.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 18fc4a281a7b..5175aa3103c6 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -144,7 +144,7 @@ struct bio {
  * bit 1 -- rw-ahead when set
  * bit 2 -- barrier
  *	Insert a serialization point in the IO queue, forcing previously
- *	submitted IO to be completed before this oen is issued.
+ *	submitted IO to be completed before this one is issued.
  * bit 3 -- synchronous I/O hint: the block layer will unplug immediately
  *	Note that this does NOT indicate that the IO itself is sync, just
  *	that the block layer will not postpone issue of this IO by plugging.

From 1308835ffffe6d61ad1f48c5c381c9cc47f683ec Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Wed, 7 Jan 2009 12:22:39 +0100
Subject: [PATCH 06/14] block: export SSD/non-rotational queue flag through
 sysfs

For some devices (i.e. CFA ATA) we can't reliably detect whether
the device is of rotational or non-rotational type so we need to
leave the final decision about this setting to the user-space.

As a bonus do a minor CodingStyle fixup in queue_nomerges_store().

Suggested-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-sysfs.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index a29cb788e408..b538ab8dbbd1 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -130,6 +130,27 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
 	return queue_var_show(max_hw_sectors_kb, (page));
 }
 
+static ssize_t queue_nonrot_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(!blk_queue_nonrot(q), page);
+}
+
+static ssize_t queue_nonrot_store(struct request_queue *q, const char *page,
+				  size_t count)
+{
+	unsigned long nm;
+	ssize_t ret = queue_var_store(&nm, page, count);
+
+	spin_lock_irq(q->queue_lock);
+	if (nm)
+		queue_flag_clear(QUEUE_FLAG_NONROT, q);
+	else
+		queue_flag_set(QUEUE_FLAG_NONROT, q);
+	spin_unlock_irq(q->queue_lock);
+
+	return ret;
+}
+
 static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(blk_queue_nomerges(q), page);
@@ -146,8 +167,8 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
 		queue_flag_set(QUEUE_FLAG_NOMERGES, q);
 	else
 		queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
-
 	spin_unlock_irq(q->queue_lock);
+
 	return ret;
 }
 
@@ -210,6 +231,12 @@ static struct queue_sysfs_entry queue_hw_sector_size_entry = {
 	.show = queue_hw_sector_size_show,
 };
 
+static struct queue_sysfs_entry queue_nonrot_entry = {
+	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_nonrot_show,
+	.store = queue_nonrot_store,
+};
+
 static struct queue_sysfs_entry queue_nomerges_entry = {
 	.attr = {.name = "nomerges", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_nomerges_show,
@@ -229,6 +256,7 @@ static struct attribute *default_attrs[] = {
 	&queue_max_sectors_entry.attr,
 	&queue_iosched_entry.attr,
 	&queue_hw_sector_size_entry.attr,
+	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
 	NULL,

From 213d9417fec62ef4c3675621b9364a667954d4dd Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 6 Jan 2009 09:16:05 +0100
Subject: [PATCH 07/14] block: seperate bio/request unplug and sync bits

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       |  5 ++++-
 include/linux/bio.h    | 18 +++++++++++-------
 include/linux/blkdev.h |  2 ++
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index a824e49c0d0a..9e2e86fb78b8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1125,6 +1125,8 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 
 	if (bio_sync(bio))
 		req->cmd_flags |= REQ_RW_SYNC;
+	if (bio_unplug(bio))
+		req->cmd_flags |= REQ_UNPLUG;
 	if (bio_rw_meta(bio))
 		req->cmd_flags |= REQ_RW_META;
 
@@ -1141,6 +1143,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	int el_ret, nr_sectors;
 	const unsigned short prio = bio_prio(bio);
 	const int sync = bio_sync(bio);
+	const int unplug = bio_unplug(bio);
 	int rw_flags;
 
 	nr_sectors = bio_sectors(bio);
@@ -1244,7 +1247,7 @@ get_rq:
 		blk_plug_device(q);
 	add_request(q, req);
 out:
-	if (sync || blk_queue_nonrot(q))
+	if (unplug || blk_queue_nonrot(q))
 		__generic_unplug_device(q);
 	spin_unlock_irq(q->queue_lock);
 	return 0;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5175aa3103c6..f53568c5852e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -163,12 +163,15 @@ struct bio {
 #define BIO_RW		0	/* Must match RW in req flags (blkdev.h) */
 #define BIO_RW_AHEAD	1	/* Must match FAILFAST in req flags */
 #define BIO_RW_BARRIER	2
-#define BIO_RW_SYNC	3
-#define BIO_RW_META	4
-#define BIO_RW_DISCARD	5
-#define BIO_RW_FAILFAST_DEV		6
-#define BIO_RW_FAILFAST_TRANSPORT	7
-#define BIO_RW_FAILFAST_DRIVER		8
+#define BIO_RW_SYNCIO	3
+#define BIO_RW_UNPLUG	4
+#define BIO_RW_META	5
+#define BIO_RW_DISCARD	6
+#define BIO_RW_FAILFAST_DEV		7
+#define BIO_RW_FAILFAST_TRANSPORT	8
+#define BIO_RW_FAILFAST_DRIVER		9
+
+#define BIO_RW_SYNC	(BIO_RW_SYNCIO | BIO_RW_UNPLUG)
 
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
@@ -194,7 +197,8 @@ struct bio {
 #define bio_segments(bio)	((bio)->bi_vcnt - (bio)->bi_idx)
 #define bio_sectors(bio)	((bio)->bi_size >> 9)
 #define bio_barrier(bio)	((bio)->bi_rw & (1 << BIO_RW_BARRIER))
-#define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNC))
+#define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNCIO))
+#define bio_unplug(bio)		((bio)->bi_rw & (1 << BIO_RW_UNPLUG))
 #define bio_failfast_dev(bio)	((bio)->bi_rw &	(1 << BIO_RW_FAILFAST_DEV))
 #define bio_failfast_transport(bio)	\
 	((bio)->bi_rw & (1 << BIO_RW_FAILFAST_TRANSPORT))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 044467ef7b11..75426e4b8cdf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -108,6 +108,7 @@ enum rq_flag_bits {
 	__REQ_RW_META,		/* metadata io request */
 	__REQ_COPY_USER,	/* contains copies of user pages */
 	__REQ_INTEGRITY,	/* integrity metadata has been remapped */
+	__REQ_UNPLUG,		/* unplug queue on submission */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -134,6 +135,7 @@ enum rq_flag_bits {
 #define REQ_RW_META	(1 << __REQ_RW_META)
 #define REQ_COPY_USER	(1 << __REQ_COPY_USER)
 #define REQ_INTEGRITY	(1 << __REQ_INTEGRITY)
+#define REQ_UNPLUG	(1 << __REQ_UNPLUG)
 
 #define BLK_MAX_CDB	16
 

From 1dfa17f4ab8543d82caf4d36636b93916a18f456 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 6 Jan 2009 09:21:49 +0100
Subject: [PATCH 08/14] block: add bio_rw_flagged() for testing bio->bi_rw

The existing functions for checking bio->bi_rw are badly named. So lets
mirror what we do for bio->bi_flags testing, use a properly named
function so that it's immediately obvious what is being tested.

Maintain compatability names for the old macros, eventually we'll get
rid of these.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/bio.h | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index f53568c5852e..0942765cf8c0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -173,6 +173,24 @@ struct bio {
 
 #define BIO_RW_SYNC	(BIO_RW_SYNCIO | BIO_RW_UNPLUG)
 
+#define bio_rw_flagged(bio, flag)	((bio)->bi_rw & (1 << (flag)))
+
+/*
+ * Old defines, these should eventually be replaced by direct usage of
+ * bio_rw_flagged()
+ */
+#define bio_barrier(bio)	bio_rw_flagged(bio, BIO_RW_BARRIER)
+#define bio_sync(bio)		bio_rw_flagged(bio, BIO_RW_SYNCIO)
+#define bio_unplug(bio)		bio_rw_flagged(bio, BIO_RW_UNPLUG)
+#define bio_failfast_dev(bio)	bio_rw_flagged(bio, BIO_RW_FAILFAST_DEV)
+#define bio_failfast_transport(bio)	\
+		bio_rw_flagged(bio, BIO_RW_FAILFAST_TRANSPORT)
+#define bio_failfast_driver(bio) 	\
+		bio_rw_flagged(bio, BIO_RW_FAILFAST_DRIVER)
+#define bio_rw_ahead(bio)	bio_rw_flagged(bio, BIO_RW_AHEAD)
+#define bio_rw_meta(bio)	bio_rw_flagged(bio, BIO_RW_META)
+#define bio_discard(bio)	bio_rw_flagged(bio, BIO_RW_DISCARD)
+
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
  */
@@ -196,16 +214,6 @@ struct bio {
 #define bio_offset(bio)		bio_iovec((bio))->bv_offset
 #define bio_segments(bio)	((bio)->bi_vcnt - (bio)->bi_idx)
 #define bio_sectors(bio)	((bio)->bi_size >> 9)
-#define bio_barrier(bio)	((bio)->bi_rw & (1 << BIO_RW_BARRIER))
-#define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNCIO))
-#define bio_unplug(bio)		((bio)->bi_rw & (1 << BIO_RW_UNPLUG))
-#define bio_failfast_dev(bio)	((bio)->bi_rw &	(1 << BIO_RW_FAILFAST_DEV))
-#define bio_failfast_transport(bio)	\
-	((bio)->bi_rw & (1 << BIO_RW_FAILFAST_TRANSPORT))
-#define bio_failfast_driver(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DRIVER))
-#define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
-#define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
-#define bio_discard(bio)	((bio)->bi_rw & (1 << BIO_RW_DISCARD))
 #define bio_empty_barrier(bio)	(bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio))
 
 static inline unsigned int bio_cur_sectors(struct bio *bio)

From dbdac9b71dff5d27885f82eb2cfca310861fdf9e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 13 Jan 2009 15:27:32 +0100
Subject: [PATCH 09/14] block: Fix documentation for blkdev_issue_flush()

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 8eba4e43bb0c..f7dae57e6cab 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -302,7 +302,7 @@ static void bio_end_empty_barrier(struct bio *bio, int err)
  * Description:
  *    Issue a flush for the block device in question. Caller can supply
  *    room for storing the error offset in case of a flush error, if they
- *    wish to.  Caller must run wait_for_completion() on its own.
+ *    wish to.
  */
 int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 {

From cec0707e40ae25794b5a2de7b7f03c51961f80d9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 13 Jan 2009 15:28:32 +0100
Subject: [PATCH 10/14] block: silently error an unsupported barrier bio

This fixes a "regression" from 2.6.28, where the barrier probes that file
systems may do would trigger additional end request warnings in dmesg.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 9e2e86fb78b8..ae75c047f45d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1451,6 +1451,11 @@ static inline void __generic_make_request(struct bio *bio)
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
+		if (bio_barrier(bio) && bio_has_data(bio) &&
+		    (q->next_ordered == QUEUE_ORDERED_NONE)) {
+			err = -EOPNOTSUPP;
+			goto end_io;
+		}
 
 		ret = q->make_request_fn(q, bio);
 	} while (ret);

From a229fc61ef0ee3c30fd193beee0eeb87410227f1 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 19 Jan 2009 10:37:38 +0100
Subject: [PATCH 11/14] include/linux: Add bsg.h to the Kernel exported headers

bsg.h in current form is perfectly suitable for user-mode
consumption. It is needed together with scsi/sg.h for applications
that want to interface with the bsg driver.

Currently the few projects that use it would copy it over into
the projects. But that is not acceptable for projects that need
to provide source and devel packages for distros.

This should also be submitted to stable 2.6.28 and 2.6.27 since bsg had
a stable API since these Kernels and distro users will need the header
for these kernels a swell

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
CC: stable@kernel.org
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/Kbuild | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 12e9a2957caf..2124c063a7ef 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -41,6 +41,7 @@ header-y += baycom.h
 header-y += bfs_fs.h
 header-y += blkpg.h
 header-y += bpqether.h
+header-y += bsg.h
 header-y += can.h
 header-y += cdk.h
 header-y += chio.h

From 7598909e3ee2a08726276d6415b69dadb52d0d76 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Tue, 27 Jan 2009 09:29:24 +0100
Subject: [PATCH 12/14] Mark mandatory elevator functions in the biodoc.txt

biodoc.txt mentions that elevator functions marked with * are mandatory, but
no function is marked with *. Mark the 3 functions which should be
implemented by any io scheduler.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/block/biodoc.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 5d2480d33b43..ecad6ee75705 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -954,14 +954,14 @@ elevator_allow_merge_fn		called whenever the block layer determines
 				results in some sort of conflict internally,
 				this hook allows it to do that.
 
-elevator_dispatch_fn		fills the dispatch queue with ready requests.
+elevator_dispatch_fn*		fills the dispatch queue with ready requests.
 				I/O schedulers are free to postpone requests by
 				not filling the dispatch queue unless @force
 				is non-zero.  Once dispatched, I/O schedulers
 				are not allowed to manipulate the requests -
 				they belong to generic dispatch queue.
 
-elevator_add_req_fn		called to add a new request into the scheduler
+elevator_add_req_fn*		called to add a new request into the scheduler
 
 elevator_queue_empty_fn		returns true if the merge queue is empty.
 				Drivers shouldn't use this, but rather check
@@ -991,7 +991,7 @@ elevator_activate_req_fn	Called when device driver first sees a request.
 elevator_deactivate_req_fn	Called when device driver decides to delay
 				a request by requeueing it.
 
-elevator_init_fn
+elevator_init_fn*
 elevator_exit_fn		Allocate and free any elevator specific storage
 				for a queue.
 

From bc58ba9468d94d62c56ab9b47173583ec140b165 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 23 Jan 2009 10:54:44 +0100
Subject: [PATCH 13/14] block: add sysfs file for controlling io stats
 accounting

This allows us to turn off disk stat accounting completely, for the cases
where the 0.5-1% reduction in system time is important.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 90 +++++++++++++++++++++++++-----------------
 block/blk-sysfs.c      | 28 +++++++++++++
 include/linux/blkdev.h |  6 +++
 3 files changed, 88 insertions(+), 36 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index ae75c047f45d..ca69f3d94100 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,11 +64,12 @@ static struct workqueue_struct *kblockd_workqueue;
 
 static void drive_stat_acct(struct request *rq, int new_io)
 {
+	struct gendisk *disk = rq->rq_disk;
 	struct hd_struct *part;
 	int rw = rq_data_dir(rq);
 	int cpu;
 
-	if (!blk_fs_request(rq) || !rq->rq_disk)
+	if (!blk_fs_request(rq) || !disk || !blk_queue_io_stat(disk->queue))
 		return;
 
 	cpu = part_stat_lock();
@@ -599,8 +600,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 	q->request_fn		= rfn;
 	q->prep_rq_fn		= NULL;
 	q->unplug_fn		= generic_unplug_device;
-	q->queue_flags		= (1 << QUEUE_FLAG_CLUSTER |
-				   1 << QUEUE_FLAG_STACKABLE);
+	q->queue_flags		= QUEUE_FLAG_DEFAULT;
 	q->queue_lock		= lock;
 
 	blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK);
@@ -1663,6 +1663,55 @@ void blkdev_dequeue_request(struct request *req)
 }
 EXPORT_SYMBOL(blkdev_dequeue_request);
 
+static void blk_account_io_completion(struct request *req, unsigned int bytes)
+{
+	struct gendisk *disk = req->rq_disk;
+
+	if (!disk || !blk_queue_io_stat(disk->queue))
+		return;
+
+	if (blk_fs_request(req)) {
+		const int rw = rq_data_dir(req);
+		struct hd_struct *part;
+		int cpu;
+
+		cpu = part_stat_lock();
+		part = disk_map_sector_rcu(req->rq_disk, req->sector);
+		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
+		part_stat_unlock();
+	}
+}
+
+static void blk_account_io_done(struct request *req)
+{
+	struct gendisk *disk = req->rq_disk;
+
+	if (!disk || !blk_queue_io_stat(disk->queue))
+		return;
+
+	/*
+	 * Account IO completion.  bar_rq isn't accounted as a normal
+	 * IO on queueing nor completion.  Accounting the containing
+	 * request is enough.
+	 */
+	if (blk_fs_request(req) && req != &req->q->bar_rq) {
+		unsigned long duration = jiffies - req->start_time;
+		const int rw = rq_data_dir(req);
+		struct hd_struct *part;
+		int cpu;
+
+		cpu = part_stat_lock();
+		part = disk_map_sector_rcu(disk, req->sector);
+
+		part_stat_inc(cpu, part, ios[rw]);
+		part_stat_add(cpu, part, ticks[rw], duration);
+		part_round_stats(cpu, part);
+		part_dec_in_flight(part);
+
+		part_stat_unlock();
+	}
+}
+
 /**
  * __end_that_request_first - end I/O on a request
  * @req:      the request being processed
@@ -1698,16 +1747,7 @@ static int __end_that_request_first(struct request *req, int error,
 				(unsigned long long)req->sector);
 	}
 
-	if (blk_fs_request(req) && req->rq_disk) {
-		const int rw = rq_data_dir(req);
-		struct hd_struct *part;
-		int cpu;
-
-		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, req->sector);
-		part_stat_add(cpu, part, sectors[rw], nr_bytes >> 9);
-		part_stat_unlock();
-	}
+	blk_account_io_completion(req, nr_bytes);
 
 	total_bytes = bio_nbytes = 0;
 	while ((bio = req->bio) != NULL) {
@@ -1787,8 +1827,6 @@ static int __end_that_request_first(struct request *req, int error,
  */
 static void end_that_request_last(struct request *req, int error)
 {
-	struct gendisk *disk = req->rq_disk;
-
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(req->q, req);
 
@@ -1800,27 +1838,7 @@ static void end_that_request_last(struct request *req, int error)
 
 	blk_delete_timer(req);
 
-	/*
-	 * Account IO completion.  bar_rq isn't accounted as a normal
-	 * IO on queueing nor completion.  Accounting the containing
-	 * request is enough.
-	 */
-	if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {
-		unsigned long duration = jiffies - req->start_time;
-		const int rw = rq_data_dir(req);
-		struct hd_struct *part;
-		int cpu;
-
-		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(disk, req->sector);
-
-		part_stat_inc(cpu, part, ios[rw]);
-		part_stat_add(cpu, part, ticks[rw], duration);
-		part_round_stats(cpu, part);
-		part_dec_in_flight(part);
-
-		part_stat_unlock();
-	}
+	blk_account_io_done(req);
 
 	if (req->end_io)
 		req->end_io(req, error);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b538ab8dbbd1..e29ddfc73cf4 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -197,6 +197,27 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 	return ret;
 }
 
+static ssize_t queue_iostats_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(blk_queue_io_stat(q), page);
+}
+
+static ssize_t queue_iostats_store(struct request_queue *q, const char *page,
+				   size_t count)
+{
+	unsigned long stats;
+	ssize_t ret = queue_var_store(&stats, page, count);
+
+	spin_lock_irq(q->queue_lock);
+	if (stats)
+		queue_flag_set(QUEUE_FLAG_IO_STAT, q);
+	else
+		queue_flag_clear(QUEUE_FLAG_IO_STAT, q);
+	spin_unlock_irq(q->queue_lock);
+
+	return ret;
+}
+
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_requests_show,
@@ -249,6 +270,12 @@ static struct queue_sysfs_entry queue_rq_affinity_entry = {
 	.store = queue_rq_affinity_store,
 };
 
+static struct queue_sysfs_entry queue_iostats_entry = {
+	.attr = {.name = "iostats", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_iostats_show,
+	.store = queue_iostats_store,
+};
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -259,6 +286,7 @@ static struct attribute *default_attrs[] = {
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
+	&queue_iostats_entry.attr,
 	NULL,
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 75426e4b8cdf..d08c4b8219a6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -451,6 +451,11 @@ struct request_queue
 #define QUEUE_FLAG_STACKABLE   13	/* supports request stacking */
 #define QUEUE_FLAG_NONROT      14	/* non-rotational device (SSD) */
 #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
+#define QUEUE_FLAG_IO_STAT     15	/* do IO stats */
+
+#define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
+				 (1 << QUEUE_FLAG_CLUSTER) |		\
+				  1 << QUEUE_FLAG_STACKABLE)
 
 static inline int queue_is_locked(struct request_queue *q)
 {
@@ -567,6 +572,7 @@ enum {
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_nomerges(q)	test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
 #define blk_queue_nonrot(q)	test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
+#define blk_queue_io_stat(q)	test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
 #define blk_queue_flushing(q)	((q)->ordseq)
 #define blk_queue_stackable(q)	\
 	test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)

From 3a9a3f6cc55418dd1525e636dccbbe13c394f652 Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Fri, 30 Jan 2009 12:46:41 +0100
Subject: [PATCH 14/14] cfq-iosched: Allow RT requests to pre-empt ongoing BE
 timeslice

This patch adds the ability to pre-empt an ongoing BE timeslice when a RT
request is waiting for the current timeslice to complete. This reduces the
wait time to disk for RT requests from an upper bound of 4 (current value
of cfq_quantum) to 1 disk request.

Applied Jens' suggeested changes to avoid the rb lookup and use !cfq_class_rt()
and retested.

Latency(secs) for the RT task when doing sequential reads from 10G file.
                       | only RT | RT + BE | RT + BE + this patch
small (512 byte) reads | 143     | 163     | 145
large (1Mb) reads      | 142     | 158     | 146

Signed-off-by: Divyesh Shah <dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e8525fa72823..664ebfd092ec 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -84,6 +84,11 @@ struct cfq_data {
 	 */
 	struct cfq_rb_root service_tree;
 	unsigned int busy_queues;
+	/*
+	 * Used to track any pending rt requests so we can pre-empt current
+	 * non-RT cfqq in service when this value is non-zero.
+	 */
+	unsigned int busy_rt_queues;
 
 	int rq_in_driver;
 	int sync_flight;
@@ -562,6 +567,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	cfq_mark_cfqq_on_rr(cfqq);
 	cfqd->busy_queues++;
+	if (cfq_class_rt(cfqq))
+		cfqd->busy_rt_queues++;
 
 	cfq_resort_rr_list(cfqd, cfqq);
 }
@@ -581,6 +588,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
+	if (cfq_class_rt(cfqq))
+		cfqd->busy_rt_queues--;
 }
 
 /*
@@ -1004,6 +1013,20 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 	if (cfq_slice_used(cfqq))
 		goto expire;
 
+	/*
+	 * If we have a RT cfqq waiting, then we pre-empt the current non-rt
+	 * cfqq.
+	 */
+	if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) {
+		/*
+		 * We simulate this as cfqq timed out so that it gets to bank
+		 * the remaining of its time slice.
+		 */
+		cfq_log_cfqq(cfqd, cfqq, "preempt");
+		cfq_slice_expired(cfqd, 1);
+		goto new_queue;
+	}
+
 	/*
 	 * The active queue has requests and isn't expired, allow it to
 	 * dispatch.
@@ -1067,6 +1090,13 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		if (RB_EMPTY_ROOT(&cfqq->sort_list))
 			break;
 
+		/*
+		 * If there is a non-empty RT cfqq waiting for current
+		 * cfqq's timeslice to complete, pre-empt this cfqq
+		 */
+		if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues)
+			break;
+
 	} while (dispatched < max_dispatch);
 
 	/*
@@ -1801,6 +1831,12 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	if (rq_is_meta(rq) && !cfqq->meta_pending)
 		return 1;
 
+	/*
+	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
+	 */
+	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
+		return 1;
+
 	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
 		return 0;
 
@@ -1870,7 +1906,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		/*
 		 * not the active queue - expire current slice if it is
 		 * idle and has expired it's mean thinktime or this new queue
-		 * has some old slice time left and is of higher priority
+		 * has some old slice time left and is of higher priority or
+		 * this new queue is RT and the current one is BE
 		 */
 		cfq_preempt_queue(cfqd, cfqq);
 		cfq_mark_cfqq_must_dispatch(cfqq);