From ffbd517d5a8c8e93ddd11046434fb029f3df73aa Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 20 Apr 2009 15:50:09 -0400
Subject: Btrfs: use WRITE_SYNC for synchronous writes

Part of reducing fsync/O_SYNC/O_DIRECT latencies is using WRITE_SYNC for
writes we plan on waiting on in the near future.  This patch
mirrors recent changes in other filesystems and the generic code to
use WRITE_SYNC when WB_SYNC_ALL is passed and to use WRITE_SYNC for
other latency critical writes.

Btrfs uses async worker threads for checksumming before the write is done,
and then again to actually submit the bios.  The bio submission code just
runs a per-device list of bios that need to be sent down the pipe.

This list is split into low priority and high priority lists so the
WRITE_SYNC IO happens first.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 92caa8035f3..fec18b43c2c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2095,10 +2095,10 @@ static int write_dev_supers(struct btrfs_device *device,
 				device->barriers = 0;
 				get_bh(bh);
 				lock_buffer(bh);
-				ret = submit_bh(WRITE, bh);
+				ret = submit_bh(WRITE_SYNC, bh);
 			}
 		} else {
-			ret = submit_bh(WRITE, bh);
+			ret = submit_bh(WRITE_SYNC, bh);
 		}
 
 		if (!ret && wait) {
-- 
cgit v1.2.3


From d313d7a31a752c88f7288692bd98e66d0789779b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 20 Apr 2009 15:50:09 -0400
Subject: Btrfs: add a priority queue to the async thread helpers

Btrfs is using WRITE_SYNC_PLUG to send down synchronous IOs with a
higher priority.  But, the checksumming helper threads prevent it
from being fully effective.

There are two problems.  First, a big queue of pending checksumming
will delay the synchronous IO behind other lower priority writes.  Second,
the checksumming uses an ordered async work queue.  The ordering makes sure
that IOs are sent to the block layer in the same order they are sent
to the checksumming threads.  Usually this gives us less seeky IO.

But, when we start mixing IO priorities, the lower priority IO can delay
the higher priority IO.

This patch solves both problems by adding a high priority list to the async
helper threads, and a new btrfs_set_work_high_prio(), which is used
to make put a new async work item onto the higher priority list.

The ordering is still done on high priority IO, but all of the high
priority bios are ordered separately from the low priority bios.  This
ordering is purely an IO optimization, it is not involved in data
or metadata integrity.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fec18b43c2c..a6b83744b05 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -579,6 +579,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	async->bio_flags = bio_flags;
 
 	atomic_inc(&fs_info->nr_async_submits);
+
+	if (rw & (1 << BIO_RW_SYNCIO))
+		btrfs_set_work_high_prio(&async->work);
+
 	btrfs_queue_worker(&fs_info->workers, &async->work);
 #if 0
 	int limit = btrfs_async_submit_limit(fs_info);
@@ -656,6 +660,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
 				     mirror_num, 0);
 	}
+
 	/*
 	 * kthread helpers are used to submit writes so that checksumming
 	 * can happen in parallel across all CPUs
-- 
cgit v1.2.3


From 97e728d4353f38c87bf0804cdfd79a9b13fc2c3e Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Tue, 21 Apr 2009 17:40:57 -0400
Subject: Btrfs: try to keep a healthy ratio of metadata vs data block groups

This patch makes the chunk allocator keep a good ratio of metadata vs data
block groups.  By default for every 8 data block groups, we'll allocate 1
metadata chunk, or about 12% of the disk will be allocated for metadata.  This
can be changed by specifying the metadata_ratio mount option.

This is simply the number of data block groups that have to be allocated to
force a metadata chunk allocation.  By making sure we allocate metadata chunks
more often, we are less likely to get into situations where the whole disk
has been allocated as data block groups.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a6b83744b05..44c94d808e2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1604,6 +1604,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
+	fs_info->metadata_ratio = 8;
 
 	fs_info->thread_pool_size = min_t(unsigned long,
 					  num_online_cpus() + 2, 8);
-- 
cgit v1.2.3


From b7967db75a38df4891b22efe1b0969b9357eb946 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 27 Apr 2009 07:29:04 -0400
Subject: Btrfs: remove #if 0 code

Btrfs had some old code sitting around under #if 0, this drops it.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 48 +-----------------------------------------------
 1 file changed, 1 insertion(+), 47 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 44c94d808e2..77f9a3b824b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -584,18 +584,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 		btrfs_set_work_high_prio(&async->work);
 
 	btrfs_queue_worker(&fs_info->workers, &async->work);
-#if 0
-	int limit = btrfs_async_submit_limit(fs_info);
-	if (atomic_read(&fs_info->nr_async_submits) > limit) {
-		wait_event_timeout(fs_info->async_submit_wait,
-			   (atomic_read(&fs_info->nr_async_submits) < limit),
-			   HZ/10);
 
-		wait_event_timeout(fs_info->async_submit_wait,
-			   (atomic_read(&fs_info->nr_async_bios) < limit),
-			   HZ/10);
-	}
-#endif
 	while (atomic_read(&fs_info->async_submit_draining) &&
 	      atomic_read(&fs_info->nr_async_submits)) {
 		wait_event(fs_info->async_submit_wait,
@@ -770,27 +759,6 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
 	}
 }
 
-#if 0
-static int btree_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct buffer_head *bh;
-	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-	struct buffer_head *head;
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, root->fs_info->sb->s_blocksize,
-					(1 << BH_Dirty)|(1 << BH_Uptodate));
-	}
-	head = page_buffers(page);
-	bh = head;
-	do {
-		if (buffer_dirty(bh))
-			csum_tree_block(root, bh, 0);
-		bh = bh->b_this_page;
-	} while (bh != head);
-	return block_write_full_page(page, btree_get_block, wbc);
-}
-#endif
-
 static struct address_space_operations btree_aops = {
 	.readpage	= btree_readpage,
 	.writepage	= btree_writepage,
@@ -1278,11 +1246,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 	int ret = 0;
 	struct btrfs_device *device;
 	struct backing_dev_info *bdi;
-#if 0
-	if ((bdi_bits & (1 << BDI_write_congested)) &&
-	    btrfs_congested_async(info, 0))
-		return 1;
-#endif
+
 	list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
 		if (!device->bdev)
 			continue;
@@ -2334,16 +2298,6 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 
-#if 0
-	while (!list_empty(&fs_info->hashers)) {
-		struct btrfs_hasher *hasher;
-		hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
-				    hashers);
-		list_del(&hasher->hashers);
-		crypto_free_hash(&fs_info->hash_tfm);
-		kfree(hasher);
-	}
-#endif
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
-- 
cgit v1.2.3


From 193f284d4985db0370a8a1bbdfb20df548cf9ffb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 27 Apr 2009 07:29:05 -0400
Subject: Btrfs: ratelimit IO error printks

Btrfs has printks for various IO errors, including bad checksums and
mismatches between what we expect the block headers to contain and what
we actually find on the disk.

Longer term we need a real reporting mechanism for this, but for now
printk is going to have to do.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 77f9a3b824b..aa0c259b9c2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -232,10 +232,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			memcpy(&found, result, csum_size);
 
 			read_extent_buffer(buf, &val, 0, csum_size);
-			printk(KERN_INFO "btrfs: %s checksum verify failed "
-			       "on %llu wanted %X found %X level %d\n",
-			       root->fs_info->sb->s_id,
-			       buf->start, val, found, btrfs_header_level(buf));
+			if (printk_ratelimit()) {
+				printk(KERN_INFO "btrfs: %s checksum verify "
+				       "failed on %llu wanted %X found %X "
+				       "level %d\n",
+				       root->fs_info->sb->s_id,
+				       (unsigned long long)buf->start, val, found,
+				       btrfs_header_level(buf));
+			}
 			if (result != (char *)&inline_result)
 				kfree(result);
 			return 1;
@@ -268,10 +272,13 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
 		ret = 0;
 		goto out;
 	}
-	printk("parent transid verify failed on %llu wanted %llu found %llu\n",
-	       (unsigned long long)eb->start,
-	       (unsigned long long)parent_transid,
-	       (unsigned long long)btrfs_header_generation(eb));
+	if (printk_ratelimit()) {
+		printk("parent transid verify failed on %llu wanted %llu "
+		       "found %llu\n",
+		       (unsigned long long)eb->start,
+		       (unsigned long long)parent_transid,
+		       (unsigned long long)btrfs_header_generation(eb));
+	}
 	ret = 1;
 	clear_extent_buffer_uptodate(io_tree, eb);
 out:
@@ -415,9 +422,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
-		printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
-		       (unsigned long long)found_start,
-		       (unsigned long long)eb->start);
+		if (printk_ratelimit()) {
+			printk(KERN_INFO "btrfs bad tree block start "
+			       "%llu %llu\n",
+			       (unsigned long long)found_start,
+			       (unsigned long long)eb->start);
+		}
 		ret = -EIO;
 		goto err;
 	}
@@ -429,8 +439,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		goto err;
 	}
 	if (check_tree_block_fsid(root, eb)) {
-		printk(KERN_INFO "btrfs bad fsid on block %llu\n",
-		       (unsigned long long)eb->start);
+		if (printk_ratelimit()) {
+			printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+			       (unsigned long long)eb->start);
+		}
 		ret = -EIO;
 		goto err;
 	}
-- 
cgit v1.2.3


From 21380931eb4da4e29ac663d0221581282cbba208 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 21 Apr 2009 12:38:29 -0700
Subject: Btrfs: Fix a bunch of printk() warnings.

Just happened to notice a bunch of %llu vs u64 warnings.  Here's a patch
to cast them all.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/disk-io.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index aa0c259b9c2..0ff16d3331d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1671,7 +1671,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (features) {
 		printk(KERN_ERR "BTRFS: couldn't mount because of "
 		       "unsupported optional features (%Lx).\n",
-		       features);
+		       (unsigned long long)features);
 		err = -EINVAL;
 		goto fail_iput;
 	}
@@ -1681,7 +1681,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!(sb->s_flags & MS_RDONLY) && features) {
 		printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
 		       "unsupported option features (%Lx).\n",
-		       features);
+		       (unsigned long long)features);
 		err = -EINVAL;
 		goto fail_iput;
 	}
@@ -2273,7 +2273,7 @@ int close_ctree(struct btrfs_root *root)
 
 	if (fs_info->delalloc_bytes) {
 		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
-		       fs_info->delalloc_bytes);
+		       (unsigned long long)fs_info->delalloc_bytes);
 	}
 	if (fs_info->total_ref_cache_size) {
 		printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
-- 
cgit v1.2.3