From f65cb45cba63f249458b669aa67069eabc37b2f5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 16 Dec 2008 13:40:44 +0100
Subject: perfcounters: flush on setuid exec

Pavel Machek pointed out that performance counters should be flushed
when crossing protection domains on setuid execution.

Reported-by: Pavel Machek <pavel@suse.cz>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/exec.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index ec5df9a3831..d5165d899a4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
+#include <linux/perf_counter.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 #include <linux/key.h>
@@ -1017,6 +1018,13 @@ int flush_old_exec(struct linux_binprm * bprm)
 		set_dumpable(current->mm, suid_dumpable);
 	}
 
+	/*
+	 * Flush performance counters when crossing a
+	 * security domain:
+	 */
+	if (!get_dumpable(current->mm))
+		perf_counter_exit_task(current);
+
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
 
-- 
cgit v1.2.3


From 9c83633ad38138855181af6936e8ac570ef7e2cb Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Tue, 7 Apr 2009 14:48:16 +0300
Subject: missing unlock in jfs_quota_write()

We should unlock &inode->i_mutex on the error path.  This bug was
in ext2_quota_write().  I sent a patch to them today as well.

Found by smatch (http://repo.or.cz/w/smatch.git).  Compile tested.

regards,
dan carpenter

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 fs/jfs/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 6f21adf9479..d9b0e92b360 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -720,8 +720,10 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
 		blk++;
 	}
 out:
-	if (len == towrite)
+	if (len == towrite) {
+		mutex_unlock(&inode->i_mutex);
 		return err;
+	}
 	if (inode->i_size < off+len-towrite)
 		i_size_write(inode, off+len-towrite);
 	inode->i_version++;
-- 
cgit v1.2.3


From 8d1b2d9361b494bfc761700c348c65ebbe3deb5b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:30 +0200
Subject: perf_counter: track task-comm data

Similar to the mmap data stream, add one that tracks the task COMM field,
so that the userspace reporting knows what to call a task.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130409.127422406@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/exec.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index e015c0b5a08..bf47ed0278f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -951,6 +951,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
 	task_lock(tsk);
 	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
 	task_unlock(tsk);
+	perf_counter_comm(tsk);
 }
 
 int flush_old_exec(struct linux_binprm * bprm)
-- 
cgit v1.2.3


From 79ffab34391933ee3b95dac7f25c0478fa2f8f1e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 13 May 2009 15:13:42 -0400
Subject: ext4: Properly initialize the buffer_head state

These struct buffer_heads are allocated on the stack (and hence are
initialized with stack garbage).  They are only used to call a
get_blocks() function, so that's mostly OK, but b_state must be
initialized to be 0 so we don't have any unexpected BH_* flags set by
accident, such as BH_Unwritten or BH_Delay.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c |  1 +
 fs/ext4/inode.c   | 15 ++++++++++++++-
 fs/mpage.c        |  6 ++++--
 3 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e3a55eb8b26..a953214f282 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3150,6 +3150,7 @@ retry:
 			ret = PTR_ERR(handle);
 			break;
 		}
+		map_bh.b_state = 0;
 		ret = ext4_get_blocks_wrap(handle, inode, block,
 					  max_blocks, &map_bh,
 					  EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2a9ffd528dd..d7ad0bb73cd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2055,7 +2055,20 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	if ((mpd->b_state  & (1 << BH_Mapped)) &&
 	    !(mpd->b_state & (1 << BH_Delay)))
 		return 0;
-	new.b_state = mpd->b_state;
+	/*
+	 * We need to make sure the BH_Delay flag is passed down to
+	 * ext4_da_get_block_write(), since it calls
+	 * ext4_get_blocks_wrap() with the EXT4_DELALLOC_RSVED flag.
+	 * This flag causes ext4_get_blocks_wrap() to call
+	 * ext4_da_update_reserve_space() if the passed buffer head
+	 * has the BH_Delay flag set.  In the future, once we clean up
+	 * the interfaces to ext4_get_blocks_wrap(), we should pass in
+	 * a separate flag which requests that the delayed allocation
+	 * statistics should be updated, instead of depending on the
+	 * state information getting passed down via the map_bh's
+	 * state bitmasks plus the magic EXT4_DELALLOC_RSVED flag.
+	 */
+	new.b_state = mpd->b_state & (1 << BH_Delay);
 	new.b_blocknr = 0;
 	new.b_size = mpd->b_size;
 	next = mpd->b_blocknr;
diff --git a/fs/mpage.c b/fs/mpage.c
index 680ba60863f..42381bd6543 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -379,7 +379,8 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
 
-	clear_buffer_mapped(&map_bh);
+	map_bh.b_state = 0;
+	map_bh.b_size = 0;
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 		struct page *page = list_entry(pages->prev, struct page, lru);
 
@@ -412,7 +413,8 @@ int mpage_readpage(struct page *page, get_block_t get_block)
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
 
-	clear_buffer_mapped(&map_bh);
+	map_bh.b_state = 0;
+	map_bh.b_size = 0;
 	bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
 			&map_bh, &first_logical_block, get_block);
 	if (bio)
-- 
cgit v1.2.3


From 8fb0e342481c4d80040670fec915f0b9c7c6499a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 12 May 2009 16:22:37 -0400
Subject: vfs: Add BUG_ON for delayed and unwritten flags in submit_bh()

The BH_Delay and BH_Unwritten flags should never leak out to
submit_bh().  So add some BUG_ON() checks to submit_bh so we can get a
stack trace and determine how and why this might have happened.

(Note that only XFS and ext4 use these buffer head flags, and XFS does
not use submit_bh().  So this patch should only modify behavior for
ext4.)

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: linux-fsdevel@vger.kernel.org
---
 fs/buffer.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index aed297739eb..ad011290022 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2933,6 +2933,8 @@ int submit_bh(int rw, struct buffer_head * bh)
 	BUG_ON(!buffer_locked(bh));
 	BUG_ON(!buffer_mapped(bh));
 	BUG_ON(!bh->b_end_io);
+	BUG_ON(buffer_delay(bh));
+	BUG_ON(buffer_unwritten(bh));
 
 	/*
 	 * Mask in barrier bit for a write (could be either a WRITE or a
-- 
cgit v1.2.3


From 29fa89d088941d79765d60f22d5ccdd6b8696e11 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 12 May 2009 16:30:27 -0400
Subject: ext4: Mark the unwritten buffer_head as mapped during write_begin

Setting BH_Unwritten buffer_heads as BH_Mapped avoids multiple
(unnecessary) calls to get_block() during the call to the write(2)
system call.  Setting BH_Unwritten buffer heads as BH_Mapped requires
that the writepages() functions can handle BH_Unwritten buffer_heads.

After this commit, things work as follows:

ext4_ext_get_block() returns unmapped, unwritten, buffer head when
called with create = 0 for prealloc space. This makes sure we handle
the read path and non-delayed allocation case correctly.  Even though
the buffer head is marked unmapped we have valid b_blocknr and b_bdev
values in the buffer_head.

ext4_da_get_block_prep() called for block resrevation will now return
mapped, unwritten, new buffer_head for prealloc space. This avoids
multiple calls to get_block() for write to same offset. By making such
buffers as BH_New, we also assure that sub-block zeroing of buffered
writes happens correctly.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c |  4 +--
 fs/ext4/inode.c   | 82 +++++++++++++++++++++++++++++++++++--------------------
 2 files changed, 54 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a953214f282..ea5c47608ce 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2872,6 +2872,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			if (create == EXT4_CREATE_UNINITIALIZED_EXT)
 				goto out;
 			if (!create) {
+				if (allocated > max_blocks)
+					allocated = max_blocks;
 				/*
 				 * We have blocks reserved already.  We
 				 * return allocated blocks so that delalloc
@@ -2879,8 +2881,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 				 * the buffer head will be unmapped so that
 				 * a read from the block returns 0s.
 				 */
-				if (allocated > max_blocks)
-					allocated = max_blocks;
 				set_buffer_unwritten(bh_result);
 				bh_result->b_bdev = inode->i_sb->s_bdev;
 				bh_result->b_blocknr = newblock;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d7ad0bb73cd..96f3366f59f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1852,7 +1852,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
  * @logical - first logical block to start assignment with
  *
  * the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay
+ * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
  */
 static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 				 struct buffer_head *exbh)
@@ -1902,16 +1902,24 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 			do {
 				if (cur_logical >= logical + blocks)
 					break;
-				if (buffer_delay(bh)) {
-					bh->b_blocknr = pblock;
-					clear_buffer_delay(bh);
-					bh->b_bdev = inode->i_sb->s_bdev;
-				} else if (buffer_unwritten(bh)) {
-					bh->b_blocknr = pblock;
-					clear_buffer_unwritten(bh);
-					set_buffer_mapped(bh);
-					set_buffer_new(bh);
-					bh->b_bdev = inode->i_sb->s_bdev;
+
+				if (buffer_delay(bh) ||
+						buffer_unwritten(bh)) {
+
+					BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
+
+					if (buffer_delay(bh)) {
+						clear_buffer_delay(bh);
+						bh->b_blocknr = pblock;
+					} else {
+						/*
+						 * unwritten already should have
+						 * blocknr assigned. Verify that
+						 */
+						clear_buffer_unwritten(bh);
+						BUG_ON(bh->b_blocknr != pblock);
+					}
+
 				} else if (buffer_mapped(bh))
 					BUG_ON(bh->b_blocknr != pblock);
 
@@ -2053,7 +2061,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 * We consider only non-mapped and non-allocated blocks
 	 */
 	if ((mpd->b_state  & (1 << BH_Mapped)) &&
-	    !(mpd->b_state & (1 << BH_Delay)))
+		!(mpd->b_state & (1 << BH_Delay)) &&
+		!(mpd->b_state & (1 << BH_Unwritten)))
 		return 0;
 	/*
 	 * We need to make sure the BH_Delay flag is passed down to
@@ -2205,6 +2214,17 @@ flush_it:
 	return;
 }
 
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+	/*
+	 * unmapped buffer is possible for holes.
+	 * delay buffer is possible with delayed allocation.
+	 * We also need to consider unwritten buffer as unmapped.
+	 */
+	return (!buffer_mapped(bh) || buffer_delay(bh) ||
+				buffer_unwritten(bh)) && buffer_dirty(bh);
+}
+
 /*
  * __mpage_da_writepage - finds extent of pages and blocks
  *
@@ -2289,8 +2309,7 @@ static int __mpage_da_writepage(struct page *page,
 			 * Otherwise we won't make progress
 			 * with the page in ext4_da_writepage
 			 */
-			if (buffer_dirty(bh) &&
-			    (!buffer_mapped(bh) || buffer_delay(bh))) {
+			if (ext4_bh_unmapped_or_delay(NULL, bh)) {
 				mpage_add_bh_to_extent(mpd, logical,
 						       bh->b_size,
 						       bh->b_state);
@@ -2318,6 +2337,14 @@ static int __mpage_da_writepage(struct page *page,
 /*
  * this is a special callback for ->write_begin() only
  * it's intention is to return mapped block or reserve space
+ *
+ * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
+ * We also have b_blocknr = -1 and b_bdev initialized properly
+ *
+ * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
+ * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
+ * initialized properly.
+ *
  */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 				  struct buffer_head *bh_result, int create)
@@ -2353,28 +2380,23 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 		set_buffer_delay(bh_result);
 	} else if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
-		/*
-		 * With sub-block writes into unwritten extents
-		 * we also need to mark the buffer as new so that
-		 * the unwritten parts of the buffer gets correctly zeroed.
-		 */
-		if (buffer_unwritten(bh_result))
+		if (buffer_unwritten(bh_result)) {
+			/* A delayed write to unwritten bh should
+			 * be marked new and mapped.  Mapped ensures
+			 * that we don't do get_block multiple times
+			 * when we write to the same offset and new
+			 * ensures that we do proper zero out for
+			 * partial write.
+			 */
 			set_buffer_new(bh_result);
+			set_buffer_mapped(bh_result);
+		}
 		ret = 0;
 	}
 
 	return ret;
 }
 
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
-{
-	/*
-	 * unmapped buffer is possible for holes.
-	 * delay buffer is possible with delayed allocation
-	 */
-	return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
-}
-
 static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
@@ -2828,7 +2850,7 @@ static int ext4_da_should_update_i_disksize(struct page *page,
 	for (i = 0; i < idx; i++)
 		bh = bh->b_this_page;
 
-	if (!buffer_mapped(bh) || (buffer_delay(bh)))
+	if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
 		return 0;
 	return 1;
 }
-- 
cgit v1.2.3


From c5ca7c7636fa689a9746b6032f83aa7fffec31c6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 27 Apr 2009 22:48:48 -0400
Subject: ext4: Fallback to vmalloc if kmalloc can't allocate s_flex_groups
 array

For very large filesystems, the s_flex_groups array can get quite big.
For example, a filesystem that can be resized up to 16TB will have
8192 flex groups (assuming the default flex_bg size of 16), so the
array is 96k, which is *very* marginal for kmalloc().  On the other
hand, a 160GB filesystem without the resize_inode feature will only
require 960 bytes.  So we try to allocate the array first using
kmalloc(), and if that fails, we'll try to use vmalloc() instead.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2958f4e6f22..3f4475daa66 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -20,6 +20,7 @@
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/time.h>
+#include <linux/vmalloc.h>
 #include <linux/jbd2.h>
 #include <linux/slab.h>
 #include <linux/init.h>
@@ -586,7 +587,10 @@ static void ext4_put_super(struct super_block *sb)
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
-	kfree(sbi->s_flex_groups);
+	if (is_vmalloc_addr(sbi->s_flex_groups))
+		vfree(sbi->s_flex_groups);
+	else
+		kfree(sbi->s_flex_groups);
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -1620,6 +1624,7 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	ext4_group_t flex_group_count;
 	ext4_group_t flex_group;
 	int groups_per_flex = 0;
+	size_t size;
 	int i;
 
 	if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1634,8 +1639,13 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
 			((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
 			      EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
-	sbi->s_flex_groups = kzalloc(flex_group_count *
-				     sizeof(struct flex_groups), GFP_KERNEL);
+	size = flex_group_count * sizeof(struct flex_groups);
+	sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
+	if (sbi->s_flex_groups == NULL) {
+		sbi->s_flex_groups = vmalloc(size);
+		if (sbi->s_flex_groups)
+			memset(sbi->s_flex_groups, 0, size);
+	}
 	if (sbi->s_flex_groups == NULL) {
 		printk(KERN_ERR "EXT4-fs: not enough memory for "
 				"%u flex groups\n", flex_group_count);
@@ -2842,6 +2852,12 @@ failed_mount4:
 		sbi->s_journal = NULL;
 	}
 failed_mount3:
+	if (sbi->s_flex_groups) {
+		if (is_vmalloc_addr(sbi->s_flex_groups))
+			vfree(sbi->s_flex_groups);
+		else
+			kfree(sbi->s_flex_groups);
+	}
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
-- 
cgit v1.2.3


From f7c439504ccba0cca43271e651013ab97a221c62 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 24 Apr 2009 23:31:59 -0400
Subject: ext4: Use is_power_of_2() for clarity

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3f4475daa66..3e509bc647e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1483,7 +1483,7 @@ set_qf_format:
 				return 0;
 			if (option < 0 || option > (1 << 30))
 				return 0;
-			if (option & (option - 1)) {
+			if (!is_power_of_2(option)) {
 				printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
 				       " must be a power of 2\n");
 				return 0;
@@ -2101,8 +2101,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 	if (parse_strtoul(buf, 0x40000000, &t))
 		return -EINVAL;
 
-	/* inode_readahead_blks must be a power of 2 */
-	if (t & (t-1))
+	if (!is_power_of_2(t))
 		return -EINVAL;
 
 	sbi->s_inode_readahead_blks = t;
-- 
cgit v1.2.3


From e2d670523c6c4ccb0fca9f3ab1b8f066d9aa57d6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 00:33:44 -0400
Subject: ext4: Simplify ext4_commit_super()'s function signature

The ext4_commit_super() function took both a struct super_block * and
a struct ext4_super_block *, but the struct ext4_super_block can be
derived from the struct super_block.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3e509bc647e..ad4c9be4abd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -54,8 +54,7 @@ static struct kset *ext4_kset;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
-static int ext4_commit_super(struct super_block *sb,
-			      struct ext4_super_block *es, int sync);
+static int ext4_commit_super(struct super_block *sb, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
 					struct ext4_super_block *es);
 static void ext4_clear_journal_err(struct super_block *sb,
@@ -306,7 +305,7 @@ static void ext4_handle_error(struct super_block *sb)
 		printk(KERN_CRIT "Remounting filesystem read-only\n");
 		sb->s_flags |= MS_RDONLY;
 	}
-	ext4_commit_super(sb, es, 1);
+	ext4_commit_super(sb, 1);
 	if (test_opt(sb, ERRORS_PANIC))
 		panic("EXT4-fs (device %s): panic forced after error\n",
 			sb->s_id);
@@ -448,7 +447,7 @@ __acquires(bitlock)
 	if (test_opt(sb, ERRORS_CONT)) {
 		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-		ext4_commit_super(sb, es, 0);
+		ext4_commit_super(sb, 0);
 		return;
 	}
 	ext4_unlock_group(sb, grp);
@@ -577,7 +576,7 @@ static void ext4_put_super(struct super_block *sb)
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
-		ext4_commit_super(sb, es, 1);
+		ext4_commit_super(sb, 1);
 	}
 	if (sbi->s_proc) {
 		remove_proc_entry(sb->s_id, ext4_proc_root);
@@ -1596,7 +1595,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	if (sbi->s_journal)
 		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 
-	ext4_commit_super(sb, es, 1);
+	ext4_commit_super(sb, 1);
 	if (test_opt(sb, DEBUG))
 		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
 				"bpg=%lu, ipg=%lu, mo=%04lx]\n",
@@ -2655,7 +2654,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			if (test_opt(sb, ERRORS_PANIC)) {
 				EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 				es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-				ext4_commit_super(sb, es, 1);
+				ext4_commit_super(sb, 1);
 				goto failed_mount4;
 			}
 		}
@@ -3132,15 +3131,15 @@ static int ext4_load_journal(struct super_block *sb,
 		sb->s_dirt = 1;
 
 		/* Make sure we flush the recovery flag to disk. */
-		ext4_commit_super(sb, es, 1);
+		ext4_commit_super(sb, 1);
 	}
 
 	return 0;
 }
 
-static int ext4_commit_super(struct super_block *sb,
-			      struct ext4_super_block *es, int sync)
+static int ext4_commit_super(struct super_block *sb, int sync)
 {
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
 	int error = 0;
 
@@ -3212,7 +3211,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 	    sb->s_flags & MS_RDONLY) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		sb->s_dirt = 0;
-		ext4_commit_super(sb, es, 1);
+		ext4_commit_super(sb, 1);
 	}
 	unlock_super(sb);
 
@@ -3253,7 +3252,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
 
 		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-		ext4_commit_super(sb, es, 1);
+		ext4_commit_super(sb, 1);
 
 		jbd2_journal_clear_err(journal);
 	}
@@ -3293,7 +3292,7 @@ static void ext4_write_super(struct super_block *sb)
 			BUG();
 		sb->s_dirt = 0;
 	} else {
-		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+		ext4_commit_super(sb, 1);
 	}
 }
 
@@ -3312,7 +3311,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 						     target);
 		}
 	} else {
-		ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
+		ext4_commit_super(sb, wait);
 	}
 	return ret;
 }
@@ -3345,7 +3344,7 @@ static int ext4_freeze(struct super_block *sb)
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+		error = ext4_commit_super(sb, 1);
 		if (error)
 			goto out;
 	}
@@ -3365,7 +3364,7 @@ static int ext4_unfreeze(struct super_block *sb)
 		lock_super(sb);
 		/* Reser the needs_recovery flag before the fs is unlocked. */
 		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+		ext4_commit_super(sb, 1);
 		unlock_super(sb);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 	}
@@ -3520,7 +3519,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		}
 	}
 	if (sbi->s_journal == NULL)
-		ext4_commit_super(sb, es, 1);
+		ext4_commit_super(sb, 1);
 
 #ifdef CONFIG_QUOTA
 	/* Release old quota file names */
-- 
cgit v1.2.3


From 7234ab2a55e77784b44cf2d862136d9e41b8d98a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 30 Apr 2009 21:24:04 -0400
Subject: ext4: Fix and simplify s_dirt handling

The s_dirt flag wasn't completely handled correctly, but it didn't
really matter when journalling was enabled.  It turns out that when
ext4 runs without a journal, we don't clear s_dirt in places where we
should have, with the result that the high-level write_super()
function was writing the superblock when it wasn't necessary.

So we fix this by making ext4_commit_super() clear the s_dirt flag,
and removing many of the other places where s_dirt is manipulated.
When journalling is enabled, the s_dirt flag might be left set more
often, but s_dirt really doesn't matter when journalling is enabled.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ad4c9be4abd..7c7a08af120 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3128,7 +3128,6 @@ static int ext4_load_journal(struct super_block *sb,
 	if (journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
 		es->s_journal_dev = cpu_to_le32(journal_devnum);
-		sb->s_dirt = 1;
 
 		/* Make sure we flush the recovery flag to disk. */
 		ext4_commit_super(sb, 1);
@@ -3168,7 +3167,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 					&EXT4_SB(sb)->s_freeblocks_counter));
 	es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
 					&EXT4_SB(sb)->s_freeinodes_counter));
-
+	sb->s_dirt = 0;
 	BUFFER_TRACE(sbh, "marking dirty");
 	mark_buffer_dirty(sbh);
 	if (sync) {
@@ -3210,7 +3209,6 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
 	    sb->s_flags & MS_RDONLY) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		sb->s_dirt = 0;
 		ext4_commit_super(sb, 1);
 	}
 	unlock_super(sb);
@@ -3271,10 +3269,8 @@ int ext4_force_commit(struct super_block *sb)
 		return 0;
 
 	journal = EXT4_SB(sb)->s_journal;
-	if (journal) {
-		sb->s_dirt = 0;
+	if (journal)
 		ret = ext4_journal_force_commit(journal);
-	}
 
 	return ret;
 }
@@ -3282,15 +3278,13 @@ int ext4_force_commit(struct super_block *sb)
 /*
  * Ext4 always journals updates to the superblock itself, so we don't
  * have to propagate any other updates to the superblock on disk at this
- * point.  (We can probably nuke this function altogether, and remove
- * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...)
+ * point if the journalling is enabled.
  */
 static void ext4_write_super(struct super_block *sb)
 {
 	if (EXT4_SB(sb)->s_journal) {
 		if (mutex_trylock(&sb->s_lock) != 0)
 			BUG();
-		sb->s_dirt = 0;
 	} else {
 		ext4_commit_super(sb, 1);
 	}
@@ -3302,7 +3296,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 	tid_t target;
 
 	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
-	sb->s_dirt = 0;
 	if (EXT4_SB(sb)->s_journal) {
 		if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
 					      &target)) {
@@ -3324,7 +3317,6 @@ static int ext4_freeze(struct super_block *sb)
 {
 	int error = 0;
 	journal_t *journal;
-	sb->s_dirt = 0;
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		journal = EXT4_SB(sb)->s_journal;
-- 
cgit v1.2.3


From 9ca92389c5312a51e819c15c762f0abdc7f3129b Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 12:52:25 -0400
Subject: ext4: Use separate super_operations structure for no_journal
 filesystems

By using a separate super_operations structure for filesystems that
have and don't have journals, we can simply ext4_write_super() ---
which is only needed when no journal is present --- and ext4_freeze(),
ext4_unfreeze(), and ext4_sync_fs(), which are only needed when the
journal is present.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 108 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 57 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7c7a08af120..68c3a44c4a9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -995,7 +995,6 @@ static const struct super_operations ext4_sops = {
 	.dirty_inode	= ext4_dirty_inode,
 	.delete_inode	= ext4_delete_inode,
 	.put_super	= ext4_put_super,
-	.write_super	= ext4_write_super,
 	.sync_fs	= ext4_sync_fs,
 	.freeze_fs	= ext4_freeze,
 	.unfreeze_fs	= ext4_unfreeze,
@@ -1010,6 +1009,25 @@ static const struct super_operations ext4_sops = {
 	.bdev_try_to_free_page = bdev_try_to_free_page,
 };
 
+static const struct super_operations ext4_nojournal_sops = {
+	.alloc_inode	= ext4_alloc_inode,
+	.destroy_inode	= ext4_destroy_inode,
+	.write_inode	= ext4_write_inode,
+	.dirty_inode	= ext4_dirty_inode,
+	.delete_inode	= ext4_delete_inode,
+	.write_super	= ext4_write_super,
+	.put_super	= ext4_put_super,
+	.statfs		= ext4_statfs,
+	.remount_fs	= ext4_remount,
+	.clear_inode	= ext4_clear_inode,
+	.show_options	= ext4_show_options,
+#ifdef CONFIG_QUOTA
+	.quota_read	= ext4_quota_read,
+	.quota_write	= ext4_quota_write,
+#endif
+	.bdev_try_to_free_page = bdev_try_to_free_page,
+};
+
 static const struct export_operations ext4_export_ops = {
 	.fh_to_dentry = ext4_fh_to_dentry,
 	.fh_to_parent = ext4_fh_to_parent,
@@ -2615,7 +2633,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	/*
 	 * set up enough so that it can read an inode
 	 */
-	sb->s_op = &ext4_sops;
+	if (!test_opt(sb, NOLOAD) &&
+	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+		sb->s_op = &ext4_sops;
+	else
+		sb->s_op = &ext4_nojournal_sops;
 	sb->s_export_op = &ext4_export_ops;
 	sb->s_xattr = ext4_xattr_handlers;
 #ifdef CONFIG_QUOTA
@@ -3275,19 +3297,9 @@ int ext4_force_commit(struct super_block *sb)
 	return ret;
 }
 
-/*
- * Ext4 always journals updates to the superblock itself, so we don't
- * have to propagate any other updates to the superblock on disk at this
- * point if the journalling is enabled.
- */
 static void ext4_write_super(struct super_block *sb)
 {
-	if (EXT4_SB(sb)->s_journal) {
-		if (mutex_trylock(&sb->s_lock) != 0)
-			BUG();
-	} else {
-		ext4_commit_super(sb, 1);
-	}
+	ext4_commit_super(sb, 1);
 }
 
 static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -3296,15 +3308,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 	tid_t target;
 
 	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
-	if (EXT4_SB(sb)->s_journal) {
-		if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
-					      &target)) {
-			if (wait)
-				jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
-						     target);
-		}
-	} else {
-		ext4_commit_super(sb, wait);
+	if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+		if (wait)
+			jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
 	}
 	return ret;
 }
@@ -3318,32 +3324,31 @@ static int ext4_freeze(struct super_block *sb)
 	int error = 0;
 	journal_t *journal;
 
-	if (!(sb->s_flags & MS_RDONLY)) {
-		journal = EXT4_SB(sb)->s_journal;
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
 
-		if (journal) {
-			/* Now we set up the journal barrier. */
-			jbd2_journal_lock_updates(journal);
+	journal = EXT4_SB(sb)->s_journal;
 
-			/*
-			 * We don't want to clear needs_recovery flag when we
-			 * failed to flush the journal.
-			 */
-			error = jbd2_journal_flush(journal);
-			if (error < 0)
-				goto out;
-		}
+	/* Now we set up the journal barrier. */
+	jbd2_journal_lock_updates(journal);
 
-		/* Journal blocked and flushed, clear needs_recovery flag. */
-		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		error = ext4_commit_super(sb, 1);
-		if (error)
-			goto out;
+	/*
+	 * Don't clear the needs_recovery flag if we failed to flush
+	 * the journal.
+	 */
+	error = jbd2_journal_flush(journal);
+	if (error < 0) {
+	out:
+		jbd2_journal_unlock_updates(journal);
+		return error;
 	}
+
+	/* Journal blocked and flushed, clear needs_recovery flag. */
+	EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+	error = ext4_commit_super(sb, 1);
+	if (error)
+		goto out;
 	return 0;
-out:
-	jbd2_journal_unlock_updates(journal);
-	return error;
 }
 
 /*
@@ -3352,14 +3357,15 @@ out:
  */
 static int ext4_unfreeze(struct super_block *sb)
 {
-	if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
-		lock_super(sb);
-		/* Reser the needs_recovery flag before the fs is unlocked. */
-		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		ext4_commit_super(sb, 1);
-		unlock_super(sb);
-		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
-	}
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	lock_super(sb);
+	/* Reset the needs_recovery flag before the fs is unlocked. */
+	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+	ext4_commit_super(sb, 1);
+	unlock_super(sb);
+	jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 8df9675f8b498d0bfa1f0b5b06f56bf1ff366dd5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 08:50:38 -0400
Subject: ext4: Avoid races caused by on-line resizing and SMP memory
 reordering

Ext4's on-line resizing adds a new block group and then, only at the
last step adjusts s_groups_count.  However, it's possible on SMP
systems that another CPU could see the updated the s_group_count and
not see the newly initialized data structures for the just-added block
group.  For this reason, it's important to insert a SMP read barrier
after reading s_groups_count and before reading any (for example) the
new block group descriptors allowed by the increased value of
s_groups_count.

Unfortunately, we rather blatently violate this locking protocol
documented in fs/ext4/resize.c.  Fortunately, (1) on-line resizes
happen relatively rarely, and (2) it seems rare that the filesystem
code will immediately try to use just-added block group before any
memory ordering issues resolve themselves.  So apparently problems
here are relatively hard to hit, since ext3 has been vulnerable to the
same issue for years with no one apparently complaining.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 15 +++++++--------
 fs/ext4/ext4.h    | 12 ++++++++++++
 fs/ext4/ialloc.c  | 40 +++++++++++++++++++---------------------
 fs/ext4/inode.c   |  7 ++++---
 fs/ext4/mballoc.c | 45 ++++++++++++++++++++++++---------------------
 fs/ext4/super.c   |  3 +--
 6 files changed, 67 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 53c72ad8587..a5ba039850c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -88,6 +88,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 ext4_group_t block_group, struct ext4_group_desc *gdp)
 {
 	int bit, bit_max;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	unsigned free_blocks, group_blocks;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
@@ -123,7 +124,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		bit_max += ext4_bg_num_gdb(sb, block_group);
 	}
 
-	if (block_group == sbi->s_groups_count - 1) {
+	if (block_group == ngroups - 1) {
 		/*
 		 * Even though mke2fs always initialize first and last group
 		 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
@@ -131,7 +132,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 */
 		group_blocks = ext4_blocks_count(sbi->s_es) -
 			le32_to_cpu(sbi->s_es->s_first_data_block) -
-			(EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
+			(EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
 	} else {
 		group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
 	}
@@ -205,18 +206,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 {
 	unsigned int group_desc;
 	unsigned int offset;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (block_group >= sbi->s_groups_count) {
+	if (block_group >= ngroups) {
 		ext4_error(sb, "ext4_get_group_desc",
 			   "block_group >= groups_count - "
 			   "block_group = %u, groups_count = %u",
-			   block_group, sbi->s_groups_count);
+			   block_group, ngroups);
 
 		return NULL;
 	}
-	smp_rmb();
 
 	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
 	offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
@@ -665,7 +666,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 	ext4_fsblk_t desc_count;
 	struct ext4_group_desc *gdp;
 	ext4_group_t i;
-	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 #ifdef EXT4FS_DEBUG
 	struct ext4_super_block *es;
 	ext4_fsblk_t bitmap_count;
@@ -677,7 +678,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 	bitmap_count = 0;
 	gdp = NULL;
 
-	smp_rmb();
 	for (i = 0; i < ngroups; i++) {
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
@@ -700,7 +700,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 	return bitmap_count;
 #else
 	desc_count = 0;
-	smp_rmb();
 	for (i = 0; i < ngroups; i++) {
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d0f15ef56de..02ec44bf38e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1228,6 +1228,18 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 	 return grp_info[indexv][indexh];
 }
 
+/*
+ * Reading s_groups_count requires using smp_rmb() afterwards.  See
+ * the locking protocol documented in the comments of ext4_group_add()
+ * in resize.c
+ */
+static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
+{
+	ext4_group_t	ngroups = EXT4_SB(sb)->s_groups_count;
+
+	smp_rmb();
+	return ngroups;
+}
 
 static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
 					     ext4_group_t block_group)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f18e0a08a6b..55ba419ca00 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -316,7 +316,7 @@ error_return:
 static int find_group_dir(struct super_block *sb, struct inode *parent,
 				ext4_group_t *best_group)
 {
-	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	unsigned int freei, avefreei;
 	struct ext4_group_desc *desc, *best_desc = NULL;
 	ext4_group_t group;
@@ -353,7 +353,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 	struct flex_groups *flex_group = sbi->s_flex_groups;
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
-	ext4_group_t ngroups = sbi->s_groups_count;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	int flex_size = ext4_flex_bg_size(sbi);
 	ext4_group_t best_flex = parent_fbg_group;
 	int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
@@ -362,7 +362,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 	ext4_group_t n_fbg_groups;
 	ext4_group_t i;
 
-	n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+	n_fbg_groups = (ngroups + flex_size - 1) >>
 		sbi->s_log_groups_per_flex;
 
 find_close_to_parent:
@@ -478,20 +478,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	ext4_group_t ngroups = sbi->s_groups_count;
+	ext4_group_t real_ngroups = ext4_get_groups_count(sb);
 	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
 	unsigned int freei, avefreei;
 	ext4_fsblk_t freeb, avefreeb;
 	unsigned int ndirs;
 	int max_dirs, min_inodes;
 	ext4_grpblk_t min_blocks;
-	ext4_group_t i, grp, g;
+	ext4_group_t i, grp, g, ngroups;
 	struct ext4_group_desc *desc;
 	struct orlov_stats stats;
 	int flex_size = ext4_flex_bg_size(sbi);
 
+	ngroups = real_ngroups;
 	if (flex_size > 1) {
-		ngroups = (ngroups + flex_size - 1) >>
+		ngroups = (real_ngroups + flex_size - 1) >>
 			sbi->s_log_groups_per_flex;
 		parent_group >>= sbi->s_log_groups_per_flex;
 	}
@@ -543,7 +544,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 		 */
 		grp *= flex_size;
 		for (i = 0; i < flex_size; i++) {
-			if (grp+i >= sbi->s_groups_count)
+			if (grp+i >= real_ngroups)
 				break;
 			desc = ext4_get_group_desc(sb, grp+i, NULL);
 			if (desc && ext4_free_inodes_count(sb, desc)) {
@@ -583,7 +584,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	}
 
 fallback:
-	ngroups = sbi->s_groups_count;
+	ngroups = real_ngroups;
 	avefreei = freei / ngroups;
 fallback_retry:
 	parent_group = EXT4_I(parent)->i_block_group;
@@ -613,9 +614,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 			    ext4_group_t *group, int mode)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
-	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+	ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
 	struct ext4_group_desc *desc;
-	ext4_group_t i, last;
 	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
 
 	/*
@@ -799,11 +799,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 	struct super_block *sb;
 	struct buffer_head *inode_bitmap_bh = NULL;
 	struct buffer_head *group_desc_bh;
-	ext4_group_t group = 0;
+	ext4_group_t ngroups, group = 0;
 	unsigned long ino = 0;
 	struct inode *inode;
 	struct ext4_group_desc *gdp = NULL;
-	struct ext4_super_block *es;
 	struct ext4_inode_info *ei;
 	struct ext4_sb_info *sbi;
 	int ret2, err = 0;
@@ -818,15 +817,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 		return ERR_PTR(-EPERM);
 
 	sb = dir->i_sb;
+	ngroups = ext4_get_groups_count(sb);
 	trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
 		   dir->i_ino, mode);
 	inode = new_inode(sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	ei = EXT4_I(inode);
-
 	sbi = EXT4_SB(sb);
-	es = sbi->s_es;
 
 	if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
 		ret2 = find_group_flex(sb, dir, &group);
@@ -856,7 +854,7 @@ got_group:
 	if (ret2 == -1)
 		goto out;
 
-	for (i = 0; i < sbi->s_groups_count; i++) {
+	for (i = 0; i < ngroups; i++) {
 		err = -EIO;
 
 		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
@@ -917,7 +915,7 @@ repeat_in_this_group:
 		 * group descriptor metadata has not yet been updated.
 		 * So we just go onto the next blockgroup.
 		 */
-		if (++group == sbi->s_groups_count)
+		if (++group == ngroups)
 			group = 0;
 	}
 	err = -ENOSPC;
@@ -1158,7 +1156,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 {
 	unsigned long desc_count;
 	struct ext4_group_desc *gdp;
-	ext4_group_t i;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 #ifdef EXT4FS_DEBUG
 	struct ext4_super_block *es;
 	unsigned long bitmap_count, x;
@@ -1168,7 +1166,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 	desc_count = 0;
 	bitmap_count = 0;
 	gdp = NULL;
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+	for (i = 0; i < ngroups; i++) {
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
@@ -1190,7 +1188,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 	return desc_count;
 #else
 	desc_count = 0;
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+	for (i = 0; i < ngroups; i++) {
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
@@ -1205,9 +1203,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 unsigned long ext4_count_dirs(struct super_block * sb)
 {
 	unsigned long count = 0;
-	ext4_group_t i;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+	for (i = 0; i < ngroups; i++) {
 		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 96f3366f59f..4e7f363e303 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4965,7 +4965,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
  */
 int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
-	int groups, gdpblocks;
+	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
+	int gdpblocks;
 	int idxblocks;
 	int ret = 0;
 
@@ -4992,8 +4993,8 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 		groups += nrblocks;
 
 	gdpblocks = groups;
-	if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
-		groups = EXT4_SB(inode->i_sb)->s_groups_count;
+	if (groups > ngroups)
+		groups = ngroups;
 	if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
 		gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f871677a798..c3af9e6b666 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -739,6 +739,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 
 static int ext4_mb_init_cache(struct page *page, char *incore)
 {
+	ext4_group_t ngroups;
 	int blocksize;
 	int blocks_per_page;
 	int groups_per_page;
@@ -757,6 +758,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
 	inode = page->mapping->host;
 	sb = inode->i_sb;
+	ngroups = ext4_get_groups_count(sb);
 	blocksize = 1 << inode->i_blkbits;
 	blocks_per_page = PAGE_CACHE_SIZE / blocksize;
 
@@ -780,7 +782,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 	for (i = 0; i < groups_per_page; i++) {
 		struct ext4_group_desc *desc;
 
-		if (first_group + i >= EXT4_SB(sb)->s_groups_count)
+		if (first_group + i >= ngroups)
 			break;
 
 		err = -EIO;
@@ -852,7 +854,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		struct ext4_group_info *grinfo;
 
 		group = (first_block + i) >> 1;
-		if (group >= EXT4_SB(sb)->s_groups_count)
+		if (group >= ngroups)
 			break;
 
 		/*
@@ -1788,6 +1790,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
 	int block, pnum;
 	int blocks_per_page;
 	int groups_per_page;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	ext4_group_t first_group;
 	struct ext4_group_info *grp;
 
@@ -1807,7 +1810,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
 	/* read all groups the page covers into the cache */
 	for (i = 0; i < groups_per_page; i++) {
 
-		if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
+		if ((first_group + i) >= ngroups)
 			break;
 		grp = ext4_get_group_info(sb, first_group + i);
 		/* take all groups write allocation
@@ -1945,8 +1948,7 @@ err:
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
-	ext4_group_t group;
-	ext4_group_t i;
+	ext4_group_t ngroups, group, i;
 	int cr;
 	int err = 0;
 	int bsbits;
@@ -1957,6 +1959,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 
 	sb = ac->ac_sb;
 	sbi = EXT4_SB(sb);
+	ngroups = ext4_get_groups_count(sb);
 	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
 
 	/* first, try the goal */
@@ -2017,11 +2020,11 @@ repeat:
 		 */
 		group = ac->ac_g_ex.fe_group;
 
-		for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
+		for (i = 0; i < ngroups; group++, i++) {
 			struct ext4_group_info *grp;
 			struct ext4_group_desc *desc;
 
-			if (group == EXT4_SB(sb)->s_groups_count)
+			if (group == ngroups)
 				group = 0;
 
 			/* quick check to skip empty groups */
@@ -2315,12 +2318,10 @@ static struct file_operations ext4_mb_seq_history_fops = {
 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 {
 	struct super_block *sb = seq->private;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t group;
 
-	if (*pos < 0 || *pos >= sbi->s_groups_count)
+	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
 		return NULL;
-
 	group = *pos + 1;
 	return (void *) ((unsigned long) group);
 }
@@ -2328,11 +2329,10 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct super_block *sb = seq->private;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t group;
 
 	++*pos;
-	if (*pos < 0 || *pos >= sbi->s_groups_count)
+	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
 		return NULL;
 	group = *pos + 1;
 	return (void *) ((unsigned long) group);
@@ -2587,6 +2587,7 @@ void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
 
 static int ext4_mb_init_backend(struct super_block *sb)
 {
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	ext4_group_t i;
 	int metalen;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2598,7 +2599,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
 	struct ext4_group_desc *desc;
 
 	/* This is the number of blocks used by GDT */
-	num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+	num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
 				1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
 
 	/*
@@ -2644,7 +2645,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
 	for (i = 0; i < num_meta_group_infos; i++) {
 		if ((i + 1) == num_meta_group_infos)
 			metalen = sizeof(*meta_group_info) *
-				(sbi->s_groups_count -
+				(ngroups -
 					(i << EXT4_DESC_PER_BLOCK_BITS(sb)));
 		meta_group_info = kmalloc(metalen, GFP_KERNEL);
 		if (meta_group_info == NULL) {
@@ -2655,7 +2656,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
 		sbi->s_group_info[i] = meta_group_info;
 	}
 
-	for (i = 0; i < sbi->s_groups_count; i++) {
+	for (i = 0; i < ngroups; i++) {
 		desc = ext4_get_group_desc(sb, i, NULL);
 		if (desc == NULL) {
 			printk(KERN_ERR
@@ -2781,13 +2782,14 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 
 int ext4_mb_release(struct super_block *sb)
 {
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	ext4_group_t i;
 	int num_meta_group_infos;
 	struct ext4_group_info *grinfo;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	if (sbi->s_group_info) {
-		for (i = 0; i < sbi->s_groups_count; i++) {
+		for (i = 0; i < ngroups; i++) {
 			grinfo = ext4_get_group_info(sb, i);
 #ifdef DOUBLE_CHECK
 			kfree(grinfo->bb_bitmap);
@@ -2797,7 +2799,7 @@ int ext4_mb_release(struct super_block *sb)
 			ext4_unlock_group(sb, i);
 			kfree(grinfo);
 		}
-		num_meta_group_infos = (sbi->s_groups_count +
+		num_meta_group_infos = (ngroups +
 				EXT4_DESC_PER_BLOCK(sb) - 1) >>
 			EXT4_DESC_PER_BLOCK_BITS(sb);
 		for (i = 0; i < num_meta_group_infos; i++)
@@ -4121,7 +4123,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
 	struct super_block *sb = ac->ac_sb;
-	ext4_group_t i;
+	ext4_group_t ngroups, i;
 
 	printk(KERN_ERR "EXT4-fs: Can't allocate:"
 			" Allocation context details:\n");
@@ -4145,7 +4147,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 	printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
 		ac->ac_found);
 	printk(KERN_ERR "EXT4-fs: groups: \n");
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+	ngroups = ext4_get_groups_count(sb);
+	for (i = 0; i < ngroups; i++) {
 		struct ext4_group_info *grp = ext4_get_group_info(sb, i);
 		struct ext4_prealloc_space *pa;
 		ext4_grpblk_t start;
@@ -4469,13 +4472,13 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 
 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 {
-	ext4_group_t i;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 	int ret;
 	int freed = 0;
 
 	trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
 		   sb->s_id, needed);
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
+	for (i = 0; i < ngroups && needed > 0; i++) {
 		ret = ext4_mb_discard_group_preallocations(sb, i, needed);
 		freed += ret;
 		needed -= ret;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 68c3a44c4a9..fcd7b24c6df 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3557,9 +3557,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	if (test_opt(sb, MINIX_DF)) {
 		sbi->s_overhead_last = 0;
 	} else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
-		ext4_group_t ngroups = sbi->s_groups_count, i;
+		ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 		ext4_fsblk_t overhead = 0;
-		smp_rmb();
 
 		/*
 		 * Compute the overhead (FS structures).  This is constant
-- 
cgit v1.2.3


From 114e9fc90703bd6aac0229fb559e97caa6c49770 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 25 Apr 2009 15:48:07 -0400
Subject: ext4: Remove outdated comment about lock_super()

ext4_fill_super() is no longer called by read_super(), and it is no
longer called with the superblock locked.  The
unlock_super()/lock_super() is no longer present, so this comment is
entirely superfluous.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fcd7b24c6df..e3b35f26d5f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2828,14 +2828,6 @@ no_journal:
 		goto failed_mount4;
 	};
 
-	/*
-	 * akpm: core read_super() calls in here with the superblock locked.
-	 * That deadlocks, because orphan cleanup needs to lock the superblock
-	 * in numerous places.  Here we just pop the lock - it's relatively
-	 * harmless, because we are now ready to accept write_super() requests,
-	 * and aviro says that's the only reason for hanging onto the
-	 * superblock lock.
-	 */
 	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
 	ext4_orphan_cleanup(sb, es);
 	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
-- 
cgit v1.2.3


From a63c9eb2ce6f5028da90f282798232c4f398ceb8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 01:59:42 -0400
Subject: ext4: ext4_mark_recovery_complete() doesn't need to use lock_super

The function ext4_mark_recovery_complete() is called from two call
paths: either (a) while mounting the filesystem, in which case there's
no danger of any other CPU calling write_super() until the mount is
completed, and (b) while remounting the filesystem read-write, in
which case the fs core has already locked the superblock.  This also
allows us to take out a very vile unlock_super()/lock_super() pair in
ext4_remount().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e3b35f26d5f..45d0ada9bfc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3219,13 +3219,11 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 	if (jbd2_journal_flush(journal) < 0)
 		goto out;
 
-	lock_super(sb);
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
 	    sb->s_flags & MS_RDONLY) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		ext4_commit_super(sb, 1);
 	}
-	unlock_super(sb);
 
 out:
 	jbd2_journal_unlock_updates(journal);
@@ -3436,15 +3434,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			    (sbi->s_mount_state & EXT4_VALID_FS))
 				es->s_state = cpu_to_le16(sbi->s_mount_state);
 
-			/*
-			 * We have to unlock super so that we can wait for
-			 * transactions.
-			 */
-			if (sbi->s_journal) {
-				unlock_super(sb);
+			if (sbi->s_journal)
 				ext4_mark_recovery_complete(sb, es);
-				lock_super(sb);
-			}
 		} else {
 			int ret;
 			if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
-- 
cgit v1.2.3


From 3b9d4ed26680771295d904a6b83e88e620780893 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 25 Apr 2009 22:54:04 -0400
Subject: ext4: Replace lock/unlock_super() with an explicit lock for the
 orphan list

Use a separate lock to protect the orphan list, so we can stop
overloading the use of lock_super().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_sb.h |  1 +
 fs/ext4/namei.c   | 20 +++++++++++---------
 fs/ext4/super.c   |  1 +
 3 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 57b71fefbcc..4bda2f75d42 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -71,6 +71,7 @@ struct ext4_sb_info {
 	struct inode *s_journal_inode;
 	struct journal_s *s_journal;
 	struct list_head s_orphan;
+	struct mutex s_orphan_lock;
 	unsigned long s_commit_interval;
 	u32 s_max_batch_time;
 	u32 s_min_batch_time;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 22098e1cd08..8018e49a728 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1997,7 +1997,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	if (!ext4_handle_valid(handle))
 		return 0;
 
-	lock_super(sb);
+	mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
 	if (!list_empty(&EXT4_I(inode)->i_orphan))
 		goto out_unlock;
 
@@ -2006,9 +2006,13 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 
 	/* @@@ FIXME: Observation from aviro:
 	 * I think I can trigger J_ASSERT in ext4_orphan_add().  We block
-	 * here (on lock_super()), so race with ext4_link() which might bump
+	 * here (on s_orphan_lock), so race with ext4_link() which might bump
 	 * ->i_nlink. For, say it, character device. Not a regular file,
 	 * not a directory, not a symlink and ->i_nlink > 0.
+	 *
+	 * tytso, 4/25/2009: I'm not sure how that could happen;
+	 * shouldn't the fs core protect us from these sort of
+	 * unlink()/link() races?
 	 */
 	J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 		  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
@@ -2045,7 +2049,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	jbd_debug(4, "orphan inode %lu will point to %d\n",
 			inode->i_ino, NEXT_ORPHAN(inode));
 out_unlock:
-	unlock_super(sb);
+	mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
@@ -2066,11 +2070,9 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 	if (!ext4_handle_valid(handle))
 		return 0;
 
-	lock_super(inode->i_sb);
-	if (list_empty(&ei->i_orphan)) {
-		unlock_super(inode->i_sb);
-		return 0;
-	}
+	mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
+	if (list_empty(&ei->i_orphan))
+		goto out;
 
 	ino_next = NEXT_ORPHAN(inode);
 	prev = ei->i_orphan.prev;
@@ -2120,7 +2122,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 out_err:
 	ext4_std_error(inode->i_sb, err);
 out:
-	unlock_super(inode->i_sb);
+	mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
 	return err;
 
 out_brelse:
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 45d0ada9bfc..7f43fde9554 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2645,6 +2645,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sb->dq_op = &ext4_quota_operations;
 #endif
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+	mutex_init(&sbi->s_orphan_lock);
 
 	sb->s_root = NULL;
 
-- 
cgit v1.2.3


From 32ed5058ce90024efcd811254b4b1de0468099df Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 25 Apr 2009 22:53:39 -0400
Subject: ext4: Replace lock/unlock_super() with an explicit lock for resizing

Use a separate lock to protect s_groups_count and the other block
group descriptors which get changed via an on-line resize operation,
so we can stop overloading the use of lock_super().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_sb.h |  1 +
 fs/ext4/resize.c  | 35 ++++++++++++++++++-----------------
 fs/ext4/super.c   |  1 +
 3 files changed, 20 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 4bda2f75d42..2d36223d5f5 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -72,6 +72,7 @@ struct ext4_sb_info {
 	struct journal_s *s_journal;
 	struct list_head s_orphan;
 	struct mutex s_orphan_lock;
+	struct mutex s_resize_lock;
 	unsigned long s_commit_interval;
 	u32 s_max_batch_time;
 	u32 s_min_batch_time;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 546c7dd869e..e8ded13b5cb 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -193,7 +193,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
-	lock_super(sb);
+	mutex_lock(&sbi->s_resize_lock);
 	if (input->group != sbi->s_groups_count) {
 		err = -EBUSY;
 		goto exit_journal;
@@ -302,7 +302,7 @@ exit_bh:
 	brelse(bh);
 
 exit_journal:
-	unlock_super(sb);
+	mutex_unlock(&sbi->s_resize_lock);
 	if ((err2 = ext4_journal_stop(handle)) && !err)
 		err = err2;
 
@@ -643,11 +643,12 @@ exit_free:
  * important part is that the new block and inode counts are in the backup
  * superblocks, and the location of the new group metadata in the GDT backups.
  *
- * We do not need lock_super() for this, because these blocks are not
- * otherwise touched by the filesystem code when it is mounted.  We don't
- * need to worry about last changing from sbi->s_groups_count, because the
- * worst that can happen is that we do not copy the full number of backups
- * at this time.  The resize which changed s_groups_count will backup again.
+ * We do not need take the s_resize_lock for this, because these
+ * blocks are not otherwise touched by the filesystem code when it is
+ * mounted.  We don't need to worry about last changing from
+ * sbi->s_groups_count, because the worst that can happen is that we
+ * do not copy the full number of backups at this time.  The resize
+ * which changed s_groups_count will backup again.
  */
 static void update_backups(struct super_block *sb,
 			   int blk_off, char *data, int size)
@@ -809,7 +810,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 		goto exit_put;
 	}
 
-	lock_super(sb);
+	mutex_lock(&sbi->s_resize_lock);
 	if (input->group != sbi->s_groups_count) {
 		ext4_warning(sb, __func__,
 			     "multiple resizers run on filesystem!");
@@ -840,7 +841,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
         /*
          * OK, now we've set up the new group.  Time to make it active.
          *
-         * Current kernels don't lock all allocations via lock_super(),
+         * We do not lock all allocations via s_resize_lock
          * so we have to be safe wrt. concurrent accesses the group
          * data.  So we need to be careful to set all of the relevant
          * group descriptor data etc. *before* we enable the group.
@@ -900,12 +901,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	 *
 	 * The precise rules we use are:
 	 *
-	 * * Writers of s_groups_count *must* hold lock_super
+	 * * Writers of s_groups_count *must* hold s_resize_lock
 	 * AND
 	 * * Writers must perform a smp_wmb() after updating all dependent
 	 *   data and before modifying the groups count
 	 *
-	 * * Readers must hold lock_super() over the access
+	 * * Readers must hold s_resize_lock over the access
 	 * OR
 	 * * Readers must perform an smp_rmb() after reading the groups count
 	 *   and before reading any dependent data.
@@ -948,7 +949,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	sb->s_dirt = 1;
 
 exit_journal:
-	unlock_super(sb);
+	mutex_unlock(&sbi->s_resize_lock);
 	if ((err2 = ext4_journal_stop(handle)) && !err)
 		err = err2;
 	if (!err) {
@@ -986,7 +987,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 
 	/* We don't need to worry about locking wrt other resizers just
 	 * yet: we're going to revalidate es->s_blocks_count after
-	 * taking lock_super() below. */
+	 * taking the s_resize_lock below. */
 	o_blocks_count = ext4_blocks_count(es);
 	o_groups_count = EXT4_SB(sb)->s_groups_count;
 
@@ -1056,11 +1057,11 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		goto exit_put;
 	}
 
-	lock_super(sb);
+	mutex_lock(&EXT4_SB(sb)->s_resize_lock);
 	if (o_blocks_count != ext4_blocks_count(es)) {
 		ext4_warning(sb, __func__,
 			     "multiple resizers run on filesystem!");
-		unlock_super(sb);
+		mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
 		ext4_journal_stop(handle);
 		err = -EBUSY;
 		goto exit_put;
@@ -1070,14 +1071,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 						 EXT4_SB(sb)->s_sbh))) {
 		ext4_warning(sb, __func__,
 			     "error %d on journal write access", err);
-		unlock_super(sb);
+		mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
 		ext4_journal_stop(handle);
 		goto exit_put;
 	}
 	ext4_blocks_count_set(es, o_blocks_count + add);
 	ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
 	sb->s_dirt = 1;
-	unlock_super(sb);
+	mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
 	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
 		   o_blocks_count + add);
 	/* We add the blocks to the bitmap and set the group need init bit */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7f43fde9554..1fbf0906ae2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2646,6 +2646,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #endif
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
 	mutex_init(&sbi->s_orphan_lock);
+	mutex_init(&sbi->s_resize_lock);
 
 	sb->s_root = NULL;
 
-- 
cgit v1.2.3


From db2dbb12dc47a50c7a4c5678f526014063e486f6 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 22 Apr 2009 14:08:13 +0200
Subject: block: implement blkdev_readpages

Doing a proper block dev ->readpages() speeds up the crazy dump(8)
approach of using interleaved process IO.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index f45dbc18dd1..a85fe310fc6 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -331,6 +331,12 @@ static int blkdev_readpage(struct file * file, struct page * page)
 	return block_read_full_page(page, blkdev_get_block);
 }
 
+static int blkdev_readpages(struct file *file, struct address_space *mapping,
+			struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
+}
+
 static int blkdev_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
@@ -1399,6 +1405,7 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
 
 static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
+	.readpages	= blkdev_readpages,
 	.writepage	= blkdev_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= blkdev_write_begin,
-- 
cgit v1.2.3


From 75507efb1372b6acf1aa6bf00ebd49ce196fd994 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 12:58:36 -0400
Subject: ext4: Don't avoid using BLOCK_UNINIT block groups in mballoc

By avoiding the use of not-yet-used block groups (i.e., block groups
with the BLOCK_UNINIT flag), mballoc had a tendency to create large
files with large non-contiguous gaps.  In addition avoiding the use of
new block groups had a tendency to push regular file data into the
first block group in a flex_bg group, which slows down the speed of
e2fsck pass 2, since it has a tendency to seek much more.  For
example:

               Before Patch                       After Patch
              Time in seconds                   Time in seconds
            Real /  User/  Sys   MB/s      Real /  User/  Sys    MB/s
Pass 1      8.52 / 2.21 / 0.46  20.43      8.84 / 4.97 / 1.11   19.68
Pass 2     21.16 / 1.02 / 1.86  11.30      6.54 / 1.77 / 1.78   36.39
Pass 3      0.01 / 0.00 / 0.00 139.00      0.01 / 0.01 / 0.00  128.90
Pass 4      0.16 / 0.15 / 0.00   0.00      0.17 / 0.17 / 0.00    0.00
Pass 5      2.52 / 1.99 / 0.09   0.79      2.31 / 1.78 / 0.06    0.86
Total      32.40 / 5.11 / 2.49  12.81     17.99 / 8.75 / 2.98   23.01

This was on a sample 80 gig root filesystem which was approximately
50% full.  Note the improved e2fsck pass 2 performance, by over a
factor of 3, due to a decreased number of seeks.  (The total amount of
I/O in pass 2 was unchanged; the layout of the directory blocks was
simply much better from e2fsck's's perspective.)

Other changes as a result of this patch on this sample filesystem:

                             Before Patch    After Patch
# of non-contig files           762             779
# of non-contig directories     571             570
# of BLOCK_UNINIT bg's          307             293
# of INODE_UNINIT bg's          503             503

Out of 640 block groups, of which 333 were in use, this patch caused
an extra 14 block groups to be utilized.  The number of non-contiguous
files did go up slightly, but when measured against the 99.9% of the
files (603,154) which were contiguously allocated, this is pretty
insignificant.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andreas Dilger <adilger@sun.com>
---
 fs/ext4/mballoc.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c3af9e6b666..dbd47eac13e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1728,7 +1728,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	unsigned free, fragments;
 	unsigned i, bits;
 	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
-	struct ext4_group_desc *desc;
 	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
 
 	BUG_ON(cr < 0 || cr >= 4);
@@ -1744,10 +1743,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	switch (cr) {
 	case 0:
 		BUG_ON(ac->ac_2order == 0);
-		/* If this group is uninitialized, skip it initially */
-		desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
-		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
-			return 0;
 
 		/* Avoid using the first bg of a flexgroup for data files */
 		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
@@ -2067,9 +2062,7 @@ repeat:
 
 			ac->ac_groups_scanned++;
 			desc = ext4_get_group_desc(sb, group, NULL);
-			if (cr == 0 || (desc->bg_flags &
-					cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
-					ac->ac_2order != 0))
+			if (cr == 0)
 				ext4_mb_simple_scan_group(ac, &e4b);
 			else if (cr == 1 &&
 					ac->ac_g_ex.fe_len == sbi->s_stripe)
-- 
cgit v1.2.3


From d444c3c38189b3f18337a213855ac1c07af4e2d9 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 13:44:33 -0400
Subject: ext4: Move the ext4_i.h header file into ext4.h

There is no longer a reason for a separate ext4_i.h header file, so
move it into ext4.h just to make life easier for developers to find
the relevant data structures and typedefs.  Should also speed up
compiles slightly, too.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h   | 122 +++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/ext4_i.h | 140 -------------------------------------------------------
 2 files changed, 121 insertions(+), 141 deletions(-)
 delete mode 100644 fs/ext4/ext4_i.h

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 02ec44bf38e..ba57d669cb6 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -21,7 +21,10 @@
 #include <linux/magic.h>
 #include <linux/jbd2.h>
 #include <linux/quota.h>
-#include "ext4_i.h"
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
 
 /*
  * The fourth extended filesystem constants/structures
@@ -46,6 +49,19 @@
 #define ext4_debug(f, a...)	do {} while (0)
 #endif
 
+/* data type for block offset of block group */
+typedef int ext4_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long long ext4_fsblk_t;
+
+/* data type for file logical block number */
+typedef __u32 ext4_lblk_t;
+
+/* data type for block group number */
+typedef unsigned int ext4_group_t;
+
+
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE		1
 /* blocks already reserved */
@@ -515,6 +531,110 @@ do {									       \
 
 #endif /* defined(__KERNEL__) || defined(__linux__) */
 
+/*
+ * storage for cached extent
+ */
+struct ext4_ext_cache {
+	ext4_fsblk_t	ec_start;
+	ext4_lblk_t	ec_block;
+	__u32		ec_len; /* must be 32bit to return holes */
+	__u32		ec_type;
+};
+
+/*
+ * fourth extended file system inode data in memory
+ */
+struct ext4_inode_info {
+	__le32	i_data[15];	/* unconverted */
+	__u32	i_flags;
+	ext4_fsblk_t	i_file_acl;
+	__u32	i_dtime;
+
+	/*
+	 * i_block_group is the number of the block group which contains
+	 * this file's inode.  Constant across the lifetime of the inode,
+	 * it is ued for making block allocation decisions - we try to
+	 * place a file's data blocks near its inode block, and new inodes
+	 * near to their parent directory's inode.
+	 */
+	ext4_group_t	i_block_group;
+	__u32	i_state;		/* Dynamic state flags for ext4 */
+
+	ext4_lblk_t		i_dir_start_lookup;
+#ifdef CONFIG_EXT4_FS_XATTR
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_mutex even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+#endif
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+	struct posix_acl	*i_acl;
+	struct posix_acl	*i_default_acl;
+#endif
+
+	struct list_head i_orphan;	/* unlinked but open inodes */
+
+	/*
+	 * i_disksize keeps track of what the inode size is ON DISK, not
+	 * in memory.  During truncate, i_size is set to the new size by
+	 * the VFS prior to calling ext4_truncate(), but the filesystem won't
+	 * set i_disksize to 0 until the truncate is actually under way.
+	 *
+	 * The intent is that i_disksize always represents the blocks which
+	 * are used by this file.  This allows recovery to restart truncate
+	 * on orphans if we crash during truncate.  We actually write i_disksize
+	 * into the on-disk inode when writing inodes out, instead of i_size.
+	 *
+	 * The only time when i_disksize and i_size may be different is when
+	 * a truncate is in progress.  The only things which change i_disksize
+	 * are ext4_get_block (growth) and ext4_truncate (shrinkth).
+	 */
+	loff_t	i_disksize;
+
+	/*
+	 * i_data_sem is for serialising ext4_truncate() against
+	 * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
+	 * data tree are chopped off during truncate. We can't do that in
+	 * ext4 because whenever we perform intermediate commits during
+	 * truncate, the inode and all the metadata blocks *must* be in a
+	 * consistent state which allows truncation of the orphans to restart
+	 * during recovery.  Hence we must fix the get_block-vs-truncate race
+	 * by other means, so we have i_data_sem.
+	 */
+	struct rw_semaphore i_data_sem;
+	struct inode vfs_inode;
+	struct jbd2_inode jinode;
+
+	struct ext4_ext_cache i_cached_extent;
+	/*
+	 * File creation time. Its function is same as that of
+	 * struct timespec i_{a,c,m}time in the generic inode.
+	 */
+	struct timespec i_crtime;
+
+	/* mballoc */
+	struct list_head i_prealloc_list;
+	spinlock_t i_prealloc_lock;
+
+	/* ialloc */
+	ext4_group_t	i_last_alloc_group;
+
+	/* allocation reservation info for delalloc */
+	unsigned int i_reserved_data_blocks;
+	unsigned int i_reserved_meta_blocks;
+	unsigned int i_allocated_meta_blocks;
+	unsigned short i_delalloc_reserved_flag;
+
+	/* on-disk additional length */
+	__u16 i_extra_isize;
+
+	spinlock_t i_block_reservation_lock;
+};
+
 /*
  * File system states
  */
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
deleted file mode 100644
index 4ce2187123a..00000000000
--- a/fs/ext4/ext4_i.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- *  ext4_i.h
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/include/linux/minix_fs_i.h
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-
-#ifndef _EXT4_I
-#define _EXT4_I
-
-#include <linux/rwsem.h>
-#include <linux/rbtree.h>
-#include <linux/seqlock.h>
-#include <linux/mutex.h>
-
-/* data type for block offset of block group */
-typedef int ext4_grpblk_t;
-
-/* data type for filesystem-wide blocks number */
-typedef unsigned long long ext4_fsblk_t;
-
-/* data type for file logical block number */
-typedef __u32 ext4_lblk_t;
-
-/* data type for block group number */
-typedef unsigned int ext4_group_t;
-
-/*
- * storage for cached extent
- */
-struct ext4_ext_cache {
-	ext4_fsblk_t	ec_start;
-	ext4_lblk_t	ec_block;
-	__u32		ec_len; /* must be 32bit to return holes */
-	__u32		ec_type;
-};
-
-/*
- * fourth extended file system inode data in memory
- */
-struct ext4_inode_info {
-	__le32	i_data[15];	/* unconverted */
-	__u32	i_flags;
-	ext4_fsblk_t	i_file_acl;
-	__u32	i_dtime;
-
-	/*
-	 * i_block_group is the number of the block group which contains
-	 * this file's inode.  Constant across the lifetime of the inode,
-	 * it is ued for making block allocation decisions - we try to
-	 * place a file's data blocks near its inode block, and new inodes
-	 * near to their parent directory's inode.
-	 */
-	ext4_group_t	i_block_group;
-	__u32	i_state;		/* Dynamic state flags for ext4 */
-
-	ext4_lblk_t		i_dir_start_lookup;
-#ifdef CONFIG_EXT4_FS_XATTR
-	/*
-	 * Extended attributes can be read independently of the main file
-	 * data. Taking i_mutex even when reading would cause contention
-	 * between readers of EAs and writers of regular file data, so
-	 * instead we synchronize on xattr_sem when reading or changing
-	 * EAs.
-	 */
-	struct rw_semaphore xattr_sem;
-#endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-	struct posix_acl	*i_acl;
-	struct posix_acl	*i_default_acl;
-#endif
-
-	struct list_head i_orphan;	/* unlinked but open inodes */
-
-	/*
-	 * i_disksize keeps track of what the inode size is ON DISK, not
-	 * in memory.  During truncate, i_size is set to the new size by
-	 * the VFS prior to calling ext4_truncate(), but the filesystem won't
-	 * set i_disksize to 0 until the truncate is actually under way.
-	 *
-	 * The intent is that i_disksize always represents the blocks which
-	 * are used by this file.  This allows recovery to restart truncate
-	 * on orphans if we crash during truncate.  We actually write i_disksize
-	 * into the on-disk inode when writing inodes out, instead of i_size.
-	 *
-	 * The only time when i_disksize and i_size may be different is when
-	 * a truncate is in progress.  The only things which change i_disksize
-	 * are ext4_get_block (growth) and ext4_truncate (shrinkth).
-	 */
-	loff_t	i_disksize;
-
-	/*
-	 * i_data_sem is for serialising ext4_truncate() against
-	 * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
-	 * data tree are chopped off during truncate. We can't do that in
-	 * ext4 because whenever we perform intermediate commits during
-	 * truncate, the inode and all the metadata blocks *must* be in a
-	 * consistent state which allows truncation of the orphans to restart
-	 * during recovery.  Hence we must fix the get_block-vs-truncate race
-	 * by other means, so we have i_data_sem.
-	 */
-	struct rw_semaphore i_data_sem;
-	struct inode vfs_inode;
-	struct jbd2_inode jinode;
-
-	struct ext4_ext_cache i_cached_extent;
-	/*
-	 * File creation time. Its function is same as that of
-	 * struct timespec i_{a,c,m}time in the generic inode.
-	 */
-	struct timespec i_crtime;
-
-	/* mballoc */
-	struct list_head i_prealloc_list;
-	spinlock_t i_prealloc_lock;
-
-	/* ialloc */
-	ext4_group_t	i_last_alloc_group;
-
-	/* allocation reservation info for delalloc */
-	unsigned int i_reserved_data_blocks;
-	unsigned int i_reserved_meta_blocks;
-	unsigned int i_allocated_meta_blocks;
-	unsigned short i_delalloc_reserved_flag;
-
-	/* on-disk additional length */
-	__u16 i_extra_isize;
-
-	spinlock_t i_block_reservation_lock;
-};
-
-#endif	/* _EXT4_I */
-- 
cgit v1.2.3


From ca0faba0e8ac844dc0279825eb8db876b5962ea5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 3 May 2009 16:33:44 -0400
Subject: ext4: Move the ext4_sb.h header file into ext4.h

There is no longer a reason for a separate ext4_sb.h header file, so
move it into ext4.h just to make life easier for developers to find
the relevant data structures and typedefs.  Should also speed up
compiles slightly, too.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 144 +++++++++++++++++++++++++++++++++++++++++++++--
 fs/ext4/ext4_sb.h | 163 ------------------------------------------------------
 2 files changed, 140 insertions(+), 167 deletions(-)
 delete mode 100644 fs/ext4/ext4_sb.h

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ba57d669cb6..af3c906e705 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -25,6 +25,10 @@
 #include <linux/rbtree.h>
 #include <linux/seqlock.h>
 #include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/percpu_counter.h>
 
 /*
  * The fourth extended filesystem constants/structures
@@ -195,9 +199,6 @@ struct flex_groups {
 #define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
 
-#ifdef __KERNEL__
-#include "ext4_sb.h"
-#endif
 /*
  * Macro-instructions used to manage group descriptors
  */
@@ -809,6 +810,136 @@ struct ext4_super_block {
 };
 
 #ifdef __KERNEL__
+/*
+ * fourth extended-fs super-block data in memory
+ */
+struct ext4_sb_info {
+	unsigned long s_desc_size;	/* Size of a group descriptor in bytes */
+	unsigned long s_inodes_per_block;/* Number of inodes per block */
+	unsigned long s_blocks_per_group;/* Number of blocks in a group */
+	unsigned long s_inodes_per_group;/* Number of inodes in a group */
+	unsigned long s_itb_per_group;	/* Number of inode table blocks per group */
+	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
+	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
+	ext4_group_t s_groups_count;	/* Number of groups in the fs */
+	unsigned long s_overhead_last;  /* Last calculated overhead */
+	unsigned long s_blocks_last;    /* Last seen block count */
+	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
+	struct buffer_head * s_sbh;	/* Buffer containing the super block */
+	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
+	struct buffer_head **s_group_desc;
+	unsigned long  s_mount_opt;
+	ext4_fsblk_t s_sb_block;
+	uid_t s_resuid;
+	gid_t s_resgid;
+	unsigned short s_mount_state;
+	unsigned short s_pad;
+	int s_addr_per_block_bits;
+	int s_desc_per_block_bits;
+	int s_inode_size;
+	int s_first_ino;
+	unsigned int s_inode_readahead_blks;
+	spinlock_t s_next_gen_lock;
+	u32 s_next_generation;
+	u32 s_hash_seed[4];
+	int s_def_hash_version;
+	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
+	struct percpu_counter s_freeblocks_counter;
+	struct percpu_counter s_freeinodes_counter;
+	struct percpu_counter s_dirs_counter;
+	struct percpu_counter s_dirtyblocks_counter;
+	struct blockgroup_lock *s_blockgroup_lock;
+	struct proc_dir_entry *s_proc;
+	struct kobject s_kobj;
+	struct completion s_kobj_unregister;
+
+	/* Journaling */
+	struct inode *s_journal_inode;
+	struct journal_s *s_journal;
+	struct list_head s_orphan;
+	struct mutex s_orphan_lock;
+	struct mutex s_resize_lock;
+	unsigned long s_commit_interval;
+	u32 s_max_batch_time;
+	u32 s_min_batch_time;
+	struct block_device *journal_bdev;
+#ifdef CONFIG_JBD2_DEBUG
+	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
+	wait_queue_head_t ro_wait_queue;	/* For people waiting for the fs to go read-only */
+#endif
+#ifdef CONFIG_QUOTA
+	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */
+	int s_jquota_fmt;			/* Format of quota to use */
+#endif
+	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+
+#ifdef EXTENTS_STATS
+	/* ext4 extents stats */
+	unsigned long s_ext_min;
+	unsigned long s_ext_max;
+	unsigned long s_depth_max;
+	spinlock_t s_ext_stats_lock;
+	unsigned long s_ext_blocks;
+	unsigned long s_ext_extents;
+#endif
+
+	/* for buddy allocator */
+	struct ext4_group_info ***s_group_info;
+	struct inode *s_buddy_cache;
+	long s_blocks_reserved;
+	spinlock_t s_reserve_lock;
+	spinlock_t s_md_lock;
+	tid_t s_last_transaction;
+	unsigned short *s_mb_offsets;
+	unsigned int *s_mb_maxs;
+
+	/* tunables */
+	unsigned long s_stripe;
+	unsigned int s_mb_stream_request;
+	unsigned int s_mb_max_to_scan;
+	unsigned int s_mb_min_to_scan;
+	unsigned int s_mb_stats;
+	unsigned int s_mb_order2_reqs;
+	unsigned int s_mb_group_prealloc;
+	/* where last allocation was done - for stream allocation */
+	unsigned long s_mb_last_group;
+	unsigned long s_mb_last_start;
+
+	/* history to debug policy */
+	struct ext4_mb_history *s_mb_history;
+	int s_mb_history_cur;
+	int s_mb_history_max;
+	int s_mb_history_num;
+	spinlock_t s_mb_history_lock;
+	int s_mb_history_filter;
+
+	/* stats for buddy allocator */
+	spinlock_t s_mb_pa_lock;
+	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
+	atomic_t s_bal_success;	/* we found long enough chunks */
+	atomic_t s_bal_allocated;	/* in blocks */
+	atomic_t s_bal_ex_scanned;	/* total extents scanned */
+	atomic_t s_bal_goals;	/* goal hits */
+	atomic_t s_bal_breaks;	/* too long searches */
+	atomic_t s_bal_2orders;	/* 2^order hits */
+	spinlock_t s_bal_lock;
+	unsigned long s_mb_buddies_generated;
+	unsigned long long s_mb_generation_time;
+	atomic_t s_mb_lost_chunks;
+	atomic_t s_mb_preallocated;
+	atomic_t s_mb_discarded;
+
+	/* locality groups */
+	struct ext4_locality_group *s_locality_groups;
+
+	/* for write statistics */
+	unsigned long s_sectors_written_start;
+	u64 s_kbytes_written;
+
+	unsigned int s_log_groups_per_flex;
+	struct flex_groups *s_flex_groups;
+};
+
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
@@ -824,7 +955,6 @@ static inline struct timespec ext4_current_time(struct inode *inode)
 		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
 }
 
-
 static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 {
 	return ino == EXT4_ROOT_INO ||
@@ -833,6 +963,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 		(ino >= EXT4_FIRST_INO(sb) &&
 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
+
+static inline spinlock_t *
+sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
+{
+	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
+}
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
  * a kernel struct super_block.  This will allow us to call the feature-test
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
deleted file mode 100644
index 2d36223d5f5..00000000000
--- a/fs/ext4/ext4_sb.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- *  ext4_sb.h
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/include/linux/minix_fs_sb.h
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-
-#ifndef _EXT4_SB
-#define _EXT4_SB
-
-#ifdef __KERNEL__
-#include <linux/timer.h>
-#include <linux/wait.h>
-#include <linux/blockgroup_lock.h>
-#include <linux/percpu_counter.h>
-#endif
-#include <linux/rbtree.h>
-
-/*
- * fourth extended-fs super-block data in memory
- */
-struct ext4_sb_info {
-	unsigned long s_desc_size;	/* Size of a group descriptor in bytes */
-	unsigned long s_inodes_per_block;/* Number of inodes per block */
-	unsigned long s_blocks_per_group;/* Number of blocks in a group */
-	unsigned long s_inodes_per_group;/* Number of inodes in a group */
-	unsigned long s_itb_per_group;	/* Number of inode table blocks per group */
-	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
-	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
-	ext4_group_t s_groups_count;	/* Number of groups in the fs */
-	unsigned long s_overhead_last;  /* Last calculated overhead */
-	unsigned long s_blocks_last;    /* Last seen block count */
-	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
-	struct buffer_head * s_sbh;	/* Buffer containing the super block */
-	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
-	struct buffer_head **s_group_desc;
-	unsigned long  s_mount_opt;
-	ext4_fsblk_t s_sb_block;
-	uid_t s_resuid;
-	gid_t s_resgid;
-	unsigned short s_mount_state;
-	unsigned short s_pad;
-	int s_addr_per_block_bits;
-	int s_desc_per_block_bits;
-	int s_inode_size;
-	int s_first_ino;
-	unsigned int s_inode_readahead_blks;
-	spinlock_t s_next_gen_lock;
-	u32 s_next_generation;
-	u32 s_hash_seed[4];
-	int s_def_hash_version;
-	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
-	struct percpu_counter s_freeblocks_counter;
-	struct percpu_counter s_freeinodes_counter;
-	struct percpu_counter s_dirs_counter;
-	struct percpu_counter s_dirtyblocks_counter;
-	struct blockgroup_lock *s_blockgroup_lock;
-	struct proc_dir_entry *s_proc;
-	struct kobject s_kobj;
-	struct completion s_kobj_unregister;
-
-	/* Journaling */
-	struct inode *s_journal_inode;
-	struct journal_s *s_journal;
-	struct list_head s_orphan;
-	struct mutex s_orphan_lock;
-	struct mutex s_resize_lock;
-	unsigned long s_commit_interval;
-	u32 s_max_batch_time;
-	u32 s_min_batch_time;
-	struct block_device *journal_bdev;
-#ifdef CONFIG_JBD2_DEBUG
-	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
-	wait_queue_head_t ro_wait_queue;	/* For people waiting for the fs to go read-only */
-#endif
-#ifdef CONFIG_QUOTA
-	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */
-	int s_jquota_fmt;			/* Format of quota to use */
-#endif
-	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
-
-#ifdef EXTENTS_STATS
-	/* ext4 extents stats */
-	unsigned long s_ext_min;
-	unsigned long s_ext_max;
-	unsigned long s_depth_max;
-	spinlock_t s_ext_stats_lock;
-	unsigned long s_ext_blocks;
-	unsigned long s_ext_extents;
-#endif
-
-	/* for buddy allocator */
-	struct ext4_group_info ***s_group_info;
-	struct inode *s_buddy_cache;
-	long s_blocks_reserved;
-	spinlock_t s_reserve_lock;
-	spinlock_t s_md_lock;
-	tid_t s_last_transaction;
-	unsigned short *s_mb_offsets;
-	unsigned int *s_mb_maxs;
-
-	/* tunables */
-	unsigned long s_stripe;
-	unsigned int s_mb_stream_request;
-	unsigned int s_mb_max_to_scan;
-	unsigned int s_mb_min_to_scan;
-	unsigned int s_mb_stats;
-	unsigned int s_mb_order2_reqs;
-	unsigned int s_mb_group_prealloc;
-	/* where last allocation was done - for stream allocation */
-	unsigned long s_mb_last_group;
-	unsigned long s_mb_last_start;
-
-	/* history to debug policy */
-	struct ext4_mb_history *s_mb_history;
-	int s_mb_history_cur;
-	int s_mb_history_max;
-	int s_mb_history_num;
-	spinlock_t s_mb_history_lock;
-	int s_mb_history_filter;
-
-	/* stats for buddy allocator */
-	spinlock_t s_mb_pa_lock;
-	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
-	atomic_t s_bal_success;	/* we found long enough chunks */
-	atomic_t s_bal_allocated;	/* in blocks */
-	atomic_t s_bal_ex_scanned;	/* total extents scanned */
-	atomic_t s_bal_goals;	/* goal hits */
-	atomic_t s_bal_breaks;	/* too long searches */
-	atomic_t s_bal_2orders;	/* 2^order hits */
-	spinlock_t s_bal_lock;
-	unsigned long s_mb_buddies_generated;
-	unsigned long long s_mb_generation_time;
-	atomic_t s_mb_lost_chunks;
-	atomic_t s_mb_preallocated;
-	atomic_t s_mb_discarded;
-
-	/* locality groups */
-	struct ext4_locality_group *s_locality_groups;
-
-	/* for write statistics */
-	unsigned long s_sectors_written_start;
-	u64 s_kbytes_written;
-
-	unsigned int s_log_groups_per_flex;
-	struct flex_groups *s_flex_groups;
-};
-
-static inline spinlock_t *
-sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
-{
-	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
-}
-
-#endif	/* _EXT4_SB */
-- 
cgit v1.2.3


From 596397b77c895d0fa3674f579c94ad5ea88ef01d Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 13:49:15 -0400
Subject: ext4: Move fs/ext4/namei.h into ext4.h

The fs/ext4/namei.h header file had only a single function
declaration, and should have never been a standalone file.  Move it
into ext4.h, where should have been from the beginning.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 1 +
 fs/ext4/namei.c | 1 -
 fs/ext4/namei.h | 8 --------
 fs/ext4/super.c | 1 -
 4 files changed, 1 insertion(+), 10 deletions(-)
 delete mode 100644 fs/ext4/namei.h

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index af3c906e705..d9c5251d082 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1594,6 +1594,7 @@ extern const struct file_operations ext4_file_operations;
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
 extern const struct inode_operations ext4_special_inode_operations;
+extern struct dentry *ext4_get_parent(struct dentry *child);
 
 /* symlink.c */
 extern const struct inode_operations ext4_symlink_inode_operations;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 8018e49a728..c9690b250e5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -37,7 +37,6 @@
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
-#include "namei.h"
 #include "xattr.h"
 #include "acl.h"
 
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
deleted file mode 100644
index 5e4dfff36a0..00000000000
--- a/fs/ext4/namei.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/*  linux/fs/ext4/namei.h
- *
- * Copyright (C) 2005 Simtec Electronics
- *	Ben Dooks <ben@simtec.co.uk>
- *
-*/
-
-extern struct dentry *ext4_get_parent(struct dentry *child);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1fbf0906ae2..d79e1c428b4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -46,7 +46,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "namei.h"
 #include "group.h"
 
 struct proc_dir_entry *ext4_proc_root;
-- 
cgit v1.2.3


From bb23c20a851a5038b255a3c0d0aa56093c1da3f8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 19:44:44 -0400
Subject: ext4: Move fs/ext4/group.h into ext4.h

Move the function prototypes in group.h into ext4.h so they are all
defined in one place.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  |  1 -
 fs/ext4/ext4.h    | 17 +++++++++++++++++
 fs/ext4/group.h   | 29 -----------------------------
 fs/ext4/ialloc.c  |  1 -
 fs/ext4/mballoc.h |  1 -
 fs/ext4/resize.c  |  1 -
 fs/ext4/super.c   |  1 -
 7 files changed, 17 insertions(+), 34 deletions(-)
 delete mode 100644 fs/ext4/group.h

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index a5ba039850c..92f557d957d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -19,7 +19,6 @@
 #include <linux/buffer_head.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
-#include "group.h"
 #include "mballoc.h"
 
 /*
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d9c5251d082..5973f3261b0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1270,6 +1270,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 						    ext4_group_t block_group,
 						    struct buffer_head ** bh);
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+				      ext4_group_t block_group);
+extern unsigned ext4_init_block_bitmap(struct super_block *sb,
+				       struct buffer_head *bh,
+				       ext4_group_t group,
+				       struct ext4_group_desc *desc);
+#define ext4_free_blocks_after_init(sb, group, desc)			\
+		ext4_init_block_bitmap(sb, NULL, group, desc)
 
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1294,6 +1302,11 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
+extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
+				       struct buffer_head *bh,
+				       ext4_group_t group,
+				       struct ext4_group_desc *desc);
+extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1417,6 +1430,10 @@ extern void ext4_used_dirs_set(struct super_block *sb,
 				struct ext4_group_desc *bg, __u32 count);
 extern void ext4_itable_unused_set(struct super_block *sb,
 				   struct ext4_group_desc *bg, __u32 count);
+extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
+				   struct ext4_group_desc *gdp);
+extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
+				       struct ext4_group_desc *gdp);
 
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
deleted file mode 100644
index c2c0a8d06d0..00000000000
--- a/fs/ext4/group.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  linux/fs/ext4/group.h
- *
- * Copyright (C) 2007 Cluster File Systems, Inc
- *
- * Author: Andreas Dilger <adilger@clusterfs.com>
- */
-
-#ifndef _LINUX_EXT4_GROUP_H
-#define _LINUX_EXT4_GROUP_H
-
-extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
-				   struct ext4_group_desc *gdp);
-extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
-				       struct ext4_group_desc *gdp);
-struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
-				      ext4_group_t block_group);
-extern unsigned ext4_init_block_bitmap(struct super_block *sb,
-				       struct buffer_head *bh,
-				       ext4_group_t group,
-				       struct ext4_group_desc *desc);
-#define ext4_free_blocks_after_init(sb, group, desc)			\
-		ext4_init_block_bitmap(sb, NULL, group, desc)
-extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
-				       struct buffer_head *bh,
-				       ext4_group_t group,
-				       struct ext4_group_desc *desc);
-extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
-#endif /* _LINUX_EXT4_GROUP_H */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 55ba419ca00..916d05c881c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -27,7 +27,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "group.h"
 
 /*
  * ialloc.c contains the inodes allocation and deallocation routines
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index dd9e6cd5f6c..75e34f69215 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -23,7 +23,6 @@
 #include <linux/mutex.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
-#include "group.h"
 
 /*
  * with AGGRESSIVE_CHECK allocator runs consistency checks over
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index e8ded13b5cb..27eb289eea3 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -15,7 +15,6 @@
 #include <linux/slab.h>
 
 #include "ext4_jbd2.h"
-#include "group.h"
 
 #define outside(b, first, last)	((b) < (first) || (b) >= (last))
 #define inside(b, first, last)	((b) >= (first) && (b) < (last))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d79e1c428b4..7903f20c807 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -46,7 +46,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "group.h"
 
 struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
-- 
cgit v1.2.3


From f40339031b04279c3fdde7ac5fe97db33b2a7694 Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Fri, 1 May 2009 20:27:20 -0400
Subject: ext4: Make the length of the mb_history file tunable

In memory-constrained systems with many partitions, the ~68K for each
partition for the mb_history buffer can be excessive.

This patch adds a new mount option, mb_history_length, as well as a
way of setting the default via a module parameter (or via a sysfs
parameter in /sys/module/ext4/parameter/default_mb_history_length).
If the mb_history_length is set to zero, the mb_history facility is
disabled entirely.

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 13 +++++++------
 fs/ext4/super.c   | 18 +++++++++++++++++-
 2 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index dbd47eac13e..df75855ae6f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2413,7 +2413,8 @@ static void ext4_mb_history_release(struct super_block *sb)
 
 	if (sbi->s_proc != NULL) {
 		remove_proc_entry("mb_groups", sbi->s_proc);
-		remove_proc_entry("mb_history", sbi->s_proc);
+		if (sbi->s_mb_history_max)
+			remove_proc_entry("mb_history", sbi->s_proc);
 	}
 	kfree(sbi->s_mb_history);
 }
@@ -2424,17 +2425,17 @@ static void ext4_mb_history_init(struct super_block *sb)
 	int i;
 
 	if (sbi->s_proc != NULL) {
-		proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
-				 &ext4_mb_seq_history_fops, sb);
+		if (sbi->s_mb_history_max)
+			proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
+					 &ext4_mb_seq_history_fops, sb);
 		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
 				 &ext4_mb_seq_groups_fops, sb);
 	}
 
-	sbi->s_mb_history_max = 1000;
 	sbi->s_mb_history_cur = 0;
 	spin_lock_init(&sbi->s_mb_history_lock);
 	i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-	sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
+	sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
 	/* if we can't allocate history, then we simple won't use it */
 }
 
@@ -2444,7 +2445,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	struct ext4_mb_history h;
 
-	if (unlikely(sbi->s_mb_history == NULL))
+	if (sbi->s_mb_history == NULL)
 		return;
 
 	if (!(ac->ac_op & sbi->s_mb_history_filter))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7903f20c807..39223a52bc7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -47,6 +47,13 @@
 #include "xattr.h"
 #include "acl.h"
 
+static int default_mb_history_length = 1000;
+
+module_param_named(default_mb_history_length, default_mb_history_length,
+		   int, 0644);
+MODULE_PARM_DESC(default_mb_history_length,
+		 "Default number of entries saved for mb_history");
+
 struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
 
@@ -1042,7 +1049,7 @@ enum {
 	Opt_journal_update, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
-	Opt_data_err_abort, Opt_data_err_ignore,
+	Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
@@ -1088,6 +1095,7 @@ static const match_table_t tokens = {
 	{Opt_data_writeback, "data=writeback"},
 	{Opt_data_err_abort, "data_err=abort"},
 	{Opt_data_err_ignore, "data_err=ignore"},
+	{Opt_mb_history_length, "mb_history_length=%u"},
 	{Opt_offusrjquota, "usrjquota="},
 	{Opt_usrjquota, "usrjquota=%s"},
 	{Opt_offgrpjquota, "grpjquota="},
@@ -1329,6 +1337,13 @@ static int parse_options(char *options, struct super_block *sb,
 		case Opt_data_err_ignore:
 			clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
 			break;
+		case Opt_mb_history_length:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			sbi->s_mb_history_max = option;
+			break;
 #ifdef CONFIG_QUOTA
 		case Opt_usrjquota:
 			qtype = USRQUOTA;
@@ -2345,6 +2360,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
 	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
 	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+	sbi->s_mb_history_max = default_mb_history_length;
 
 	set_opt(sbi->s_mount_opt, BARRIER);
 
-- 
cgit v1.2.3


From abc8746eb91fb01e8d411896f80f7687c0d8372e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 2 May 2009 22:54:32 -0400
Subject: ext4: hook fiemap operation for directories

Add fiemap callback for directories

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index c9690b250e5..f2bc160463b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2534,6 +2534,7 @@ const struct inode_operations ext4_dir_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.permission	= ext4_permission,
+	.fiemap         = ext4_fiemap,
 };
 
 const struct inode_operations ext4_special_inode_operations = {
-- 
cgit v1.2.3


From 19ba0559f9ce104171ab16706893ce01f03ef116 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 13 May 2009 18:12:05 -0400
Subject: vfs: Enable FS_IOC_FIEMAP and FIGETBSZ for all filetypes

The fiemap and get_blk_size ioctls should be enabled even for
directories.  So move it outisde file_ioctl.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ioctl.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ioctl.c b/fs/ioctl.c
index 82d9c42b8ba..286f38dfc6c 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -414,10 +414,6 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
 	switch (cmd) {
 	case FIBMAP:
 		return ioctl_fibmap(filp, p);
-	case FS_IOC_FIEMAP:
-		return ioctl_fiemap(filp, arg);
-	case FIGETBSZ:
-		return put_user(inode->i_sb->s_blocksize, p);
 	case FIONREAD:
 		return put_user(i_size_read(inode) - filp->f_pos, p);
 	}
@@ -557,6 +553,16 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		error = ioctl_fsthaw(filp);
 		break;
 
+	case FS_IOC_FIEMAP:
+		return ioctl_fiemap(filp, arg);
+
+	case FIGETBSZ:
+	{
+		struct inode *inode = filp->f_path.dentry->d_inode;
+		int __user *p = (int __user *)arg;
+		return put_user(inode->i_sb->s_blocksize, p);
+	}
+
 	default:
 		if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
 			error = file_ioctl(filp, cmd, arg);
-- 
cgit v1.2.3


From c9877b205f6ce7943bb95281342f4001cc1c00ec Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 1 May 2009 23:32:06 -0400
Subject: ext4: fix for fiemap last-block test

Carl Henrik Lunde reported and debugged this; the test for the
last allocated block was comparing bytes to blocks in this test:

	if (logical + length - 1 == EXT_MAX_BLOCK ||
	    ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
		flags |= FIEMAP_EXTENT_LAST;

so any extent which ended right at 4G was stopping the extent
walk.  Just replacing these values with the extent block &
length should fix it.

Also give blksize_bits a saner type, and reverse the order
of the tests to make the more likely case tested first.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reported-by: Carl Henrik Lunde <chlunde@ping.uio.no>
Tested-by: Carl Henrik Lunde <chlunde@ping.uio.no>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ea5c47608ce..5f7295287de 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3196,7 +3196,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 		       void *data)
 {
 	struct fiemap_extent_info *fieinfo = data;
-	unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
+	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
 	__u64	logical;
 	__u64	physical;
 	__u64	length;
@@ -3243,8 +3243,8 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 	 *
 	 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
 	 */
-	if (logical + length - 1 == EXT_MAX_BLOCK ||
-	    ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
+	if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
+	    newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK)
 		flags |= FIEMAP_EXTENT_LAST;
 
 	error = fiemap_fill_next_extent(fieinfo, logical, physical,
-- 
cgit v1.2.3


From eefd7f03b86b8a319890e7fac5a6fcc7f8694b76 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 2 May 2009 19:05:37 -0400
Subject: ext4: fix the length returned by fiemap for an unallocated extent

If the file's blocks have not yet been allocated because of delayed
allocation, the length of the extent returned by fiemap is incorrect.
This commit fixes this bug.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5f7295287de..4fec6b74638 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3244,8 +3244,15 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 	 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
 	 */
 	if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
-	    newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK)
+	    newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
+		loff_t size = i_size_read(inode);
+		loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
+
 		flags |= FIEMAP_EXTENT_LAST;
+		if ((flags & FIEMAP_EXTENT_DELALLOC) &&
+		    logical+length > size)
+			length = (size - logical + bs - 1) & ~(bs-1);
+	}
 
 	error = fiemap_fill_next_extent(fieinfo, logical, physical,
 					length, flags);
-- 
cgit v1.2.3


From 955ce5f5be67dfe0d1d096b543af33fe8a1ce3dd Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 2 May 2009 20:35:09 -0400
Subject: ext4: Convert ext4_lock_group to use sb_bgl_lock

We have sb_bgl_lock() and ext4_group_info.bb_state
bit spinlock to protech group information. The later is only
used within mballoc code. Consolidate them to use sb_bgl_lock().
This makes the mballoc.c code much simpler and also avoid
confusion with two locks protecting same info.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 12 ++++-----
 fs/ext4/ext4.h    | 26 +++++++------------
 fs/ext4/ialloc.c  | 29 ++++++++++-----------
 fs/ext4/mballoc.c | 78 +++++++++++++++++++------------------------------------
 fs/ext4/super.c   |  6 ++---
 5 files changed, 59 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 92f557d957d..e2126d70dff 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -326,16 +326,16 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 		unlock_buffer(bh);
 		return bh;
 	}
-	spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+	ext4_lock_group(sb, block_group);
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		ext4_init_block_bitmap(sb, bh, block_group, desc);
 		set_bitmap_uptodate(bh);
 		set_buffer_uptodate(bh);
-		spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+		ext4_unlock_group(sb, block_group);
 		unlock_buffer(bh);
 		return bh;
 	}
-	spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+	ext4_unlock_group(sb, block_group);
 	if (buffer_uptodate(bh)) {
 		/*
 		 * if not uninit if bh is uptodate,
@@ -451,7 +451,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 	down_write(&grp->alloc_sem);
 	for (i = 0, blocks_freed = 0; i < count; i++) {
 		BUFFER_TRACE(bitmap_bh, "clear bit");
-		if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+		if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
 						bit + i, bitmap_bh->b_data)) {
 			ext4_error(sb, __func__,
 				   "bit already cleared for block %llu",
@@ -461,11 +461,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 			blocks_freed++;
 		}
 	}
-	spin_lock(sb_bgl_lock(sbi, block_group));
+	ext4_lock_group(sb, block_group);
 	blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
 	ext4_free_blks_set(sb, desc, blk_free_count);
 	desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
-	spin_unlock(sb_bgl_lock(sbi, block_group));
+	ext4_unlock_group(sb, block_group);
 	percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
 
 	if (sbi->s_log_groups_per_flex) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5973f3261b0..149e02dc360 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -963,12 +963,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 		(ino >= EXT4_FIRST_INO(sb) &&
 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
-
-static inline spinlock_t *
-sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
-{
-	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
-}
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
  * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1568,33 +1562,31 @@ struct ext4_group_info {
 };
 
 #define EXT4_GROUP_INFO_NEED_INIT_BIT	0
-#define EXT4_GROUP_INFO_LOCKED_BIT	1
 
 #define EXT4_MB_GRP_NEED_INIT(grp)	\
 	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
 
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
+					      ext4_group_t group)
 {
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+	return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
+}
 
-	bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+	spin_lock(ext4_group_lock_ptr(sb, group));
 }
 
 static inline void ext4_unlock_group(struct super_block *sb,
 					ext4_group_t group)
 {
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+	spin_unlock(ext4_group_lock_ptr(sb, group));
 }
 
 static inline int ext4_is_group_locked(struct super_block *sb,
 					ext4_group_t group)
 {
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
-						&(grinfo->bb_state));
+	return spin_is_locked(ext4_group_lock_ptr(sb, group));
 }
 
 /*
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 916d05c881c..82f7d1d7eae 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -122,16 +122,16 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 		unlock_buffer(bh);
 		return bh;
 	}
-	spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+	ext4_lock_group(sb, block_group);
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 		ext4_init_inode_bitmap(sb, bh, block_group, desc);
 		set_bitmap_uptodate(bh);
 		set_buffer_uptodate(bh);
-		spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+		ext4_unlock_group(sb, block_group);
 		unlock_buffer(bh);
 		return bh;
 	}
-	spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+	ext4_unlock_group(sb, block_group);
 	if (buffer_uptodate(bh)) {
 		/*
 		 * if not uninit if bh is uptodate,
@@ -246,9 +246,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 		goto error_return;
 
 	/* Ok, now we can actually update the inode bitmaps.. */
-	spin_lock(sb_bgl_lock(sbi, block_group));
-	cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
-	spin_unlock(sb_bgl_lock(sbi, block_group));
+	cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
+					bit, bitmap_bh->b_data);
 	if (!cleared)
 		ext4_error(sb, "ext4_free_inode",
 			   "bit already cleared for inode %lu", ino);
@@ -260,7 +259,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 		if (fatal) goto error_return;
 
 		if (gdp) {
-			spin_lock(sb_bgl_lock(sbi, block_group));
+			ext4_lock_group(sb, block_group);
 			count = ext4_free_inodes_count(sb, gdp) + 1;
 			ext4_free_inodes_set(sb, gdp, count);
 			if (is_directory) {
@@ -276,7 +275,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 			}
 			gdp->bg_checksum = ext4_group_desc_csum(sbi,
 							block_group, gdp);
-			spin_unlock(sb_bgl_lock(sbi, block_group));
+			ext4_unlock_group(sb, block_group);
 			percpu_counter_inc(&sbi->s_freeinodes_counter);
 			if (is_directory)
 				percpu_counter_dec(&sbi->s_dirs_counter);
@@ -707,10 +706,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 
 /*
  * claim the inode from the inode bitmap. If the group
- * is uninit we need to take the groups's sb_bgl_lock
+ * is uninit we need to take the groups's ext4_group_lock
  * and clear the uninit flag. The inode bitmap update
  * and group desc uninit flag clear should be done
- * after holding sb_bgl_lock so that ext4_read_inode_bitmap
+ * after holding ext4_group_lock so that ext4_read_inode_bitmap
  * doesn't race with the ext4_claim_inode
  */
 static int ext4_claim_inode(struct super_block *sb,
@@ -721,7 +720,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
 
-	spin_lock(sb_bgl_lock(sbi, group));
+	ext4_lock_group(sb, group);
 	if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
 		/* not a free inode */
 		retval = 1;
@@ -730,7 +729,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	ino++;
 	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
 			ino > EXT4_INODES_PER_GROUP(sb)) {
-		spin_unlock(sb_bgl_lock(sbi, group));
+		ext4_unlock_group(sb, group);
 		ext4_error(sb, __func__,
 			   "reserved inode or inode > inodes count - "
 			   "block_group = %u, inode=%lu", group,
@@ -779,7 +778,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	}
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
-	spin_unlock(sb_bgl_lock(sbi, group));
+	ext4_unlock_group(sb, group);
 	return retval;
 }
 
@@ -935,7 +934,7 @@ got:
 		}
 
 		free = 0;
-		spin_lock(sb_bgl_lock(sbi, group));
+		ext4_lock_group(sb, group);
 		/* recheck and clear flag under lock if we still need to */
 		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 			free = ext4_free_blocks_after_init(sb, group, gdp);
@@ -944,7 +943,7 @@ got:
 			gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
 								gdp);
 		}
-		spin_unlock(sb_bgl_lock(sbi, group));
+		ext4_unlock_group(sb, group);
 
 		/* Don't need to dirty bitmap block if we didn't change it */
 		if (free) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index df75855ae6f..e76459cedcd 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -372,24 +372,12 @@ static inline void mb_set_bit(int bit, void *addr)
 	ext4_set_bit(bit, addr);
 }
 
-static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
-	addr = mb_correct_addr_and_bit(&bit, addr);
-	ext4_set_bit_atomic(lock, bit, addr);
-}
-
 static inline void mb_clear_bit(int bit, void *addr)
 {
 	addr = mb_correct_addr_and_bit(&bit, addr);
 	ext4_clear_bit(bit, addr);
 }
 
-static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
-	addr = mb_correct_addr_and_bit(&bit, addr);
-	ext4_clear_bit_atomic(lock, bit, addr);
-}
-
 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 {
 	int fix = 0, ret, tmpmax;
@@ -803,17 +791,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			unlock_buffer(bh[i]);
 			continue;
 		}
-		spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+		ext4_lock_group(sb, first_group + i);
 		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 			ext4_init_block_bitmap(sb, bh[i],
 						first_group + i, desc);
 			set_bitmap_uptodate(bh[i]);
 			set_buffer_uptodate(bh[i]);
-			spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+			ext4_unlock_group(sb, first_group + i);
 			unlock_buffer(bh[i]);
 			continue;
 		}
-		spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+		ext4_unlock_group(sb, first_group + i);
 		if (buffer_uptodate(bh[i])) {
 			/*
 			 * if not uninit if bh is uptodate,
@@ -1080,7 +1068,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
 	return 0;
 }
 
-static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_clear_bits(void *bm, int cur, int len)
 {
 	__u32 *addr;
 
@@ -1093,15 +1081,12 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
 			cur += 32;
 			continue;
 		}
-		if (lock)
-			mb_clear_bit_atomic(lock, cur, bm);
-		else
-			mb_clear_bit(cur, bm);
+		mb_clear_bit(cur, bm);
 		cur++;
 	}
 }
 
-static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_set_bits(void *bm, int cur, int len)
 {
 	__u32 *addr;
 
@@ -1114,10 +1099,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
 			cur += 32;
 			continue;
 		}
-		if (lock)
-			mb_set_bit_atomic(lock, cur, bm);
-		else
-			mb_set_bit(cur, bm);
+		mb_set_bit(cur, bm);
 		cur++;
 	}
 }
@@ -1332,8 +1314,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 		e4b->bd_info->bb_counters[ord]++;
 	}
 
-	mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group),
-			EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+	mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
 	mb_check_buddy(e4b);
 
 	return ret;
@@ -2756,7 +2737,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	return 0;
 }
 
-/* need to called with ext4 group lock (ext4_lock_group) */
+/* need to called with the ext4 group lock held */
 static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 {
 	struct ext4_prealloc_space *pa;
@@ -2993,14 +2974,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		 * Fix the bitmap and repeat the block allocation
 		 * We leak some of the blocks here.
 		 */
-		mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
-				bitmap_bh->b_data, ac->ac_b_ex.fe_start,
-				ac->ac_b_ex.fe_len);
+		ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+		mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+			    ac->ac_b_ex.fe_len);
+		ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
 		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 		if (!err)
 			err = -EAGAIN;
 		goto out_err;
 	}
+
+	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
 #ifdef AGGRESSIVE_CHECK
 	{
 		int i;
@@ -3010,9 +2994,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		}
 	}
 #endif
-	spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
-	mb_set_bits(NULL, bitmap_bh->b_data,
-				ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
+	mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len);
 	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
 		ext4_free_blks_set(sb, gdp,
@@ -3022,7 +3004,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
 	ext4_free_blks_set(sb, gdp, len);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
-	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+
+	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
 	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
 	/*
 	 * Now reduce the dirty block count also. Should not go negative
@@ -3455,7 +3438,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
  * the function goes through all block freed in the group
  * but not yet committed and marks them used in in-core bitmap.
  * buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with the ext4 group lock held
  */
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 						ext4_group_t group)
@@ -3469,9 +3452,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 
 	while (n) {
 		entry = rb_entry(n, struct ext4_free_data, node);
-		mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
-				bitmap, entry->start_blk,
-				entry->count);
+		mb_set_bits(bitmap, entry->start_blk, entry->count);
 		n = rb_next(n);
 	}
 	return;
@@ -3480,7 +3461,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 /*
  * the function goes through all preallocation in this group and marks them
  * used in in-core bitmap. buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with ext4 group lock held
  */
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group)
@@ -3512,8 +3493,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 		if (unlikely(len == 0))
 			continue;
 		BUG_ON(groupnr != group);
-		mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
-						bitmap, start, len);
+		mb_set_bits(bitmap, start, len);
 		preallocated += len;
 		count++;
 	}
@@ -4856,29 +4836,25 @@ do_more:
 		new_entry->group  = block_group;
 		new_entry->count = count;
 		new_entry->t_tid = handle->h_transaction->t_tid;
+
 		ext4_lock_group(sb, block_group);
-		mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
-				bit, count);
+		mb_clear_bits(bitmap_bh->b_data, bit, count);
 		ext4_mb_free_metadata(handle, &e4b, new_entry);
-		ext4_unlock_group(sb, block_group);
 	} else {
-		ext4_lock_group(sb, block_group);
 		/* need to update group_info->bb_free and bitmap
 		 * with group lock held. generate_buddy look at
 		 * them with group lock_held
 		 */
-		mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
-				bit, count);
+		ext4_lock_group(sb, block_group);
+		mb_clear_bits(bitmap_bh->b_data, bit, count);
 		mb_free_blocks(inode, &e4b, bit, count);
 		ext4_mb_return_to_preallocation(inode, &e4b, block, count);
-		ext4_unlock_group(sb, block_group);
 	}
 
-	spin_lock(sb_bgl_lock(sbi, block_group));
 	ret = ext4_free_blks_count(sb, gdp) + count;
 	ext4_free_blks_set(sb, gdp, ret);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
-	spin_unlock(sb_bgl_lock(sbi, block_group));
+	ext4_unlock_group(sb, block_group);
 	percpu_counter_add(&sbi->s_freeblocks_counter, count);
 
 	if (sbi->s_log_groups_per_flex) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 39223a52bc7..dc34ed3d132 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1784,18 +1784,18 @@ static int ext4_check_descriptors(struct super_block *sb)
 			       "(block %llu)!\n", i, inode_table);
 			return 0;
 		}
-		spin_lock(sb_bgl_lock(sbi, i));
+		ext4_lock_group(sb, i);
 		if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
 			       "Checksum for group %u failed (%u!=%u)\n",
 			       i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
 			       gdp)), le16_to_cpu(gdp->bg_checksum));
 			if (!(sb->s_flags & MS_RDONLY)) {
-				spin_unlock(sb_bgl_lock(sbi, i));
+				ext4_unlock_group(sb, i);
 				return 0;
 			}
 		}
-		spin_unlock(sb_bgl_lock(sbi, i));
+		ext4_unlock_group(sb, i);
 		if (!flexbg_flag)
 			first_block += EXT4_BLOCKS_PER_GROUP(sb);
 	}
-- 
cgit v1.2.3


From 5e751e992f3fb08ba35e1ca8095ec8fbf9eda523 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 8 May 2009 13:55:22 +0100
Subject: CRED: Rename cred_exec_mutex to reflect that it's a guard against
 ptrace

Rename cred_exec_mutex to reflect that it's a guard against foreign
intervention on a process's credential state, such as is made by ptrace().  The
attachment of a debugger to a process affects execve()'s calculation of the new
credential state - _and_ also setprocattr()'s calculation of that state.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/compat.c |  6 +++---
 fs/exec.c   | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 681ed81e6be..bb2a9b2e817 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1488,7 +1488,7 @@ int compat_do_execve(char * filename,
 	if (!bprm)
 		goto out_files;
 
-	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+	retval = mutex_lock_interruptible(&current->cred_guard_mutex);
 	if (retval < 0)
 		goto out_free;
 	current->in_execve = 1;
@@ -1550,7 +1550,7 @@ int compat_do_execve(char * filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_exec_mutex);
+	mutex_unlock(&current->cred_guard_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
 	if (displaced)
@@ -1573,7 +1573,7 @@ out_unmark:
 
 out_unlock:
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_exec_mutex);
+	mutex_unlock(&current->cred_guard_mutex);
 
 out_free:
 	free_bprm(bprm);
diff --git a/fs/exec.c b/fs/exec.c
index 639177b0eea..998e856c307 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1045,7 +1045,7 @@ void install_exec_creds(struct linux_binprm *bprm)
 	commit_creds(bprm->cred);
 	bprm->cred = NULL;
 
-	/* cred_exec_mutex must be held at least to this point to prevent
+	/* cred_guard_mutex must be held at least to this point to prevent
 	 * ptrace_attach() from altering our determination of the task's
 	 * credentials; any time after this it may be unlocked */
 
@@ -1055,7 +1055,7 @@ EXPORT_SYMBOL(install_exec_creds);
 
 /*
  * determine how safe it is to execute the proposed program
- * - the caller must hold current->cred_exec_mutex to protect against
+ * - the caller must hold current->cred_guard_mutex to protect against
  *   PTRACE_ATTACH
  */
 int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1297,7 +1297,7 @@ int do_execve(char * filename,
 	if (!bprm)
 		goto out_files;
 
-	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+	retval = mutex_lock_interruptible(&current->cred_guard_mutex);
 	if (retval < 0)
 		goto out_free;
 	current->in_execve = 1;
@@ -1360,7 +1360,7 @@ int do_execve(char * filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_exec_mutex);
+	mutex_unlock(&current->cred_guard_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
 	if (displaced)
@@ -1383,7 +1383,7 @@ out_unmark:
 
 out_unlock:
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_exec_mutex);
+	mutex_unlock(&current->cred_guard_mutex);
 
 out_free:
 	free_bprm(bprm);
-- 
cgit v1.2.3


From 107db7c7dd137aeb7361b8c2606ac936c0be58ff Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 8 May 2009 13:55:27 +0100
Subject: CRED: Guard the setprocattr security hook against ptrace

Guard the setprocattr security hook against ptrace by taking the target task's
cred_guard_mutex around it.  The problem is that setprocattr() may otherwise
note the lack of a debugger, and then perform an action on that basis whilst
letting a debugger attach between the two points.  Holding cred_guard_mutex
across the test and the action prevents ptrace_attach() from doing that.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/proc/base.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index fb45615943c..23342e188a6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2128,9 +2128,15 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 	if (copy_from_user(page, buf, count))
 		goto out_free;
 
+	/* Guard against adverse ptrace interaction */
+	length = mutex_lock_interruptible(&task->cred_guard_mutex);
+	if (length < 0)
+		goto out_free;
+
 	length = security_setprocattr(task,
 				      (char*)file->f_path.dentry->d_name.name,
 				      (void*)page, count);
+	mutex_unlock(&task->cred_guard_mutex);
 out_free:
 	free_page((unsigned long) page);
 out:
-- 
cgit v1.2.3


From c3a4d78c580de4edc9ef0f7c59812fb02ceb037f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:37 +0900
Subject: block: add rq->resid_len

rq->data_len served two purposes - the length of data buffer on issue
and the residual count on completion.  This duality creates some
headaches.

First of all, block layer and low level drivers can't really determine
what rq->data_len contains while a request is executing.  It could be
the total request length or it coulde be anything else one of the
lower layers is using to keep track of residual count.  This
complicates things because blk_rq_bytes() and thus
[__]blk_end_request_all() relies on rq->data_len for PC commands.
Drivers which want to report residual count should first cache the
total request length, update rq->data_len and then complete the
request with the cached data length.

Secondly, it makes requests default to reporting full residual count,
ie. reporting that no data transfer occurred.  The residual count is
an exception not the norm; however, the driver should clear
rq->data_len to zero to signify the normal cases while leaving it
alone means no data transfer occurred at all.  This reverse default
behavior complicates code unnecessarily and renders block PC on some
drivers (ide-tape/floppy) unuseable.

This patch adds rq->resid_len which is used only for residual count.

While at it, remove now unnecessasry blk_rq_bytes() caching in
ide_pc_intr() as rq->data_len is not changed anymore.

Boaz	: spotted missing conversion in osd
Sergei	: spotted too early conversion to blk_rq_bytes() in ide-tape

[ Impact: cleanup residual count handling, report 0 resid by default ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Mike Miller <mike.miller@hp.com>
Cc: Eric Moore <Eric.Moore@lsi.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Doug Gilbert <dgilbert@interlog.com>
Cc: Mike Miller <mike.miller@hp.com>
Cc: Eric Moore <Eric.Moore@lsi.com>
Cc: Darrick J. Wong <djwong@us.ibm.com>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/exofs/osd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
index b249ae97fb1..06ca92672eb 100644
--- a/fs/exofs/osd.c
+++ b/fs/exofs/osd.c
@@ -50,10 +50,10 @@ int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
 
 	/* FIXME: should be include in osd_sense_info */
 	if (in_resid)
-		*in_resid = or->in.req ? or->in.req->data_len : 0;
+		*in_resid = or->in.req ? or->in.req->resid_len : 0;
 
 	if (out_resid)
-		*out_resid = or->out.req ? or->out.req->data_len : 0;
+		*out_resid = or->out.req ? or->out.req->resid_len : 0;
 
 	return ret;
 }
-- 
cgit v1.2.3


From c969f58ca43fc403c75f5d3da4cf1e21de7afaa0 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 7 Apr 2009 14:13:01 +0100
Subject: GFS2: Update the rw flags

After Jens recent updates:
http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=a1f242524c3c1f5d40f1c9c343427e34d1aadd6e
et al. this is a patch to bring gfs2 uptodate with the core
code. Also I've managed to squash another call to ll_rw_block()
along the way.

There is still one part of the GFS2 I/O paths which are not correctly
annotated and that is due to the sharing of the writeback code between
the data and metadata address spaces. I would like to change that too,
but this patch is still worth doing on its own, I think.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/log.c     |  6 +++---
 fs/gfs2/lops.c    | 14 ++++++++------
 fs/gfs2/meta_io.c | 38 +++++++++++++++++++++++++++-----------
 3 files changed, 38 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 98918a75641..aa62cf5976e 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -120,7 +120,7 @@ __acquires(&sdp->sd_log_lock)
 			lock_buffer(bh);
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
-				submit_bh(WRITE, bh);
+				submit_bh(WRITE_SYNC_PLUG, bh);
 			} else {
 				unlock_buffer(bh);
 				brelse(bh);
@@ -604,7 +604,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
 		goto skip_barrier;
 	get_bh(bh);
-	submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh);
+	submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh);
 	wait_on_buffer(bh);
 	if (buffer_eopnotsupp(bh)) {
 		clear_buffer_eopnotsupp(bh);
@@ -664,7 +664,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
 		lock_buffer(bh);
 		if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
 			bh->b_end_io = end_buffer_write_sync;
-			submit_bh(WRITE, bh);
+			submit_bh(WRITE_SYNC_PLUG, bh);
 		} else {
 			unlock_buffer(bh);
 			brelse(bh);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 80e4f5f898b..00315f50fa4 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -13,6 +13,8 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/bio.h>
+#include <linux/fs.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -189,7 +191,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 		}
 
 		gfs2_log_unlock(sdp);
-		submit_bh(WRITE, bh);
+		submit_bh(WRITE_SYNC_PLUG, bh);
 		gfs2_log_lock(sdp);
 
 		n = 0;
@@ -199,7 +201,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 			gfs2_log_unlock(sdp);
 			lock_buffer(bd2->bd_bh);
 			bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
-			submit_bh(WRITE, bh);
+			submit_bh(WRITE_SYNC_PLUG, bh);
 			gfs2_log_lock(sdp);
 			if (++n >= num)
 				break;
@@ -341,7 +343,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 		sdp->sd_log_num_revoke--;
 
 		if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
-			submit_bh(WRITE, bh);
+			submit_bh(WRITE_SYNC_PLUG, bh);
 
 			bh = gfs2_log_get_buf(sdp);
 			mh = (struct gfs2_meta_header *)bh->b_data;
@@ -358,7 +360,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 	}
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
 
-	submit_bh(WRITE, bh);
+	submit_bh(WRITE_SYNC_PLUG, bh);
 }
 
 static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -560,7 +562,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
 	ptr = bh_log_ptr(bh);
 	
 	get_bh(bh);
-	submit_bh(WRITE, bh);
+	submit_bh(WRITE_SYNC_PLUG, bh);
 	gfs2_log_lock(sdp);
 	while(!list_empty(list)) {
 		bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
@@ -586,7 +588,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
 		} else {
 			bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
 		}
-		submit_bh(WRITE, bh1);
+		submit_bh(WRITE_SYNC_PLUG, bh1);
 		gfs2_log_lock(sdp);
 		ptr += 2;
 	}
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 8d6f13256b2..75b2aec06f8 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -201,16 +201,32 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 		   struct buffer_head **bhp)
 {
-	*bhp = gfs2_getbuf(gl, blkno, CREATE);
-	if (!buffer_uptodate(*bhp)) {
-		ll_rw_block(READ_META, 1, bhp);
-		if (flags & DIO_WAIT) {
-			int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
-			if (error) {
-				brelse(*bhp);
-				return error;
-			}
-		}
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct buffer_head *bh;
+
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+
+	*bhp = bh = gfs2_getbuf(gl, blkno, CREATE);
+
+	lock_buffer(bh);
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		return 0;
+	}
+	bh->b_end_io = end_buffer_read_sync;
+	get_bh(bh);
+	submit_bh(READ_SYNC | (1 << BIO_RW_META), bh);
+	if (!(flags & DIO_WAIT))
+		return 0;
+
+	wait_on_buffer(bh);
+	if (unlikely(!buffer_uptodate(bh))) {
+		struct gfs2_trans *tr = current->journal_info;
+		if (tr && tr->tr_touched)
+			gfs2_io_error_bh(sdp, bh);
+		brelse(bh);
+		return -EIO;
 	}
 
 	return 0;
@@ -404,7 +420,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 	if (buffer_uptodate(first_bh))
 		goto out;
 	if (!buffer_locked(first_bh))
-		ll_rw_block(READ_META, 1, &first_bh);
+		ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh);
 
 	dblock++;
 	extlen--;
-- 
cgit v1.2.3


From 4a0f9a321a113392b448e477018311d14fba2b34 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 20 Apr 2009 08:16:26 +0100
Subject: GFS2: Optimise writepage for metadata

This adds a GFS2 specific writepage for metadata, rather than
continuing to use the VFS function. As a result we now tag all
our metadata I/O with the correct flag so that blktraces will
now be less confusing.

Also, the generic function was checking for a number of corner
cases which cannot happen on the metadata address spaces so that
this should be faster too.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/meta_io.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 57 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 75b2aec06f8..78a5f431266 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -33,17 +33,65 @@
 #include "util.h"
 #include "ops_address.h"
 
-static int aspace_get_block(struct inode *inode, sector_t lblock,
-			    struct buffer_head *bh_result, int create)
+static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
 {
-	gfs2_assert_warn(inode->i_sb->s_fs_info, 0);
-	return -EOPNOTSUPP;
-}
+	int err;
+	struct buffer_head *bh, *head;
+	int nr_underway = 0;
+	int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
+			WRITE_SYNC_PLUG : WRITE));
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!page_has_buffers(page));
+
+	head = page_buffers(page);
+	bh = head;
+
+	do {
+		if (!buffer_mapped(bh))
+			continue;
+		/*
+		 * If it's a fully non-blocking write attempt and we cannot
+		 * lock the buffer then redirty the page.  Note that this can
+		 * potentially cause a busy-wait loop from pdflush and kswapd
+		 * activity, but those code paths have their own higher-level
+		 * throttling.
+		 */
+		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+			lock_buffer(bh);
+		} else if (!trylock_buffer(bh)) {
+			redirty_page_for_writepage(wbc, page);
+			continue;
+		}
+		if (test_clear_buffer_dirty(bh)) {
+			mark_buffer_async_write(bh);
+		} else {
+			unlock_buffer(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+
+	/*
+	 * The page and its buffers are protected by PageWriteback(), so we can
+	 * drop the bh refcounts early.
+	 */
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			submit_bh(write_op, bh);
+			nr_underway++;
+		}
+		bh = next;
+	} while (bh != head);
+	unlock_page(page);
 
-static int gfs2_aspace_writepage(struct page *page,
-				 struct writeback_control *wbc)
-{
-	return block_write_full_page(page, aspace_get_block, wbc);
+	err = 0;
+	if (nr_underway == 0)
+		end_page_writeback(page);
+
+	return err;
 }
 
 static const struct address_space_operations aspace_aops = {
-- 
cgit v1.2.3


From 48bf2b1711dc498494e77705c415ee46bb508fd9 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 29 Apr 2009 13:59:35 +0100
Subject: GFS2: Something nonlinear this way comes!

For some reason GFS2 has been missing support for non-linear
mappings. This patch fixes that, and also avoids taking any
locks for mmap in the O_NOATIME case. In fact we don't actually need
to take the lock here at all - just doing file_accessed() would be
enough, but we have to take the lock eventually and this helps
it hit disk (and thus be seen by other nodes) faster.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_file.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 5d82e91887e..0ee7bd287c5 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -425,33 +425,36 @@ static struct vm_operations_struct gfs2_vm_ops = {
 	.page_mkwrite = gfs2_page_mkwrite,
 };
 
-
 /**
  * gfs2_mmap -
  * @file: The file to map
  * @vma: The VMA which described the mapping
  *
- * Returns: 0 or error code
+ * There is no need to get a lock here unless we should be updating
+ * atime. We ignore any locking errors since the only consequence is
+ * a missed atime update (which will just be deferred until later).
+ *
+ * Returns: 0
  */
 
 static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-	struct gfs2_holder i_gh;
-	int error;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
-	error = gfs2_glock_nq(&i_gh);
-	if (error) {
-		gfs2_holder_uninit(&i_gh);
-		return error;
-	}
+	if (!(file->f_flags & O_NOATIME)) {
+		struct gfs2_holder i_gh;
+		int error;
 
+		gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+		error = gfs2_glock_nq(&i_gh);
+		file_accessed(file);
+		if (error == 0)
+			gfs2_glock_dq_uninit(&i_gh);
+	}
 	vma->vm_ops = &gfs2_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
 
-	gfs2_glock_dq_uninit(&i_gh);
-
-	return error;
+	return 0;
 }
 
 /**
-- 
cgit v1.2.3


From 7c77f0b3f9208c339a4b40737bb2cb0f0319bb8d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 7 May 2009 15:37:35 +0200
Subject: splice: implement pipe to pipe splicing

Allow splice(2) to work when both the input and the output is a pipe.

Based on the impementation of the tee(2) syscall, but instead of
duplicating the buffer references move the buffers from the input pipe
to the output pipe.

Moving the whole buffer only succeeds if the full length of the buffer
is spliced.  Otherwise duplicate the buffer, just like tee(2), set the
length of the output buffer and advance the offset on the input
buffer.

Since splice is operating on two pipes, special care needs to be taken
with locking to prevent AN ABBA deadlock.  Again this is done
similarly to the tee(2) syscall, first preparing the input and output
pipes so there's data to consume and space for that data, and then
doing the move operation while holding both locks.

If other processes are doing I/O on the same pipes parallel to the
splice, then by the time both inodes are locked there might be no
buffers left to move, or no space to move them to.  In this case retry
the whole operation, including the preparation phase.  This could lead
to starvation, but I'm not sure if that's serious enough to worry
about.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 162 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 151 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 666953d59a3..e405cf552f5 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1112,6 +1112,9 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 	return ret;
 }
 
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+			       struct pipe_inode_info *opipe,
+			       size_t len, unsigned int flags);
 /*
  * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
  * location, so checking ->i_pipe is not enough to verify that this is a
@@ -1132,12 +1135,32 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		      struct file *out, loff_t __user *off_out,
 		      size_t len, unsigned int flags)
 {
-	struct pipe_inode_info *pipe;
+	struct pipe_inode_info *ipipe;
+	struct pipe_inode_info *opipe;
 	loff_t offset, *off;
 	long ret;
 
-	pipe = pipe_info(in->f_path.dentry->d_inode);
-	if (pipe) {
+	ipipe = pipe_info(in->f_path.dentry->d_inode);
+	opipe = pipe_info(out->f_path.dentry->d_inode);
+
+	if (ipipe && opipe) {
+		if (off_in || off_out)
+			return -ESPIPE;
+
+		if (!(in->f_mode & FMODE_READ))
+			return -EBADF;
+
+		if (!(out->f_mode & FMODE_WRITE))
+			return -EBADF;
+
+		/* Splicing to self would be fun, but... */
+		if (ipipe == opipe)
+			return -EINVAL;
+
+		return splice_pipe_to_pipe(ipipe, opipe, len, flags);
+	}
+
+	if (ipipe) {
 		if (off_in)
 			return -ESPIPE;
 		if (off_out) {
@@ -1149,7 +1172,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		} else
 			off = &out->f_pos;
 
-		ret = do_splice_from(pipe, out, off, len, flags);
+		ret = do_splice_from(ipipe, out, off, len, flags);
 
 		if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
 			ret = -EFAULT;
@@ -1157,8 +1180,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		return ret;
 	}
 
-	pipe = pipe_info(out->f_path.dentry->d_inode);
-	if (pipe) {
+	if (opipe) {
 		if (off_out)
 			return -ESPIPE;
 		if (off_in) {
@@ -1170,7 +1192,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		} else
 			off = &in->f_pos;
 
-		ret = do_splice_to(in, off, pipe, len, flags);
+		ret = do_splice_to(in, off, opipe, len, flags);
 
 		if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
 			ret = -EFAULT;
@@ -1511,7 +1533,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
  * Make sure there's data to read. Wait for input if we can, otherwise
  * return an appropriate error.
  */
-static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 {
 	int ret;
 
@@ -1549,7 +1571,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
  * Make sure there's writeable room. Wait for room if we can, otherwise
  * return an appropriate error.
  */
-static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 {
 	int ret;
 
@@ -1586,6 +1608,124 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 	return ret;
 }
 
+/*
+ * Splice contents of ipipe to opipe.
+ */
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+			       struct pipe_inode_info *opipe,
+			       size_t len, unsigned int flags)
+{
+	struct pipe_buffer *ibuf, *obuf;
+	int ret = 0, nbuf;
+	bool input_wakeup = false;
+
+
+retry:
+	ret = ipipe_prep(ipipe, flags);
+	if (ret)
+		return ret;
+
+	ret = opipe_prep(opipe, flags);
+	if (ret)
+		return ret;
+
+	/*
+	 * Potential ABBA deadlock, work around it by ordering lock
+	 * grabbing by pipe info address. Otherwise two different processes
+	 * could deadlock (one doing tee from A -> B, the other from B -> A).
+	 */
+	pipe_double_lock(ipipe, opipe);
+
+	do {
+		if (!opipe->readers) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret)
+				ret = -EPIPE;
+			break;
+		}
+
+		if (!ipipe->nrbufs && !ipipe->writers)
+			break;
+
+		/*
+		 * Cannot make any progress, because either the input
+		 * pipe is empty or the output pipe is full.
+		 */
+		if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
+			/* Already processed some buffers, break */
+			if (ret)
+				break;
+
+			if (flags & SPLICE_F_NONBLOCK) {
+				ret = -EAGAIN;
+				break;
+			}
+
+			/*
+			 * We raced with another reader/writer and haven't
+			 * managed to process any buffers.  A zero return
+			 * value means EOF, so retry instead.
+			 */
+			pipe_unlock(ipipe);
+			pipe_unlock(opipe);
+			goto retry;
+		}
+
+		ibuf = ipipe->bufs + ipipe->curbuf;
+		nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
+		obuf = opipe->bufs + nbuf;
+
+		if (len >= ibuf->len) {
+			/*
+			 * Simply move the whole buffer from ipipe to opipe
+			 */
+			*obuf = *ibuf;
+			ibuf->ops = NULL;
+			opipe->nrbufs++;
+			ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
+			ipipe->nrbufs--;
+			input_wakeup = true;
+		} else {
+			/*
+			 * Get a reference to this pipe buffer,
+			 * so we can copy the contents over.
+			 */
+			ibuf->ops->get(ipipe, ibuf);
+			*obuf = *ibuf;
+
+			/*
+			 * Don't inherit the gift flag, we need to
+			 * prevent multiple steals of this page.
+			 */
+			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+
+			obuf->len = len;
+			opipe->nrbufs++;
+			ibuf->offset += obuf->len;
+			ibuf->len -= obuf->len;
+		}
+		ret += obuf->len;
+		len -= obuf->len;
+	} while (len);
+
+	pipe_unlock(ipipe);
+	pipe_unlock(opipe);
+
+	/*
+	 * If we put data in the output pipe, wakeup any potential readers.
+	 */
+	if (ret > 0) {
+		smp_mb();
+		if (waitqueue_active(&opipe->wait))
+			wake_up_interruptible(&opipe->wait);
+		kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
+	}
+	if (input_wakeup)
+		wakeup_pipe_writers(ipipe);
+
+	return ret;
+}
+
 /*
  * Link contents of ipipe to opipe.
  */
@@ -1690,9 +1830,9 @@ static long do_tee(struct file *in, struct file *out, size_t len,
 		 * Keep going, unless we encounter an error. The ipipe/opipe
 		 * ordering doesn't really matter.
 		 */
-		ret = link_ipipe_prep(ipipe, flags);
+		ret = ipipe_prep(ipipe, flags);
 		if (!ret) {
-			ret = link_opipe_prep(opipe, flags);
+			ret = opipe_prep(opipe, flags);
 			if (!ret)
 				ret = link_pipe(ipipe, opipe, len, flags);
 		}
-- 
cgit v1.2.3


From 6818173bd658439b83896a2a7586f64ab51bf29c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 7 May 2009 15:37:36 +0200
Subject: splice: implement default splice_read method

If f_op->splice_read() is not implemented, fall back to a plain read.
Use vfs_readv() to read into previously allocated pages.

This will allow splice and functions using splice, such as the loop
device, to work on all filesystems.  This includes "direct_io" files
in fuse which bypass the page cache.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/coda/file.c  |   9 +++--
 fs/pipe.c       |  14 +++++++
 fs/read_write.c |   7 +---
 fs/splice.c     | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++++---
 4 files changed, 136 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/coda/file.c b/fs/coda/file.c
index 6a347fbc998..ffd42815fda 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -47,6 +47,8 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
 		      struct pipe_inode_info *pipe, size_t count,
 		      unsigned int flags)
 {
+	ssize_t (*splice_read)(struct file *, loff_t *,
+			       struct pipe_inode_info *, size_t, unsigned int);
 	struct coda_file_info *cfi;
 	struct file *host_file;
 
@@ -54,10 +56,11 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	if (!host_file->f_op || !host_file->f_op->splice_read)
-		return -EINVAL;
+	splice_read = host_file->f_op->splice_read;
+	if (!splice_read)
+		splice_read = default_file_splice_read;
 
-	return host_file->f_op->splice_read(host_file, ppos, pipe, count,flags);
+	return splice_read(host_file, ppos, pipe, count, flags);
 }
 
 static ssize_t
diff --git a/fs/pipe.c b/fs/pipe.c
index 13414ec45b8..f7dd21ad85a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -302,6 +302,20 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
 	return 0;
 }
 
+/**
+ * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
+ * @pipe:	the pipe that the buffer belongs to
+ * @buf:	the buffer to put a reference to
+ *
+ * Description:
+ *	This function releases a reference to @buf.
+ */
+void generic_pipe_buf_release(struct pipe_inode_info *pipe,
+			      struct pipe_buffer *buf)
+{
+	page_cache_release(buf->page);
+}
+
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
 	.can_merge = 1,
 	.map = generic_pipe_buf_map,
diff --git a/fs/read_write.c b/fs/read_write.c
index 9d1e76bb9ee..6c8c55dec2b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -805,12 +805,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 		goto out;
 	if (!(in_file->f_mode & FMODE_READ))
 		goto fput_in;
-	retval = -EINVAL;
-	in_inode = in_file->f_path.dentry->d_inode;
-	if (!in_inode)
-		goto fput_in;
-	if (!in_file->f_op || !in_file->f_op->splice_read)
-		goto fput_in;
 	retval = -ESPIPE;
 	if (!ppos)
 		ppos = &in_file->f_pos;
@@ -834,6 +828,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 	retval = -EINVAL;
 	if (!out_file->f_op || !out_file->f_op->sendpage)
 		goto fput_out;
+	in_inode = in_file->f_path.dentry->d_inode;
 	out_inode = out_file->f_path.dentry->d_inode;
 	retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
 	if (retval < 0)
diff --git a/fs/splice.c b/fs/splice.c
index e405cf552f5..3bd9cb21b38 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -507,9 +507,116 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 
 	return ret;
 }
-
 EXPORT_SYMBOL(generic_file_splice_read);
 
+static const struct pipe_buf_operations default_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = generic_pipe_buf_release,
+	.steal = generic_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
+			    unsigned long vlen, loff_t offset)
+{
+	mm_segment_t old_fs;
+	loff_t pos = offset;
+	ssize_t res;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
+	set_fs(old_fs);
+
+	return res;
+}
+
+ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
+				 struct pipe_inode_info *pipe, size_t len,
+				 unsigned int flags)
+{
+	unsigned int nr_pages;
+	unsigned int nr_freed;
+	size_t offset;
+	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial[PIPE_BUFFERS];
+	struct iovec vec[PIPE_BUFFERS];
+	pgoff_t index;
+	ssize_t res;
+	size_t this_len;
+	int error;
+	int i;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.flags = flags,
+		.ops = &default_pipe_buf_ops,
+		.spd_release = spd_release_page,
+	};
+
+	index = *ppos >> PAGE_CACHE_SHIFT;
+	offset = *ppos & ~PAGE_CACHE_MASK;
+	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
+		struct page *page;
+
+		page = alloc_page(GFP_HIGHUSER);
+		error = -ENOMEM;
+		if (!page)
+			goto err;
+
+		this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
+		vec[i].iov_base = (void __user *) kmap(page);
+		vec[i].iov_len = this_len;
+		pages[i] = page;
+		spd.nr_pages++;
+		len -= this_len;
+		offset = 0;
+	}
+
+	res = kernel_readv(in, vec, spd.nr_pages, *ppos);
+	if (res < 0)
+		goto err;
+
+	error = 0;
+	if (!res)
+		goto err;
+
+	nr_freed = 0;
+	for (i = 0; i < spd.nr_pages; i++) {
+		kunmap(pages[i]);
+		this_len = min_t(size_t, vec[i].iov_len, res);
+		partial[i].offset = 0;
+		partial[i].len = this_len;
+		if (!this_len) {
+			__free_page(pages[i]);
+			pages[i] = NULL;
+			nr_freed++;
+		}
+		res -= this_len;
+	}
+	spd.nr_pages -= nr_freed;
+
+	res = splice_to_pipe(pipe, &spd);
+	if (res > 0)
+		*ppos += res;
+
+	return res;
+
+err:
+	for (i = 0; i < spd.nr_pages; i++) {
+		kunmap(pages[i]);
+		__free_page(pages[i]);
+	}
+	return error;
+}
+EXPORT_SYMBOL(default_file_splice_read);
+
 /*
  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
  * using sendpage(). Return the number of bytes sent.
@@ -933,11 +1040,10 @@ static long do_splice_to(struct file *in, loff_t *ppos,
 			 struct pipe_inode_info *pipe, size_t len,
 			 unsigned int flags)
 {
+	ssize_t (*splice_read)(struct file *, loff_t *,
+			       struct pipe_inode_info *, size_t, unsigned int);
 	int ret;
 
-	if (unlikely(!in->f_op || !in->f_op->splice_read))
-		return -EINVAL;
-
 	if (unlikely(!(in->f_mode & FMODE_READ)))
 		return -EBADF;
 
@@ -945,7 +1051,11 @@ static long do_splice_to(struct file *in, loff_t *ppos,
 	if (unlikely(ret < 0))
 		return ret;
 
-	return in->f_op->splice_read(in, ppos, pipe, len, flags);
+	splice_read = in->f_op->splice_read;
+	if (!splice_read)
+		splice_read = default_file_splice_read;
+
+	return splice_read(in, ppos, pipe, len, flags);
 }
 
 /**
-- 
cgit v1.2.3


From 0b0a47f5c4a30b58432e20ae1706a27baea91a88 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 7 May 2009 15:37:37 +0200
Subject: splice: implement default splice_write method

If f_op->splice_write() is not implemented, fall back to a plain write.
Use vfs_writev() to write from the pipe buffers.

This will allow splice on all filesystems and file types.  This
includes "direct_io" files in fuse which bypass the page cache.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 138 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 3bd9cb21b38..eefd96b1d7f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -535,6 +535,21 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
 	return res;
 }
 
+static ssize_t kernel_writev(struct file *file, const struct iovec *vec,
+			    unsigned long vlen, loff_t *ppos)
+{
+	mm_segment_t old_fs;
+	ssize_t res;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	res = vfs_writev(file, (const struct iovec __user *)vec, vlen, ppos);
+	set_fs(old_fs);
+
+	return res;
+}
+
 ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 				 struct pipe_inode_info *pipe, size_t len,
 				 unsigned int flags)
@@ -988,6 +1003,122 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 EXPORT_SYMBOL(generic_file_splice_write);
 
+static struct pipe_buffer *nth_pipe_buf(struct pipe_inode_info *pipe, int n)
+{
+	return &pipe->bufs[(pipe->curbuf + n) % PIPE_BUFFERS];
+}
+
+static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
+					 struct file *out, loff_t *ppos,
+					 size_t len, unsigned int flags)
+{
+	ssize_t ret = 0;
+	ssize_t total_len = 0;
+	int do_wakeup = 0;
+
+	pipe_lock(pipe);
+	while (len) {
+		struct pipe_buffer *buf;
+		void *data[PIPE_BUFFERS];
+		struct iovec vec[PIPE_BUFFERS];
+		unsigned int nr_pages = 0;
+		unsigned int write_len = 0;
+		unsigned int now_len = len;
+		unsigned int this_len;
+		int i;
+
+		BUG_ON(pipe->nrbufs > PIPE_BUFFERS);
+		for (i = 0; i < pipe->nrbufs && now_len; i++) {
+			buf = nth_pipe_buf(pipe, i);
+
+			ret = buf->ops->confirm(pipe, buf);
+			if (ret)
+				break;
+
+			data[i] = buf->ops->map(pipe, buf, 0);
+			this_len = min(buf->len, now_len);
+			vec[i].iov_base = (void __user *) data[i] + buf->offset;
+			vec[i].iov_len = this_len;
+			now_len -= this_len;
+			write_len += this_len;
+			nr_pages++;
+		}
+
+		if (nr_pages) {
+			ret = kernel_writev(out, vec, nr_pages, ppos);
+			if (ret == 0)
+				ret = -EIO;
+			if (ret > 0) {
+				len -= ret;
+				total_len += ret;
+			}
+		}
+
+		for (i = 0; i < nr_pages; i++) {
+			buf = nth_pipe_buf(pipe, i);
+			buf->ops->unmap(pipe, buf, data[i]);
+
+			if (ret > 0) {
+				this_len = min_t(unsigned, vec[i].iov_len, ret);
+				buf->offset += this_len;
+				buf->len -= this_len;
+				ret -= this_len;
+			}
+		}
+
+		if (ret < 0)
+			break;
+
+		while (pipe->nrbufs) {
+			const struct pipe_buf_operations *ops;
+
+			buf = nth_pipe_buf(pipe, 0);
+			if (buf->len)
+				break;
+
+			ops = buf->ops;
+			buf->ops = NULL;
+			ops->release(pipe, buf);
+			pipe->curbuf = (pipe->curbuf + 1) % PIPE_BUFFERS;
+			pipe->nrbufs--;
+			if (pipe->inode)
+				do_wakeup = 1;
+		}
+
+		if (pipe->nrbufs)
+			continue;
+		if (!pipe->writers)
+			break;
+		if (!pipe->waiting_writers) {
+			if (total_len)
+				break;
+		}
+
+		if (flags & SPLICE_F_NONBLOCK) {
+			ret = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (do_wakeup) {
+			wakeup_pipe_writers(pipe);
+			do_wakeup = 0;
+		}
+
+		pipe_wait(pipe);
+	}
+	pipe_unlock(pipe);
+
+	if (do_wakeup)
+		wakeup_pipe_writers(pipe);
+
+	return total_len ? total_len : ret;
+}
+
 /**
  * generic_splice_sendpage - splice data from a pipe to a socket
  * @pipe:	pipe to splice from
@@ -1015,11 +1146,10 @@ EXPORT_SYMBOL(generic_splice_sendpage);
 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 			   loff_t *ppos, size_t len, unsigned int flags)
 {
+	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
+				loff_t *, size_t, unsigned int);
 	int ret;
 
-	if (unlikely(!out->f_op || !out->f_op->splice_write))
-		return -EINVAL;
-
 	if (unlikely(!(out->f_mode & FMODE_WRITE)))
 		return -EBADF;
 
@@ -1030,7 +1160,11 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 	if (unlikely(ret < 0))
 		return ret;
 
-	return out->f_op->splice_write(pipe, out, ppos, len, flags);
+	splice_write = out->f_op->splice_write;
+	if (!splice_write)
+		splice_write = default_file_splice_write;
+
+	return splice_write(pipe, out, ppos, len, flags);
 }
 
 /*
-- 
cgit v1.2.3


From bc8e67409ccdcff72c3f1656b1fb1aad7ff396db Mon Sep 17 00:00:00 2001
From: Vincent Minet <vincent@vincent-minet.net>
Date: Fri, 15 May 2009 08:33:18 -0400
Subject: ext4: Fix spinlock assertions on UP systems

On UP systems without DEBUG_SPINLOCK, ext4_is_group_locked always fails
which triggers a BUG_ON() call.
This patch fixes it by using assert_spin_locked instead.

Signed-off-by: Vincent Minet <vincent@vincent-minet.net>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  6 ------
 fs/ext4/mballoc.c | 10 +++++-----
 2 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 149e02dc360..89190ae671f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1583,12 +1583,6 @@ static inline void ext4_unlock_group(struct super_block *sb,
 	spin_unlock(ext4_group_lock_ptr(sb, group));
 }
 
-static inline int ext4_is_group_locked(struct super_block *sb,
-					ext4_group_t group)
-{
-	return spin_is_locked(ext4_group_lock_ptr(sb, group));
-}
-
 /*
  * Inodes and files operations
  */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e76459cedcd..541bd9adffa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -436,7 +436,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 
 	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 		return;
-	BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
 	for (i = 0; i < count; i++) {
 		if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
 			ext4_fsblk_t blocknr;
@@ -460,7 +460,7 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
 
 	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 		return;
-	BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 	for (i = 0; i < count; i++) {
 		BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
 		mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
@@ -1115,7 +1115,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 	struct super_block *sb = e4b->bd_sb;
 
 	BUG_ON(first + count > (sb->s_blocksize << 3));
-	BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
 	mb_check_buddy(e4b);
 	mb_free_blocks_double(inode, e4b, first, count);
 
@@ -1196,7 +1196,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
 	int ord;
 	void *buddy;
 
-	BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 	BUG_ON(ex == NULL);
 
 	buddy = mb_find_buddy(e4b, order, &max);
@@ -1260,7 +1260,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 
 	BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
 	BUG_ON(e4b->bd_group != ex->fe_group);
-	BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 	mb_check_buddy(e4b);
 	mb_mark_used_double(e4b, start, len);
 
-- 
cgit v1.2.3


From f888e652d758bfe0c04c209b72a05972daeba386 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 12 May 2009 00:21:29 -0400
Subject: ext4: Simplify function signature for ext4_da_get_block_write()

The function ext4_da_get_block_write() is called in exactly one write,
and the last argument, create, is always 1.  Remove it to simplify the
code slightly.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4e7f363e303..476d843610a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2000,7 +2000,7 @@ static void ext4_print_free_blocks(struct inode *inode)
 
 #define		EXT4_DELALLOC_RSVED	1
 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
-				   struct buffer_head *bh_result, int create)
+				   struct buffer_head *bh_result)
 {
 	int ret;
 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
@@ -2010,7 +2010,7 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 	handle = ext4_journal_current_handle();
 	BUG_ON(!handle);
 	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-				   bh_result, create, 0, EXT4_DELALLOC_RSVED);
+				   bh_result, 1, 0, EXT4_DELALLOC_RSVED);
 	if (ret <= 0)
 		return ret;
 
@@ -2088,7 +2088,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	if (!new.b_size)
 		return 0;
 
-	err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
+	err = ext4_da_get_block_write(mpd->inode, next, &new);
 	if (err) {
 		/*
 		 * If get block returns with error we simply
-- 
cgit v1.2.3


From e4d996ca806e93dddb5d76c0d3d859b494c559f6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 12 May 2009 00:25:28 -0400
Subject: ext4: Rename ext4_get_blocks_handle() to be ext4_ind_get_blocks()

The static function ext4_get_blocks_handle() is badly named.  Of
*course* it takes a handle.  Since its counterpart for extent-based
file is ext4_ext_get_blocks(), rename it to be ext4_ind_get_blocks().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 476d843610a..f758e8021d1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -914,7 +914,7 @@ err_out:
  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
  */
-static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
+static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 				  ext4_lblk_t iblock, unsigned int maxblocks,
 				  struct buffer_head *bh_result,
 				  int create, int extend_disksize)
@@ -1129,7 +1129,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
  * mapped.
  *
  * If file type is extents based, it will call ext4_ext_get_blocks(),
- * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping
+ * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
  * based files
  *
  * On success, it returns the number of blocks being mapped or allocate.
@@ -1160,8 +1160,8 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
 				bh, 0, 0);
 	} else {
-		retval = ext4_get_blocks_handle(handle,
-				inode, block, max_blocks, bh, 0, 0);
+		retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
+					     bh, 0, 0);
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
 
@@ -1215,7 +1215,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
 				bh, create, extend_disksize);
 	} else {
-		retval = ext4_get_blocks_handle(handle, inode, block,
+		retval = ext4_ind_get_blocks(handle, inode, block,
 				max_blocks, bh, create, extend_disksize);
 
 		if (retval > 0 && buffer_new(bh)) {
@@ -1297,7 +1297,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 	err = ext4_get_blocks_wrap(handle, inode, block, 1,
 					&dummy, create, 1, 0);
 	/*
-	 * ext4_get_blocks_handle() returns number of blocks
+	 * ext4_get_blocks_wrap() returns number of blocks
 	 * mapped. 0 in case of a HOLE.
 	 */
 	if (err > 0) {
-- 
cgit v1.2.3


From 7537d81aa7b7cd31b0caeac8091456e93d96fa8d Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Tue, 12 May 2009 11:16:20 -0500
Subject: GFS2: Fix timestamps on write

This patch copies the timestamps from the vfs inode into gfs2 and syncs
it to the disk inode during writes.

Signed-off-by: Abhijith Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_address.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index a6dde1751e1..e5664210f0d 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -781,10 +781,12 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 	unlock_page(page);
 	page_cache_release(page);
 
-	if (inode->i_size < to) {
-		i_size_write(inode, to);
-		ip->i_disksize = inode->i_size;
-		di->di_size = cpu_to_be64(inode->i_size);
+	if (copied) {
+		if (inode->i_size < to) {
+			i_size_write(inode, to);
+			ip->i_disksize = inode->i_size;
+		}
+		gfs2_dinode_out(ip, di);
 		mark_inode_dirty(inode);
 	}
 
@@ -824,7 +826,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct buffer_head *dibh;
 	struct gfs2_alloc *al = ip->i_alloc;
-	struct gfs2_dinode *di;
 	unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
 	unsigned int to = from + len;
 	int ret;
@@ -847,11 +848,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 		gfs2_page_add_databufs(ip, page, from, to);
 
 	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-
-	if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) {
-		di = (struct gfs2_dinode *)dibh->b_data;
-		ip->i_disksize = inode->i_size;
-		di->di_size = cpu_to_be64(inode->i_size);
+	if (ret > 0) {
+		if (inode->i_size > ip->i_disksize)
+			ip->i_disksize = inode->i_size;
+		gfs2_dinode_out(ip, dibh->b_data);
 		mark_inode_dirty(inode);
 	}
 
-- 
cgit v1.2.3


From 12b7ac176831df1aa58a787e67c3e5d698b30163 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 May 2009 00:57:44 -0400
Subject: ext4: Rename ext4_get_blocks_wrap() to be ext4_get_blocks()

Another function rename for clarity's sake.  The _wrap prefix simply
confuses people, and didn't add much people trying to follow the code
paths.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/dir.c     |  3 +--
 fs/ext4/ext4.h    |  8 ++++----
 fs/ext4/extents.c |  6 +++---
 fs/ext4/inode.c   | 35 +++++++++++++++++------------------
 4 files changed, 25 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index b64789929a6..052d6378f99 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -131,8 +131,7 @@ static int ext4_readdir(struct file *filp,
 		struct buffer_head *bh = NULL;
 
 		map_bh.b_state = 0;
-		err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
-						0, 0, 0);
+		err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0, 0, 0);
 		if (err > 0) {
 			pgoff_t index = map_bh.b_blocknr >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 89190ae671f..5dc8368e46b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1617,10 +1617,10 @@ extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 			  loff_t len);
-extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
-			sector_t block, unsigned int max_blocks,
-			struct buffer_head *bh, int create,
-			int extend_disksize, int flag);
+extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
+			   sector_t block, unsigned int max_blocks,
+			   struct buffer_head *bh, int create,
+			   int extend_disksize, int flag);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4fec6b74638..7e7d02dd273 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3151,9 +3151,9 @@ retry:
 			break;
 		}
 		map_bh.b_state = 0;
-		ret = ext4_get_blocks_wrap(handle, inode, block,
-					  max_blocks, &map_bh,
-					  EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
+		ret = ext4_get_blocks(handle, inode, block,
+				      max_blocks, &map_bh,
+				      EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
 			WARN_ON(ret <= 0);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f758e8021d1..a9a9b9b77e8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1121,7 +1121,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 }
 
 /*
- * The ext4_get_blocks_wrap() function try to look up the requested blocks,
+ * The ext4_get_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
  *
  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
@@ -1142,9 +1142,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
  *
  * It returns the error in case of allocation failure.
  */
-int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
-			unsigned int max_blocks, struct buffer_head *bh,
-			int create, int extend_disksize, int flag)
+int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
+		    unsigned int max_blocks, struct buffer_head *bh,
+		    int create, int extend_disksize, int flag)
 {
 	int retval;
 
@@ -1268,8 +1268,8 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
 		started = 1;
 	}
 
-	ret = ext4_get_blocks_wrap(handle, inode, iblock,
-					max_blocks, bh_result, create, 0, 0);
+	ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+			      create, 0, 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
@@ -1294,10 +1294,9 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 	dummy.b_state = 0;
 	dummy.b_blocknr = -1000;
 	buffer_trace_init(&dummy.b_history);
-	err = ext4_get_blocks_wrap(handle, inode, block, 1,
-					&dummy, create, 1, 0);
+	err = ext4_get_blocks(handle, inode, block, 1, &dummy, create, 1, 0);
 	/*
-	 * ext4_get_blocks_wrap() returns number of blocks
+	 * ext4_get_blocks() returns number of blocks
 	 * mapped. 0 in case of a HOLE.
 	 */
 	if (err > 0) {
@@ -2009,8 +2008,8 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 
 	handle = ext4_journal_current_handle();
 	BUG_ON(!handle);
-	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-				   bh_result, 1, 0, EXT4_DELALLOC_RSVED);
+	ret = ext4_get_blocks(handle, inode, iblock, max_blocks,
+			      bh_result, 1, 0, EXT4_DELALLOC_RSVED);
 	if (ret <= 0)
 		return ret;
 
@@ -2067,11 +2066,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	/*
 	 * We need to make sure the BH_Delay flag is passed down to
 	 * ext4_da_get_block_write(), since it calls
-	 * ext4_get_blocks_wrap() with the EXT4_DELALLOC_RSVED flag.
-	 * This flag causes ext4_get_blocks_wrap() to call
+	 * ext4_get_blocks() with the EXT4_DELALLOC_RSVED flag.
+	 * This flag causes ext4_get_blocks() to call
 	 * ext4_da_update_reserve_space() if the passed buffer head
 	 * has the BH_Delay flag set.  In the future, once we clean up
-	 * the interfaces to ext4_get_blocks_wrap(), we should pass in
+	 * the interfaces to ext4_get_blocks(), we should pass in
 	 * a separate flag which requests that the delayed allocation
 	 * statistics should be updated, instead of depending on the
 	 * state information getting passed down via the map_bh's
@@ -2363,7 +2362,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 	 * preallocated blocks are unmapped but should treated
 	 * the same as allocated blocks.
 	 */
-	ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+	ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
 	if ((ret == 0) && !buffer_delay(bh_result)) {
 		/* the block isn't (pre)allocated yet, let's reserve space */
 		/*
@@ -2407,8 +2406,8 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
 	 * we don't want to do block allocation in writepage
 	 * so call get_block_wrap with create = 0
 	 */
-	ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
-				   bh_result, 0, 0, 0);
+	ret = ext4_get_blocks(NULL, inode, iblock, max_blocks,
+			      bh_result, 0, 0, 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
@@ -5034,7 +5033,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
  * Calculate the journal credits for a chunk of data modification.
  *
  * This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
+ * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
  *
  * journal buffers for data blocks are not included here, as DIO
  * and fallocate do no need to journal data buffers.
-- 
cgit v1.2.3


From c21770573319922e3f3fcb331cfaa290c49f1c81 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 May 2009 00:58:52 -0400
Subject: ext4: Define a new set of flags for ext4_get_blocks()

The functions ext4_get_blocks(), ext4_ext_get_blocks(), and
ext4_ind_get_blocks() used an ad-hoc set of integer variables used as
boolean flags passed in as arguments.  Use a single flags parameter
and a setandard set of bitfield flags instead.  This saves space on
the call stack, and it also makes the code a bit more understandable.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/dir.c     |  2 +-
 fs/ext4/ext4.h    | 22 ++++++++++++++-------
 fs/ext4/extents.c | 22 ++++++++++-----------
 fs/ext4/inode.c   | 57 +++++++++++++++++++++++++++++--------------------------
 4 files changed, 57 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 052d6378f99..9dc93168e26 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -131,7 +131,7 @@ static int ext4_readdir(struct file *filp,
 		struct buffer_head *bh = NULL;
 
 		map_bh.b_state = 0;
-		err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0, 0, 0);
+		err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
 		if (err > 0) {
 			pgoff_t index = map_bh.b_blocknr >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5dc8368e46b..17feb4ac633 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -314,10 +314,20 @@ struct ext4_new_group_data {
 };
 
 /*
- * Following is used by preallocation code to tell get_blocks() that we
- * want uninitialzed extents.
+ * Flags used by ext4_get_blocks()
  */
-#define EXT4_CREATE_UNINITIALIZED_EXT		2
+	/* Allocate any needed blocks and/or convert an unitialized
+	   extent to be an initialized ext4 */
+#define EXT4_GET_BLOCKS_CREATE			1
+	/* Request the creation of an unitialized extent */
+#define EXT4_GET_BLOCKS_UNINIT_EXT		2
+#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT	(EXT4_GET_BLOCKS_UNINIT_EXT|\
+						 EXT4_GET_BLOCKS_CREATE)
+	/* Update the ext4_inode_info i_disksize field */
+#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE		4
+	/* Caller is from the delayed allocation writeout path,
+	   so the filesystem blocks have already been accounted for */
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	8
 
 /*
  * ioctl commands
@@ -1610,8 +1620,7 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
 				       int chunk);
 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			       ext4_lblk_t iblock, unsigned int max_blocks,
-			       struct buffer_head *bh_result,
-			       int create, int extend_disksize);
+			       struct buffer_head *bh_result, int flags);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
@@ -1619,8 +1628,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 			  loff_t len);
 extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
 			   sector_t block, unsigned int max_blocks,
-			   struct buffer_head *bh, int create,
-			   int extend_disksize, int flag);
+			   struct buffer_head *bh, int flags);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7e7d02dd273..27c383c7b43 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2784,7 +2784,7 @@ fix_extent_len:
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			ext4_lblk_t iblock,
 			unsigned int max_blocks, struct buffer_head *bh_result,
-			int create, int extend_disksize)
+			int flags)
 {
 	struct ext4_ext_path *path = NULL;
 	struct ext4_extent_header *eh;
@@ -2803,7 +2803,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	cache_type = ext4_ext_in_cache(inode, iblock, &newex);
 	if (cache_type) {
 		if (cache_type == EXT4_EXT_CACHE_GAP) {
-			if (!create) {
+			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
 				/*
 				 * block isn't allocated yet and
 				 * user doesn't want to allocate it
@@ -2869,9 +2869,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 							EXT4_EXT_CACHE_EXTENT);
 				goto out;
 			}
-			if (create == EXT4_CREATE_UNINITIALIZED_EXT)
+			if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
 				goto out;
-			if (!create) {
+			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
 				if (allocated > max_blocks)
 					allocated = max_blocks;
 				/*
@@ -2903,7 +2903,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * requested block isn't allocated yet;
 	 * we couldn't try to create block if create flag is zero
 	 */
-	if (!create) {
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
 		/*
 		 * put just found gap into cache to speed up
 		 * subsequent requests
@@ -2932,10 +2932,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * EXT_UNINIT_MAX_LEN.
 	 */
 	if (max_blocks > EXT_INIT_MAX_LEN &&
-	    create != EXT4_CREATE_UNINITIALIZED_EXT)
+	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
 		max_blocks = EXT_INIT_MAX_LEN;
 	else if (max_blocks > EXT_UNINIT_MAX_LEN &&
-		 create == EXT4_CREATE_UNINITIALIZED_EXT)
+		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
 		max_blocks = EXT_UNINIT_MAX_LEN;
 
 	/* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
@@ -2966,7 +2966,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	/* try to insert new extent into found leaf and return */
 	ext4_ext_store_pblock(&newex, newblock);
 	newex.ee_len = cpu_to_le16(ar.len);
-	if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
+	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)  /* Mark uninitialized */
 		ext4_ext_mark_uninitialized(&newex);
 	err = ext4_ext_insert_extent(handle, inode, path, &newex);
 	if (err) {
@@ -2983,7 +2983,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	newblock = ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
 outnew:
-	if (extend_disksize) {
+	if (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE) {
 		disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
 		if (disksize > i_size_read(inode))
 			disksize = i_size_read(inode);
@@ -2994,7 +2994,7 @@ outnew:
 	set_buffer_new(bh_result);
 
 	/* Cache only when it is _not_ an uninitialized extent */
-	if (create != EXT4_CREATE_UNINITIALIZED_EXT)
+	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
 		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
 						EXT4_EXT_CACHE_EXTENT);
 out:
@@ -3153,7 +3153,7 @@ retry:
 		map_bh.b_state = 0;
 		ret = ext4_get_blocks(handle, inode, block,
 				      max_blocks, &map_bh,
-				      EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
+				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
 			WARN_ON(ret <= 0);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a9a9b9b77e8..8b7564dfacd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -917,7 +917,7 @@ err_out:
 static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 				  ext4_lblk_t iblock, unsigned int maxblocks,
 				  struct buffer_head *bh_result,
-				  int create, int extend_disksize)
+				  int flags)
 {
 	int err = -EIO;
 	ext4_lblk_t offsets[4];
@@ -934,7 +934,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 
 
 	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
-	J_ASSERT(handle != NULL || create == 0);
+	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
 	depth = ext4_block_to_path(inode, iblock, offsets,
 					&blocks_to_boundary);
 
@@ -963,7 +963,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	}
 
 	/* Next simple case - plain lookup or failed read of indirect block */
-	if (!create || err == -EIO)
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
 		goto cleanup;
 
 	/*
@@ -1002,7 +1002,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	 * protect it if you're about to implement concurrent
 	 * ext4_get_block() -bzzz
 	*/
-	if (!err && extend_disksize) {
+	if (!err && (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE)) {
 		disksize = ((loff_t) iblock + count) << inode->i_blkbits;
 		if (disksize > i_size_read(inode))
 			disksize = i_size_read(inode);
@@ -1144,7 +1144,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
  */
 int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 		    unsigned int max_blocks, struct buffer_head *bh,
-		    int create, int extend_disksize, int flag)
+		    int flags)
 {
 	int retval;
 
@@ -1158,15 +1158,15 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	down_read((&EXT4_I(inode)->i_data_sem));
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
 		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-				bh, 0, 0);
+				bh, 0);
 	} else {
 		retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
-					     bh, 0, 0);
+					     bh, 0);
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
 
 	/* If it is only a block(s) look up */
-	if (!create)
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
 		return retval;
 
 	/*
@@ -1205,7 +1205,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	 * let the underlying get_block() function know to
 	 * avoid double accounting
 	 */
-	if (flag)
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 		EXT4_I(inode)->i_delalloc_reserved_flag = 1;
 	/*
 	 * We need to check for EXT4 here because migrate
@@ -1213,10 +1213,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	 */
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
 		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-				bh, create, extend_disksize);
+					      bh, flags);
 	} else {
 		retval = ext4_ind_get_blocks(handle, inode, block,
-				max_blocks, bh, create, extend_disksize);
+					     max_blocks, bh, flags);
 
 		if (retval > 0 && buffer_new(bh)) {
 			/*
@@ -1229,7 +1229,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 		}
 	}
 
-	if (flag) {
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
 		EXT4_I(inode)->i_delalloc_reserved_flag = 0;
 		/*
 		 * Update reserved blocks/metadata blocks
@@ -1269,7 +1269,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
 	}
 
 	ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
-			      create, 0, 0);
+			      create ? EXT4_GET_BLOCKS_CREATE : 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
@@ -1288,16 +1288,19 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 {
 	struct buffer_head dummy;
 	int fatal = 0, err;
+	int flags = EXT4_GET_BLOCKS_EXTEND_DISKSIZE;
 
 	J_ASSERT(handle != NULL || create == 0);
 
 	dummy.b_state = 0;
 	dummy.b_blocknr = -1000;
 	buffer_trace_init(&dummy.b_history);
-	err = ext4_get_blocks(handle, inode, block, 1, &dummy, create, 1, 0);
+	if (create)
+		flags |= EXT4_GET_BLOCKS_CREATE;
+	err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
 	/*
-	 * ext4_get_blocks() returns number of blocks
-	 * mapped. 0 in case of a HOLE.
+	 * ext4_get_blocks() returns number of blocks mapped. 0 in
+	 * case of a HOLE.
 	 */
 	if (err > 0) {
 		if (err > 1)
@@ -1997,7 +2000,6 @@ static void ext4_print_free_blocks(struct inode *inode)
 	return;
 }
 
-#define		EXT4_DELALLOC_RSVED	1
 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result)
 {
@@ -2009,7 +2011,8 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 	handle = ext4_journal_current_handle();
 	BUG_ON(!handle);
 	ret = ext4_get_blocks(handle, inode, iblock, max_blocks,
-			      bh_result, 1, 0, EXT4_DELALLOC_RSVED);
+			      bh_result, EXT4_GET_BLOCKS_CREATE|
+			      EXT4_GET_BLOCKS_DELALLOC_RESERVE);
 	if (ret <= 0)
 		return ret;
 
@@ -2065,16 +2068,17 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		return 0;
 	/*
 	 * We need to make sure the BH_Delay flag is passed down to
-	 * ext4_da_get_block_write(), since it calls
-	 * ext4_get_blocks() with the EXT4_DELALLOC_RSVED flag.
-	 * This flag causes ext4_get_blocks() to call
+	 * ext4_da_get_block_write(), since it calls ext4_get_blocks()
+	 * with the EXT4_GET_BLOCKS_DELALLOC_RESERVE flag.  This flag
+	 * causes ext4_get_blocks() to call
 	 * ext4_da_update_reserve_space() if the passed buffer head
 	 * has the BH_Delay flag set.  In the future, once we clean up
-	 * the interfaces to ext4_get_blocks(), we should pass in
-	 * a separate flag which requests that the delayed allocation
+	 * the interfaces to ext4_get_blocks(), we should pass in a
+	 * separate flag which requests that the delayed allocation
 	 * statistics should be updated, instead of depending on the
 	 * state information getting passed down via the map_bh's
-	 * state bitmasks plus the magic EXT4_DELALLOC_RSVED flag.
+	 * state bitmasks plus the magic
+	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE flag.
 	 */
 	new.b_state = mpd->b_state & (1 << BH_Delay);
 	new.b_blocknr = 0;
@@ -2362,7 +2366,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 	 * preallocated blocks are unmapped but should treated
 	 * the same as allocated blocks.
 	 */
-	ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+	ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0);
 	if ((ret == 0) && !buffer_delay(bh_result)) {
 		/* the block isn't (pre)allocated yet, let's reserve space */
 		/*
@@ -2406,8 +2410,7 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
 	 * we don't want to do block allocation in writepage
 	 * so call get_block_wrap with create = 0
 	 */
-	ret = ext4_get_blocks(NULL, inode, iblock, max_blocks,
-			      bh_result, 0, 0, 0);
+	ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
-- 
cgit v1.2.3


From b920c75502cb2c48654ef196d647c8eb81ab608a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 May 2009 00:54:29 -0400
Subject: ext4: Add documentation to the ext4_*get_block* functions

This adds more documentation to various internal functions in
fs/ext4/inode.c, most notably ext4_ind_get_blocks(),
ext4_da_get_block_write(), ext4_da_get_block_prep(),
ext4_normal_get_block_write().

In addition, the static function ext4_normal_get_block_write() has
been renamed noalloc_get_block_write(), since it is used in many
places far beyond ext4_normal_writepage().

Plenty of warnings have been added to the noalloc_get_block_write()
function, since the way it is used is amazingly fragile.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 86 ++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 55 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8b7564dfacd..fd5f27a9b81 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -892,6 +892,10 @@ err_out:
 }
 
 /*
+ * The ext4_ind_get_blocks() function handles non-extents inodes
+ * (i.e., using the traditional indirect/double-indirect i_blocks
+ * scheme) for ext4_get_blocks().
+ *
  * Allocation strategy is simple: if we have to allocate something, we will
  * have to go the whole way to leaf. So let's do it before attaching anything
  * to tree, set linkage between the newborn blocks, write them if sync is
@@ -909,10 +913,11 @@ err_out:
  * return = 0, if plain lookup failed.
  * return < 0, error case.
  *
- *
- * Need to be called with
- * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
- * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ * The ext4_ind_get_blocks() function should be called with
+ * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
+ * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
+ * blocks.
  */
 static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 				  ext4_lblk_t iblock, unsigned int maxblocks,
@@ -1152,8 +1157,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	clear_buffer_unwritten(bh);
 
 	/*
-	 * Try to see if we can get  the block without requesting
-	 * for new file system block.
+	 * Try to see if we can get the block without requesting a new
+	 * file system block.
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -2000,6 +2005,12 @@ static void ext4_print_free_blocks(struct inode *inode)
 	return;
 }
 
+/*
+ * This function is used by mpage_da_map_blocks().  We separate it out
+ * as a separate function just to make life easier, and because
+ * mpage_da_map_blocks() used to be a generic function that took a
+ * get_block_t.
+ */
 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result)
 {
@@ -2031,8 +2042,8 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 
 	/*
 	 * Update on-disk size along with block allocation we don't
-	 * use 'extend_disksize' as size may change within already
-	 * allocated block -bzzz
+	 * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change
+	 * within already allocated block -bzzz
 	 */
 	disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
 	if (disksize > i_size_read(inode))
@@ -2338,8 +2349,9 @@ static int __mpage_da_writepage(struct page *page,
 }
 
 /*
- * this is a special callback for ->write_begin() only
- * it's intention is to return mapped block or reserve space
+ * This is a special get_blocks_t callback which is used by
+ * ext4_da_write_begin().  It will either return mapped block or
+ * reserve space for a single block.
  *
  * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
  * We also have b_blocknr = -1 and b_bdev initialized properly
@@ -2347,7 +2359,6 @@ static int __mpage_da_writepage(struct page *page,
  * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
  * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
  * initialized properly.
- *
  */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 				  struct buffer_head *bh_result, int create)
@@ -2400,7 +2411,23 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 	return ret;
 }
 
-static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+/*
+ * This function is used as a standard get_block_t calback function
+ * when there is no desire to allocate any blocks.  It is used as a
+ * callback function for block_prepare_write(), nobh_writepage(), and
+ * block_write_full_page().  These functions should only try to map a
+ * single block at a time.
+ *
+ * Since this function doesn't do block allocations even if the caller
+ * requests it by passing in create=1, it is critically important that
+ * any caller checks to make sure that any buffer heads are returned
+ * by this function are either all already mapped or marked for
+ * delayed allocation before calling nobh_writepage() or
+ * block_write_full_page().  Otherwise, b_blocknr could be left
+ * unitialized, and the page write functions will be taken by
+ * surprise.
+ */
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
 	int ret = 0;
@@ -2419,10 +2446,11 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
 }
 
 /*
- * get called vi ext4_da_writepages after taking page lock (have journal handle)
- * get called via journal_submit_inode_data_buffers (no journal handle)
- * get called via shrink_page_list via pdflush (no journal handle)
- * or grab_page_cache when doing write_begin (have journal handle)
+ * This function can get called via...
+ *   - ext4_da_writepages after taking page lock (have journal handle)
+ *   - journal_submit_inode_data_buffers (no journal handle)
+ *   - shrink_page_list via pdflush (no journal handle)
+ *   - grab_page_cache when doing write_begin (have journal handle)
  */
 static int ext4_da_writepage(struct page *page,
 				struct writeback_control *wbc)
@@ -2473,7 +2501,7 @@ static int ext4_da_writepage(struct page *page,
 		 * do block allocation here.
 		 */
 		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-						ext4_normal_get_block_write);
+					  noalloc_get_block_write);
 		if (!ret) {
 			page_bufs = page_buffers(page);
 			/* check whether all are mapped and non delay */
@@ -2498,11 +2526,10 @@ static int ext4_da_writepage(struct page *page,
 	}
 
 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-		ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+		ret = nobh_writepage(page, noalloc_get_block_write, wbc);
 	else
-		ret = block_write_full_page(page,
-						ext4_normal_get_block_write,
-						wbc);
+		ret = block_write_full_page(page, noalloc_get_block_write,
+					    wbc);
 
 	return ret;
 }
@@ -2814,7 +2841,7 @@ retry:
 	*pagep = page;
 
 	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-							ext4_da_get_block_prep);
+				ext4_da_get_block_prep);
 	if (ret < 0) {
 		unlock_page(page);
 		ext4_journal_stop(handle);
@@ -3122,12 +3149,10 @@ static int __ext4_normal_writepage(struct page *page,
 	struct inode *inode = page->mapping->host;
 
 	if (test_opt(inode->i_sb, NOBH))
-		return nobh_writepage(page,
-					ext4_normal_get_block_write, wbc);
+		return nobh_writepage(page, noalloc_get_block_write, wbc);
 	else
-		return block_write_full_page(page,
-						ext4_normal_get_block_write,
-						wbc);
+		return block_write_full_page(page, noalloc_get_block_write,
+					     wbc);
 }
 
 static int ext4_normal_writepage(struct page *page,
@@ -3179,7 +3204,7 @@ static int __ext4_journalled_writepage(struct page *page,
 	int err;
 
 	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-					ext4_normal_get_block_write);
+				  noalloc_get_block_write);
 	if (ret != 0)
 		goto out_unlock;
 
@@ -3264,9 +3289,8 @@ static int ext4_journalled_writepage(struct page *page,
 		 * really know unless we go poke around in the buffer_heads.
 		 * But block_write_full_page will do the right thing.
 		 */
-		return block_write_full_page(page,
-						ext4_normal_get_block_write,
-						wbc);
+		return block_write_full_page(page, noalloc_get_block_write,
+					     wbc);
 	}
 no_write:
 	redirty_page_for_writepage(wbc, page);
-- 
cgit v1.2.3


From a2dc52b5d1d8cc280b3e795abf1c80ac8c49f30c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 12 May 2009 13:51:29 -0400
Subject: ext4: Add BUG_ON debugging checks to noalloc_get_block_write()

Enforce that noalloc_get_block_write() is only called to map one block
at a time, and that it always is successful in finding a mapping for
given an inode's logical block block number if it is called with
create == 1.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index fd5f27a9b81..e6113c3a126 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2433,11 +2433,14 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
 	int ret = 0;
 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 
+	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
 	/*
 	 * we don't want to do block allocation in writepage
 	 * so call get_block_wrap with create = 0
 	 */
 	ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
+	BUG_ON(create && ret == 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
-- 
cgit v1.2.3


From 4f23122858a27ba97444b9b37a066d83edebd4c8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 13 May 2009 08:35:35 +0200
Subject: splice: fix repeated kmap()'s in default_file_splice_read()

We cannot reliably map more than one page at the time, or we risk
deadlocking. Just allocate the pages from low mem instead.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index eefd96b1d7f..c5e3c79b95a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -580,13 +580,13 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
 		struct page *page;
 
-		page = alloc_page(GFP_HIGHUSER);
+		page = alloc_page(GFP_USER);
 		error = -ENOMEM;
 		if (!page)
 			goto err;
 
 		this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
-		vec[i].iov_base = (void __user *) kmap(page);
+		vec[i].iov_base = (void __user *) page_address(page);
 		vec[i].iov_len = this_len;
 		pages[i] = page;
 		spd.nr_pages++;
@@ -604,7 +604,6 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 
 	nr_freed = 0;
 	for (i = 0; i < spd.nr_pages; i++) {
-		kunmap(pages[i]);
 		this_len = min_t(size_t, vec[i].iov_len, res);
 		partial[i].offset = 0;
 		partial[i].len = this_len;
@@ -624,10 +623,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	return res;
 
 err:
-	for (i = 0; i < spd.nr_pages; i++) {
-		kunmap(pages[i]);
+	for (i = 0; i < spd.nr_pages; i++)
 		__free_page(pages[i]);
-	}
+
 	return error;
 }
 EXPORT_SYMBOL(default_file_splice_read);
-- 
cgit v1.2.3


From a1c0643ff9f360a30644f6e3cd643ca2a5083aea Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 13 May 2009 10:56:52 +0100
Subject: GFS2: Move journal live test at transaction start

There seems little point grabbing the transaction glock
only to have to release it again if the journal isn't
live. This moves the test earlier to avoid grabbing the lock
when we don't need it in the first place.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/trans.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 053752d4b27..4ef0e9fa354 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -33,6 +33,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 	BUG_ON(current->journal_info);
 	BUG_ON(blocks == 0 && revokes == 0);
 
+	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+		return -EROFS;
+
 	tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
 	if (!tr)
 		return -ENOMEM;
@@ -54,12 +57,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 	if (error)
 		goto fail_holder_uninit;
 
-	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-		tr->tr_t_gh.gh_flags |= GL_NOCACHE;
-		error = -EROFS;
-		goto fail_gunlock;
-	}
-
 	error = gfs2_log_reserve(sdp, tr->tr_reserved);
 	if (error)
 		goto fail_gunlock;
-- 
cgit v1.2.3


From 48c2b613616235d7c97fda5982f50100a6c79166 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 13 May 2009 14:49:48 +0100
Subject: GFS2: Add commit= mount option

It has always been possible to adjust the gfs2 log commit
interval, but only from the sysfs interface. This adds a
mount option, commit=<nn>, which will be familar to ext3
users.

The sysfs interface continues to be available as well, although
this might be removed in the future.

Also this patch cleans up some duplicated structures in the GFS2
sysfs code.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h     |  1 +
 fs/gfs2/mount.c      | 10 ++++++
 fs/gfs2/ops_fstype.c |  4 ++-
 fs/gfs2/ops_super.c  | 13 +++++++-
 fs/gfs2/sys.c        | 92 +++++++++++++++++++---------------------------------
 5 files changed, 60 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 399d1b97804..65f438e9537 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -418,6 +418,7 @@ struct gfs2_args {
 	unsigned int ar_data:2;			/* ordered/writeback */
 	unsigned int ar_meta:1;			/* mount metafs */
 	unsigned int ar_discard:1;		/* discard requests */
+	int ar_commit;				/* Commit interval */
 };
 
 struct gfs2_tune {
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index f7e8527a21e..947af151fa2 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -45,6 +45,7 @@ enum {
 	Opt_meta,
 	Opt_discard,
 	Opt_nodiscard,
+	Opt_commit,
 	Opt_err,
 };
 
@@ -73,6 +74,7 @@ static const match_table_t tokens = {
 	{Opt_meta, "meta"},
 	{Opt_discard, "discard"},
 	{Opt_nodiscard, "nodiscard"},
+	{Opt_commit, "commit=%d"},
 	{Opt_err, NULL}
 };
 
@@ -89,6 +91,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
 	char *o;
 	int token;
 	substring_t tmp[MAX_OPT_ARGS];
+	int rv;
 
 	/* Split the options into tokens with the "," character and
 	   process them */
@@ -173,6 +176,13 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
 		case Opt_nodiscard:
 			args->ar_discard = 0;
 			break;
+		case Opt_commit:
+			rv = match_int(&tmp[0], &args->ar_commit);
+			if (rv || args->ar_commit <= 0) {
+				fs_info(sdp, "commit mount option requires a positive numeric argument\n");
+				return rv ? rv : -EINVAL;
+			}
+			break;
 		case Opt_err:
 		default:
 			fs_info(sdp, "invalid mount option: %s\n", o);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1ff9473ea75..7981fbc9fc3 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -55,7 +55,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 	spin_lock_init(&gt->gt_spin);
 
 	gt->gt_incore_log_blocks = 1024;
-	gt->gt_log_flush_secs = 60;
 	gt->gt_recoverd_secs = 60;
 	gt->gt_logd_secs = 1;
 	gt->gt_quota_simul_sync = 64;
@@ -1165,6 +1164,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 
 	sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
 	sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
+	sdp->sd_args.ar_commit = 60;
 
 	error = gfs2_mount_args(sdp, &sdp->sd_args, data);
 	if (error) {
@@ -1191,6 +1191,8 @@ static int fill_super(struct super_block *sb, void *data, int silent)
                                GFS2_BASIC_BLOCK_SHIFT;
 	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
 
+	sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
+
 	error = init_names(sdp, silent);
 	if (error)
 		goto fail;
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 458019569dc..0677a837856 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -436,8 +436,12 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_args args = sdp->sd_args; /* Default to current settings */
+	struct gfs2_tune *gt = &sdp->sd_tune;
 	int error;
 
+	spin_lock(&gt->gt_spin);
+	args.ar_commit = gt->gt_log_flush_secs;
+	spin_unlock(&gt->gt_spin);
 	error = gfs2_mount_args(sdp, &args, data);
 	if (error)
 		return error;
@@ -473,6 +477,10 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 		sb->s_flags |= MS_POSIXACL;
 	else
 		sb->s_flags &= ~MS_POSIXACL;
+	spin_lock(&gt->gt_spin);
+	gt->gt_log_flush_secs = args.ar_commit;
+	spin_unlock(&gt->gt_spin);
+
 	return 0;
 }
 
@@ -550,6 +558,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 {
 	struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
 	struct gfs2_args *args = &sdp->sd_args;
+	int lfsecs;
 
 	if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
 		seq_printf(s, ",meta");
@@ -610,7 +619,9 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	}
 	if (args->ar_discard)
 		seq_printf(s, ",discard");
-
+	lfsecs = sdp->sd_tune.gt_log_flush_secs;
+	if (lfsecs != 60)
+		seq_printf(s, ",commit=%d", lfsecs);
 	return 0;
 }
 
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7655f5025fe..d53b22edc98 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -26,6 +26,36 @@
 #include "util.h"
 #include "glops.h"
 
+struct gfs2_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct gfs2_sbd *, char *);
+	ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
+};
+
+static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+	return a->show ? a->show(sdp, buf) : 0;
+}
+
+static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+	return a->store ? a->store(sdp, buf, len) : len;
+}
+
+static struct sysfs_ops gfs2_attr_ops = {
+	.show  = gfs2_attr_show,
+	.store = gfs2_attr_store,
+};
+
+
+static struct kset *gfs2_kset;
+
 static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -212,11 +242,6 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
 	return len;
 }
 
-struct gfs2_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gfs2_sbd *, char *);
-	ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
-};
 
 #define GFS2_ATTR(name, mode, show, store) \
 static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
@@ -246,49 +271,21 @@ static struct attribute *gfs2_attrs[] = {
 	NULL,
 };
 
-static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
-{
-	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
-	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
-	return a->show ? a->show(sdp, buf) : 0;
-}
-
-static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
-			       const char *buf, size_t len)
-{
-	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
-	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
-	return a->store ? a->store(sdp, buf, len) : len;
-}
-
-static struct sysfs_ops gfs2_attr_ops = {
-	.show  = gfs2_attr_show,
-	.store = gfs2_attr_store,
-};
-
 static struct kobj_type gfs2_ktype = {
 	.default_attrs = gfs2_attrs,
 	.sysfs_ops     = &gfs2_attr_ops,
 };
 
-static struct kset *gfs2_kset;
-
 /*
  * display struct lm_lockstruct fields
  */
 
-struct lockstruct_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-
 #define LOCKSTRUCT_ATTR(name, fmt)                                          \
 static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
 {                                                                           \
 	return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \
 }                                                                           \
-static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
+static struct gfs2_attr lockstruct_attr_##name = __ATTR_RO(name)
 
 LOCKSTRUCT_ATTR(jid,      "%u\n");
 LOCKSTRUCT_ATTR(first,    "%u\n");
@@ -401,14 +398,8 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
 	return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
 }
 
-struct gdlm_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gfs2_sbd *sdp, char *);
-	ssize_t (*store)(struct gfs2_sbd *sdp, const char *, size_t);
-};
-
 #define GDLM_ATTR(_name,_mode,_show,_store) \
-static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
+static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 
 GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
 GDLM_ATTR(block,          0644, block_show,          block_store);
@@ -434,21 +425,12 @@ static struct attribute *lock_module_attrs[] = {
 	NULL,
 };
 
-/*
- * display struct gfs2_args fields
- */
-
-struct args_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-
 #define ARGS_ATTR(name, fmt)                                                \
 static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
 {                                                                           \
 	return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name);       \
 }                                                                           \
-static struct args_attr args_attr_##name = __ATTR_RO(name)
+static struct gfs2_attr args_attr_##name = __ATTR_RO(name)
 
 ARGS_ATTR(lockproto,       "%s\n");
 ARGS_ATTR(locktable,       "%s\n");
@@ -531,14 +513,8 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
 	return len;
 }
 
-struct tune_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gfs2_sbd *, char *);
-	ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
-};
-
 #define TUNE_ATTR_3(name, show, store)                                        \
-static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
+static struct gfs2_attr tune_attr_##name = __ATTR(name, 0644, show, store)
 
 #define TUNE_ATTR_2(name, store)                                              \
 static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                   \
-- 
cgit v1.2.3


From 9582d41135c0d362f04ed6bf3dc8d693a7eafee2 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 13 May 2009 15:06:25 +0100
Subject: GFS2: Remove a couple of unused sysfs entries

These two tunables are pointless and would never need to be
changed anyway. There is also a race between them and umount
as the deamons which they refer to might have gone away. The
easiest way to fix the race is to remove the interface.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/sys.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index d53b22edc98..894bf773ec9 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -530,15 +530,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
 }                                                                             \
 TUNE_ATTR_2(name, name##_store)
 
-#define TUNE_ATTR_DAEMON(name, process)                                       \
-static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
-{                                                                             \
-	ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len);      \
-	wake_up_process(sdp->sd_##process);                                   \
-	return r;                                                             \
-}                                                                             \
-TUNE_ATTR_2(name, name##_store)
-
 TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
@@ -550,8 +541,6 @@ TUNE_ATTR(new_files_jdata, 0);
 TUNE_ATTR(quota_simul_sync, 1);
 TUNE_ATTR(stall_secs, 1);
 TUNE_ATTR(statfs_quantum, 1);
-TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
-TUNE_ATTR_DAEMON(logd_secs, logd_process);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 
 static struct attribute *tune_attrs[] = {
@@ -565,8 +554,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_quota_simul_sync.attr,
 	&tune_attr_stall_secs.attr,
 	&tune_attr_statfs_quantum.attr,
-	&tune_attr_recoverd_secs.attr,
-	&tune_attr_logd_secs.attr,
 	&tune_attr_quota_scale.attr,
 	&tune_attr_new_files_jdata.attr,
 	NULL,
-- 
cgit v1.2.3


From 77f6bf57ba9d2c50173536dbfdacdab27cb867ca Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 14 May 2009 09:49:44 +0200
Subject: splice: fix error return code

fs/splice.c: In function 'default_file_splice_read':
fs/splice.c:566: warning: 'error' may be used uninitialized in this function

which is sort-of true.  The code will in fact return -ENOMEM instead of the
kernel_readv() return value.

Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index c5e3c79b95a..41179c0a655 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -595,8 +595,10 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	}
 
 	res = kernel_readv(in, vec, spd.nr_pages, *ppos);
-	if (res < 0)
+	if (res < 0) {
+		error = res;
 		goto err;
+	}
 
 	error = 0;
 	if (!res)
-- 
cgit v1.2.3


From 2fa3cdfb319055fd8b25abdafa413e16f00ad493 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 May 2009 09:29:45 -0400
Subject: ext4: Merge ext4_da_get_block_write() into mpage_da_map_blocks()

The static function ext4_da_get_block_write() was only used by
mpage_da_map_blocks().  So to simplify the code, merge that function
into mpage_da_map_blocks().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 110 ++++++++++++++++++++++----------------------------------
 1 file changed, 43 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e6113c3a126..bfe50a22363 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2005,57 +2005,6 @@ static void ext4_print_free_blocks(struct inode *inode)
 	return;
 }
 
-/*
- * This function is used by mpage_da_map_blocks().  We separate it out
- * as a separate function just to make life easier, and because
- * mpage_da_map_blocks() used to be a generic function that took a
- * get_block_t.
- */
-static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
-				   struct buffer_head *bh_result)
-{
-	int ret;
-	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-	loff_t disksize = EXT4_I(inode)->i_disksize;
-	handle_t *handle = NULL;
-
-	handle = ext4_journal_current_handle();
-	BUG_ON(!handle);
-	ret = ext4_get_blocks(handle, inode, iblock, max_blocks,
-			      bh_result, EXT4_GET_BLOCKS_CREATE|
-			      EXT4_GET_BLOCKS_DELALLOC_RESERVE);
-	if (ret <= 0)
-		return ret;
-
-	bh_result->b_size = (ret << inode->i_blkbits);
-
-	if (ext4_should_order_data(inode)) {
-		int retval;
-		retval = ext4_jbd2_file_inode(handle, inode);
-		if (retval)
-			/*
-			 * Failed to add inode for ordered mode. Don't
-			 * update file size
-			 */
-			return retval;
-	}
-
-	/*
-	 * Update on-disk size along with block allocation we don't
-	 * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change
-	 * within already allocated block -bzzz
-	 */
-	disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
-	if (disksize > i_size_read(inode))
-		disksize = i_size_read(inode);
-	if (disksize > EXT4_I(inode)->i_disksize) {
-		ext4_update_i_disksize(inode, disksize);
-		ret = ext4_mark_inode_dirty(handle, inode);
-		return ret;
-	}
-	return 0;
-}
-
 /*
  * mpage_da_map_blocks - go through given space
  *
@@ -2066,9 +2015,12 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
  */
 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
-	int err = 0;
+	int err, blks;
 	struct buffer_head new;
-	sector_t next;
+	sector_t next = mpd->b_blocknr;
+	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
+	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
+	handle_t *handle = NULL;
 
 	/*
 	 * We consider only non-mapped and non-allocated blocks
@@ -2077,6 +2029,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		!(mpd->b_state & (1 << BH_Delay)) &&
 		!(mpd->b_state & (1 << BH_Unwritten)))
 		return 0;
+
+	/*
+	 * If we didn't accumulate anything to write simply return
+	 */
+	if (!mpd->b_size)
+		return 0;
+
+	handle = ext4_journal_current_handle();
+	BUG_ON(!handle);
+
 	/*
 	 * We need to make sure the BH_Delay flag is passed down to
 	 * ext4_da_get_block_write(), since it calls ext4_get_blocks()
@@ -2092,18 +2054,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE flag.
 	 */
 	new.b_state = mpd->b_state & (1 << BH_Delay);
-	new.b_blocknr = 0;
-	new.b_size = mpd->b_size;
-	next = mpd->b_blocknr;
-	/*
-	 * If we didn't accumulate anything
-	 * to write simply return
-	 */
-	if (!new.b_size)
-		return 0;
-
-	err = ext4_da_get_block_write(mpd->inode, next, &new);
-	if (err) {
+	blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
+			       &new, EXT4_GET_BLOCKS_CREATE|
+			       EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+	if (blks < 0) {
+		err = blks;
 		/*
 		 * If get block returns with error we simply
 		 * return. Later writepage will redirty the page and
@@ -2136,12 +2091,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		if (err == -ENOSPC) {
 			ext4_print_free_blocks(mpd->inode);
 		}
-		/* invlaidate all the pages */
+		/* invalidate all the pages */
 		ext4_da_block_invalidatepages(mpd, next,
 				mpd->b_size >> mpd->inode->i_blkbits);
 		return err;
 	}
-	BUG_ON(new.b_size == 0);
+	BUG_ON(blks == 0);
+
+	new.b_size = (blks << mpd->inode->i_blkbits);
 
 	if (buffer_new(&new))
 		__unmap_underlying_blocks(mpd->inode, &new);
@@ -2154,6 +2111,25 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	    (mpd->b_state & (1 << BH_Unwritten)))
 		mpage_put_bnr_to_bhs(mpd, next, &new);
 
+	if (ext4_should_order_data(mpd->inode)) {
+		err = ext4_jbd2_file_inode(handle, mpd->inode);
+		if (err)
+			return err;
+	}
+
+	/*
+	 * Update on-disk size along with block allocation we don't
+	 * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change
+	 * within already allocated block -bzzz
+	 */
+	disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
+	if (disksize > i_size_read(mpd->inode))
+		disksize = i_size_read(mpd->inode);
+	if (disksize > EXT4_I(mpd->inode)->i_disksize) {
+		ext4_update_i_disksize(mpd->inode, disksize);
+		return ext4_mark_inode_dirty(handle, mpd->inode);
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 2ac3b6e00acb46406c993d57921f86a594aafe08 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 May 2009 13:57:08 -0400
Subject: ext4: Clean up ext4_get_blocks() so it does not depend on
 bh_result->b_state

The ext4_get_blocks() function was depending on the value of
bh_result->b_state as an input parameter to decide whether or not
update the delalloc accounting statistics by calling
ext4_da_update_reserve_space().  We now use a separate flag,
EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE, to requests this update, so that
all callers of ext4_get_blocks() can clear map_bh.b_state before
calling ext4_get_blocks() without worrying about any consistency
issues.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 15 ++++++++++-----
 fs/ext4/inode.c | 56 +++++++++++++++++++++++++++++++-------------------------
 2 files changed, 41 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 17feb4ac633..d164f1294e5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -318,16 +318,21 @@ struct ext4_new_group_data {
  */
 	/* Allocate any needed blocks and/or convert an unitialized
 	   extent to be an initialized ext4 */
-#define EXT4_GET_BLOCKS_CREATE			1
+#define EXT4_GET_BLOCKS_CREATE			0x0001
 	/* Request the creation of an unitialized extent */
-#define EXT4_GET_BLOCKS_UNINIT_EXT		2
+#define EXT4_GET_BLOCKS_UNINIT_EXT		0x0002
 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT	(EXT4_GET_BLOCKS_UNINIT_EXT|\
 						 EXT4_GET_BLOCKS_CREATE)
 	/* Update the ext4_inode_info i_disksize field */
-#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE		4
+#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE		0x0004
 	/* Caller is from the delayed allocation writeout path,
-	   so the filesystem blocks have already been accounted for */
-#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	8
+	   so set the magic i_delalloc_reserve_flag after taking the 
+	   inode allocation semaphore for */
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0008
+	/* Call ext4_da_update_reserve_space() after successfully 
+	   allocating the blocks */
+#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE	0x0010
+
 
 /*
  * ioctl commands
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bfe50a22363..d7b7480682b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1234,16 +1234,15 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 		}
 	}
 
-	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 		EXT4_I(inode)->i_delalloc_reserved_flag = 0;
-		/*
-		 * Update reserved blocks/metadata blocks
-		 * after successful block allocation
-		 * which were deferred till now
-		 */
-		if ((retval > 0) && buffer_delay(bh))
-			ext4_da_update_reserve_space(inode, retval);
-	}
+
+	/*
+	 * Update reserved blocks/metadata blocks after successful
+	 * block allocation which had been deferred till now.
+	 */
+	if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
+		ext4_da_update_reserve_space(inode, retval);
 
 	up_write((&EXT4_I(inode)->i_data_sem));
 	return retval;
@@ -2015,7 +2014,7 @@ static void ext4_print_free_blocks(struct inode *inode)
  */
 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
-	int err, blks;
+	int err, blks, get_blocks_flags;
 	struct buffer_head new;
 	sector_t next = mpd->b_blocknr;
 	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
@@ -2040,23 +2039,30 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	BUG_ON(!handle);
 
 	/*
-	 * We need to make sure the BH_Delay flag is passed down to
-	 * ext4_da_get_block_write(), since it calls ext4_get_blocks()
-	 * with the EXT4_GET_BLOCKS_DELALLOC_RESERVE flag.  This flag
-	 * causes ext4_get_blocks() to call
-	 * ext4_da_update_reserve_space() if the passed buffer head
-	 * has the BH_Delay flag set.  In the future, once we clean up
-	 * the interfaces to ext4_get_blocks(), we should pass in a
-	 * separate flag which requests that the delayed allocation
-	 * statistics should be updated, instead of depending on the
-	 * state information getting passed down via the map_bh's
-	 * state bitmasks plus the magic
-	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE flag.
+	 * Call ext4_get_blocks() to allocate any delayed allocation
+	 * blocks, or to convert an uninitialized extent to be
+	 * initialized (in the case where we have written into
+	 * one or more preallocated blocks).
+	 *
+	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
+	 * indicate that we are on the delayed allocation path.  This
+	 * affects functions in many different parts of the allocation
+	 * call path.  This flag exists primarily because we don't
+	 * want to change *many* call functions, so ext4_get_blocks()
+	 * will set the magic i_delalloc_reserved_flag once the
+	 * inode's allocation semaphore is taken.
+	 *
+	 * If the blocks in questions were delalloc blocks, set
+	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
+	 * variables are updated after the blocks have been allocated.
 	 */
-	new.b_state = mpd->b_state & (1 << BH_Delay);
+	new.b_state = 0;
+	get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
+			    EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+	if (mpd->b_state & (1 << BH_Delay))
+		get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
 	blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
-			       &new, EXT4_GET_BLOCKS_CREATE|
-			       EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+			       &new, get_blocks_flags);
 	if (blks < 0) {
 		err = blks;
 		/*
-- 
cgit v1.2.3


From 2d02494f5a90f2e4b3c4c6acc85ec94674cdc431 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 2 May 2009 20:08:52 +0200
Subject: sched, timers: cleanup avenrun users

avenrun is an rough estimate so we don't have to worry about
consistency of the three avenrun values. Remove the xtime lock
dependency and provide a function to scale the values. Cleanup the
users.

[ Impact: cleanup ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
---
 fs/proc/loadavg.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 9bca39cf99e..1afa4dd4cae 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -12,20 +12,14 @@
 
 static int loadavg_proc_show(struct seq_file *m, void *v)
 {
-	int a, b, c;
-	unsigned long seq;
+	unsigned long avnrun[3];
 
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		a = avenrun[0] + (FIXED_1/200);
-		b = avenrun[1] + (FIXED_1/200);
-		c = avenrun[2] + (FIXED_1/200);
-	} while (read_seqretry(&xtime_lock, seq));
+	get_avenrun(avnrun, FIXED_1/200, 0);
 
-	seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
-		LOAD_INT(a), LOAD_FRAC(a),
-		LOAD_INT(b), LOAD_FRAC(b),
-		LOAD_INT(c), LOAD_FRAC(c),
+	seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+		LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
 		nr_running(), nr_threads,
 		task_active_pid_ns(current)->last_pid);
 	return 0;
-- 
cgit v1.2.3


From 6fd058f7791087648c683eb8572edf3be3c4c23c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 17 May 2009 15:38:01 -0400
Subject: ext4: Add a comprehensive block validity check to ext4_get_blocks()

To catch filesystem bugs or corruption which could lead to the
filesystem getting severly damaged, this patch adds a facility for
tracking all of the filesystem metadata blocks by contiguous regions
in a red-black tree.  This allows quick searching of the tree to
locate extents which might overlap with filesystem metadata blocks.

This facility is also used by the multi-block allocator to assure that
it is not allocating blocks out of the system zone, as well as by the
routines used when reading indirect blocks and extents information
from disk to make sure their contents are valid.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/Makefile         |   4 +-
 fs/ext4/block_validity.c | 244 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/ext4.h           |  11 +++
 fs/ext4/extents.c        |  22 +----
 fs/ext4/inode.c          |  47 +++++++--
 fs/ext4/mballoc.c        |  11 +--
 fs/ext4/super.c          |  32 ++++++-
 7 files changed, 332 insertions(+), 39 deletions(-)
 create mode 100644 fs/ext4/block_validity.c

(limited to 'fs')

diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index a8ff003a00f..8a34710ecf4 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -5,8 +5,8 @@
 obj-$(CONFIG_EXT4_FS) += ext4.o
 
 ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-		   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-		   ext4_jbd2.o migrate.o mballoc.o
+		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+		ext4_jbd2.o migrate.o mballoc.o block_validity.o
 
 ext4-$(CONFIG_EXT4_FS_XATTR)		+= xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
new file mode 100644
index 00000000000..50784ef0756
--- /dev/null
+++ b/fs/ext4/block_validity.c
@@ -0,0 +1,244 @@
+/*
+ *  linux/fs/ext4/block_validity.c
+ *
+ * Copyright (C) 2009
+ * Theodore Ts'o (tytso@mit.edu)
+ *
+ * Track which blocks in the filesystem are metadata blocks that
+ * should never be used as data blocks by files or directories.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/version.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include "ext4.h"
+
+struct ext4_system_zone {
+	struct rb_node	node;
+	ext4_fsblk_t	start_blk;
+	unsigned int	count;
+};
+
+static struct kmem_cache *ext4_system_zone_cachep;
+
+int __init init_ext4_system_zone(void)
+{
+	ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
+					     SLAB_RECLAIM_ACCOUNT);
+	if (ext4_system_zone_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void exit_ext4_system_zone(void)
+{
+	kmem_cache_destroy(ext4_system_zone_cachep);
+}
+
+static inline int can_merge(struct ext4_system_zone *entry1,
+		     struct ext4_system_zone *entry2)
+{
+	if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+		return 1;
+	return 0;
+}
+
+/*
+ * Mark a range of blocks as belonging to the "system zone" --- that
+ * is, filesystem metadata blocks which should never be used by
+ * inodes.
+ */
+static int add_system_zone(struct ext4_sb_info *sbi,
+			   ext4_fsblk_t start_blk,
+			   unsigned int count)
+{
+	struct ext4_system_zone *new_entry = NULL, *entry;
+	struct rb_node **n = &sbi->system_blks.rb_node, *node;
+	struct rb_node *parent = NULL, *new_node = NULL;
+
+	while (*n) {
+		parent = *n;
+		entry = rb_entry(parent, struct ext4_system_zone, node);
+		if (start_blk < entry->start_blk)
+			n = &(*n)->rb_left;
+		else if (start_blk >= (entry->start_blk + entry->count))
+			n = &(*n)->rb_right;
+		else {
+			if (start_blk + count > (entry->start_blk + 
+						 entry->count))
+				entry->count = (start_blk + count - 
+						entry->start_blk);
+			new_node = *n;
+			new_entry = rb_entry(new_node, struct ext4_system_zone,
+					     node);
+			break;
+		}
+	}
+
+	if (!new_entry) {
+		new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
+					     GFP_KERNEL);
+		if (!new_entry)
+			return -ENOMEM;
+		new_entry->start_blk = start_blk;
+		new_entry->count = count;
+		new_node = &new_entry->node;
+
+		rb_link_node(new_node, parent, n);
+		rb_insert_color(new_node, &sbi->system_blks);
+	}
+
+	/* Can we merge to the left? */
+	node = rb_prev(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		if (can_merge(entry, new_entry)) {
+			new_entry->start_blk = entry->start_blk;
+			new_entry->count += entry->count;
+			rb_erase(node, &sbi->system_blks);
+			kmem_cache_free(ext4_system_zone_cachep, entry);
+		}
+	}
+
+	/* Can we merge to the right? */
+	node = rb_next(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		if (can_merge(new_entry, entry)) {
+			new_entry->count += entry->count;
+			rb_erase(node, &sbi->system_blks);
+			kmem_cache_free(ext4_system_zone_cachep, entry);
+		}
+	}
+	return 0;
+}
+
+static void debug_print_tree(struct ext4_sb_info *sbi)
+{
+	struct rb_node *node;
+	struct ext4_system_zone *entry;
+	int first = 1;
+
+	printk(KERN_INFO "System zones: ");
+	node = rb_first(&sbi->system_blks);
+	while (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		printk("%s%llu-%llu", first ? "" : ", ",
+		       entry->start_blk, entry->start_blk + entry->count - 1);
+		first = 0;
+		node = rb_next(node);
+	}
+	printk("\n");
+}
+
+int ext4_setup_system_zone(struct super_block *sb)
+{
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp;
+	ext4_group_t i;
+	int flex_size = ext4_flex_bg_size(sbi);
+	int ret;
+
+	if (!test_opt(sb, BLOCK_VALIDITY)) {
+		if (EXT4_SB(sb)->system_blks.rb_node)
+			ext4_release_system_zone(sb);
+		return 0;
+	}
+	if (EXT4_SB(sb)->system_blks.rb_node)
+		return 0;
+
+	for (i=0; i < ngroups; i++) {
+		if (ext4_bg_has_super(sb, i) &&
+		    ((i < 5) || ((i % flex_size) == 0)))
+			add_system_zone(sbi, ext4_group_first_block_no(sb, i),
+					sbi->s_gdb_count + 1);
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
+		if (ret)
+			return ret;
+		ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
+		if (ret)
+			return ret;
+		ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
+				sbi->s_itb_per_group);
+		if (ret)
+			return ret;
+	}
+
+	if (test_opt(sb, DEBUG))
+		debug_print_tree(EXT4_SB(sb));
+	return 0;
+}
+
+/* Called when the filesystem is unmounted */
+void ext4_release_system_zone(struct super_block *sb)
+{
+	struct rb_node	*n = EXT4_SB(sb)->system_blks.rb_node;
+	struct rb_node	*parent;
+	struct ext4_system_zone	*entry;
+
+	while (n) {
+		/* Do the node's children first */
+		if (n->rb_left) {
+			n = n->rb_left;
+			continue;
+		}
+		if (n->rb_right) {
+			n = n->rb_right;
+			continue;
+		}
+		/*
+		 * The node has no children; free it, and then zero
+		 * out parent's link to it.  Finally go to the
+		 * beginning of the loop and try to free the parent
+		 * node.
+		 */
+		parent = rb_parent(n);
+		entry = rb_entry(n, struct ext4_system_zone, node);
+		kmem_cache_free(ext4_system_zone_cachep, entry);
+		if (!parent)
+			EXT4_SB(sb)->system_blks.rb_node = NULL;
+		else if (parent->rb_left == n)
+			parent->rb_left = NULL;
+		else if (parent->rb_right == n)
+			parent->rb_right = NULL;
+		n = parent;
+	}
+	EXT4_SB(sb)->system_blks.rb_node = NULL;
+}
+
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with filesystem metadata blocks.
+ */
+int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+			  unsigned int count)
+{
+	struct ext4_system_zone *entry;
+	struct rb_node *n = sbi->system_blks.rb_node;
+
+	if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+	    (start_blk + count > ext4_blocks_count(sbi->s_es)))
+		return 0;
+	while (n) {
+		entry = rb_entry(n, struct ext4_system_zone, node);
+		if (start_blk + count - 1 < entry->start_blk)
+			n = n->rb_left;
+		else if (start_blk >= (entry->start_blk + entry->count))
+			n = n->rb_right;
+		else
+			return 0;
+	}
+	return 1;
+}
+
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d164f1294e5..4311cc85b53 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -696,6 +696,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
+#define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
 
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -887,6 +888,7 @@ struct ext4_sb_info {
 	int s_jquota_fmt;			/* Format of quota to use */
 #endif
 	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+	struct rb_root system_blks;
 
 #ifdef EXTENTS_STATS
 	/* ext4 extents stats */
@@ -1618,6 +1620,15 @@ extern struct dentry *ext4_get_parent(struct dentry *child);
 extern const struct inode_operations ext4_symlink_inode_operations;
 extern const struct inode_operations ext4_fast_symlink_inode_operations;
 
+/* block_validity */
+extern void ext4_release_system_zone(struct super_block *sb);
+extern int ext4_setup_system_zone(struct super_block *sb);
+extern int __init init_ext4_system_zone(void);
+extern void exit_ext4_system_zone(void);
+extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
+				 ext4_fsblk_t start_blk,
+				 unsigned int count);
+
 /* extents.c */
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 27c383c7b43..d04b779b780 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -326,32 +326,18 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 
 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
-	ext4_fsblk_t block = ext_pblock(ext), valid_block;
+	ext4_fsblk_t block = ext_pblock(ext);
 	int len = ext4_ext_get_actual_len(ext);
-	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 
-	valid_block = le32_to_cpu(es->s_first_data_block) +
-		EXT4_SB(inode->i_sb)->s_gdb_count;
-	if (unlikely(block <= valid_block ||
-		     ((block + len) > ext4_blocks_count(es))))
-		return 0;
-	else
-		return 1;
+	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
 }
 
 static int ext4_valid_extent_idx(struct inode *inode,
 				struct ext4_extent_idx *ext_idx)
 {
-	ext4_fsblk_t block = idx_pblock(ext_idx), valid_block;
-	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+	ext4_fsblk_t block = idx_pblock(ext_idx);
 
-	valid_block = le32_to_cpu(es->s_first_data_block) +
-		EXT4_SB(inode->i_sb)->s_gdb_count;
-	if (unlikely(block <= valid_block ||
-		     (block >= ext4_blocks_count(es))))
-		return 0;
-	else
-		return 1;
+	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
 }
 
 static int ext4_valid_extent_entries(struct inode *inode,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d7b7480682b..dadd3f995db 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -372,20 +372,21 @@ static int ext4_block_to_path(struct inode *inode,
 }
 
 static int __ext4_check_blockref(const char *function, struct inode *inode,
-				 __le32 *p, unsigned int max) {
-
-	unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
+				 __le32 *p, unsigned int max)
+{
 	__le32 *bref = p;
+	unsigned int blk;
+
 	while (bref < p+max) {
-		if (unlikely(le32_to_cpu(*bref) >= maxblocks)) {
+		blk = le32_to_cpu(*bref++);
+		if (blk && 
+		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 
+						    blk, 1))) {
 			ext4_error(inode->i_sb, function,
-				   "block reference %u >= max (%u) "
-				   "in inode #%lu, offset=%d",
-				   le32_to_cpu(*bref), maxblocks,
-				   inode->i_ino, (int)(bref-p));
+				   "invalid block reference %u "
+				   "in inode #%lu", blk, inode->i_ino);
  			return -EIO;
  		}
-		bref++;
  	}
  	return 0;
 }
@@ -1125,6 +1126,21 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 		ext4_discard_preallocations(inode);
 }
 
+static int check_block_validity(struct inode *inode, sector_t logical,
+				sector_t phys, int len)
+{
+	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
+		ext4_error(inode->i_sb, "check_block_validity",
+			   "inode #%lu logical block %llu mapped to %llu "
+			   "(size %d)", inode->i_ino,
+			   (unsigned long long) logical,
+			   (unsigned long long) phys, len);
+		WARN_ON(1);
+		return -EIO;
+	}
+	return 0;
+}
+
 /*
  * The ext4_get_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
@@ -1170,6 +1186,13 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
 
+	if (retval > 0 && buffer_mapped(bh)) {
+		int ret = check_block_validity(inode, block, 
+					       bh->b_blocknr, retval);
+		if (ret != 0)
+			return ret;
+	}
+
 	/* If it is only a block(s) look up */
 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
 		return retval;
@@ -1245,6 +1268,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 		ext4_da_update_reserve_space(inode, retval);
 
 	up_write((&EXT4_I(inode)->i_data_sem));
+	if (retval > 0 && buffer_mapped(bh)) {
+		int ret = check_block_validity(inode, block, 
+					       bh->b_blocknr, retval);
+		if (ret != 0)
+			return ret;
+	}
 	return retval;
 }
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 541bd9adffa..ed8482e22c0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2961,15 +2961,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		+ le32_to_cpu(es->s_first_data_block);
 
 	len = ac->ac_b_ex.fe_len;
-	if (in_range(ext4_block_bitmap(sb, gdp), block, len) ||
-	    in_range(ext4_inode_bitmap(sb, gdp), block, len) ||
-	    in_range(block, ext4_inode_table(sb, gdp),
-		     EXT4_SB(sb)->s_itb_per_group) ||
-	    in_range(block + len - 1, ext4_inode_table(sb, gdp),
-		     EXT4_SB(sb)->s_itb_per_group)) {
+	if (!ext4_data_block_valid(sbi, block, len)) {
 		ext4_error(sb, __func__,
-			   "Allocating block %llu in system zone of %d group\n",
-			   block, ac->ac_b_ex.fe_group);
+			   "Allocating blocks %llu-%llu which overlap "
+			   "fs metadata\n", block, block+len);
 		/* File system mounted not to panic on error
 		 * Fix the bitmap and repeat the block allocation
 		 * We leak some of the blocks here.
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dc34ed3d132..600b7ad699b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -568,6 +568,7 @@ static void ext4_put_super(struct super_block *sb)
 	struct ext4_super_block *es = sbi->s_es;
 	int i, err;
 
+	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
@@ -1055,6 +1056,7 @@ enum {
 	Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
 	Opt_usrquota, Opt_grpquota, Opt_i_version,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+	Opt_block_validity, Opt_noblock_validity,
 	Opt_inode_readahead_blks, Opt_journal_ioprio
 };
 
@@ -1114,6 +1116,8 @@ static const match_table_t tokens = {
 	{Opt_resize, "resize"},
 	{Opt_delalloc, "delalloc"},
 	{Opt_nodelalloc, "nodelalloc"},
+	{Opt_block_validity, "block_validity"},
+	{Opt_noblock_validity, "noblock_validity"},
 	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
 	{Opt_journal_ioprio, "journal_ioprio=%u"},
 	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
@@ -1508,6 +1512,12 @@ set_qf_format:
 		case Opt_delalloc:
 			set_opt(sbi->s_mount_opt, DELALLOC);
 			break;
+		case Opt_block_validity:
+			set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+			break;
+		case Opt_noblock_validity:
+			clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+			break;
 		case Opt_inode_readahead_blks:
 			if (match_int(&args[0], &option))
 				return 0;
@@ -2826,6 +2836,13 @@ no_journal:
 	} else if (test_opt(sb, DELALLOC))
 		printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
 
+	err = ext4_setup_system_zone(sb);
+	if (err) {
+		printk(KERN_ERR "EXT4-fs: failed to initialize system "
+		       "zone (%d)\n", err);
+		goto failed_mount4;
+	}
+
 	ext4_ext_init(sb);
 	err = ext4_mb_init(sb, needs_recovery);
 	if (err) {
@@ -2875,6 +2892,7 @@ cantfind_ext4:
 
 failed_mount4:
 	printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
+	ext4_release_system_zone(sb);
 	if (sbi->s_journal) {
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
@@ -3515,6 +3533,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				sb->s_flags &= ~MS_RDONLY;
 		}
 	}
+	ext4_setup_system_zone(sb);
 	if (sbi->s_journal == NULL)
 		ext4_commit_super(sb, 1);
 
@@ -3927,13 +3946,16 @@ static int __init init_ext4_fs(void)
 {
 	int err;
 
+	err = init_ext4_system_zone();
+	if (err)
+		return err;
 	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
 	if (!ext4_kset)
-		return -ENOMEM;
+		goto out4;
 	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
 	err = init_ext4_mballoc();
 	if (err)
-		return err;
+		goto out3;
 
 	err = init_ext4_xattr();
 	if (err)
@@ -3958,6 +3980,11 @@ out1:
 	exit_ext4_xattr();
 out2:
 	exit_ext4_mballoc();
+out3:
+	remove_proc_entry("fs/ext4", NULL);
+	kset_unregister(ext4_kset);
+out4:
+	exit_ext4_system_zone();
 	return err;
 }
 
@@ -3972,6 +3999,7 @@ static void __exit exit_ext4_fs(void)
 	exit_ext4_mballoc();
 	remove_proc_entry("fs/ext4", NULL);
 	kset_unregister(ext4_kset);
+	exit_ext4_system_zone();
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-- 
cgit v1.2.3


From 0568c518937ee3a9b6a94d18bae9c150fe5d6832 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 17 May 2009 23:31:23 -0400
Subject: ext4: down i_data_sem only for read when walking tree for fiemap

Not sure why I put this in as down_write originally; all we are
doing is walking the tree, nothing will change under us and
concurrent reads should be no problem.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d04b779b780..d4e99e96fdd 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3312,10 +3312,10 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		 * Walk the extent tree gathering extent information.
 		 * ext4_ext_fiemap_cb will push extents back to user.
 		 */
-		down_write(&EXT4_I(inode)->i_data_sem);
+		down_read(&EXT4_I(inode)->i_data_sem);
 		error = ext4_ext_walk_space(inode, start_blk, len_blks,
 					  ext4_ext_fiemap_cb, fieinfo);
-		up_write(&EXT4_I(inode)->i_data_sem);
+		up_read(&EXT4_I(inode)->i_data_sem);
 	}
 
 	return error;
-- 
cgit v1.2.3


From f68301656b5f5d2de104f2687add6beeb8f3c3b9 Mon Sep 17 00:00:00 2001
From: Manish Katiyar <mkatiyar@gmail.com>
Date: Sun, 17 May 2009 23:52:44 -0400
Subject: ext4: Fix memory leak in ext4_fill_super() in case of a failed mount

Signed-off-by: Manish Katiyar <mkatiyar@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 600b7ad699b..eca6c057b11 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2924,6 +2924,7 @@ failed_mount:
 	brelse(bh);
 out_fail:
 	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	lock_kernel();
 	return ret;
-- 
cgit v1.2.3


From de5ce037304f2c88a319b1c3b808ab0c4c618c1c Mon Sep 17 00:00:00 2001
From: Manish Katiyar <mkatiyar@gmail.com>
Date: Sun, 17 May 2009 23:52:47 -0400
Subject: ext3: Fix memory leak in ext3_fill_super() in case of a failed mount

Signed-off-by: Manish Katiyar <mkatiyar@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext3/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 599dbfe504c..d8b73d4abe3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2021,6 +2021,7 @@ failed_mount:
 	brelse(bh);
 out_fail:
 	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	lock_kernel();
 	return ret;
-- 
cgit v1.2.3


From 0f7ee7c17241915fdaff49d1a36f5aafd80a7dce Mon Sep 17 00:00:00 2001
From: Manish Katiyar <mkatiyar@gmail.com>
Date: Sun, 17 May 2009 23:52:51 -0400
Subject: ext2: Fix memory leak in ext2_fill_super() in case of a failed mount

Signed-off-by: Manish Katiyar <mkatiyar@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext2/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 5c4afe65224..e3c748faf2d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1093,6 +1093,7 @@ failed_mount:
 	brelse(bh);
 failed_sbi:
 	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	return ret;
 }
-- 
cgit v1.2.3


From fe64d517df0970a68417184a12fcd4ba0589cc28 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 19 May 2009 10:01:18 +0100
Subject: GFS2: Umount recovery race fix

This patch fixes a race condition where we can receive recovery
requests part way through processing a umount. This was causing
problems since the recovery thread had already gone away.

Looking in more detail at the recovery code, it was really trying
to implement a slight variation on a work queue, and that happens to
align nicely with the recently introduced slow-work subsystem. As a
result I've updated the code to use slow-work, rather than its own home
grown variety of work queue.

When using the wait_on_bit() function, I noticed that the wait function
that was supplied as an argument was appearing in the WCHAN field, so
I've updated the function names in order to produce more meaningful
output.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Kconfig      |   1 +
 fs/gfs2/glock.c      |  21 +++++++++--
 fs/gfs2/incore.h     |  14 +++----
 fs/gfs2/main.c       |   8 ++++
 fs/gfs2/ops_fstype.c |  20 +++-------
 fs/gfs2/ops_super.c  |  25 ++++++++++++-
 fs/gfs2/recovery.c   | 102 +++++++++++++++++----------------------------------
 fs/gfs2/recovery.h   |   2 +-
 fs/gfs2/sys.c        |  53 +++++++++++++-------------
 9 files changed, 122 insertions(+), 124 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 3a981b7f64c..cad957cdb1e 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -7,6 +7,7 @@ config GFS2_FS
 	select IP_SCTP if DLM_SCTP
 	select FS_POSIX_ACL
 	select CRC32
+	select SLOW_WORK
 	help
 	  A cluster filesystem.
 
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ff498109048..2bf62bcc518 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -796,22 +796,37 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
 	gh->gh_ip = 0;
 }
 
-static int just_schedule(void *word)
+/**
+ * gfs2_glock_holder_wait
+ * @word: unused
+ *
+ * This function and gfs2_glock_demote_wait both show up in the WCHAN
+ * field. Thus I've separated these otherwise identical functions in
+ * order to be more informative to the user.
+ */
+
+static int gfs2_glock_holder_wait(void *word)
 {
         schedule();
         return 0;
 }
 
+static int gfs2_glock_demote_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
 static void wait_on_holder(struct gfs2_holder *gh)
 {
 	might_sleep();
-	wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
+	wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE);
 }
 
 static void wait_on_demote(struct gfs2_glock *gl)
 {
 	might_sleep();
-	wait_on_bit(&gl->gl_flags, GLF_DEMOTE, just_schedule, TASK_UNINTERRUPTIBLE);
+	wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE);
 }
 
 /**
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 65f438e9537..0060e9564bb 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,6 +12,7 @@
 
 #include <linux/fs.h>
 #include <linux/workqueue.h>
+#include <linux/slow-work.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
 
@@ -376,11 +377,11 @@ struct gfs2_journal_extent {
 struct gfs2_jdesc {
 	struct list_head jd_list;
 	struct list_head extent_list;
-
+	struct slow_work jd_work;
 	struct inode *jd_inode;
+	unsigned long jd_flags;
+#define JDF_RECOVERY 1
 	unsigned int jd_jid;
-	int jd_dirty;
-
 	unsigned int jd_blocks;
 };
 
@@ -390,9 +391,6 @@ struct gfs2_statfs_change_host {
 	s64 sc_dinodes;
 };
 
-#define GFS2_GLOCKD_DEFAULT	1
-#define GFS2_GLOCKD_MAX		16
-
 #define GFS2_QUOTA_DEFAULT	GFS2_QUOTA_OFF
 #define GFS2_QUOTA_OFF		0
 #define GFS2_QUOTA_ACCOUNT	1
@@ -427,7 +425,6 @@ struct gfs2_tune {
 	unsigned int gt_incore_log_blocks;
 	unsigned int gt_log_flush_secs;
 
-	unsigned int gt_recoverd_secs;
 	unsigned int gt_logd_secs;
 
 	unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -448,6 +445,7 @@ enum {
 	SDF_JOURNAL_LIVE	= 1,
 	SDF_SHUTDOWN		= 2,
 	SDF_NOBARRIERS		= 3,
+	SDF_NORECOVERY		= 4,
 };
 
 #define GFS2_FSNAME_LEN		256
@@ -494,7 +492,6 @@ struct lm_lockstruct {
 	unsigned long ls_flags;
 	dlm_lockspace_t *ls_dlm;
 
-	int ls_recover_jid;
 	int ls_recover_jid_done;
 	int ls_recover_jid_status;
 };
@@ -583,7 +580,6 @@ struct gfs2_sbd {
 
 	/* Daemon stuff */
 
-	struct task_struct *sd_recoverd_process;
 	struct task_struct *sd_logd_process;
 	struct task_struct *sd_quotad_process;
 
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a6892ed0840..eacd78a5d08 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/gfs2_ondisk.h>
 #include <asm/atomic.h>
+#include <linux/slow-work.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -113,12 +114,18 @@ static int __init init_gfs2_fs(void)
 	if (error)
 		goto fail_unregister;
 
+	error = slow_work_register_user();
+	if (error)
+		goto fail_slow;
+
 	gfs2_register_debugfs();
 
 	printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
 
 	return 0;
 
+fail_slow:
+	unregister_filesystem(&gfs2meta_fs_type);
 fail_unregister:
 	unregister_filesystem(&gfs2_fs_type);
 fail:
@@ -156,6 +163,7 @@ static void __exit exit_gfs2_fs(void)
 	gfs2_unregister_debugfs();
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
+	slow_work_unregister_user();
 
 	kmem_cache_destroy(gfs2_quotad_cachep);
 	kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7981fbc9fc3..2cd1164c88d 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,6 +17,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/slow-work.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -55,7 +56,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 	spin_lock_init(&gt->gt_spin);
 
 	gt->gt_incore_log_blocks = 1024;
-	gt->gt_recoverd_secs = 60;
 	gt->gt_logd_secs = 1;
 	gt->gt_quota_simul_sync = 64;
 	gt->gt_quota_warn_period = 10;
@@ -675,6 +675,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
 			break;
 
 		INIT_LIST_HEAD(&jd->extent_list);
+		slow_work_init(&jd->jd_work, &gfs2_recover_ops);
 		jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
 		if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
 			if (!jd->jd_inode)
@@ -700,14 +701,13 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
 	struct inode *master = sdp->sd_master_dir->d_inode;
 	struct gfs2_holder ji_gh;
-	struct task_struct *p;
 	struct gfs2_inode *ip;
 	int jindex = 1;
 	int error = 0;
 
 	if (undo) {
 		jindex = 0;
-		goto fail_recoverd;
+		goto fail_jinode_gh;
 	}
 
 	sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
@@ -800,18 +800,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 	gfs2_glock_dq_uninit(&ji_gh);
 	jindex = 0;
 
-	p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
-	error = IS_ERR(p);
-	if (error) {
-		fs_err(sdp, "can't start recoverd thread: %d\n", error);
-		goto fail_jinode_gh;
-	}
-	sdp->sd_recoverd_process = p;
-
 	return 0;
 
-fail_recoverd:
-	kthread_stop(sdp->sd_recoverd_process);
 fail_jinode_gh:
 	if (!sdp->sd_args.ar_spectator)
 		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
@@ -1172,8 +1162,10 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 		goto fail;
 	}
 
-	if (sdp->sd_args.ar_spectator)
+	if (sdp->sd_args.ar_spectator) {
                 sb->s_flags |= MS_RDONLY;
+		set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+	}
 	if (sdp->sd_args.ar_posix_acl)
 		sb->s_flags |= MS_POSIXACL;
 
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 0677a837856..a3c2272e7ca 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -121,6 +121,12 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 	return error;
 }
 
+static int gfs2_umount_recovery_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
 /**
  * gfs2_put_super - Unmount the filesystem
  * @sb: The VFS superblock
@@ -131,6 +137,7 @@ static void gfs2_put_super(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	int error;
+	struct gfs2_jdesc *jd;
 
 	/*  Unfreeze the filesystem, if we need to  */
 
@@ -139,9 +146,25 @@ static void gfs2_put_super(struct super_block *sb)
 		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
 	mutex_unlock(&sdp->sd_freeze_lock);
 
+	/* No more recovery requests */
+	set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+	smp_mb();
+
+	/* Wait on outstanding recovery */
+restart:
+	spin_lock(&sdp->sd_jindex_spin);
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (!test_bit(JDF_RECOVERY, &jd->jd_flags))
+			continue;
+		spin_unlock(&sdp->sd_jindex_spin);
+		wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
+			    gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE);
+		goto restart;
+	}
+	spin_unlock(&sdp->sd_jindex_spin);
+
 	kthread_stop(sdp->sd_quotad_process);
 	kthread_stop(sdp->sd_logd_process);
-	kthread_stop(sdp->sd_recoverd_process);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		error = gfs2_make_fs_ro(sdp);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 247e8f7d6b3..59d2695509d 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -13,8 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/kthread.h>
-#include <linux/freezer.h>
+#include <linux/slow-work.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -441,18 +440,25 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
         kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
 }
 
-/**
- * gfs2_recover_journal - recover a given journal
- * @jd: the struct gfs2_jdesc describing the journal
- *
- * Acquire the journal's lock, check to see if the journal is clean, and
- * do recovery if necessary.
- *
- * Returns: errno
- */
+static int gfs2_recover_get_ref(struct slow_work *work)
+{
+	struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
+	if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
+		return -EBUSY;
+	return 0;
+}
 
-int gfs2_recover_journal(struct gfs2_jdesc *jd)
+static void gfs2_recover_put_ref(struct slow_work *work)
+{
+	struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
+	clear_bit(JDF_RECOVERY, &jd->jd_flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
+}
+
+static void gfs2_recover_work(struct slow_work *work)
 {
+	struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
 	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 	struct gfs2_log_header_host head;
@@ -569,7 +575,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
 		gfs2_glock_dq_uninit(&j_gh);
 
 	fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
-	return 0;
+	return;
 
 fail_gunlock_tr:
 	gfs2_glock_dq_uninit(&t_gh);
@@ -584,70 +590,28 @@ fail_gunlock_j:
 
 fail:
 	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
-	return error;
 }
 
-static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
-{
-	struct gfs2_jdesc *jd;
-	int found = 0;
-
-	spin_lock(&sdp->sd_jindex_spin);
+struct slow_work_ops gfs2_recover_ops = {
+	.get_ref = gfs2_recover_get_ref,
+	.put_ref = gfs2_recover_put_ref,
+	.execute = gfs2_recover_work,
+};
 
-	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-		if (jd->jd_dirty) {
-			jd->jd_dirty = 0;
-			found = 1;
-			break;
-		}
-	}
-	spin_unlock(&sdp->sd_jindex_spin);
-
-	if (!found)
-		jd = NULL;
 
-	return jd;
-}
-
-/**
- * gfs2_check_journals - Recover any dirty journals
- * @sdp: the filesystem
- *
- */
-
-static void gfs2_check_journals(struct gfs2_sbd *sdp)
+static int gfs2_recovery_wait(void *word)
 {
-	struct gfs2_jdesc *jd;
-
-	for (;;) {
-		jd = gfs2_jdesc_find_dirty(sdp);
-		if (!jd)
-			break;
-
-		if (jd != sdp->sd_jdesc)
-			gfs2_recover_journal(jd);
-	}
+	schedule();
+	return 0;
 }
 
-/**
- * gfs2_recoverd - Recover dead machine's journals
- * @sdp: Pointer to GFS2 superblock
- *
- */
-
-int gfs2_recoverd(void *data)
+int gfs2_recover_journal(struct gfs2_jdesc *jd)
 {
-	struct gfs2_sbd *sdp = data;
-	unsigned long t;
-
-	while (!kthread_should_stop()) {
-		gfs2_check_journals(sdp);
-		t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
-		if (freezing(current))
-			refrigerator();
-		schedule_timeout_interruptible(t);
-	}
-
+	int rv;
+	rv = slow_work_enqueue(&jd->jd_work);
+	if (rv)
+		return rv;
+	wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE);
 	return 0;
 }
 
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index a8218ea15b5..1616ac22569 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -28,7 +28,7 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
 extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
 		    struct gfs2_log_header_host *head);
 extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
-extern int gfs2_recoverd(void *data);
+extern struct slow_work_ops gfs2_recover_ops;
 
 #endif /* __RECOVERY_DOT_H__ */
 
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 894bf773ec9..9f6d48b75fd 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -356,34 +356,33 @@ static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
 	return sprintf(buf, "%d\n", ls->ls_first_done);
 }
 
-static ssize_t recover_show(struct gfs2_sbd *sdp, char *buf)
-{
-	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-	return sprintf(buf, "%d\n", ls->ls_recover_jid);
-}
-
-static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
+static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 {
+	unsigned jid;
 	struct gfs2_jdesc *jd;
+	int rv;
+
+	rv = sscanf(buf, "%u", &jid);
+	if (rv != 1)
+		return -EINVAL;
 
+	rv = -ESHUTDOWN;
 	spin_lock(&sdp->sd_jindex_spin);
+	if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
+		goto out;
+	rv = -EBUSY;
+	if (sdp->sd_jdesc->jd_jid == jid)
+		goto out;
+	rv = -ENOENT;
 	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
 		if (jd->jd_jid != jid)
 			continue;
-		jd->jd_dirty = 1;
+		rv = slow_work_enqueue(&jd->jd_work);
 		break;
 	}
+out:
 	spin_unlock(&sdp->sd_jindex_spin);
-}
-
-static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
-	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-	ls->ls_recover_jid = simple_strtol(buf, NULL, 0);
-	gfs2_jdesc_make_dirty(sdp, ls->ls_recover_jid);
-	if (sdp->sd_recoverd_process)
-		wake_up_process(sdp->sd_recoverd_process);
-	return len;
+	return rv ? rv : len;
 }
 
 static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf)
@@ -401,15 +400,15 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
 #define GDLM_ATTR(_name,_mode,_show,_store) \
 static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 
-GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
-GDLM_ATTR(block,          0644, block_show,          block_store);
-GDLM_ATTR(withdraw,       0644, withdraw_show,       withdraw_store);
-GDLM_ATTR(id,             0444, lkid_show,           NULL);
-GDLM_ATTR(first,          0444, lkfirst_show,        NULL);
-GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
-GDLM_ATTR(recover,        0644, recover_show,        recover_store);
-GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
-GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
+GDLM_ATTR(proto_name,     0444, proto_name_show,	NULL);
+GDLM_ATTR(block,          0644, block_show,		block_store);
+GDLM_ATTR(withdraw,       0644, withdraw_show,		withdraw_store);
+GDLM_ATTR(id,             0444, lkid_show,		NULL);
+GDLM_ATTR(first,          0444, lkfirst_show,		NULL);
+GDLM_ATTR(first_done,     0444, first_done_show,	NULL);
+GDLM_ATTR(recover,        0200, NULL,			recover_store);
+GDLM_ATTR(recover_done,   0444, recover_done_show,	NULL);
+GDLM_ATTR(recover_status, 0444, recover_status_show,	NULL);
 
 static struct attribute *lock_module_attrs[] = {
 	&gdlm_attr_proto_name.attr,
-- 
cgit v1.2.3


From 4fc981ef9e7c0953d5c4896ce088b19c50cb018f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 19 May 2009 18:33:06 +0900
Subject: bio: always copy back data for copied kernel requests

When a read bio_copy_kern() request fails, the content of the bounce
buffer is not copied back.  However, as request failure doesn't
necessarily mean complete failure, the buffer state can be useful.
This behavior is also inconsistent with the user map counterpart and
causes the subtle difference between bounced and unbounced IO causes
confusion.

This patch makes bio_copy_kern_endio() ignore @err and always copy
back data on request completion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 7bbc98f0eda..ee3bc67833d 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1198,7 +1198,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
 		char *addr = page_address(bvec->bv_page);
 		int len = bmd->iovecs[i].bv_len;
 
-		if (read && !err)
+		if (read)
 			memcpy(p, addr, len);
 
 		__free_page(bvec->bv_page);
-- 
cgit v1.2.3


From b2858d7d1639c04ca3c54988d76c5f7300b76f1c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 19 May 2009 11:37:46 +0200
Subject: splice: fix kmaps in default_file_splice_write()

Unfortunately multiple kmap() within a single thread are deadlockable,
so writing out multiple buffers with writev() isn't possible.

Change the implementation so that it does a separate write() for each
buffer.  This actually simplifies the code a lot since the
splice_from_pipe() helper can be used.

This limitation is caused by HIGHMEM pages, and so only affects a
subset of architectures and configurations.  In the future it may be
worth to implement default_file_splice_write() in a more efficient way
on configs that allow it.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 130 ++++++++++--------------------------------------------------
 1 file changed, 22 insertions(+), 108 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 41179c0a655..73766d24f97 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -535,8 +535,8 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
 	return res;
 }
 
-static ssize_t kernel_writev(struct file *file, const struct iovec *vec,
-			    unsigned long vlen, loff_t *ppos)
+static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
+			    loff_t pos)
 {
 	mm_segment_t old_fs;
 	ssize_t res;
@@ -544,7 +544,7 @@ static ssize_t kernel_writev(struct file *file, const struct iovec *vec,
 	old_fs = get_fs();
 	set_fs(get_ds());
 	/* The cast to a user pointer is valid due to the set_fs() */
-	res = vfs_writev(file, (const struct iovec __user *)vec, vlen, ppos);
+	res = vfs_write(file, (const char __user *)buf, count, &pos);
 	set_fs(old_fs);
 
 	return res;
@@ -1003,120 +1003,34 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 EXPORT_SYMBOL(generic_file_splice_write);
 
-static struct pipe_buffer *nth_pipe_buf(struct pipe_inode_info *pipe, int n)
+static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+			  struct splice_desc *sd)
 {
-	return &pipe->bufs[(pipe->curbuf + n) % PIPE_BUFFERS];
+	int ret;
+	void *data;
+
+	ret = buf->ops->confirm(pipe, buf);
+	if (ret)
+		return ret;
+
+	data = buf->ops->map(pipe, buf, 0);
+	ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
+	buf->ops->unmap(pipe, buf, data);
+
+	return ret;
 }
 
 static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
 					 struct file *out, loff_t *ppos,
 					 size_t len, unsigned int flags)
 {
-	ssize_t ret = 0;
-	ssize_t total_len = 0;
-	int do_wakeup = 0;
-
-	pipe_lock(pipe);
-	while (len) {
-		struct pipe_buffer *buf;
-		void *data[PIPE_BUFFERS];
-		struct iovec vec[PIPE_BUFFERS];
-		unsigned int nr_pages = 0;
-		unsigned int write_len = 0;
-		unsigned int now_len = len;
-		unsigned int this_len;
-		int i;
-
-		BUG_ON(pipe->nrbufs > PIPE_BUFFERS);
-		for (i = 0; i < pipe->nrbufs && now_len; i++) {
-			buf = nth_pipe_buf(pipe, i);
-
-			ret = buf->ops->confirm(pipe, buf);
-			if (ret)
-				break;
-
-			data[i] = buf->ops->map(pipe, buf, 0);
-			this_len = min(buf->len, now_len);
-			vec[i].iov_base = (void __user *) data[i] + buf->offset;
-			vec[i].iov_len = this_len;
-			now_len -= this_len;
-			write_len += this_len;
-			nr_pages++;
-		}
-
-		if (nr_pages) {
-			ret = kernel_writev(out, vec, nr_pages, ppos);
-			if (ret == 0)
-				ret = -EIO;
-			if (ret > 0) {
-				len -= ret;
-				total_len += ret;
-			}
-		}
-
-		for (i = 0; i < nr_pages; i++) {
-			buf = nth_pipe_buf(pipe, i);
-			buf->ops->unmap(pipe, buf, data[i]);
-
-			if (ret > 0) {
-				this_len = min_t(unsigned, vec[i].iov_len, ret);
-				buf->offset += this_len;
-				buf->len -= this_len;
-				ret -= this_len;
-			}
-		}
-
-		if (ret < 0)
-			break;
-
-		while (pipe->nrbufs) {
-			const struct pipe_buf_operations *ops;
-
-			buf = nth_pipe_buf(pipe, 0);
-			if (buf->len)
-				break;
-
-			ops = buf->ops;
-			buf->ops = NULL;
-			ops->release(pipe, buf);
-			pipe->curbuf = (pipe->curbuf + 1) % PIPE_BUFFERS;
-			pipe->nrbufs--;
-			if (pipe->inode)
-				do_wakeup = 1;
-		}
-
-		if (pipe->nrbufs)
-			continue;
-		if (!pipe->writers)
-			break;
-		if (!pipe->waiting_writers) {
-			if (total_len)
-				break;
-		}
-
-		if (flags & SPLICE_F_NONBLOCK) {
-			ret = -EAGAIN;
-			break;
-		}
-
-		if (signal_pending(current)) {
-			ret = -ERESTARTSYS;
-			break;
-		}
-
-		if (do_wakeup) {
-			wakeup_pipe_writers(pipe);
-			do_wakeup = 0;
-		}
-
-		pipe_wait(pipe);
-	}
-	pipe_unlock(pipe);
+	ssize_t ret;
 
-	if (do_wakeup)
-		wakeup_pipe_writers(pipe);
+	ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
+	if (ret > 0)
+		*ppos += ret;
 
-	return total_len ? total_len : ret;
+	return ret;
 }
 
 /**
-- 
cgit v1.2.3


From ef9e8b14a5c1d0afbaf12b4c3b271188ddfc52a4 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 19 May 2009 14:25:16 +0100
Subject: GFS2: Don't warn when delete inode fails on ro filesystem

If the filesystem is read-only, then we expect that delete inode
will fail, so there is no need to warn about it.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index a3c2272e7ca..2fd1dcbcc5b 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -714,7 +714,7 @@ out_unlock:
 		gfs2_glock_dq(&ip->i_iopen_gh);
 	gfs2_holder_uninit(&ip->i_iopen_gh);
 	gfs2_glock_dq_uninit(&gh);
-	if (error && error != GLR_TRYFAILED)
+	if (error && error != GLR_TRYFAILED && error != -EROFS)
 		fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
 out:
 	truncate_inode_pages(&inode->i_data, 0);
-- 
cgit v1.2.3


From 09010978345e8883003bf411bb99753710eb5a3a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 20 May 2009 10:48:47 +0100
Subject: GFS2: Improve resource group error handling

This patch improves the error handling in the case where we
discover that the summary information in the resource group
doesn't match the bitmap information while in the process of
allocating blocks. Originally this resulted in a kernel bug,
but this patch changes that so that we return -EIO and print
some messages explaining what went wrong, and how to fix it.

We also remember locally not to try and allocate from the
same rgrp again, so that a subsequent allocation in a
different rgrp should succeed.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c   |  9 +++++++--
 fs/gfs2/dir.c    | 11 +++++++++--
 fs/gfs2/eattr.c  | 14 +++++++++++---
 fs/gfs2/glops.c  | 20 +------------------
 fs/gfs2/incore.h |  7 ++++---
 fs/gfs2/rgrp.c   | 58 +++++++++++++++++++++++++++++++++++++++++---------------
 fs/gfs2/rgrp.h   | 47 +++++++++++++++++++++++----------------------
 7 files changed, 99 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3a5d3f883e1..253e1a39f84 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -136,7 +136,9 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 		   and write it out to disk */
 
 		unsigned int n = 1;
-		block = gfs2_alloc_block(ip, &n);
+		error = gfs2_alloc_block(ip, &block, &n);
+		if (error)
+			goto out_brelse;
 		if (isdir) {
 			gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
 			error = gfs2_dir_get_new_buffer(ip, block, &bh);
@@ -476,8 +478,11 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 	blks = dblks + iblks;
 	i = sheight;
 	do {
+		int error;
 		n = blks - alloced;
-		bn = gfs2_alloc_block(ip, &n);
+		error = gfs2_alloc_block(ip, &bn, &n);
+		if (error)
+			return error;
 		alloced += n;
 		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 			gfs2_trans_add_unrevoke(sdp, bn, n);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index aef4d0c0674..297d7e5ceba 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -803,13 +803,20 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	unsigned int n = 1;
-	u64 bn = gfs2_alloc_block(ip, &n);
-	struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
+	u64 bn;
+	int error;
+	struct buffer_head *bh;
 	struct gfs2_leaf *leaf;
 	struct gfs2_dirent *dent;
 	struct qstr name = { .name = "", .len = 0, .hash = 0 };
+
+	error = gfs2_alloc_block(ip, &bn, &n);
+	if (error)
+		return NULL;
+	bh = gfs2_meta_new(ip->i_gl, bn);
 	if (!bh)
 		return NULL;
+
 	gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
 	gfs2_trans_add_bh(ip->i_gl, bh, 1);
 	gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 899763aed21..07ea9529add 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -582,8 +582,11 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
 	struct gfs2_ea_header *ea;
 	unsigned int n = 1;
 	u64 block;
+	int error;
 
-	block = gfs2_alloc_block(ip, &n);
+	error = gfs2_alloc_block(ip, &block, &n);
+	if (error)
+		return error;
 	gfs2_trans_add_unrevoke(sdp, block, 1);
 	*bhp = gfs2_meta_new(ip->i_gl, block);
 	gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
@@ -617,6 +620,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 		    struct gfs2_ea_request *er)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	int error;
 
 	ea->ea_data_len = cpu_to_be32(er->er_data_len);
 	ea->ea_name_len = er->er_name_len;
@@ -642,7 +646,9 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 			int mh_size = sizeof(struct gfs2_meta_header);
 			unsigned int n = 1;
 
-			block = gfs2_alloc_block(ip, &n);
+			error = gfs2_alloc_block(ip, &block, &n);
+			if (error)
+				return error;
 			gfs2_trans_add_unrevoke(sdp, block, 1);
 			bh = gfs2_meta_new(ip->i_gl, block);
 			gfs2_trans_add_bh(ip->i_gl, bh, 1);
@@ -963,7 +969,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	} else {
 		u64 blk;
 		unsigned int n = 1;
-		blk = gfs2_alloc_block(ip, &n);
+		error = gfs2_alloc_block(ip, &blk, &n);
+		if (error)
+			return error;
 		gfs2_trans_add_unrevoke(sdp, blk, 1);
 		indbh = gfs2_meta_new(ip->i_gl, blk);
 		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 70f87f43afa..d5e4ab155ca 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -309,24 +309,6 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
 	gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
 }
 
-/**
- * rgrp_go_dump - print out an rgrp
- * @seq: The iterator
- * @gl: The glock in question
- *
- */
-
-static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
-{
-	const struct gfs2_rgrpd *rgd = gl->gl_object;
-	if (rgd == NULL)
-		return 0;
-	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
-		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
-		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
-	return 0;
-}
-
 /**
  * trans_go_sync - promote/demote the transaction glock
  * @gl: the glock
@@ -410,7 +392,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 	.go_demote_ok = rgrp_go_demote_ok,
 	.go_lock = rgrp_go_lock,
 	.go_unlock = rgrp_go_unlock,
-	.go_dump = rgrp_go_dump,
+	.go_dump = gfs2_rgrp_dump,
 	.go_type = LM_TYPE_RGRP,
 	.go_min_hold_time = HZ / 5,
 };
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 0060e9564bb..de50d86fec1 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -92,9 +92,10 @@ struct gfs2_rgrpd {
 	unsigned int rd_bh_count;
 	u32 rd_last_alloc;
 	unsigned char rd_flags;
-#define GFS2_RDF_CHECK        0x01      /* Need to check for unlinked inodes */
-#define GFS2_RDF_NOALLOC      0x02      /* rg prohibits allocation */
-#define GFS2_RDF_UPTODATE     0x04      /* rg is up to date */
+#define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
+#define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
+#define GFS2_RDF_ERROR		0x40000000 /* error in rg */
+#define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
 };
 
 enum gfs2_state_bits {
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 565038243fa..fbacf09ee34 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -701,10 +701,7 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 	u32 rg_flags;
 
 	rg_flags = be32_to_cpu(str->rg_flags);
-	if (rg_flags & GFS2_RGF_NOALLOC)
-		rgd->rd_flags |= GFS2_RDF_NOALLOC;
-	else
-		rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
+	rg_flags &= ~GFS2_RDF_MASK;
 	rgd->rd_free = be32_to_cpu(str->rg_free);
 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
@@ -713,11 +710,8 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 {
 	struct gfs2_rgrp *str = buf;
-	u32 rg_flags = 0;
 
-	if (rgd->rd_flags & GFS2_RDF_NOALLOC)
-		rg_flags |= GFS2_RGF_NOALLOC;
-	str->rg_flags = cpu_to_be32(rg_flags);
+	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
 	str->rg_free = cpu_to_be32(rgd->rd_free);
 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
 	str->__pad = cpu_to_be32(0);
@@ -942,7 +936,7 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
 	int ret = 0;
 
-	if (rgd->rd_flags & GFS2_RDF_NOALLOC)
+	if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
 		return 0;
 
 	spin_lock(&sdp->sd_rindex_spin);
@@ -1435,13 +1429,33 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 }
 
 /**
- * gfs2_alloc_block - Allocate a block
+ * gfs2_rgrp_dump - print out an rgrp
+ * @seq: The iterator
+ * @gl: The glock in question
+ *
+ */
+
+int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+	const struct gfs2_rgrpd *rgd = gl->gl_object;
+	if (rgd == NULL)
+		return 0;
+	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
+		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
+	return 0;
+}
+
+/**
+ * gfs2_alloc_block - Allocate one or more blocks
  * @ip: the inode to allocate the block for
+ * @bn: Used to return the starting block number
+ * @n: requested number of blocks/extent length (value/result)
  *
- * Returns: the allocated block
+ * Returns: 0 or error
  */
 
-u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
+int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head *dibh;
@@ -1457,7 +1471,10 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 		goal = rgd->rd_last_alloc;
 
 	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n);
-	BUG_ON(blk == BFITNOENT);
+
+	/* Since all blocks are reserved in advance, this shouldn't happen */
+	if (blk == BFITNOENT)
+		goto rgrp_error;
 
 	rgd->rd_last_alloc = blk;
 	block = rgd->rd_data0 + blk;
@@ -1469,7 +1486,9 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 		di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal);
 		brelse(dibh);
 	}
-	gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
+	if (rgd->rd_free < *n)
+		goto rgrp_error;
+
 	rgd->rd_free -= *n;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1484,7 +1503,16 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 	rgd->rd_free_clone -= *n;
 	spin_unlock(&sdp->sd_rindex_spin);
 
-	return block;
+	*bn = block;
+	return 0;
+
+rgrp_error:
+	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
+	        (unsigned long long)rgd->rd_addr);
+	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
+	gfs2_rgrp_dump(NULL, rgd->rd_gl);
+	rgd->rd_flags |= GFS2_RDF_ERROR;
+	return -EIO;
 }
 
 /**
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 3181c7e624b..1e76ff0f3e0 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -14,22 +14,22 @@ struct gfs2_rgrpd;
 struct gfs2_sbd;
 struct gfs2_holder;
 
-void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
 
 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
 
-void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
-int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
+extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
+extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
 
-int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
-void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
-void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
+extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
 
-void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
 
-struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
 static inline void gfs2_alloc_put(struct gfs2_inode *ip)
 {
 	BUG_ON(ip->i_alloc == NULL);
@@ -37,22 +37,22 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
 	ip->i_alloc = NULL;
 }
 
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
-			 char *file, unsigned int line);
+extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file,
+				  unsigned int line);
 #define gfs2_inplace_reserve(ip) \
 gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
 
-void gfs2_inplace_release(struct gfs2_inode *ip);
+extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
-unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
+extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
 
-u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n);
-u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
+extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
+extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
 
-void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
-void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
-void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
-void gfs2_unlink_di(struct inode *inode);
+extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
+extern void gfs2_unlink_di(struct inode *inode);
 
 struct gfs2_rgrp_list {
 	unsigned int rl_rgrps;
@@ -61,10 +61,11 @@ struct gfs2_rgrp_list {
 	struct gfs2_holder *rl_ghs;
 };
 
-void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
-		    u64 block);
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
-void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
-u64 gfs2_ri_total(struct gfs2_sbd *sdp);
+extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+			   u64 block);
+extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
+extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
+extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
+extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
 
 #endif /* __RGRP_DOT_H__ */
-- 
cgit v1.2.3


From 60a0b8f93664621a07b93273fc8ebc29590c62f5 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 21 May 2009 12:23:12 +0100
Subject: GFS2: Add a rgrp bitmap full flag

During block allocation, it is useful to know if sections of disk
are full on a finer grained basis than a single resource group.
This can make a performance difference when resource groups have
larger numbers of bitmap blocks, since we no longer have to search
them all block by block in each individual bitmap.

The full flag is set on a per-bitmap basis when it has been
searched and found to have no free space. It is then skipped in
subsequent searches until the flag is reset. The resetting
occurs if we have to drop the glock on the resource group for any
reason, or if we deallocate some blocks within that resource
group and thus free up some space.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h |  3 +++
 fs/gfs2/rgrp.c   | 77 ++++++++++++++++++++++++++++++++++----------------------
 2 files changed, 50 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index de50d86fec1..dd87379b61e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -64,9 +64,12 @@ struct gfs2_log_element {
 	const struct gfs2_log_operations *le_ops;
 };
 
+#define GBF_FULL 1
+
 struct gfs2_bitmap {
 	struct buffer_head *bi_bh;
 	char *bi_clone;
+	unsigned long bi_flags;
 	u32 bi_offset;
 	u32 bi_start;
 	u32 bi_len;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fbacf09ee34..23637b9d1c7 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -442,6 +442,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
 	for (x = 0; x < length; x++) {
 		bi = rgd->rd_bits + x;
 
+		bi->bi_flags = 0;
 		/* small rgrp; bitmap stored completely in header block */
 		if (length == 1) {
 			bytes = bytes_left;
@@ -769,6 +770,8 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 	}
 
 	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
+		for (x = 0; x < length; x++)
+			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
 		rgd->rd_flags |= GFS2_RDF_UPTODATE;
 	}
@@ -897,6 +900,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
 			continue;
 		if (sdp->sd_args.ar_discard)
 			gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
+		clear_bit(GBF_FULL, &bi->bi_flags);
 		memcpy(bi->bi_clone + bi->bi_offset,
 		       bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
 	}
@@ -1309,30 +1313,37 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 {
 	struct gfs2_bitmap *bi = NULL;
 	const u32 length = rgd->rd_length;
-	u32 blk = 0;
+	u32 blk = BFITNOENT;
 	unsigned int buf, x;
 	const unsigned int elen = *n;
-	const u8 *buffer;
+	const u8 *buffer = NULL;
 
 	*n = 0;
 	/* Find bitmap block that contains bits for goal block */
 	for (buf = 0; buf < length; buf++) {
 		bi = rgd->rd_bits + buf;
-		if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
-			break;
+		/* Convert scope of "goal" from rgrp-wide to within found bit block */
+		if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
+			goal -= bi->bi_start * GFS2_NBBY;
+			goto do_search;
+		}
 	}
+	buf = 0;
+	goal = 0;
 
-	gfs2_assert(rgd->rd_sbd, buf < length);
-
-	/* Convert scope of "goal" from rgrp-wide to within found bit block */
-	goal -= bi->bi_start * GFS2_NBBY;
-
+do_search:
 	/* Search (up to entire) bitmap in this rgrp for allocatable block.
 	   "x <= length", instead of "x < length", because we typically start
 	   the search in the middle of a bit block, but if we can't find an
 	   allocatable block anywhere else, we want to be able wrap around and
 	   search in the first part of our first-searched bit block.  */
 	for (x = 0; x <= length; x++) {
+		bi = rgd->rd_bits + buf;
+
+		if (test_bit(GBF_FULL, &bi->bi_flags) &&
+		    (old_state == GFS2_BLKST_FREE))
+			goto skip;
+
 		/* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
 		   bitmaps, so we must search the originals for that. */
 		buffer = bi->bi_bh->b_data + bi->bi_offset;
@@ -1343,33 +1354,39 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 		if (blk != BFITNOENT)
 			break;
 
+		if ((goal == 0) && (old_state == GFS2_BLKST_FREE))
+			set_bit(GBF_FULL, &bi->bi_flags);
+
 		/* Try next bitmap block (wrap back to rgrp header if at end) */
-		buf = (buf + 1) % length;
-		bi = rgd->rd_bits + buf;
+skip:
+		buf++;
+		buf %= length;
 		goal = 0;
 	}
 
-	if (blk != BFITNOENT && old_state != new_state) {
-		*n = 1;
-		gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+	if (blk == BFITNOENT)
+		return blk;
+	*n = 1;
+	if (old_state == new_state)
+		goto out;
+
+	gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+	gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
+		    bi->bi_len, blk, new_state);
+	goal = blk;
+	while (*n < elen) {
+		goal++;
+		if (goal >= (bi->bi_len * GFS2_NBBY))
+			break;
+		if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
+		    GFS2_BLKST_FREE)
+			break;
 		gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
-			    bi->bi_len, blk, new_state);
-		goal = blk;
-		while (*n < elen) {
-			goal++;
-			if (goal >= (bi->bi_len * GFS2_NBBY))
-				break;
-			if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
-			    GFS2_BLKST_FREE)
-				break;
-			gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone,
-				    bi->bi_offset, bi->bi_len, goal,
-				    new_state);
-			(*n)++;
-		}
+			    bi->bi_len, goal, new_state);
+		(*n)++;
 	}
-
-	return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk;
+out:
+	return (bi->bi_start * GFS2_NBBY) + blk;
 }
 
 /**
-- 
cgit v1.2.3


From 1ce97e564b628bee30b8dbb64e5e653a484308f6 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 21 May 2009 15:18:19 +0100
Subject: GFS2: Be more aggressive in reclaiming unlinked inodes

This patch increases the frequency with which gfs2 looks
for unlinked, but still allocated inodes. Its the equivalent
operation to ext3's orphan list, but done with bitmaps in
the resource groups.

This also fixes a bug where a field in the rgrp was too small.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h | 2 +-
 fs/gfs2/rgrp.c   | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index dd87379b61e..225347fbff3 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -94,7 +94,7 @@ struct gfs2_rgrpd {
 	struct gfs2_sbd *rd_sbd;
 	unsigned int rd_bh_count;
 	u32 rd_last_alloc;
-	unsigned char rd_flags;
+	u32 rd_flags;
 #define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
 #define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
 #define GFS2_RDF_ERROR		0x40000000 /* error in rg */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 23637b9d1c7..ee3d5c1876a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -581,7 +581,6 @@ static int read_rindex_entry(struct gfs2_inode *ip,
 
 	rgd->rd_gl->gl_object = rgd;
 	rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
-	rgd->rd_flags |= GFS2_RDF_CHECK;
 	return error;
 }
 
@@ -703,6 +702,8 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 
 	rg_flags = be32_to_cpu(str->rg_flags);
 	rg_flags &= ~GFS2_RDF_MASK;
+	rgd->rd_flags &= GFS2_RDF_MASK;
+	rgd->rd_flags |= rg_flags;
 	rgd->rd_free = be32_to_cpu(str->rg_free);
 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
@@ -773,7 +774,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 		for (x = 0; x < length; x++)
 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
-		rgd->rd_flags |= GFS2_RDF_UPTODATE;
+		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
 	}
 
 	spin_lock(&sdp->sd_rindex_spin);
-- 
cgit v1.2.3


From b9fc745db833bbf74b4988493b8cd902a84c9415 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 19 May 2009 13:25:57 -0400
Subject: integrity: path_check update

- Add support in ima_path_check() for integrity checking without
incrementing the counts. (Required for nfsd.)
- rename and export opencount_get to ima_counts_get
- replace ima_shm_check calls with ima_counts_get
- export ima_path_check

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/exec.c  | 5 +++--
 fs/namei.c | 6 ++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 998e856c307..618d6d1e2c5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -130,7 +130,8 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 				 MAY_READ | MAY_EXEC | MAY_OPEN);
 	if (error)
 		goto exit;
-	error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
+	error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN,
+			       IMA_COUNT_UPDATE);
 	if (error)
 		goto exit;
 
@@ -680,7 +681,7 @@ struct file *open_exec(const char *name)
 	err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
 	if (err)
 		goto out_path_put;
-	err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
+	err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN, IMA_COUNT_UPDATE);
 	if (err)
 		goto out_path_put;
 
diff --git a/fs/namei.c b/fs/namei.c
index 78f253cd2d4..b05a2b1dea6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -853,7 +853,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 			err = inode_permission(nd->path.dentry->d_inode,
 					       MAY_EXEC);
 		if (!err)
-			err = ima_path_check(&nd->path, MAY_EXEC);
+			err = ima_path_check(&nd->path, MAY_EXEC,
+				             IMA_COUNT_UPDATE);
  		if (err)
 			break;
 
@@ -1515,7 +1516,8 @@ int may_open(struct path *path, int acc_mode, int flag)
 		return error;
 
 	error = ima_path_check(path,
-			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
+			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC),
+			       IMA_COUNT_UPDATE);
 	if (error)
 		return error;
 	/*
-- 
cgit v1.2.3


From c9d9ac525a0285a5b5ad9c3f9aa8b7c1753e6121 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 19 May 2009 13:25:58 -0400
Subject: integrity: move ima_counts_get

Based on discussion on lkml (Andrew Morton and Eric Paris),
move ima_counts_get down a layer into shmem/hugetlb__file_setup().
Resolves drm shmem_file_setup() usage case as well.

HD comment:
  I still think you're doing this at the wrong level, but recognize
  that you probably won't be persuaded until a few more users of
  alloc_file() emerge, all wanting your ima_counts_get().

  Resolving GEM's shmem_file_setup() is an improvement, so I'll say

Acked-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/hugetlbfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 153d9681192..ccc62de96df 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -30,6 +30,7 @@
 #include <linux/dnotify.h>
 #include <linux/statfs.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 
 #include <asm/uaccess.h>
 
@@ -997,6 +998,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
 			&hugetlbfs_file_operations);
 	if (!file)
 		goto out_dentry; /* inode is already attached */
+	ima_counts_get(file);
 
 	return file;
 
-- 
cgit v1.2.3


From b1e71b0622974953e46a284aa986504a90869a9b Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 22 May 2009 10:01:55 +0100
Subject: GFS2: Clean up some file names

This patch renames the ops_*.c files which have no counterpart
without the ops_ prefix in order to shorten the name and make
it more readable. In addition, ops_address.h (which was very
small) is moved into inode.h and inode.h is cleaned up by
adding extern where required.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Makefile      |    2 +-
 fs/gfs2/aops.c        | 1145 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/gfs2/bmap.c        |    1 -
 fs/gfs2/dentry.c      |  114 +++++
 fs/gfs2/export.c      |  285 ++++++++++++
 fs/gfs2/file.c        |  765 +++++++++++++++++++++++++++++++++
 fs/gfs2/inode.c       |    1 -
 fs/gfs2/inode.h       |   57 +--
 fs/gfs2/meta_io.c     |    1 -
 fs/gfs2/ops_address.c | 1146 -------------------------------------------------
 fs/gfs2/ops_address.h |   23 -
 fs/gfs2/ops_dentry.c  |  114 -----
 fs/gfs2/ops_export.c  |  285 ------------
 fs/gfs2/ops_file.c    |  766 ---------------------------------
 fs/gfs2/quota.c       |    1 -
 fs/gfs2/rgrp.c        |    1 -
 16 files changed, 2343 insertions(+), 2364 deletions(-)
 create mode 100644 fs/gfs2/aops.c
 create mode 100644 fs/gfs2/dentry.c
 create mode 100644 fs/gfs2/export.c
 create mode 100644 fs/gfs2/file.c
 delete mode 100644 fs/gfs2/ops_address.c
 delete mode 100644 fs/gfs2/ops_address.h
 delete mode 100644 fs/gfs2/ops_dentry.c
 delete mode 100644 fs/gfs2/ops_export.c
 delete mode 100644 fs/gfs2/ops_file.c

(limited to 'fs')

diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index a851ea4bdf7..4f7332c7682 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,7 +1,7 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
 	glops.o inode.o log.o lops.o main.o meta_io.o \
-	mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
+	mount.o aops.o dentry.o export.o file.o \
 	ops_fstype.o ops_inode.o ops_super.o quota.o \
 	recovery.o rgrp.o super.o sys.o trans.o util.o
 
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
new file mode 100644
index 00000000000..03ebb439ace
--- /dev/null
+++ b/fs/gfs2/aops.c
@@ -0,0 +1,1145 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/mpage.h>
+#include <linux/fs.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/backing-dev.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "glock.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "trans.h"
+#include "rgrp.h"
+#include "super.h"
+#include "util.h"
+#include "glops.h"
+
+
+static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+				   unsigned int from, unsigned int to)
+{
+	struct buffer_head *head = page_buffers(page);
+	unsigned int bsize = head->b_size;
+	struct buffer_head *bh;
+	unsigned int start, end;
+
+	for (bh = head, start = 0; bh != head || !start;
+	     bh = bh->b_this_page, start = end) {
+		end = start + bsize;
+		if (end <= from || start >= to)
+			continue;
+		if (gfs2_is_jdata(ip))
+			set_buffer_uptodate(bh);
+		gfs2_trans_add_bh(ip->i_gl, bh, 0);
+	}
+}
+
+/**
+ * gfs2_get_block_noalloc - Fills in a buffer head with details about a block
+ * @inode: The inode
+ * @lblock: The block number to look up
+ * @bh_result: The buffer head to return the result in
+ * @create: Non-zero if we may add block to the file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
+				  struct buffer_head *bh_result, int create)
+{
+	int error;
+
+	error = gfs2_block_map(inode, lblock, bh_result, 0);
+	if (error)
+		return error;
+	if (!buffer_mapped(bh_result))
+		return -EIO;
+	return 0;
+}
+
+static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
+				 struct buffer_head *bh_result, int create)
+{
+	return gfs2_block_map(inode, lblock, bh_result, 0);
+}
+
+/**
+ * gfs2_writepage_common - Common bits of writepage
+ * @page: The page to be written
+ * @wbc: The writeback control
+ *
+ * Returns: 1 if writepage is ok, otherwise an error code or zero if no error.
+ */
+
+static int gfs2_writepage_common(struct page *page,
+				 struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+
+	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
+		goto out;
+	if (current->journal_info)
+		goto redirty;
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (page->index > end_index || (page->index == end_index && !offset)) {
+		page->mapping->a_ops->invalidatepage(page, 0);
+		goto out;
+	}
+	return 1;
+redirty:
+	redirty_page_for_writepage(wbc, page);
+out:
+	unlock_page(page);
+	return 0;
+}
+
+/**
+ * gfs2_writeback_writepage - Write page for writeback mappings
+ * @page: The page
+ * @wbc: The writeback control
+ *
+ */
+
+static int gfs2_writeback_writepage(struct page *page,
+				    struct writeback_control *wbc)
+{
+	int ret;
+
+	ret = gfs2_writepage_common(page, wbc);
+	if (ret <= 0)
+		return ret;
+
+	ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc);
+	if (ret == -EAGAIN)
+		ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+	return ret;
+}
+
+/**
+ * gfs2_ordered_writepage - Write page for ordered data files
+ * @page: The page to write
+ * @wbc: The writeback control
+ *
+ */
+
+static int gfs2_ordered_writepage(struct page *page,
+				  struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	int ret;
+
+	ret = gfs2_writepage_common(page, wbc);
+	if (ret <= 0)
+		return ret;
+
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, inode->i_sb->s_blocksize,
+				     (1 << BH_Dirty)|(1 << BH_Uptodate));
+	}
+	gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1);
+	return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+}
+
+/**
+ * __gfs2_jdata_writepage - The core of jdata writepage
+ * @page: The page to write
+ * @wbc: The writeback control
+ *
+ * This is shared between writepage and writepages and implements the
+ * core of the writepage operation. If a transaction is required then
+ * PageChecked will have been set and the transaction will have
+ * already been started before this is called.
+ */
+
+static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+
+	if (PageChecked(page)) {
+		ClearPageChecked(page);
+		if (!page_has_buffers(page)) {
+			create_empty_buffers(page, inode->i_sb->s_blocksize,
+					     (1 << BH_Dirty)|(1 << BH_Uptodate));
+		}
+		gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
+	}
+	return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
+}
+
+/**
+ * gfs2_jdata_writepage - Write complete page
+ * @page: Page to write
+ *
+ * Returns: errno
+ *
+ */
+
+static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	int ret;
+	int done_trans = 0;
+
+	if (PageChecked(page)) {
+		if (wbc->sync_mode != WB_SYNC_ALL)
+			goto out_ignore;
+		ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
+		if (ret)
+			goto out_ignore;
+		done_trans = 1;
+	}
+	ret = gfs2_writepage_common(page, wbc);
+	if (ret > 0)
+		ret = __gfs2_jdata_writepage(page, wbc);
+	if (done_trans)
+		gfs2_trans_end(sdp);
+	return ret;
+
+out_ignore:
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+	return 0;
+}
+
+/**
+ * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk
+ * @mapping: The mapping to write
+ * @wbc: Write-back control
+ *
+ * For the data=writeback case we can already ignore buffer heads
+ * and write whole extents at once. This is a big reduction in the
+ * number of I/O requests we send and the bmap calls we make in this case.
+ */
+static int gfs2_writeback_writepages(struct address_space *mapping,
+				     struct writeback_control *wbc)
+{
+	return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
+}
+
+/**
+ * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages
+ * @mapping: The mapping
+ * @wbc: The writeback control
+ * @writepage: The writepage function to call for each page
+ * @pvec: The vector of pages
+ * @nr_pages: The number of pages to write
+ *
+ * Returns: non-zero if loop should terminate, zero otherwise
+ */
+
+static int gfs2_write_jdata_pagevec(struct address_space *mapping,
+				    struct writeback_control *wbc,
+				    struct pagevec *pvec,
+				    int nr_pages, pgoff_t end)
+{
+	struct inode *inode = mapping->host;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
+	unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	int i;
+	int ret;
+
+	ret = gfs2_trans_begin(sdp, nrblocks, nrblocks);
+	if (ret < 0)
+		return ret;
+
+	for(i = 0; i < nr_pages; i++) {
+		struct page *page = pvec->pages[i];
+
+		lock_page(page);
+
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			continue;
+		}
+
+		if (!wbc->range_cyclic && page->index > end) {
+			ret = 1;
+			unlock_page(page);
+			continue;
+		}
+
+		if (wbc->sync_mode != WB_SYNC_NONE)
+			wait_on_page_writeback(page);
+
+		if (PageWriteback(page) ||
+		    !clear_page_dirty_for_io(page)) {
+			unlock_page(page);
+			continue;
+		}
+
+		/* Is the page fully outside i_size? (truncate in progress) */
+		if (page->index > end_index || (page->index == end_index && !offset)) {
+			page->mapping->a_ops->invalidatepage(page, 0);
+			unlock_page(page);
+			continue;
+		}
+
+		ret = __gfs2_jdata_writepage(page, wbc);
+
+		if (ret || (--(wbc->nr_to_write) <= 0))
+			ret = 1;
+		if (wbc->nonblocking && bdi_write_congested(bdi)) {
+			wbc->encountered_congestion = 1;
+			ret = 1;
+		}
+
+	}
+	gfs2_trans_end(sdp);
+	return ret;
+}
+
+/**
+ * gfs2_write_cache_jdata - Like write_cache_pages but different
+ * @mapping: The mapping to write
+ * @wbc: The writeback control
+ * @writepage: The writepage function to call
+ * @data: The data to pass to writepage
+ *
+ * The reason that we use our own function here is that we need to
+ * start transactions before we grab page locks. This allows us
+ * to get the ordering right.
+ */
+
+static int gfs2_write_cache_jdata(struct address_space *mapping,
+				  struct writeback_control *wbc)
+{
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	int ret = 0;
+	int done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;
+	int scanned = 0;
+	int range_whole = 0;
+
+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		wbc->encountered_congestion = 1;
+		return 0;
+	}
+
+	pagevec_init(&pvec, 0);
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		scanned = 1;
+	}
+
+retry:
+	 while (!done && (index <= end) &&
+		(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					       PAGECACHE_TAG_DIRTY,
+					       min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		scanned = 1;
+		ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end);
+		if (ret)
+			done = 1;
+		if (ret > 0)
+			ret = 0;
+
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = index;
+	return ret;
+}
+
+
+/**
+ * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk
+ * @mapping: The mapping to write
+ * @wbc: The writeback control
+ * 
+ */
+
+static int gfs2_jdata_writepages(struct address_space *mapping,
+				 struct writeback_control *wbc)
+{
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+	int ret;
+
+	ret = gfs2_write_cache_jdata(mapping, wbc);
+	if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
+		gfs2_log_flush(sdp, ip->i_gl);
+		ret = gfs2_write_cache_jdata(mapping, wbc);
+	}
+	return ret;
+}
+
+/**
+ * stuffed_readpage - Fill in a Linux page with stuffed file data
+ * @ip: the inode
+ * @page: the page
+ *
+ * Returns: errno
+ */
+
+static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
+{
+	struct buffer_head *dibh;
+	void *kaddr;
+	int error;
+
+	/*
+	 * Due to the order of unstuffing files and ->fault(), we can be
+	 * asked for a zero page in the case of a stuffed file being extended,
+	 * so we need to supply one here. It doesn't happen often.
+	 */
+	if (unlikely(page->index)) {
+		zero_user(page, 0, PAGE_CACHE_SIZE);
+		SetPageUptodate(page);
+		return 0;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		return error;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
+	       ip->i_disksize);
+	memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
+	kunmap_atomic(kaddr, KM_USER0);
+	flush_dcache_page(page);
+	brelse(dibh);
+	SetPageUptodate(page);
+
+	return 0;
+}
+
+
+/**
+ * __gfs2_readpage - readpage
+ * @file: The file to read a page for
+ * @page: The page to read
+ *
+ * This is the core of gfs2's readpage. Its used by the internal file
+ * reading code as in that case we already hold the glock. Also its
+ * called by gfs2_readpage() once the required lock has been granted.
+ *
+ */
+
+static int __gfs2_readpage(void *file, struct page *page)
+{
+	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+	int error;
+
+	if (gfs2_is_stuffed(ip)) {
+		error = stuffed_readpage(ip, page);
+		unlock_page(page);
+	} else {
+		error = mpage_readpage(page, gfs2_block_map);
+	}
+
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+
+	return error;
+}
+
+/**
+ * gfs2_readpage - read a page of a file
+ * @file: The file to read
+ * @page: The page of the file
+ *
+ * This deals with the locking required. We have to unlock and
+ * relock the page in order to get the locking in the right
+ * order.
+ */
+
+static int gfs2_readpage(struct file *file, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_holder gh;
+	int error;
+
+	unlock_page(page);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	error = gfs2_glock_nq(&gh);
+	if (unlikely(error))
+		goto out;
+	error = AOP_TRUNCATED_PAGE;
+	lock_page(page);
+	if (page->mapping == mapping && !PageUptodate(page))
+		error = __gfs2_readpage(file, page);
+	else
+		unlock_page(page);
+	gfs2_glock_dq(&gh);
+out:
+	gfs2_holder_uninit(&gh);
+	if (error && error != AOP_TRUNCATED_PAGE)
+		lock_page(page);
+	return error;
+}
+
+/**
+ * gfs2_internal_read - read an internal file
+ * @ip: The gfs2 inode
+ * @ra_state: The readahead state (or NULL for no readahead)
+ * @buf: The buffer to fill
+ * @pos: The file position
+ * @size: The amount to read
+ *
+ */
+
+int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
+                       char *buf, loff_t *pos, unsigned size)
+{
+	struct address_space *mapping = ip->i_inode.i_mapping;
+	unsigned long index = *pos / PAGE_CACHE_SIZE;
+	unsigned offset = *pos & (PAGE_CACHE_SIZE - 1);
+	unsigned copied = 0;
+	unsigned amt;
+	struct page *page;
+	void *p;
+
+	do {
+		amt = size - copied;
+		if (offset + size > PAGE_CACHE_SIZE)
+			amt = PAGE_CACHE_SIZE - offset;
+		page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+		p = kmap_atomic(page, KM_USER0);
+		memcpy(buf + copied, p + offset, amt);
+		kunmap_atomic(p, KM_USER0);
+		mark_page_accessed(page);
+		page_cache_release(page);
+		copied += amt;
+		index++;
+		offset = 0;
+	} while(copied < size);
+	(*pos) += size;
+	return size;
+}
+
+/**
+ * gfs2_readpages - Read a bunch of pages at once
+ *
+ * Some notes:
+ * 1. This is only for readahead, so we can simply ignore any things
+ *    which are slightly inconvenient (such as locking conflicts between
+ *    the page lock and the glock) and return having done no I/O. Its
+ *    obviously not something we'd want to do on too regular a basis.
+ *    Any I/O we ignore at this time will be done via readpage later.
+ * 2. We don't handle stuffed files here we let readpage do the honours.
+ * 3. mpage_readpages() does most of the heavy lifting in the common case.
+ * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places.
+ */
+
+static int gfs2_readpages(struct file *file, struct address_space *mapping,
+			  struct list_head *pages, unsigned nr_pages)
+{
+	struct inode *inode = mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (unlikely(ret))
+		goto out_uninit;
+	if (!gfs2_is_stuffed(ip))
+		ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map);
+	gfs2_glock_dq(&gh);
+out_uninit:
+	gfs2_holder_uninit(&gh);
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		ret = -EIO;
+	return ret;
+}
+
+/**
+ * gfs2_write_begin - Begin to write to a file
+ * @file: The file to write to
+ * @mapping: The mapping in which to write
+ * @pos: The file offset at which to start writing
+ * @len: Length of the write
+ * @flags: Various flags
+ * @pagep: Pointer to return the page
+ * @fsdata: Pointer to return fs data (unused by GFS2)
+ *
+ * Returns: errno
+ */
+
+static int gfs2_write_begin(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
+{
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+	int alloc_required;
+	int error = 0;
+	struct gfs2_alloc *al;
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned to = from + len;
+	struct page *page;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	error = gfs2_glock_nq(&ip->i_gh);
+	if (unlikely(error))
+		goto out_uninit;
+
+	error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
+	if (error)
+		goto out_unlock;
+
+	if (alloc_required || gfs2_is_jdata(ip))
+		gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
+
+	if (alloc_required) {
+		al = gfs2_alloc_get(ip);
+		if (!al) {
+			error = -ENOMEM;
+			goto out_unlock;
+		}
+
+		error = gfs2_quota_lock_check(ip);
+		if (error)
+			goto out_alloc_put;
+
+		al->al_requested = data_blocks + ind_blocks;
+		error = gfs2_inplace_reserve(ip);
+		if (error)
+			goto out_qunlock;
+	}
+
+	rblocks = RES_DINODE + ind_blocks;
+	if (gfs2_is_jdata(ip))
+		rblocks += data_blocks ? data_blocks : 1;
+	if (ind_blocks || data_blocks)
+		rblocks += RES_STATFS + RES_QUOTA;
+
+	error = gfs2_trans_begin(sdp, rblocks,
+				 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+	if (error)
+		goto out_trans_fail;
+
+	error = -ENOMEM;
+	flags |= AOP_FLAG_NOFS;
+	page = grab_cache_page_write_begin(mapping, index, flags);
+	*pagep = page;
+	if (unlikely(!page))
+		goto out_endtrans;
+
+	if (gfs2_is_stuffed(ip)) {
+		error = 0;
+		if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
+			error = gfs2_unstuff_dinode(ip, page);
+			if (error == 0)
+				goto prepare_write;
+		} else if (!PageUptodate(page)) {
+			error = stuffed_readpage(ip, page);
+		}
+		goto out;
+	}
+
+prepare_write:
+	error = block_prepare_write(page, from, to, gfs2_block_map);
+out:
+	if (error == 0)
+		return 0;
+
+	page_cache_release(page);
+	if (pos + len > ip->i_inode.i_size)
+		vmtruncate(&ip->i_inode, ip->i_inode.i_size);
+out_endtrans:
+	gfs2_trans_end(sdp);
+out_trans_fail:
+	if (alloc_required) {
+		gfs2_inplace_release(ip);
+out_qunlock:
+		gfs2_quota_unlock(ip);
+out_alloc_put:
+		gfs2_alloc_put(ip);
+	}
+out_unlock:
+	gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+	gfs2_holder_uninit(&ip->i_gh);
+	return error;
+}
+
+/**
+ * adjust_fs_space - Adjusts the free space available due to gfs2_grow
+ * @inode: the rindex inode
+ */
+static void adjust_fs_space(struct inode *inode)
+{
+	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+	u64 fs_total, new_free;
+
+	/* Total up the file system space, according to the latest rindex. */
+	fs_total = gfs2_ri_total(sdp);
+
+	spin_lock(&sdp->sd_statfs_spin);
+	if (fs_total > (m_sc->sc_total + l_sc->sc_total))
+		new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
+	else
+		new_free = 0;
+	spin_unlock(&sdp->sd_statfs_spin);
+	fs_warn(sdp, "File system extended by %llu blocks.\n",
+		(unsigned long long)new_free);
+	gfs2_statfs_change(sdp, new_free, new_free, 0);
+}
+
+/**
+ * gfs2_stuffed_write_end - Write end for stuffed files
+ * @inode: The inode
+ * @dibh: The buffer_head containing the on-disk inode
+ * @pos: The file position
+ * @len: The length of the write
+ * @copied: How much was actually copied by the VFS
+ * @page: The page
+ *
+ * This copies the data from the page into the inode block after
+ * the inode data structure itself.
+ *
+ * Returns: errno
+ */
+static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
+				  loff_t pos, unsigned len, unsigned copied,
+				  struct page *page)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	u64 to = pos + copied;
+	void *kaddr;
+	unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
+	struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+
+	BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode)));
+	kaddr = kmap_atomic(page, KM_USER0);
+	memcpy(buf + pos, kaddr + pos, copied);
+	memset(kaddr + pos + copied, 0, len - copied);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (copied) {
+		if (inode->i_size < to) {
+			i_size_write(inode, to);
+			ip->i_disksize = inode->i_size;
+		}
+		gfs2_dinode_out(ip, di);
+		mark_inode_dirty(inode);
+	}
+
+	if (inode == sdp->sd_rindex)
+		adjust_fs_space(inode);
+
+	brelse(dibh);
+	gfs2_trans_end(sdp);
+	gfs2_glock_dq(&ip->i_gh);
+	gfs2_holder_uninit(&ip->i_gh);
+	return copied;
+}
+
+/**
+ * gfs2_write_end
+ * @file: The file to write to
+ * @mapping: The address space to write to
+ * @pos: The file position
+ * @len: The length of the data
+ * @copied:
+ * @page: The page that has been written
+ * @fsdata: The fsdata (unused in GFS2)
+ *
+ * The main write_end function for GFS2. We have a separate one for
+ * stuffed files as they are slightly different, otherwise we just
+ * put our locking around the VFS provided functions.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_write_end(struct file *file, struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata)
+{
+	struct inode *inode = page->mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct buffer_head *dibh;
+	struct gfs2_alloc *al = ip->i_alloc;
+	unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned int to = from + len;
+	int ret;
+
+	BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
+
+	ret = gfs2_meta_inode_buffer(ip, &dibh);
+	if (unlikely(ret)) {
+		unlock_page(page);
+		page_cache_release(page);
+		goto failed;
+	}
+
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+
+	if (gfs2_is_stuffed(ip))
+		return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
+
+	if (!gfs2_is_writeback(ip))
+		gfs2_page_add_databufs(ip, page, from, to);
+
+	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+	if (ret > 0) {
+		if (inode->i_size > ip->i_disksize)
+			ip->i_disksize = inode->i_size;
+		gfs2_dinode_out(ip, dibh->b_data);
+		mark_inode_dirty(inode);
+	}
+
+	if (inode == sdp->sd_rindex)
+		adjust_fs_space(inode);
+
+	brelse(dibh);
+	gfs2_trans_end(sdp);
+failed:
+	if (al) {
+		gfs2_inplace_release(ip);
+		gfs2_quota_unlock(ip);
+		gfs2_alloc_put(ip);
+	}
+	gfs2_glock_dq(&ip->i_gh);
+	gfs2_holder_uninit(&ip->i_gh);
+	return ret;
+}
+
+/**
+ * gfs2_set_page_dirty - Page dirtying function
+ * @page: The page to dirty
+ *
+ * Returns: 1 if it dirtyed the page, or 0 otherwise
+ */
+ 
+static int gfs2_set_page_dirty(struct page *page)
+{
+	SetPageChecked(page);
+	return __set_page_dirty_buffers(page);
+}
+
+/**
+ * gfs2_bmap - Block map function
+ * @mapping: Address space info
+ * @lblock: The block to map
+ *
+ * Returns: The disk address for the block or 0 on hole or error
+ */
+
+static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
+{
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_holder i_gh;
+	sector_t dblock = 0;
+	int error;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+	if (error)
+		return 0;
+
+	if (!gfs2_is_stuffed(ip))
+		dblock = generic_block_bmap(mapping, lblock, gfs2_block_map);
+
+	gfs2_glock_dq_uninit(&i_gh);
+
+	return dblock;
+}
+
+static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+	struct gfs2_bufdata *bd;
+
+	lock_buffer(bh);
+	gfs2_log_lock(sdp);
+	clear_buffer_dirty(bh);
+	bd = bh->b_private;
+	if (bd) {
+		if (!list_empty(&bd->bd_le.le_list) && !buffer_pinned(bh))
+			list_del_init(&bd->bd_le.le_list);
+		else
+			gfs2_remove_from_journal(bh, current->journal_info, 0);
+	}
+	bh->b_bdev = NULL;
+	clear_buffer_mapped(bh);
+	clear_buffer_req(bh);
+	clear_buffer_new(bh);
+	gfs2_log_unlock(sdp);
+	unlock_buffer(bh);
+}
+
+static void gfs2_invalidatepage(struct page *page, unsigned long offset)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+	struct buffer_head *bh, *head;
+	unsigned long pos = 0;
+
+	BUG_ON(!PageLocked(page));
+	if (offset == 0)
+		ClearPageChecked(page);
+	if (!page_has_buffers(page))
+		goto out;
+
+	bh = head = page_buffers(page);
+	do {
+		if (offset <= pos)
+			gfs2_discard(sdp, bh);
+		pos += bh->b_size;
+		bh = bh->b_this_page;
+	} while (bh != head);
+out:
+	if (offset == 0)
+		try_to_release_page(page, 0);
+}
+
+/**
+ * gfs2_ok_for_dio - check that dio is valid on this file
+ * @ip: The inode
+ * @rw: READ or WRITE
+ * @offset: The offset at which we are reading or writing
+ *
+ * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o)
+ *          1 (to accept the i/o request)
+ */
+static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
+{
+	/*
+	 * Should we return an error here? I can't see that O_DIRECT for
+	 * a stuffed file makes any sense. For now we'll silently fall
+	 * back to buffered I/O
+	 */
+	if (gfs2_is_stuffed(ip))
+		return 0;
+
+	if (offset >= i_size_read(&ip->i_inode))
+		return 0;
+	return 1;
+}
+
+
+
+static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
+			      const struct iovec *iov, loff_t offset,
+			      unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int rv;
+
+	/*
+	 * Deferred lock, even if its a write, since we do no allocation
+	 * on this path. All we need change is atime, and this lock mode
+	 * ensures that other nodes have flushed their buffered read caches
+	 * (i.e. their page cache entries for this inode). We do not,
+	 * unfortunately have the option of only flushing a range like
+	 * the VFS does.
+	 */
+	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
+	rv = gfs2_glock_nq(&gh);
+	if (rv)
+		return rv;
+	rv = gfs2_ok_for_dio(ip, rw, offset);
+	if (rv != 1)
+		goto out; /* dio not valid, fall back to buffered i/o */
+
+	rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev,
+					   iov, offset, nr_segs,
+					   gfs2_get_block_direct, NULL);
+out:
+	gfs2_glock_dq_m(1, &gh);
+	gfs2_holder_uninit(&gh);
+	return rv;
+}
+
+/**
+ * gfs2_releasepage - free the metadata associated with a page
+ * @page: the page that's being released
+ * @gfp_mask: passed from Linux VFS, ignored by us
+ *
+ * Call try_to_free_buffers() if the buffers in this page can be
+ * released.
+ *
+ * Returns: 0
+ */
+
+int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
+{
+	struct inode *aspace = page->mapping->host;
+	struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
+	struct buffer_head *bh, *head;
+	struct gfs2_bufdata *bd;
+
+	if (!page_has_buffers(page))
+		return 0;
+
+	gfs2_log_lock(sdp);
+	head = bh = page_buffers(page);
+	do {
+		if (atomic_read(&bh->b_count))
+			goto cannot_release;
+		bd = bh->b_private;
+		if (bd && bd->bd_ail)
+			goto cannot_release;
+		gfs2_assert_warn(sdp, !buffer_pinned(bh));
+		gfs2_assert_warn(sdp, !buffer_dirty(bh));
+		bh = bh->b_this_page;
+	} while(bh != head);
+	gfs2_log_unlock(sdp);
+
+	head = bh = page_buffers(page);
+	do {
+		gfs2_log_lock(sdp);
+		bd = bh->b_private;
+		if (bd) {
+			gfs2_assert_warn(sdp, bd->bd_bh == bh);
+			gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
+			if (!list_empty(&bd->bd_le.le_list)) {
+				if (!buffer_pinned(bh))
+					list_del_init(&bd->bd_le.le_list);
+				else
+					bd = NULL;
+			}
+			if (bd)
+				bd->bd_bh = NULL;
+			bh->b_private = NULL;
+		}
+		gfs2_log_unlock(sdp);
+		if (bd)
+			kmem_cache_free(gfs2_bufdata_cachep, bd);
+
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	return try_to_free_buffers(page);
+cannot_release:
+	gfs2_log_unlock(sdp);
+	return 0;
+}
+
+static const struct address_space_operations gfs2_writeback_aops = {
+	.writepage = gfs2_writeback_writepage,
+	.writepages = gfs2_writeback_writepages,
+	.readpage = gfs2_readpage,
+	.readpages = gfs2_readpages,
+	.sync_page = block_sync_page,
+	.write_begin = gfs2_write_begin,
+	.write_end = gfs2_write_end,
+	.bmap = gfs2_bmap,
+	.invalidatepage = gfs2_invalidatepage,
+	.releasepage = gfs2_releasepage,
+	.direct_IO = gfs2_direct_IO,
+	.migratepage = buffer_migrate_page,
+	.is_partially_uptodate = block_is_partially_uptodate,
+};
+
+static const struct address_space_operations gfs2_ordered_aops = {
+	.writepage = gfs2_ordered_writepage,
+	.readpage = gfs2_readpage,
+	.readpages = gfs2_readpages,
+	.sync_page = block_sync_page,
+	.write_begin = gfs2_write_begin,
+	.write_end = gfs2_write_end,
+	.set_page_dirty = gfs2_set_page_dirty,
+	.bmap = gfs2_bmap,
+	.invalidatepage = gfs2_invalidatepage,
+	.releasepage = gfs2_releasepage,
+	.direct_IO = gfs2_direct_IO,
+	.migratepage = buffer_migrate_page,
+	.is_partially_uptodate = block_is_partially_uptodate,
+};
+
+static const struct address_space_operations gfs2_jdata_aops = {
+	.writepage = gfs2_jdata_writepage,
+	.writepages = gfs2_jdata_writepages,
+	.readpage = gfs2_readpage,
+	.readpages = gfs2_readpages,
+	.sync_page = block_sync_page,
+	.write_begin = gfs2_write_begin,
+	.write_end = gfs2_write_end,
+	.set_page_dirty = gfs2_set_page_dirty,
+	.bmap = gfs2_bmap,
+	.invalidatepage = gfs2_invalidatepage,
+	.releasepage = gfs2_releasepage,
+	.is_partially_uptodate = block_is_partially_uptodate,
+};
+
+void gfs2_set_aops(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	if (gfs2_is_writeback(ip))
+		inode->i_mapping->a_ops = &gfs2_writeback_aops;
+	else if (gfs2_is_ordered(ip))
+		inode->i_mapping->a_ops = &gfs2_ordered_aops;
+	else if (gfs2_is_jdata(ip))
+		inode->i_mapping->a_ops = &gfs2_jdata_aops;
+	else
+		BUG();
+}
+
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 253e1a39f84..1153a078920 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -25,7 +25,6 @@
 #include "trans.h"
 #include "dir.h"
 #include "util.h"
-#include "ops_address.h"
 
 /* This doesn't need to be that large as max 64 bit pointers in a 4k
  * block is 512, so __u16 is fine for that. It saves stack space to
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
new file mode 100644
index 00000000000..022c66cd560
--- /dev/null
+++ b/fs/gfs2/dentry.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "super.h"
+#include "util.h"
+#include "inode.h"
+
+/**
+ * gfs2_drevalidate - Check directory lookup consistency
+ * @dentry: the mapping to check
+ * @nd:
+ *
+ * Check to make sure the lookup necessary to arrive at this inode from its
+ * parent is still good.
+ *
+ * Returns: 1 if the dentry is ok, 0 if it isn't
+ */
+
+static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct dentry *parent = dget_parent(dentry);
+	struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
+	struct gfs2_inode *dip = GFS2_I(parent->d_inode);
+	struct inode *inode = dentry->d_inode;
+	struct gfs2_holder d_gh;
+	struct gfs2_inode *ip = NULL;
+	int error;
+	int had_lock = 0;
+
+	if (inode) {
+		if (is_bad_inode(inode))
+			goto invalid;
+		ip = GFS2_I(inode);
+	}
+
+	if (sdp->sd_args.ar_localcaching)
+		goto valid;
+
+	had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
+	if (!had_lock) {
+		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+		if (error)
+			goto fail;
+	} 
+
+	error = gfs2_dir_check(parent->d_inode, &dentry->d_name, ip);
+	switch (error) {
+	case 0:
+		if (!inode)
+			goto invalid_gunlock;
+		break;
+	case -ENOENT:
+		if (!inode)
+			goto valid_gunlock;
+		goto invalid_gunlock;
+	default:
+		goto fail_gunlock;
+	}
+
+valid_gunlock:
+	if (!had_lock)
+		gfs2_glock_dq_uninit(&d_gh);
+valid:
+	dput(parent);
+	return 1;
+
+invalid_gunlock:
+	if (!had_lock)
+		gfs2_glock_dq_uninit(&d_gh);
+invalid:
+	if (inode && S_ISDIR(inode->i_mode)) {
+		if (have_submounts(dentry))
+			goto valid;
+		shrink_dcache_parent(dentry);
+	}
+	d_drop(dentry);
+	dput(parent);
+	return 0;
+
+fail_gunlock:
+	gfs2_glock_dq_uninit(&d_gh);
+fail:
+	dput(parent);
+	return 0;
+}
+
+static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
+{
+	str->hash = gfs2_disk_hash(str->name, str->len);
+	return 0;
+}
+
+const struct dentry_operations gfs2_dops = {
+	.d_revalidate = gfs2_drevalidate,
+	.d_hash = gfs2_dhash,
+};
+
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
new file mode 100644
index 00000000000..9200ef22171
--- /dev/null
+++ b/fs/gfs2/export.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/crc32.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "super.h"
+#include "rgrp.h"
+#include "util.h"
+
+#define GFS2_SMALL_FH_SIZE 4
+#define GFS2_LARGE_FH_SIZE 8
+#define GFS2_OLD_FH_SIZE 10
+
+static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
+			  int connectable)
+{
+	__be32 *fh = (__force __be32 *)p;
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	if (*len < GFS2_SMALL_FH_SIZE ||
+	    (connectable && *len < GFS2_LARGE_FH_SIZE))
+		return 255;
+
+	fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
+	fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
+	fh[2] = cpu_to_be32(ip->i_no_addr >> 32);
+	fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
+	*len = GFS2_SMALL_FH_SIZE;
+
+	if (!connectable || inode == sb->s_root->d_inode)
+		return *len;
+
+	spin_lock(&dentry->d_lock);
+	inode = dentry->d_parent->d_inode;
+	ip = GFS2_I(inode);
+	igrab(inode);
+	spin_unlock(&dentry->d_lock);
+
+	fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32);
+	fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
+	fh[6] = cpu_to_be32(ip->i_no_addr >> 32);
+	fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
+	*len = GFS2_LARGE_FH_SIZE;
+
+	iput(inode);
+
+	return *len;
+}
+
+struct get_name_filldir {
+	struct gfs2_inum_host inum;
+	char *name;
+};
+
+static int get_name_filldir(void *opaque, const char *name, int length,
+			    loff_t offset, u64 inum, unsigned int type)
+{
+	struct get_name_filldir *gnfd = opaque;
+
+	if (inum != gnfd->inum.no_addr)
+		return 0;
+
+	memcpy(gnfd->name, name, length);
+	gnfd->name[length] = 0;
+
+	return 1;
+}
+
+static int gfs2_get_name(struct dentry *parent, char *name,
+			 struct dentry *child)
+{
+	struct inode *dir = parent->d_inode;
+	struct inode *inode = child->d_inode;
+	struct gfs2_inode *dip, *ip;
+	struct get_name_filldir gnfd;
+	struct gfs2_holder gh;
+	u64 offset = 0;
+	int error;
+
+	if (!dir)
+		return -EINVAL;
+
+	if (!S_ISDIR(dir->i_mode) || !inode)
+		return -EINVAL;
+
+	dip = GFS2_I(dir);
+	ip = GFS2_I(inode);
+
+	*name = 0;
+	gnfd.inum.no_addr = ip->i_no_addr;
+	gnfd.inum.no_formal_ino = ip->i_no_formal_ino;
+	gnfd.name = name;
+
+	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
+	if (error)
+		return error;
+
+	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
+
+	gfs2_glock_dq_uninit(&gh);
+
+	if (!error && !*name)
+		error = -ENOENT;
+
+	return error;
+}
+
+static struct dentry *gfs2_get_parent(struct dentry *child)
+{
+	struct qstr dotdot;
+	struct dentry *dentry;
+
+	/*
+	 * XXX(hch): it would be a good idea to keep this around as a
+	 *	     static variable.
+	 */
+	gfs2_str2qstr(&dotdot, "..");
+
+	dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
+	if (!IS_ERR(dentry))
+		dentry->d_op = &gfs2_dops;
+	return dentry;
+}
+
+static struct dentry *gfs2_get_dentry(struct super_block *sb,
+		struct gfs2_inum_host *inum)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_holder i_gh, ri_gh, rgd_gh;
+	struct gfs2_rgrpd *rgd;
+	struct inode *inode;
+	struct dentry *dentry;
+	int error;
+
+	/* System files? */
+
+	inode = gfs2_ilookup(sb, inum->no_addr);
+	if (inode) {
+		if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
+			iput(inode);
+			return ERR_PTR(-ESTALE);
+		}
+		goto out_inode;
+	}
+
+	error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
+				  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+	if (error)
+		return ERR_PTR(error);
+
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		goto fail;
+
+	error = -EINVAL;
+	rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
+	if (!rgd)
+		goto fail_rindex;
+
+	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
+	if (error)
+		goto fail_rindex;
+
+	error = -ESTALE;
+	if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
+		goto fail_rgd;
+
+	gfs2_glock_dq_uninit(&rgd_gh);
+	gfs2_glock_dq_uninit(&ri_gh);
+
+	inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
+					inum->no_addr,
+					0, 0);
+	if (IS_ERR(inode)) {
+		error = PTR_ERR(inode);
+		goto fail;
+	}
+
+	error = gfs2_inode_refresh(GFS2_I(inode));
+	if (error) {
+		iput(inode);
+		goto fail;
+	}
+
+	/* Pick up the works we bypass in gfs2_inode_lookup */
+	if (inode->i_state & I_NEW) 
+		gfs2_set_iop(inode);
+
+	if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
+		iput(inode);
+		goto fail;
+	}
+
+	error = -EIO;
+	if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
+		iput(inode);
+		goto fail;
+	}
+
+	gfs2_glock_dq_uninit(&i_gh);
+
+out_inode:
+	dentry = d_obtain_alias(inode);
+	if (!IS_ERR(dentry))
+		dentry->d_op = &gfs2_dops;
+	return dentry;
+
+fail_rgd:
+	gfs2_glock_dq_uninit(&rgd_gh);
+
+fail_rindex:
+	gfs2_glock_dq_uninit(&ri_gh);
+
+fail:
+	gfs2_glock_dq_uninit(&i_gh);
+	return ERR_PTR(error);
+}
+
+static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	struct gfs2_inum_host this;
+	__be32 *fh = (__force __be32 *)fid->raw;
+
+	switch (fh_type) {
+	case GFS2_SMALL_FH_SIZE:
+	case GFS2_LARGE_FH_SIZE:
+	case GFS2_OLD_FH_SIZE:
+		this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
+		this.no_formal_ino |= be32_to_cpu(fh[1]);
+		this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
+		this.no_addr |= be32_to_cpu(fh[3]);
+		return gfs2_get_dentry(sb, &this);
+	default:
+		return NULL;
+	}
+}
+
+static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid,
+		int fh_len, int fh_type)
+{
+	struct gfs2_inum_host parent;
+	__be32 *fh = (__force __be32 *)fid->raw;
+
+	switch (fh_type) {
+	case GFS2_LARGE_FH_SIZE:
+	case GFS2_OLD_FH_SIZE:
+		parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
+		parent.no_formal_ino |= be32_to_cpu(fh[5]);
+		parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
+		parent.no_addr |= be32_to_cpu(fh[7]);
+		return gfs2_get_dentry(sb, &parent);
+	default:
+		return NULL;
+	}
+}
+
+const struct export_operations gfs2_export_ops = {
+	.encode_fh = gfs2_encode_fh,
+	.fh_to_dentry = gfs2_fh_to_dentry,
+	.fh_to_parent = gfs2_fh_to_parent,
+	.get_name = gfs2_get_name,
+	.get_parent = gfs2_get_parent,
+};
+
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
new file mode 100644
index 00000000000..73b6f552f06
--- /dev/null
+++ b/fs/gfs2/file.c
@@ -0,0 +1,765 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/blkdev.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
+#include <linux/gfs2_ondisk.h>
+#include <linux/ext2_fs.h>
+#include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <asm/uaccess.h>
+#include <linux/dlm.h>
+#include <linux/dlm_plock.h>
+
+#include "gfs2.h"
+#include "incore.h"
+#include "bmap.h"
+#include "dir.h"
+#include "glock.h"
+#include "glops.h"
+#include "inode.h"
+#include "log.h"
+#include "meta_io.h"
+#include "quota.h"
+#include "rgrp.h"
+#include "trans.h"
+#include "util.h"
+#include "eaops.h"
+
+/**
+ * gfs2_llseek - seek to a location in a file
+ * @file: the file
+ * @offset: the offset
+ * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
+ *
+ * SEEK_END requires the glock for the file because it references the
+ * file's size.
+ *
+ * Returns: The new offset, or errno
+ */
+
+static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+	struct gfs2_holder i_gh;
+	loff_t error;
+
+	if (origin == 2) {
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+					   &i_gh);
+		if (!error) {
+			error = generic_file_llseek_unlocked(file, offset, origin);
+			gfs2_glock_dq_uninit(&i_gh);
+		}
+	} else
+		error = generic_file_llseek_unlocked(file, offset, origin);
+
+	return error;
+}
+
+/**
+ * gfs2_readdir - Read directory entries from a directory
+ * @file: The directory to read from
+ * @dirent: Buffer for dirents
+ * @filldir: Function used to do the copying
+ *
+ * Returns: errno
+ */
+
+static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+	struct inode *dir = file->f_mapping->host;
+	struct gfs2_inode *dip = GFS2_I(dir);
+	struct gfs2_holder d_gh;
+	u64 offset = file->f_pos;
+	int error;
+
+	gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+	error = gfs2_glock_nq(&d_gh);
+	if (error) {
+		gfs2_holder_uninit(&d_gh);
+		return error;
+	}
+
+	error = gfs2_dir_read(dir, &offset, dirent, filldir);
+
+	gfs2_glock_dq_uninit(&d_gh);
+
+	file->f_pos = offset;
+
+	return error;
+}
+
+/**
+ * fsflags_cvt
+ * @table: A table of 32 u32 flags
+ * @val: a 32 bit value to convert
+ *
+ * This function can be used to convert between fsflags values and
+ * GFS2's own flags values.
+ *
+ * Returns: the converted flags
+ */
+static u32 fsflags_cvt(const u32 *table, u32 val)
+{
+	u32 res = 0;
+	while(val) {
+		if (val & 1)
+			res |= *table;
+		table++;
+		val >>= 1;
+	}
+	return res;
+}
+
+static const u32 fsflags_to_gfs2[32] = {
+	[3] = GFS2_DIF_SYNC,
+	[4] = GFS2_DIF_IMMUTABLE,
+	[5] = GFS2_DIF_APPENDONLY,
+	[7] = GFS2_DIF_NOATIME,
+	[12] = GFS2_DIF_EXHASH,
+	[14] = GFS2_DIF_INHERIT_JDATA,
+};
+
+static const u32 gfs2_to_fsflags[32] = {
+	[gfs2fl_Sync] = FS_SYNC_FL,
+	[gfs2fl_Immutable] = FS_IMMUTABLE_FL,
+	[gfs2fl_AppendOnly] = FS_APPEND_FL,
+	[gfs2fl_NoAtime] = FS_NOATIME_FL,
+	[gfs2fl_ExHash] = FS_INDEX_FL,
+	[gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
+};
+
+static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int error;
+	u32 fsflags;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	error = gfs2_glock_nq(&gh);
+	if (error)
+		return error;
+
+	fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
+	if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
+		fsflags |= FS_JOURNAL_DATA_FL;
+	if (put_user(fsflags, ptr))
+		error = -EFAULT;
+
+	gfs2_glock_dq(&gh);
+	gfs2_holder_uninit(&gh);
+	return error;
+}
+
+void gfs2_set_inode_flags(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	unsigned int flags = inode->i_flags;
+
+	flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+	if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
+		flags |= S_IMMUTABLE;
+	if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
+		flags |= S_APPEND;
+	if (ip->i_diskflags & GFS2_DIF_NOATIME)
+		flags |= S_NOATIME;
+	if (ip->i_diskflags & GFS2_DIF_SYNC)
+		flags |= S_SYNC;
+	inode->i_flags = flags;
+}
+
+/* Flags that can be set by user space */
+#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA|			\
+			     GFS2_DIF_IMMUTABLE|		\
+			     GFS2_DIF_APPENDONLY|		\
+			     GFS2_DIF_NOATIME|			\
+			     GFS2_DIF_SYNC|			\
+			     GFS2_DIF_SYSTEM|			\
+			     GFS2_DIF_INHERIT_JDATA)
+
+/**
+ * gfs2_set_flags - set flags on an inode
+ * @inode: The inode
+ * @flags: The flags to set
+ * @mask: Indicates which flags are valid
+ *
+ */
+static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct buffer_head *bh;
+	struct gfs2_holder gh;
+	int error;
+	u32 new_flags, flags;
+
+	error = mnt_want_write(filp->f_path.mnt);
+	if (error)
+		return error;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (error)
+		goto out_drop_write;
+
+	flags = ip->i_diskflags;
+	new_flags = (flags & ~mask) | (reqflags & mask);
+	if ((new_flags ^ flags) == 0)
+		goto out;
+
+	error = -EINVAL;
+	if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
+		goto out;
+
+	error = -EPERM;
+	if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
+		goto out;
+	if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
+		goto out;
+	if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		goto out;
+	if (!IS_IMMUTABLE(inode)) {
+		error = gfs2_permission(inode, MAY_WRITE);
+		if (error)
+			goto out;
+	}
+	if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
+		if (flags & GFS2_DIF_JDATA)
+			gfs2_log_flush(sdp, ip->i_gl);
+		error = filemap_fdatawrite(inode->i_mapping);
+		if (error)
+			goto out;
+		error = filemap_fdatawait(inode->i_mapping);
+		if (error)
+			goto out;
+	}
+	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (error)
+		goto out;
+	error = gfs2_meta_inode_buffer(ip, &bh);
+	if (error)
+		goto out_trans_end;
+	gfs2_trans_add_bh(ip->i_gl, bh, 1);
+	ip->i_diskflags = new_flags;
+	gfs2_dinode_out(ip, bh->b_data);
+	brelse(bh);
+	gfs2_set_inode_flags(inode);
+	gfs2_set_aops(inode);
+out_trans_end:
+	gfs2_trans_end(sdp);
+out:
+	gfs2_glock_dq_uninit(&gh);
+out_drop_write:
+	mnt_drop_write(filp->f_path.mnt);
+	return error;
+}
+
+static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	u32 fsflags, gfsflags;
+	if (get_user(fsflags, ptr))
+		return -EFAULT;
+	gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
+	if (!S_ISDIR(inode->i_mode)) {
+		if (gfsflags & GFS2_DIF_INHERIT_JDATA)
+			gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
+		return do_gfs2_set_flags(filp, gfsflags, ~0);
+	}
+	return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
+}
+
+static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	switch(cmd) {
+	case FS_IOC_GETFLAGS:
+		return gfs2_get_flags(filp, (u32 __user *)arg);
+	case FS_IOC_SETFLAGS:
+		return gfs2_set_flags(filp, (u32 __user *)arg);
+	}
+	return -ENOTTY;
+}
+
+/**
+ * gfs2_allocate_page_backing - Use bmap to allocate blocks
+ * @page: The (locked) page to allocate backing for
+ *
+ * We try to allocate all the blocks required for the page in
+ * one go. This might fail for various reasons, so we keep
+ * trying until all the blocks to back this page are allocated.
+ * If some of the blocks are already allocated, thats ok too.
+ */
+
+static int gfs2_allocate_page_backing(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head bh;
+	unsigned long size = PAGE_CACHE_SIZE;
+	u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	do {
+		bh.b_state = 0;
+		bh.b_size = size;
+		gfs2_block_map(inode, lblock, &bh, 1);
+		if (!buffer_mapped(&bh))
+			return -EIO;
+		size -= bh.b_size;
+		lblock += (bh.b_size >> inode->i_blkbits);
+	} while(size > 0);
+	return 0;
+}
+
+/**
+ * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable
+ * @vma: The virtual memory area
+ * @page: The page which is about to become writable
+ *
+ * When the page becomes writable, we need to ensure that we have
+ * blocks allocated on disk to back that page.
+ */
+
+static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	unsigned long last_index;
+	u64 pos = page->index << PAGE_CACHE_SHIFT;
+	unsigned int data_blocks, ind_blocks, rblocks;
+	int alloc_required = 0;
+	struct gfs2_holder gh;
+	struct gfs2_alloc *al;
+	int ret;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	ret = gfs2_glock_nq(&gh);
+	if (ret)
+		goto out;
+
+	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
+	set_bit(GIF_SW_PAGED, &ip->i_flags);
+
+	ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
+	if (ret || !alloc_required)
+		goto out_unlock;
+	ret = -ENOMEM;
+	al = gfs2_alloc_get(ip);
+	if (al == NULL)
+		goto out_unlock;
+
+	ret = gfs2_quota_lock_check(ip);
+	if (ret)
+		goto out_alloc_put;
+	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
+	al->al_requested = data_blocks + ind_blocks;
+	ret = gfs2_inplace_reserve(ip);
+	if (ret)
+		goto out_quota_unlock;
+
+	rblocks = RES_DINODE + ind_blocks;
+	if (gfs2_is_jdata(ip))
+		rblocks += data_blocks ? data_blocks : 1;
+	if (ind_blocks || data_blocks)
+		rblocks += RES_STATFS + RES_QUOTA;
+	ret = gfs2_trans_begin(sdp, rblocks, 0);
+	if (ret)
+		goto out_trans_fail;
+
+	lock_page(page);
+	ret = -EINVAL;
+	last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT;
+	if (page->index > last_index)
+		goto out_unlock_page;
+	ret = 0;
+	if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping)
+		goto out_unlock_page;
+	if (gfs2_is_stuffed(ip)) {
+		ret = gfs2_unstuff_dinode(ip, page);
+		if (ret)
+			goto out_unlock_page;
+	}
+	ret = gfs2_allocate_page_backing(page);
+
+out_unlock_page:
+	unlock_page(page);
+	gfs2_trans_end(sdp);
+out_trans_fail:
+	gfs2_inplace_release(ip);
+out_quota_unlock:
+	gfs2_quota_unlock(ip);
+out_alloc_put:
+	gfs2_alloc_put(ip);
+out_unlock:
+	gfs2_glock_dq(&gh);
+out:
+	gfs2_holder_uninit(&gh);
+	if (ret == -ENOMEM)
+		ret = VM_FAULT_OOM;
+	else if (ret)
+		ret = VM_FAULT_SIGBUS;
+	return ret;
+}
+
+static struct vm_operations_struct gfs2_vm_ops = {
+	.fault = filemap_fault,
+	.page_mkwrite = gfs2_page_mkwrite,
+};
+
+/**
+ * gfs2_mmap -
+ * @file: The file to map
+ * @vma: The VMA which described the mapping
+ *
+ * There is no need to get a lock here unless we should be updating
+ * atime. We ignore any locking errors since the only consequence is
+ * a missed atime update (which will just be deferred until later).
+ *
+ * Returns: 0
+ */
+
+static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+
+	if (!(file->f_flags & O_NOATIME)) {
+		struct gfs2_holder i_gh;
+		int error;
+
+		gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+		error = gfs2_glock_nq(&i_gh);
+		file_accessed(file);
+		if (error == 0)
+			gfs2_glock_dq_uninit(&i_gh);
+	}
+	vma->vm_ops = &gfs2_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+
+	return 0;
+}
+
+/**
+ * gfs2_open - open a file
+ * @inode: the inode to open
+ * @file: the struct file for this opening
+ *
+ * Returns: errno
+ */
+
+static int gfs2_open(struct inode *inode, struct file *file)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder i_gh;
+	struct gfs2_file *fp;
+	int error;
+
+	fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
+	if (!fp)
+		return -ENOMEM;
+
+	mutex_init(&fp->f_fl_mutex);
+
+	gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
+	file->private_data = fp;
+
+	if (S_ISREG(ip->i_inode.i_mode)) {
+		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+					   &i_gh);
+		if (error)
+			goto fail;
+
+		if (!(file->f_flags & O_LARGEFILE) &&
+		    ip->i_disksize > MAX_NON_LFS) {
+			error = -EOVERFLOW;
+			goto fail_gunlock;
+		}
+
+		gfs2_glock_dq_uninit(&i_gh);
+	}
+
+	return 0;
+
+fail_gunlock:
+	gfs2_glock_dq_uninit(&i_gh);
+fail:
+	file->private_data = NULL;
+	kfree(fp);
+	return error;
+}
+
+/**
+ * gfs2_close - called to close a struct file
+ * @inode: the inode the struct file belongs to
+ * @file: the struct file being closed
+ *
+ * Returns: errno
+ */
+
+static int gfs2_close(struct inode *inode, struct file *file)
+{
+	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+	struct gfs2_file *fp;
+
+	fp = file->private_data;
+	file->private_data = NULL;
+
+	if (gfs2_assert_warn(sdp, fp))
+		return -EIO;
+
+	kfree(fp);
+
+	return 0;
+}
+
+/**
+ * gfs2_fsync - sync the dirty data for a file (across the cluster)
+ * @file: the file that points to the dentry (we ignore this)
+ * @dentry: the dentry that points to the inode to sync
+ *
+ * The VFS will flush "normal" data for us. We only need to worry
+ * about metadata here. For journaled data, we just do a log flush
+ * as we can't avoid it. Otherwise we can just bale out if datasync
+ * is set. For stuffed inodes we must flush the log in order to
+ * ensure that all data is on disk.
+ *
+ * The call to write_inode_now() is there to write back metadata and
+ * the inode itself. It does also try and write the data, but thats
+ * (hopefully) a no-op due to the VFS having already called filemap_fdatawrite()
+ * for us.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
+	int ret = 0;
+
+	if (gfs2_is_jdata(GFS2_I(inode))) {
+		gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
+		return 0;
+	}
+
+	if (sync_state != 0) {
+		if (!datasync)
+			ret = write_inode_now(inode, 0);
+
+		if (gfs2_is_stuffed(GFS2_I(inode)))
+			gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+
+/**
+ * gfs2_setlease - acquire/release a file lease
+ * @file: the file pointer
+ * @arg: lease type
+ * @fl: file lock
+ *
+ * We don't currently have a way to enforce a lease across the whole
+ * cluster; until we do, disable leases (by just returning -EINVAL),
+ * unless the administrator has requested purely local locking.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
+{
+	return -EINVAL;
+}
+
+/**
+ * gfs2_lock - acquire/release a posix lock on a file
+ * @file: the file pointer
+ * @cmd: either modify or retrieve lock state, possibly wait
+ * @fl: type and range of lock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+	if (!(fl->fl_flags & FL_POSIX))
+		return -ENOLCK;
+	if (__mandatory_lock(&ip->i_inode))
+		return -ENOLCK;
+
+	if (cmd == F_CANCELLK) {
+		/* Hack: */
+		cmd = F_SETLK;
+		fl->fl_type = F_UNLCK;
+	}
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+	if (IS_GETLK(cmd))
+		return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
+	else if (fl->fl_type == F_UNLCK)
+		return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
+	else
+		return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
+}
+
+static int do_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct gfs2_file *fp = file->private_data;
+	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+	struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode);
+	struct gfs2_glock *gl;
+	unsigned int state;
+	int flags;
+	int error = 0;
+
+	state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
+	flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
+
+	mutex_lock(&fp->f_fl_mutex);
+
+	gl = fl_gh->gh_gl;
+	if (gl) {
+		if (fl_gh->gh_state == state)
+			goto out;
+		flock_lock_file_wait(file,
+				     &(struct file_lock){.fl_type = F_UNLCK});
+		gfs2_glock_dq_wait(fl_gh);
+		gfs2_holder_reinit(state, flags, fl_gh);
+	} else {
+		error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
+				       &gfs2_flock_glops, CREATE, &gl);
+		if (error)
+			goto out;
+		gfs2_holder_init(gl, state, flags, fl_gh);
+		gfs2_glock_put(gl);
+	}
+	error = gfs2_glock_nq(fl_gh);
+	if (error) {
+		gfs2_holder_uninit(fl_gh);
+		if (error == GLR_TRYFAILED)
+			error = -EAGAIN;
+	} else {
+		error = flock_lock_file_wait(file, fl);
+		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
+	}
+
+out:
+	mutex_unlock(&fp->f_fl_mutex);
+	return error;
+}
+
+static void do_unflock(struct file *file, struct file_lock *fl)
+{
+	struct gfs2_file *fp = file->private_data;
+	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
+
+	mutex_lock(&fp->f_fl_mutex);
+	flock_lock_file_wait(file, fl);
+	if (fl_gh->gh_gl)
+		gfs2_glock_dq_uninit(fl_gh);
+	mutex_unlock(&fp->f_fl_mutex);
+}
+
+/**
+ * gfs2_flock - acquire/release a flock lock on a file
+ * @file: the file pointer
+ * @cmd: either modify or retrieve lock state, possibly wait
+ * @fl: type and range of lock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+
+	if (!(fl->fl_flags & FL_FLOCK))
+		return -ENOLCK;
+	if (__mandatory_lock(&ip->i_inode))
+		return -ENOLCK;
+
+	if (fl->fl_type == F_UNLCK) {
+		do_unflock(file, fl);
+		return 0;
+	} else {
+		return do_flock(file, cmd, fl);
+	}
+}
+
+const struct file_operations gfs2_file_fops = {
+	.llseek		= gfs2_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.mmap		= gfs2_mmap,
+	.open		= gfs2_open,
+	.release	= gfs2_close,
+	.fsync		= gfs2_fsync,
+	.lock		= gfs2_lock,
+	.flock		= gfs2_flock,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+	.setlease	= gfs2_setlease,
+};
+
+const struct file_operations gfs2_dir_fops = {
+	.readdir	= gfs2_readdir,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.open		= gfs2_open,
+	.release	= gfs2_close,
+	.fsync		= gfs2_fsync,
+	.lock		= gfs2_lock,
+	.flock		= gfs2_flock,
+};
+
+#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
+
+const struct file_operations gfs2_file_fops_nolock = {
+	.llseek		= gfs2_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.write		= do_sync_write,
+	.aio_write	= generic_file_aio_write,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.mmap		= gfs2_mmap,
+	.open		= gfs2_open,
+	.release	= gfs2_close,
+	.fsync		= gfs2_fsync,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+	.setlease	= generic_setlease,
+};
+
+const struct file_operations gfs2_dir_fops_nolock = {
+	.readdir	= gfs2_readdir,
+	.unlocked_ioctl	= gfs2_ioctl,
+	.open		= gfs2_open,
+	.release	= gfs2_close,
+	.fsync		= gfs2_fsync,
+};
+
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5a31d426116..c03a1a384e7 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -30,7 +30,6 @@
 #include "inode.h"
 #include "log.h"
 #include "meta_io.h"
-#include "ops_address.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c30be2b6658..2c3ec072d60 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -11,8 +11,16 @@
 #define __INODE_DOT_H__
 
 #include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
 #include "util.h"
 
+extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
+extern int gfs2_internal_read(struct gfs2_inode *ip,
+			      struct file_ra_state *ra_state,
+			      char *buf, loff_t *pos, unsigned size);
+extern void gfs2_set_aops(struct inode *inode);
+
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
 {
 	return !ip->i_height;
@@ -73,30 +81,31 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 }
 
 
-void gfs2_set_iop(struct inode *inode);
-struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
-				u64 no_addr, u64 no_formal_ino,
-				int skip_freeing);
-struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
-
-int gfs2_inode_refresh(struct gfs2_inode *ip);
-
-int gfs2_dinode_dealloc(struct gfs2_inode *inode);
-int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
-struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
-			   int is_root);
-struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
-			   unsigned int mode, dev_t dev);
-int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
-		struct gfs2_inode *ip);
-int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
-		   const struct gfs2_inode *ip);
-int gfs2_permission(struct inode *inode, int mask);
-int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
-int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
-struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
-void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
-void gfs2_dinode_print(const struct gfs2_inode *ip);
+extern void gfs2_set_iop(struct inode *inode);
+extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
+				       u64 no_addr, u64 no_formal_ino,
+				       int skip_freeing);
+extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
+
+extern int gfs2_inode_refresh(struct gfs2_inode *ip);
+
+extern int gfs2_dinode_dealloc(struct gfs2_inode *inode);
+extern int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
+extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+				  int is_root);
+extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
+				  const struct qstr *name,
+				  unsigned int mode, dev_t dev);
+extern int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+		       struct gfs2_inode *ip);
+extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+			  const struct gfs2_inode *ip);
+extern int gfs2_permission(struct inode *inode, int mask);
+extern int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
+extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
+extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
+extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
+extern void gfs2_dinode_print(const struct gfs2_inode *ip);
 
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 78a5f431266..cb8d7a93d5e 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -31,7 +31,6 @@
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
-#include "ops_address.h"
 
 static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
 {
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
deleted file mode 100644
index e5664210f0d..00000000000
--- a/fs/gfs2/ops_address.c
+++ /dev/null
@@ -1,1146 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/pagemap.h>
-#include <linux/pagevec.h>
-#include <linux/mpage.h>
-#include <linux/fs.h>
-#include <linux/writeback.h>
-#include <linux/swap.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/backing-dev.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "bmap.h"
-#include "glock.h"
-#include "inode.h"
-#include "log.h"
-#include "meta_io.h"
-#include "ops_address.h"
-#include "quota.h"
-#include "trans.h"
-#include "rgrp.h"
-#include "super.h"
-#include "util.h"
-#include "glops.h"
-
-
-static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
-				   unsigned int from, unsigned int to)
-{
-	struct buffer_head *head = page_buffers(page);
-	unsigned int bsize = head->b_size;
-	struct buffer_head *bh;
-	unsigned int start, end;
-
-	for (bh = head, start = 0; bh != head || !start;
-	     bh = bh->b_this_page, start = end) {
-		end = start + bsize;
-		if (end <= from || start >= to)
-			continue;
-		if (gfs2_is_jdata(ip))
-			set_buffer_uptodate(bh);
-		gfs2_trans_add_bh(ip->i_gl, bh, 0);
-	}
-}
-
-/**
- * gfs2_get_block_noalloc - Fills in a buffer head with details about a block
- * @inode: The inode
- * @lblock: The block number to look up
- * @bh_result: The buffer head to return the result in
- * @create: Non-zero if we may add block to the file
- *
- * Returns: errno
- */
-
-static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
-				  struct buffer_head *bh_result, int create)
-{
-	int error;
-
-	error = gfs2_block_map(inode, lblock, bh_result, 0);
-	if (error)
-		return error;
-	if (!buffer_mapped(bh_result))
-		return -EIO;
-	return 0;
-}
-
-static int gfs2_get_block_direct(struct inode *inode, sector_t lblock,
-				 struct buffer_head *bh_result, int create)
-{
-	return gfs2_block_map(inode, lblock, bh_result, 0);
-}
-
-/**
- * gfs2_writepage_common - Common bits of writepage
- * @page: The page to be written
- * @wbc: The writeback control
- *
- * Returns: 1 if writepage is ok, otherwise an error code or zero if no error.
- */
-
-static int gfs2_writepage_common(struct page *page,
-				 struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	loff_t i_size = i_size_read(inode);
-	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
-	unsigned offset;
-
-	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
-		goto out;
-	if (current->journal_info)
-		goto redirty;
-	/* Is the page fully outside i_size? (truncate in progress) */
-	offset = i_size & (PAGE_CACHE_SIZE-1);
-	if (page->index > end_index || (page->index == end_index && !offset)) {
-		page->mapping->a_ops->invalidatepage(page, 0);
-		goto out;
-	}
-	return 1;
-redirty:
-	redirty_page_for_writepage(wbc, page);
-out:
-	unlock_page(page);
-	return 0;
-}
-
-/**
- * gfs2_writeback_writepage - Write page for writeback mappings
- * @page: The page
- * @wbc: The writeback control
- *
- */
-
-static int gfs2_writeback_writepage(struct page *page,
-				    struct writeback_control *wbc)
-{
-	int ret;
-
-	ret = gfs2_writepage_common(page, wbc);
-	if (ret <= 0)
-		return ret;
-
-	ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc);
-	if (ret == -EAGAIN)
-		ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc);
-	return ret;
-}
-
-/**
- * gfs2_ordered_writepage - Write page for ordered data files
- * @page: The page to write
- * @wbc: The writeback control
- *
- */
-
-static int gfs2_ordered_writepage(struct page *page,
-				  struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	int ret;
-
-	ret = gfs2_writepage_common(page, wbc);
-	if (ret <= 0)
-		return ret;
-
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, inode->i_sb->s_blocksize,
-				     (1 << BH_Dirty)|(1 << BH_Uptodate));
-	}
-	gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1);
-	return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
-}
-
-/**
- * __gfs2_jdata_writepage - The core of jdata writepage
- * @page: The page to write
- * @wbc: The writeback control
- *
- * This is shared between writepage and writepages and implements the
- * core of the writepage operation. If a transaction is required then
- * PageChecked will have been set and the transaction will have
- * already been started before this is called.
- */
-
-static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-
-	if (PageChecked(page)) {
-		ClearPageChecked(page);
-		if (!page_has_buffers(page)) {
-			create_empty_buffers(page, inode->i_sb->s_blocksize,
-					     (1 << BH_Dirty)|(1 << BH_Uptodate));
-		}
-		gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1);
-	}
-	return block_write_full_page(page, gfs2_get_block_noalloc, wbc);
-}
-
-/**
- * gfs2_jdata_writepage - Write complete page
- * @page: Page to write
- *
- * Returns: errno
- *
- */
-
-static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	int ret;
-	int done_trans = 0;
-
-	if (PageChecked(page)) {
-		if (wbc->sync_mode != WB_SYNC_ALL)
-			goto out_ignore;
-		ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
-		if (ret)
-			goto out_ignore;
-		done_trans = 1;
-	}
-	ret = gfs2_writepage_common(page, wbc);
-	if (ret > 0)
-		ret = __gfs2_jdata_writepage(page, wbc);
-	if (done_trans)
-		gfs2_trans_end(sdp);
-	return ret;
-
-out_ignore:
-	redirty_page_for_writepage(wbc, page);
-	unlock_page(page);
-	return 0;
-}
-
-/**
- * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk
- * @mapping: The mapping to write
- * @wbc: Write-back control
- *
- * For the data=writeback case we can already ignore buffer heads
- * and write whole extents at once. This is a big reduction in the
- * number of I/O requests we send and the bmap calls we make in this case.
- */
-static int gfs2_writeback_writepages(struct address_space *mapping,
-				     struct writeback_control *wbc)
-{
-	return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
-}
-
-/**
- * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages
- * @mapping: The mapping
- * @wbc: The writeback control
- * @writepage: The writepage function to call for each page
- * @pvec: The vector of pages
- * @nr_pages: The number of pages to write
- *
- * Returns: non-zero if loop should terminate, zero otherwise
- */
-
-static int gfs2_write_jdata_pagevec(struct address_space *mapping,
-				    struct writeback_control *wbc,
-				    struct pagevec *pvec,
-				    int nr_pages, pgoff_t end)
-{
-	struct inode *inode = mapping->host;
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	loff_t i_size = i_size_read(inode);
-	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
-	unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
-	unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
-	int i;
-	int ret;
-
-	ret = gfs2_trans_begin(sdp, nrblocks, nrblocks);
-	if (ret < 0)
-		return ret;
-
-	for(i = 0; i < nr_pages; i++) {
-		struct page *page = pvec->pages[i];
-
-		lock_page(page);
-
-		if (unlikely(page->mapping != mapping)) {
-			unlock_page(page);
-			continue;
-		}
-
-		if (!wbc->range_cyclic && page->index > end) {
-			ret = 1;
-			unlock_page(page);
-			continue;
-		}
-
-		if (wbc->sync_mode != WB_SYNC_NONE)
-			wait_on_page_writeback(page);
-
-		if (PageWriteback(page) ||
-		    !clear_page_dirty_for_io(page)) {
-			unlock_page(page);
-			continue;
-		}
-
-		/* Is the page fully outside i_size? (truncate in progress) */
-		if (page->index > end_index || (page->index == end_index && !offset)) {
-			page->mapping->a_ops->invalidatepage(page, 0);
-			unlock_page(page);
-			continue;
-		}
-
-		ret = __gfs2_jdata_writepage(page, wbc);
-
-		if (ret || (--(wbc->nr_to_write) <= 0))
-			ret = 1;
-		if (wbc->nonblocking && bdi_write_congested(bdi)) {
-			wbc->encountered_congestion = 1;
-			ret = 1;
-		}
-
-	}
-	gfs2_trans_end(sdp);
-	return ret;
-}
-
-/**
- * gfs2_write_cache_jdata - Like write_cache_pages but different
- * @mapping: The mapping to write
- * @wbc: The writeback control
- * @writepage: The writepage function to call
- * @data: The data to pass to writepage
- *
- * The reason that we use our own function here is that we need to
- * start transactions before we grab page locks. This allows us
- * to get the ordering right.
- */
-
-static int gfs2_write_cache_jdata(struct address_space *mapping,
-				  struct writeback_control *wbc)
-{
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
-	int ret = 0;
-	int done = 0;
-	struct pagevec pvec;
-	int nr_pages;
-	pgoff_t index;
-	pgoff_t end;
-	int scanned = 0;
-	int range_whole = 0;
-
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
-		return 0;
-	}
-
-	pagevec_init(&pvec, 0);
-	if (wbc->range_cyclic) {
-		index = mapping->writeback_index; /* Start from prev offset */
-		end = -1;
-	} else {
-		index = wbc->range_start >> PAGE_CACHE_SHIFT;
-		end = wbc->range_end >> PAGE_CACHE_SHIFT;
-		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-			range_whole = 1;
-		scanned = 1;
-	}
-
-retry:
-	 while (!done && (index <= end) &&
-		(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-					       PAGECACHE_TAG_DIRTY,
-					       min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
-		scanned = 1;
-		ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end);
-		if (ret)
-			done = 1;
-		if (ret > 0)
-			ret = 0;
-
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-
-	if (!scanned && !done) {
-		/*
-		 * We hit the last page and there is more work to be done: wrap
-		 * back to the start of the file
-		 */
-		scanned = 1;
-		index = 0;
-		goto retry;
-	}
-
-	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
-		mapping->writeback_index = index;
-	return ret;
-}
-
-
-/**
- * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk
- * @mapping: The mapping to write
- * @wbc: The writeback control
- * 
- */
-
-static int gfs2_jdata_writepages(struct address_space *mapping,
-				 struct writeback_control *wbc)
-{
-	struct gfs2_inode *ip = GFS2_I(mapping->host);
-	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
-	int ret;
-
-	ret = gfs2_write_cache_jdata(mapping, wbc);
-	if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
-		gfs2_log_flush(sdp, ip->i_gl);
-		ret = gfs2_write_cache_jdata(mapping, wbc);
-	}
-	return ret;
-}
-
-/**
- * stuffed_readpage - Fill in a Linux page with stuffed file data
- * @ip: the inode
- * @page: the page
- *
- * Returns: errno
- */
-
-static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
-{
-	struct buffer_head *dibh;
-	void *kaddr;
-	int error;
-
-	/*
-	 * Due to the order of unstuffing files and ->fault(), we can be
-	 * asked for a zero page in the case of a stuffed file being extended,
-	 * so we need to supply one here. It doesn't happen often.
-	 */
-	if (unlikely(page->index)) {
-		zero_user(page, 0, PAGE_CACHE_SIZE);
-		SetPageUptodate(page);
-		return 0;
-	}
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		return error;
-
-	kaddr = kmap_atomic(page, KM_USER0);
-	memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
-	       ip->i_disksize);
-	memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
-	kunmap_atomic(kaddr, KM_USER0);
-	flush_dcache_page(page);
-	brelse(dibh);
-	SetPageUptodate(page);
-
-	return 0;
-}
-
-
-/**
- * __gfs2_readpage - readpage
- * @file: The file to read a page for
- * @page: The page to read
- *
- * This is the core of gfs2's readpage. Its used by the internal file
- * reading code as in that case we already hold the glock. Also its
- * called by gfs2_readpage() once the required lock has been granted.
- *
- */
-
-static int __gfs2_readpage(void *file, struct page *page)
-{
-	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
-	int error;
-
-	if (gfs2_is_stuffed(ip)) {
-		error = stuffed_readpage(ip, page);
-		unlock_page(page);
-	} else {
-		error = mpage_readpage(page, gfs2_block_map);
-	}
-
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		return -EIO;
-
-	return error;
-}
-
-/**
- * gfs2_readpage - read a page of a file
- * @file: The file to read
- * @page: The page of the file
- *
- * This deals with the locking required. We have to unlock and
- * relock the page in order to get the locking in the right
- * order.
- */
-
-static int gfs2_readpage(struct file *file, struct page *page)
-{
-	struct address_space *mapping = page->mapping;
-	struct gfs2_inode *ip = GFS2_I(mapping->host);
-	struct gfs2_holder gh;
-	int error;
-
-	unlock_page(page);
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
-	error = gfs2_glock_nq(&gh);
-	if (unlikely(error))
-		goto out;
-	error = AOP_TRUNCATED_PAGE;
-	lock_page(page);
-	if (page->mapping == mapping && !PageUptodate(page))
-		error = __gfs2_readpage(file, page);
-	else
-		unlock_page(page);
-	gfs2_glock_dq(&gh);
-out:
-	gfs2_holder_uninit(&gh);
-	if (error && error != AOP_TRUNCATED_PAGE)
-		lock_page(page);
-	return error;
-}
-
-/**
- * gfs2_internal_read - read an internal file
- * @ip: The gfs2 inode
- * @ra_state: The readahead state (or NULL for no readahead)
- * @buf: The buffer to fill
- * @pos: The file position
- * @size: The amount to read
- *
- */
-
-int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state,
-                       char *buf, loff_t *pos, unsigned size)
-{
-	struct address_space *mapping = ip->i_inode.i_mapping;
-	unsigned long index = *pos / PAGE_CACHE_SIZE;
-	unsigned offset = *pos & (PAGE_CACHE_SIZE - 1);
-	unsigned copied = 0;
-	unsigned amt;
-	struct page *page;
-	void *p;
-
-	do {
-		amt = size - copied;
-		if (offset + size > PAGE_CACHE_SIZE)
-			amt = PAGE_CACHE_SIZE - offset;
-		page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
-		if (IS_ERR(page))
-			return PTR_ERR(page);
-		p = kmap_atomic(page, KM_USER0);
-		memcpy(buf + copied, p + offset, amt);
-		kunmap_atomic(p, KM_USER0);
-		mark_page_accessed(page);
-		page_cache_release(page);
-		copied += amt;
-		index++;
-		offset = 0;
-	} while(copied < size);
-	(*pos) += size;
-	return size;
-}
-
-/**
- * gfs2_readpages - Read a bunch of pages at once
- *
- * Some notes:
- * 1. This is only for readahead, so we can simply ignore any things
- *    which are slightly inconvenient (such as locking conflicts between
- *    the page lock and the glock) and return having done no I/O. Its
- *    obviously not something we'd want to do on too regular a basis.
- *    Any I/O we ignore at this time will be done via readpage later.
- * 2. We don't handle stuffed files here we let readpage do the honours.
- * 3. mpage_readpages() does most of the heavy lifting in the common case.
- * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places.
- */
-
-static int gfs2_readpages(struct file *file, struct address_space *mapping,
-			  struct list_head *pages, unsigned nr_pages)
-{
-	struct inode *inode = mapping->host;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_holder gh;
-	int ret;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
-	ret = gfs2_glock_nq(&gh);
-	if (unlikely(ret))
-		goto out_uninit;
-	if (!gfs2_is_stuffed(ip))
-		ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map);
-	gfs2_glock_dq(&gh);
-out_uninit:
-	gfs2_holder_uninit(&gh);
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		ret = -EIO;
-	return ret;
-}
-
-/**
- * gfs2_write_begin - Begin to write to a file
- * @file: The file to write to
- * @mapping: The mapping in which to write
- * @pos: The file offset at which to start writing
- * @len: Length of the write
- * @flags: Various flags
- * @pagep: Pointer to return the page
- * @fsdata: Pointer to return fs data (unused by GFS2)
- *
- * Returns: errno
- */
-
-static int gfs2_write_begin(struct file *file, struct address_space *mapping,
-			    loff_t pos, unsigned len, unsigned flags,
-			    struct page **pagep, void **fsdata)
-{
-	struct gfs2_inode *ip = GFS2_I(mapping->host);
-	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
-	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
-	int alloc_required;
-	int error = 0;
-	struct gfs2_alloc *al;
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
-	unsigned to = from + len;
-	struct page *page;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
-	error = gfs2_glock_nq(&ip->i_gh);
-	if (unlikely(error))
-		goto out_uninit;
-
-	error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
-	if (error)
-		goto out_unlock;
-
-	if (alloc_required || gfs2_is_jdata(ip))
-		gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
-
-	if (alloc_required) {
-		al = gfs2_alloc_get(ip);
-		if (!al) {
-			error = -ENOMEM;
-			goto out_unlock;
-		}
-
-		error = gfs2_quota_lock_check(ip);
-		if (error)
-			goto out_alloc_put;
-
-		al->al_requested = data_blocks + ind_blocks;
-		error = gfs2_inplace_reserve(ip);
-		if (error)
-			goto out_qunlock;
-	}
-
-	rblocks = RES_DINODE + ind_blocks;
-	if (gfs2_is_jdata(ip))
-		rblocks += data_blocks ? data_blocks : 1;
-	if (ind_blocks || data_blocks)
-		rblocks += RES_STATFS + RES_QUOTA;
-
-	error = gfs2_trans_begin(sdp, rblocks,
-				 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
-	if (error)
-		goto out_trans_fail;
-
-	error = -ENOMEM;
-	flags |= AOP_FLAG_NOFS;
-	page = grab_cache_page_write_begin(mapping, index, flags);
-	*pagep = page;
-	if (unlikely(!page))
-		goto out_endtrans;
-
-	if (gfs2_is_stuffed(ip)) {
-		error = 0;
-		if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
-			error = gfs2_unstuff_dinode(ip, page);
-			if (error == 0)
-				goto prepare_write;
-		} else if (!PageUptodate(page)) {
-			error = stuffed_readpage(ip, page);
-		}
-		goto out;
-	}
-
-prepare_write:
-	error = block_prepare_write(page, from, to, gfs2_block_map);
-out:
-	if (error == 0)
-		return 0;
-
-	page_cache_release(page);
-	if (pos + len > ip->i_inode.i_size)
-		vmtruncate(&ip->i_inode, ip->i_inode.i_size);
-out_endtrans:
-	gfs2_trans_end(sdp);
-out_trans_fail:
-	if (alloc_required) {
-		gfs2_inplace_release(ip);
-out_qunlock:
-		gfs2_quota_unlock(ip);
-out_alloc_put:
-		gfs2_alloc_put(ip);
-	}
-out_unlock:
-	gfs2_glock_dq(&ip->i_gh);
-out_uninit:
-	gfs2_holder_uninit(&ip->i_gh);
-	return error;
-}
-
-/**
- * adjust_fs_space - Adjusts the free space available due to gfs2_grow
- * @inode: the rindex inode
- */
-static void adjust_fs_space(struct inode *inode)
-{
-	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
-	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
-	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
-	u64 fs_total, new_free;
-
-	/* Total up the file system space, according to the latest rindex. */
-	fs_total = gfs2_ri_total(sdp);
-
-	spin_lock(&sdp->sd_statfs_spin);
-	if (fs_total > (m_sc->sc_total + l_sc->sc_total))
-		new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
-	else
-		new_free = 0;
-	spin_unlock(&sdp->sd_statfs_spin);
-	fs_warn(sdp, "File system extended by %llu blocks.\n",
-		(unsigned long long)new_free);
-	gfs2_statfs_change(sdp, new_free, new_free, 0);
-}
-
-/**
- * gfs2_stuffed_write_end - Write end for stuffed files
- * @inode: The inode
- * @dibh: The buffer_head containing the on-disk inode
- * @pos: The file position
- * @len: The length of the write
- * @copied: How much was actually copied by the VFS
- * @page: The page
- *
- * This copies the data from the page into the inode block after
- * the inode data structure itself.
- *
- * Returns: errno
- */
-static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
-				  loff_t pos, unsigned len, unsigned copied,
-				  struct page *page)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	u64 to = pos + copied;
-	void *kaddr;
-	unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
-	struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
-
-	BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode)));
-	kaddr = kmap_atomic(page, KM_USER0);
-	memcpy(buf + pos, kaddr + pos, copied);
-	memset(kaddr + pos + copied, 0, len - copied);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
-
-	if (!PageUptodate(page))
-		SetPageUptodate(page);
-	unlock_page(page);
-	page_cache_release(page);
-
-	if (copied) {
-		if (inode->i_size < to) {
-			i_size_write(inode, to);
-			ip->i_disksize = inode->i_size;
-		}
-		gfs2_dinode_out(ip, di);
-		mark_inode_dirty(inode);
-	}
-
-	if (inode == sdp->sd_rindex)
-		adjust_fs_space(inode);
-
-	brelse(dibh);
-	gfs2_trans_end(sdp);
-	gfs2_glock_dq(&ip->i_gh);
-	gfs2_holder_uninit(&ip->i_gh);
-	return copied;
-}
-
-/**
- * gfs2_write_end
- * @file: The file to write to
- * @mapping: The address space to write to
- * @pos: The file position
- * @len: The length of the data
- * @copied:
- * @page: The page that has been written
- * @fsdata: The fsdata (unused in GFS2)
- *
- * The main write_end function for GFS2. We have a separate one for
- * stuffed files as they are slightly different, otherwise we just
- * put our locking around the VFS provided functions.
- *
- * Returns: errno
- */
-
-static int gfs2_write_end(struct file *file, struct address_space *mapping,
-			  loff_t pos, unsigned len, unsigned copied,
-			  struct page *page, void *fsdata)
-{
-	struct inode *inode = page->mapping->host;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct buffer_head *dibh;
-	struct gfs2_alloc *al = ip->i_alloc;
-	unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
-	unsigned int to = from + len;
-	int ret;
-
-	BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
-
-	ret = gfs2_meta_inode_buffer(ip, &dibh);
-	if (unlikely(ret)) {
-		unlock_page(page);
-		page_cache_release(page);
-		goto failed;
-	}
-
-	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
-
-	if (gfs2_is_stuffed(ip))
-		return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
-
-	if (!gfs2_is_writeback(ip))
-		gfs2_page_add_databufs(ip, page, from, to);
-
-	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-	if (ret > 0) {
-		if (inode->i_size > ip->i_disksize)
-			ip->i_disksize = inode->i_size;
-		gfs2_dinode_out(ip, dibh->b_data);
-		mark_inode_dirty(inode);
-	}
-
-	if (inode == sdp->sd_rindex)
-		adjust_fs_space(inode);
-
-	brelse(dibh);
-	gfs2_trans_end(sdp);
-failed:
-	if (al) {
-		gfs2_inplace_release(ip);
-		gfs2_quota_unlock(ip);
-		gfs2_alloc_put(ip);
-	}
-	gfs2_glock_dq(&ip->i_gh);
-	gfs2_holder_uninit(&ip->i_gh);
-	return ret;
-}
-
-/**
- * gfs2_set_page_dirty - Page dirtying function
- * @page: The page to dirty
- *
- * Returns: 1 if it dirtyed the page, or 0 otherwise
- */
- 
-static int gfs2_set_page_dirty(struct page *page)
-{
-	SetPageChecked(page);
-	return __set_page_dirty_buffers(page);
-}
-
-/**
- * gfs2_bmap - Block map function
- * @mapping: Address space info
- * @lblock: The block to map
- *
- * Returns: The disk address for the block or 0 on hole or error
- */
-
-static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
-{
-	struct gfs2_inode *ip = GFS2_I(mapping->host);
-	struct gfs2_holder i_gh;
-	sector_t dblock = 0;
-	int error;
-
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-	if (error)
-		return 0;
-
-	if (!gfs2_is_stuffed(ip))
-		dblock = generic_block_bmap(mapping, lblock, gfs2_block_map);
-
-	gfs2_glock_dq_uninit(&i_gh);
-
-	return dblock;
-}
-
-static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
-{
-	struct gfs2_bufdata *bd;
-
-	lock_buffer(bh);
-	gfs2_log_lock(sdp);
-	clear_buffer_dirty(bh);
-	bd = bh->b_private;
-	if (bd) {
-		if (!list_empty(&bd->bd_le.le_list) && !buffer_pinned(bh))
-			list_del_init(&bd->bd_le.le_list);
-		else
-			gfs2_remove_from_journal(bh, current->journal_info, 0);
-	}
-	bh->b_bdev = NULL;
-	clear_buffer_mapped(bh);
-	clear_buffer_req(bh);
-	clear_buffer_new(bh);
-	gfs2_log_unlock(sdp);
-	unlock_buffer(bh);
-}
-
-static void gfs2_invalidatepage(struct page *page, unsigned long offset)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
-	struct buffer_head *bh, *head;
-	unsigned long pos = 0;
-
-	BUG_ON(!PageLocked(page));
-	if (offset == 0)
-		ClearPageChecked(page);
-	if (!page_has_buffers(page))
-		goto out;
-
-	bh = head = page_buffers(page);
-	do {
-		if (offset <= pos)
-			gfs2_discard(sdp, bh);
-		pos += bh->b_size;
-		bh = bh->b_this_page;
-	} while (bh != head);
-out:
-	if (offset == 0)
-		try_to_release_page(page, 0);
-}
-
-/**
- * gfs2_ok_for_dio - check that dio is valid on this file
- * @ip: The inode
- * @rw: READ or WRITE
- * @offset: The offset at which we are reading or writing
- *
- * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o)
- *          1 (to accept the i/o request)
- */
-static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
-{
-	/*
-	 * Should we return an error here? I can't see that O_DIRECT for
-	 * a stuffed file makes any sense. For now we'll silently fall
-	 * back to buffered I/O
-	 */
-	if (gfs2_is_stuffed(ip))
-		return 0;
-
-	if (offset >= i_size_read(&ip->i_inode))
-		return 0;
-	return 1;
-}
-
-
-
-static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
-			      const struct iovec *iov, loff_t offset,
-			      unsigned long nr_segs)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_mapping->host;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder gh;
-	int rv;
-
-	/*
-	 * Deferred lock, even if its a write, since we do no allocation
-	 * on this path. All we need change is atime, and this lock mode
-	 * ensures that other nodes have flushed their buffered read caches
-	 * (i.e. their page cache entries for this inode). We do not,
-	 * unfortunately have the option of only flushing a range like
-	 * the VFS does.
-	 */
-	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
-	rv = gfs2_glock_nq(&gh);
-	if (rv)
-		return rv;
-	rv = gfs2_ok_for_dio(ip, rw, offset);
-	if (rv != 1)
-		goto out; /* dio not valid, fall back to buffered i/o */
-
-	rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev,
-					   iov, offset, nr_segs,
-					   gfs2_get_block_direct, NULL);
-out:
-	gfs2_glock_dq_m(1, &gh);
-	gfs2_holder_uninit(&gh);
-	return rv;
-}
-
-/**
- * gfs2_releasepage - free the metadata associated with a page
- * @page: the page that's being released
- * @gfp_mask: passed from Linux VFS, ignored by us
- *
- * Call try_to_free_buffers() if the buffers in this page can be
- * released.
- *
- * Returns: 0
- */
-
-int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
-{
-	struct inode *aspace = page->mapping->host;
-	struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
-	struct buffer_head *bh, *head;
-	struct gfs2_bufdata *bd;
-
-	if (!page_has_buffers(page))
-		return 0;
-
-	gfs2_log_lock(sdp);
-	head = bh = page_buffers(page);
-	do {
-		if (atomic_read(&bh->b_count))
-			goto cannot_release;
-		bd = bh->b_private;
-		if (bd && bd->bd_ail)
-			goto cannot_release;
-		gfs2_assert_warn(sdp, !buffer_pinned(bh));
-		gfs2_assert_warn(sdp, !buffer_dirty(bh));
-		bh = bh->b_this_page;
-	} while(bh != head);
-	gfs2_log_unlock(sdp);
-
-	head = bh = page_buffers(page);
-	do {
-		gfs2_log_lock(sdp);
-		bd = bh->b_private;
-		if (bd) {
-			gfs2_assert_warn(sdp, bd->bd_bh == bh);
-			gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
-			if (!list_empty(&bd->bd_le.le_list)) {
-				if (!buffer_pinned(bh))
-					list_del_init(&bd->bd_le.le_list);
-				else
-					bd = NULL;
-			}
-			if (bd)
-				bd->bd_bh = NULL;
-			bh->b_private = NULL;
-		}
-		gfs2_log_unlock(sdp);
-		if (bd)
-			kmem_cache_free(gfs2_bufdata_cachep, bd);
-
-		bh = bh->b_this_page;
-	} while (bh != head);
-
-	return try_to_free_buffers(page);
-cannot_release:
-	gfs2_log_unlock(sdp);
-	return 0;
-}
-
-static const struct address_space_operations gfs2_writeback_aops = {
-	.writepage = gfs2_writeback_writepage,
-	.writepages = gfs2_writeback_writepages,
-	.readpage = gfs2_readpage,
-	.readpages = gfs2_readpages,
-	.sync_page = block_sync_page,
-	.write_begin = gfs2_write_begin,
-	.write_end = gfs2_write_end,
-	.bmap = gfs2_bmap,
-	.invalidatepage = gfs2_invalidatepage,
-	.releasepage = gfs2_releasepage,
-	.direct_IO = gfs2_direct_IO,
-	.migratepage = buffer_migrate_page,
-	.is_partially_uptodate = block_is_partially_uptodate,
-};
-
-static const struct address_space_operations gfs2_ordered_aops = {
-	.writepage = gfs2_ordered_writepage,
-	.readpage = gfs2_readpage,
-	.readpages = gfs2_readpages,
-	.sync_page = block_sync_page,
-	.write_begin = gfs2_write_begin,
-	.write_end = gfs2_write_end,
-	.set_page_dirty = gfs2_set_page_dirty,
-	.bmap = gfs2_bmap,
-	.invalidatepage = gfs2_invalidatepage,
-	.releasepage = gfs2_releasepage,
-	.direct_IO = gfs2_direct_IO,
-	.migratepage = buffer_migrate_page,
-	.is_partially_uptodate = block_is_partially_uptodate,
-};
-
-static const struct address_space_operations gfs2_jdata_aops = {
-	.writepage = gfs2_jdata_writepage,
-	.writepages = gfs2_jdata_writepages,
-	.readpage = gfs2_readpage,
-	.readpages = gfs2_readpages,
-	.sync_page = block_sync_page,
-	.write_begin = gfs2_write_begin,
-	.write_end = gfs2_write_end,
-	.set_page_dirty = gfs2_set_page_dirty,
-	.bmap = gfs2_bmap,
-	.invalidatepage = gfs2_invalidatepage,
-	.releasepage = gfs2_releasepage,
-	.is_partially_uptodate = block_is_partially_uptodate,
-};
-
-void gfs2_set_aops(struct inode *inode)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-
-	if (gfs2_is_writeback(ip))
-		inode->i_mapping->a_ops = &gfs2_writeback_aops;
-	else if (gfs2_is_ordered(ip))
-		inode->i_mapping->a_ops = &gfs2_ordered_aops;
-	else if (gfs2_is_jdata(ip))
-		inode->i_mapping->a_ops = &gfs2_jdata_aops;
-	else
-		BUG();
-}
-
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
deleted file mode 100644
index 5da21285bba..00000000000
--- a/fs/gfs2/ops_address.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_ADDRESS_DOT_H__
-#define __OPS_ADDRESS_DOT_H__
-
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/mm.h>
-
-extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
-extern int gfs2_internal_read(struct gfs2_inode *ip,
-			      struct file_ra_state *ra_state,
-			      char *buf, loff_t *pos, unsigned size);
-extern void gfs2_set_aops(struct inode *inode);
-
-#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
deleted file mode 100644
index 022c66cd560..00000000000
--- a/fs/gfs2/ops_dentry.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/crc32.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "dir.h"
-#include "glock.h"
-#include "super.h"
-#include "util.h"
-#include "inode.h"
-
-/**
- * gfs2_drevalidate - Check directory lookup consistency
- * @dentry: the mapping to check
- * @nd:
- *
- * Check to make sure the lookup necessary to arrive at this inode from its
- * parent is still good.
- *
- * Returns: 1 if the dentry is ok, 0 if it isn't
- */
-
-static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
-{
-	struct dentry *parent = dget_parent(dentry);
-	struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode);
-	struct gfs2_inode *dip = GFS2_I(parent->d_inode);
-	struct inode *inode = dentry->d_inode;
-	struct gfs2_holder d_gh;
-	struct gfs2_inode *ip = NULL;
-	int error;
-	int had_lock = 0;
-
-	if (inode) {
-		if (is_bad_inode(inode))
-			goto invalid;
-		ip = GFS2_I(inode);
-	}
-
-	if (sdp->sd_args.ar_localcaching)
-		goto valid;
-
-	had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
-	if (!had_lock) {
-		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
-		if (error)
-			goto fail;
-	} 
-
-	error = gfs2_dir_check(parent->d_inode, &dentry->d_name, ip);
-	switch (error) {
-	case 0:
-		if (!inode)
-			goto invalid_gunlock;
-		break;
-	case -ENOENT:
-		if (!inode)
-			goto valid_gunlock;
-		goto invalid_gunlock;
-	default:
-		goto fail_gunlock;
-	}
-
-valid_gunlock:
-	if (!had_lock)
-		gfs2_glock_dq_uninit(&d_gh);
-valid:
-	dput(parent);
-	return 1;
-
-invalid_gunlock:
-	if (!had_lock)
-		gfs2_glock_dq_uninit(&d_gh);
-invalid:
-	if (inode && S_ISDIR(inode->i_mode)) {
-		if (have_submounts(dentry))
-			goto valid;
-		shrink_dcache_parent(dentry);
-	}
-	d_drop(dentry);
-	dput(parent);
-	return 0;
-
-fail_gunlock:
-	gfs2_glock_dq_uninit(&d_gh);
-fail:
-	dput(parent);
-	return 0;
-}
-
-static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
-{
-	str->hash = gfs2_disk_hash(str->name, str->len);
-	return 0;
-}
-
-const struct dentry_operations gfs2_dops = {
-	.d_revalidate = gfs2_drevalidate,
-	.d_hash = gfs2_dhash,
-};
-
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
deleted file mode 100644
index 9200ef22171..00000000000
--- a/fs/gfs2/ops_export.c
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/exportfs.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/crc32.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "dir.h"
-#include "glock.h"
-#include "glops.h"
-#include "inode.h"
-#include "super.h"
-#include "rgrp.h"
-#include "util.h"
-
-#define GFS2_SMALL_FH_SIZE 4
-#define GFS2_LARGE_FH_SIZE 8
-#define GFS2_OLD_FH_SIZE 10
-
-static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
-			  int connectable)
-{
-	__be32 *fh = (__force __be32 *)p;
-	struct inode *inode = dentry->d_inode;
-	struct super_block *sb = inode->i_sb;
-	struct gfs2_inode *ip = GFS2_I(inode);
-
-	if (*len < GFS2_SMALL_FH_SIZE ||
-	    (connectable && *len < GFS2_LARGE_FH_SIZE))
-		return 255;
-
-	fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
-	fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
-	fh[2] = cpu_to_be32(ip->i_no_addr >> 32);
-	fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
-	*len = GFS2_SMALL_FH_SIZE;
-
-	if (!connectable || inode == sb->s_root->d_inode)
-		return *len;
-
-	spin_lock(&dentry->d_lock);
-	inode = dentry->d_parent->d_inode;
-	ip = GFS2_I(inode);
-	igrab(inode);
-	spin_unlock(&dentry->d_lock);
-
-	fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32);
-	fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
-	fh[6] = cpu_to_be32(ip->i_no_addr >> 32);
-	fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
-	*len = GFS2_LARGE_FH_SIZE;
-
-	iput(inode);
-
-	return *len;
-}
-
-struct get_name_filldir {
-	struct gfs2_inum_host inum;
-	char *name;
-};
-
-static int get_name_filldir(void *opaque, const char *name, int length,
-			    loff_t offset, u64 inum, unsigned int type)
-{
-	struct get_name_filldir *gnfd = opaque;
-
-	if (inum != gnfd->inum.no_addr)
-		return 0;
-
-	memcpy(gnfd->name, name, length);
-	gnfd->name[length] = 0;
-
-	return 1;
-}
-
-static int gfs2_get_name(struct dentry *parent, char *name,
-			 struct dentry *child)
-{
-	struct inode *dir = parent->d_inode;
-	struct inode *inode = child->d_inode;
-	struct gfs2_inode *dip, *ip;
-	struct get_name_filldir gnfd;
-	struct gfs2_holder gh;
-	u64 offset = 0;
-	int error;
-
-	if (!dir)
-		return -EINVAL;
-
-	if (!S_ISDIR(dir->i_mode) || !inode)
-		return -EINVAL;
-
-	dip = GFS2_I(dir);
-	ip = GFS2_I(inode);
-
-	*name = 0;
-	gnfd.inum.no_addr = ip->i_no_addr;
-	gnfd.inum.no_formal_ino = ip->i_no_formal_ino;
-	gnfd.name = name;
-
-	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh);
-	if (error)
-		return error;
-
-	error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir);
-
-	gfs2_glock_dq_uninit(&gh);
-
-	if (!error && !*name)
-		error = -ENOENT;
-
-	return error;
-}
-
-static struct dentry *gfs2_get_parent(struct dentry *child)
-{
-	struct qstr dotdot;
-	struct dentry *dentry;
-
-	/*
-	 * XXX(hch): it would be a good idea to keep this around as a
-	 *	     static variable.
-	 */
-	gfs2_str2qstr(&dotdot, "..");
-
-	dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
-	if (!IS_ERR(dentry))
-		dentry->d_op = &gfs2_dops;
-	return dentry;
-}
-
-static struct dentry *gfs2_get_dentry(struct super_block *sb,
-		struct gfs2_inum_host *inum)
-{
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	struct gfs2_holder i_gh, ri_gh, rgd_gh;
-	struct gfs2_rgrpd *rgd;
-	struct inode *inode;
-	struct dentry *dentry;
-	int error;
-
-	/* System files? */
-
-	inode = gfs2_ilookup(sb, inum->no_addr);
-	if (inode) {
-		if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
-			iput(inode);
-			return ERR_PTR(-ESTALE);
-		}
-		goto out_inode;
-	}
-
-	error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops,
-				  LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-	if (error)
-		return ERR_PTR(error);
-
-	error = gfs2_rindex_hold(sdp, &ri_gh);
-	if (error)
-		goto fail;
-
-	error = -EINVAL;
-	rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
-	if (!rgd)
-		goto fail_rindex;
-
-	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
-	if (error)
-		goto fail_rindex;
-
-	error = -ESTALE;
-	if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
-		goto fail_rgd;
-
-	gfs2_glock_dq_uninit(&rgd_gh);
-	gfs2_glock_dq_uninit(&ri_gh);
-
-	inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
-					inum->no_addr,
-					0, 0);
-	if (IS_ERR(inode)) {
-		error = PTR_ERR(inode);
-		goto fail;
-	}
-
-	error = gfs2_inode_refresh(GFS2_I(inode));
-	if (error) {
-		iput(inode);
-		goto fail;
-	}
-
-	/* Pick up the works we bypass in gfs2_inode_lookup */
-	if (inode->i_state & I_NEW) 
-		gfs2_set_iop(inode);
-
-	if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
-		iput(inode);
-		goto fail;
-	}
-
-	error = -EIO;
-	if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
-		iput(inode);
-		goto fail;
-	}
-
-	gfs2_glock_dq_uninit(&i_gh);
-
-out_inode:
-	dentry = d_obtain_alias(inode);
-	if (!IS_ERR(dentry))
-		dentry->d_op = &gfs2_dops;
-	return dentry;
-
-fail_rgd:
-	gfs2_glock_dq_uninit(&rgd_gh);
-
-fail_rindex:
-	gfs2_glock_dq_uninit(&ri_gh);
-
-fail:
-	gfs2_glock_dq_uninit(&i_gh);
-	return ERR_PTR(error);
-}
-
-static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
-{
-	struct gfs2_inum_host this;
-	__be32 *fh = (__force __be32 *)fid->raw;
-
-	switch (fh_type) {
-	case GFS2_SMALL_FH_SIZE:
-	case GFS2_LARGE_FH_SIZE:
-	case GFS2_OLD_FH_SIZE:
-		this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32;
-		this.no_formal_ino |= be32_to_cpu(fh[1]);
-		this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32;
-		this.no_addr |= be32_to_cpu(fh[3]);
-		return gfs2_get_dentry(sb, &this);
-	default:
-		return NULL;
-	}
-}
-
-static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
-{
-	struct gfs2_inum_host parent;
-	__be32 *fh = (__force __be32 *)fid->raw;
-
-	switch (fh_type) {
-	case GFS2_LARGE_FH_SIZE:
-	case GFS2_OLD_FH_SIZE:
-		parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32;
-		parent.no_formal_ino |= be32_to_cpu(fh[5]);
-		parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32;
-		parent.no_addr |= be32_to_cpu(fh[7]);
-		return gfs2_get_dentry(sb, &parent);
-	default:
-		return NULL;
-	}
-}
-
-const struct export_operations gfs2_export_ops = {
-	.encode_fh = gfs2_encode_fh,
-	.fh_to_dentry = gfs2_fh_to_dentry,
-	.fh_to_parent = gfs2_fh_to_parent,
-	.get_name = gfs2_get_name,
-	.get_parent = gfs2_get_parent,
-};
-
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
deleted file mode 100644
index 0ee7bd287c5..00000000000
--- a/fs/gfs2/ops_file.c
+++ /dev/null
@@ -1,766 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/pagemap.h>
-#include <linux/uio.h>
-#include <linux/blkdev.h>
-#include <linux/mm.h>
-#include <linux/mount.h>
-#include <linux/fs.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/ext2_fs.h>
-#include <linux/crc32.h>
-#include <linux/writeback.h>
-#include <asm/uaccess.h>
-#include <linux/dlm.h>
-#include <linux/dlm_plock.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "bmap.h"
-#include "dir.h"
-#include "glock.h"
-#include "glops.h"
-#include "inode.h"
-#include "log.h"
-#include "meta_io.h"
-#include "quota.h"
-#include "rgrp.h"
-#include "trans.h"
-#include "util.h"
-#include "eaops.h"
-#include "ops_address.h"
-
-/**
- * gfs2_llseek - seek to a location in a file
- * @file: the file
- * @offset: the offset
- * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END)
- *
- * SEEK_END requires the glock for the file because it references the
- * file's size.
- *
- * Returns: The new offset, or errno
- */
-
-static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
-{
-	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-	struct gfs2_holder i_gh;
-	loff_t error;
-
-	if (origin == 2) {
-		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
-					   &i_gh);
-		if (!error) {
-			error = generic_file_llseek_unlocked(file, offset, origin);
-			gfs2_glock_dq_uninit(&i_gh);
-		}
-	} else
-		error = generic_file_llseek_unlocked(file, offset, origin);
-
-	return error;
-}
-
-/**
- * gfs2_readdir - Read directory entries from a directory
- * @file: The directory to read from
- * @dirent: Buffer for dirents
- * @filldir: Function used to do the copying
- *
- * Returns: errno
- */
-
-static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
-{
-	struct inode *dir = file->f_mapping->host;
-	struct gfs2_inode *dip = GFS2_I(dir);
-	struct gfs2_holder d_gh;
-	u64 offset = file->f_pos;
-	int error;
-
-	gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
-	error = gfs2_glock_nq(&d_gh);
-	if (error) {
-		gfs2_holder_uninit(&d_gh);
-		return error;
-	}
-
-	error = gfs2_dir_read(dir, &offset, dirent, filldir);
-
-	gfs2_glock_dq_uninit(&d_gh);
-
-	file->f_pos = offset;
-
-	return error;
-}
-
-/**
- * fsflags_cvt
- * @table: A table of 32 u32 flags
- * @val: a 32 bit value to convert
- *
- * This function can be used to convert between fsflags values and
- * GFS2's own flags values.
- *
- * Returns: the converted flags
- */
-static u32 fsflags_cvt(const u32 *table, u32 val)
-{
-	u32 res = 0;
-	while(val) {
-		if (val & 1)
-			res |= *table;
-		table++;
-		val >>= 1;
-	}
-	return res;
-}
-
-static const u32 fsflags_to_gfs2[32] = {
-	[3] = GFS2_DIF_SYNC,
-	[4] = GFS2_DIF_IMMUTABLE,
-	[5] = GFS2_DIF_APPENDONLY,
-	[7] = GFS2_DIF_NOATIME,
-	[12] = GFS2_DIF_EXHASH,
-	[14] = GFS2_DIF_INHERIT_JDATA,
-};
-
-static const u32 gfs2_to_fsflags[32] = {
-	[gfs2fl_Sync] = FS_SYNC_FL,
-	[gfs2fl_Immutable] = FS_IMMUTABLE_FL,
-	[gfs2fl_AppendOnly] = FS_APPEND_FL,
-	[gfs2fl_NoAtime] = FS_NOATIME_FL,
-	[gfs2fl_ExHash] = FS_INDEX_FL,
-	[gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
-};
-
-static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
-{
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder gh;
-	int error;
-	u32 fsflags;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
-	error = gfs2_glock_nq(&gh);
-	if (error)
-		return error;
-
-	fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
-	if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
-		fsflags |= FS_JOURNAL_DATA_FL;
-	if (put_user(fsflags, ptr))
-		error = -EFAULT;
-
-	gfs2_glock_dq(&gh);
-	gfs2_holder_uninit(&gh);
-	return error;
-}
-
-void gfs2_set_inode_flags(struct inode *inode)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	unsigned int flags = inode->i_flags;
-
-	flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
-	if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
-		flags |= S_IMMUTABLE;
-	if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
-		flags |= S_APPEND;
-	if (ip->i_diskflags & GFS2_DIF_NOATIME)
-		flags |= S_NOATIME;
-	if (ip->i_diskflags & GFS2_DIF_SYNC)
-		flags |= S_SYNC;
-	inode->i_flags = flags;
-}
-
-/* Flags that can be set by user space */
-#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA|			\
-			     GFS2_DIF_IMMUTABLE|		\
-			     GFS2_DIF_APPENDONLY|		\
-			     GFS2_DIF_NOATIME|			\
-			     GFS2_DIF_SYNC|			\
-			     GFS2_DIF_SYSTEM|			\
-			     GFS2_DIF_INHERIT_JDATA)
-
-/**
- * gfs2_set_flags - set flags on an inode
- * @inode: The inode
- * @flags: The flags to set
- * @mask: Indicates which flags are valid
- *
- */
-static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
-{
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct buffer_head *bh;
-	struct gfs2_holder gh;
-	int error;
-	u32 new_flags, flags;
-
-	error = mnt_want_write(filp->f_path.mnt);
-	if (error)
-		return error;
-
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-	if (error)
-		goto out_drop_write;
-
-	flags = ip->i_diskflags;
-	new_flags = (flags & ~mask) | (reqflags & mask);
-	if ((new_flags ^ flags) == 0)
-		goto out;
-
-	error = -EINVAL;
-	if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET)
-		goto out;
-
-	error = -EPERM;
-	if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE))
-		goto out;
-	if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY))
-		goto out;
-	if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) &&
-	    !capable(CAP_LINUX_IMMUTABLE))
-		goto out;
-	if (!IS_IMMUTABLE(inode)) {
-		error = gfs2_permission(inode, MAY_WRITE);
-		if (error)
-			goto out;
-	}
-	if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
-		if (flags & GFS2_DIF_JDATA)
-			gfs2_log_flush(sdp, ip->i_gl);
-		error = filemap_fdatawrite(inode->i_mapping);
-		if (error)
-			goto out;
-		error = filemap_fdatawait(inode->i_mapping);
-		if (error)
-			goto out;
-	}
-	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
-	if (error)
-		goto out;
-	error = gfs2_meta_inode_buffer(ip, &bh);
-	if (error)
-		goto out_trans_end;
-	gfs2_trans_add_bh(ip->i_gl, bh, 1);
-	ip->i_diskflags = new_flags;
-	gfs2_dinode_out(ip, bh->b_data);
-	brelse(bh);
-	gfs2_set_inode_flags(inode);
-	gfs2_set_aops(inode);
-out_trans_end:
-	gfs2_trans_end(sdp);
-out:
-	gfs2_glock_dq_uninit(&gh);
-out_drop_write:
-	mnt_drop_write(filp->f_path.mnt);
-	return error;
-}
-
-static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
-{
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	u32 fsflags, gfsflags;
-	if (get_user(fsflags, ptr))
-		return -EFAULT;
-	gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
-	if (!S_ISDIR(inode->i_mode)) {
-		if (gfsflags & GFS2_DIF_INHERIT_JDATA)
-			gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
-		return do_gfs2_set_flags(filp, gfsflags, ~0);
-	}
-	return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
-}
-
-static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-{
-	switch(cmd) {
-	case FS_IOC_GETFLAGS:
-		return gfs2_get_flags(filp, (u32 __user *)arg);
-	case FS_IOC_SETFLAGS:
-		return gfs2_set_flags(filp, (u32 __user *)arg);
-	}
-	return -ENOTTY;
-}
-
-/**
- * gfs2_allocate_page_backing - Use bmap to allocate blocks
- * @page: The (locked) page to allocate backing for
- *
- * We try to allocate all the blocks required for the page in
- * one go. This might fail for various reasons, so we keep
- * trying until all the blocks to back this page are allocated.
- * If some of the blocks are already allocated, thats ok too.
- */
-
-static int gfs2_allocate_page_backing(struct page *page)
-{
-	struct inode *inode = page->mapping->host;
-	struct buffer_head bh;
-	unsigned long size = PAGE_CACHE_SIZE;
-	u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
-	do {
-		bh.b_state = 0;
-		bh.b_size = size;
-		gfs2_block_map(inode, lblock, &bh, 1);
-		if (!buffer_mapped(&bh))
-			return -EIO;
-		size -= bh.b_size;
-		lblock += (bh.b_size >> inode->i_blkbits);
-	} while(size > 0);
-	return 0;
-}
-
-/**
- * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable
- * @vma: The virtual memory area
- * @page: The page which is about to become writable
- *
- * When the page becomes writable, we need to ensure that we have
- * blocks allocated on disk to back that page.
- */
-
-static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct page *page = vmf->page;
-	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	unsigned long last_index;
-	u64 pos = page->index << PAGE_CACHE_SHIFT;
-	unsigned int data_blocks, ind_blocks, rblocks;
-	int alloc_required = 0;
-	struct gfs2_holder gh;
-	struct gfs2_alloc *al;
-	int ret;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-	ret = gfs2_glock_nq(&gh);
-	if (ret)
-		goto out;
-
-	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
-	set_bit(GIF_SW_PAGED, &ip->i_flags);
-
-	ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
-	if (ret || !alloc_required)
-		goto out_unlock;
-	ret = -ENOMEM;
-	al = gfs2_alloc_get(ip);
-	if (al == NULL)
-		goto out_unlock;
-
-	ret = gfs2_quota_lock_check(ip);
-	if (ret)
-		goto out_alloc_put;
-	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
-	al->al_requested = data_blocks + ind_blocks;
-	ret = gfs2_inplace_reserve(ip);
-	if (ret)
-		goto out_quota_unlock;
-
-	rblocks = RES_DINODE + ind_blocks;
-	if (gfs2_is_jdata(ip))
-		rblocks += data_blocks ? data_blocks : 1;
-	if (ind_blocks || data_blocks)
-		rblocks += RES_STATFS + RES_QUOTA;
-	ret = gfs2_trans_begin(sdp, rblocks, 0);
-	if (ret)
-		goto out_trans_fail;
-
-	lock_page(page);
-	ret = -EINVAL;
-	last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT;
-	if (page->index > last_index)
-		goto out_unlock_page;
-	ret = 0;
-	if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping)
-		goto out_unlock_page;
-	if (gfs2_is_stuffed(ip)) {
-		ret = gfs2_unstuff_dinode(ip, page);
-		if (ret)
-			goto out_unlock_page;
-	}
-	ret = gfs2_allocate_page_backing(page);
-
-out_unlock_page:
-	unlock_page(page);
-	gfs2_trans_end(sdp);
-out_trans_fail:
-	gfs2_inplace_release(ip);
-out_quota_unlock:
-	gfs2_quota_unlock(ip);
-out_alloc_put:
-	gfs2_alloc_put(ip);
-out_unlock:
-	gfs2_glock_dq(&gh);
-out:
-	gfs2_holder_uninit(&gh);
-	if (ret == -ENOMEM)
-		ret = VM_FAULT_OOM;
-	else if (ret)
-		ret = VM_FAULT_SIGBUS;
-	return ret;
-}
-
-static struct vm_operations_struct gfs2_vm_ops = {
-	.fault = filemap_fault,
-	.page_mkwrite = gfs2_page_mkwrite,
-};
-
-/**
- * gfs2_mmap -
- * @file: The file to map
- * @vma: The VMA which described the mapping
- *
- * There is no need to get a lock here unless we should be updating
- * atime. We ignore any locking errors since the only consequence is
- * a missed atime update (which will just be deferred until later).
- *
- * Returns: 0
- */
-
-static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-
-	if (!(file->f_flags & O_NOATIME)) {
-		struct gfs2_holder i_gh;
-		int error;
-
-		gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
-		error = gfs2_glock_nq(&i_gh);
-		file_accessed(file);
-		if (error == 0)
-			gfs2_glock_dq_uninit(&i_gh);
-	}
-	vma->vm_ops = &gfs2_vm_ops;
-	vma->vm_flags |= VM_CAN_NONLINEAR;
-
-	return 0;
-}
-
-/**
- * gfs2_open - open a file
- * @inode: the inode to open
- * @file: the struct file for this opening
- *
- * Returns: errno
- */
-
-static int gfs2_open(struct inode *inode, struct file *file)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder i_gh;
-	struct gfs2_file *fp;
-	int error;
-
-	fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL);
-	if (!fp)
-		return -ENOMEM;
-
-	mutex_init(&fp->f_fl_mutex);
-
-	gfs2_assert_warn(GFS2_SB(inode), !file->private_data);
-	file->private_data = fp;
-
-	if (S_ISREG(ip->i_inode.i_mode)) {
-		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
-					   &i_gh);
-		if (error)
-			goto fail;
-
-		if (!(file->f_flags & O_LARGEFILE) &&
-		    ip->i_disksize > MAX_NON_LFS) {
-			error = -EOVERFLOW;
-			goto fail_gunlock;
-		}
-
-		gfs2_glock_dq_uninit(&i_gh);
-	}
-
-	return 0;
-
-fail_gunlock:
-	gfs2_glock_dq_uninit(&i_gh);
-fail:
-	file->private_data = NULL;
-	kfree(fp);
-	return error;
-}
-
-/**
- * gfs2_close - called to close a struct file
- * @inode: the inode the struct file belongs to
- * @file: the struct file being closed
- *
- * Returns: errno
- */
-
-static int gfs2_close(struct inode *inode, struct file *file)
-{
-	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
-	struct gfs2_file *fp;
-
-	fp = file->private_data;
-	file->private_data = NULL;
-
-	if (gfs2_assert_warn(sdp, fp))
-		return -EIO;
-
-	kfree(fp);
-
-	return 0;
-}
-
-/**
- * gfs2_fsync - sync the dirty data for a file (across the cluster)
- * @file: the file that points to the dentry (we ignore this)
- * @dentry: the dentry that points to the inode to sync
- *
- * The VFS will flush "normal" data for us. We only need to worry
- * about metadata here. For journaled data, we just do a log flush
- * as we can't avoid it. Otherwise we can just bale out if datasync
- * is set. For stuffed inodes we must flush the log in order to
- * ensure that all data is on disk.
- *
- * The call to write_inode_now() is there to write back metadata and
- * the inode itself. It does also try and write the data, but thats
- * (hopefully) a no-op due to the VFS having already called filemap_fdatawrite()
- * for us.
- *
- * Returns: errno
- */
-
-static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
-{
-	struct inode *inode = dentry->d_inode;
-	int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
-	int ret = 0;
-
-	if (gfs2_is_jdata(GFS2_I(inode))) {
-		gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
-		return 0;
-	}
-
-	if (sync_state != 0) {
-		if (!datasync)
-			ret = write_inode_now(inode, 0);
-
-		if (gfs2_is_stuffed(GFS2_I(inode)))
-			gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
-	}
-
-	return ret;
-}
-
-#ifdef CONFIG_GFS2_FS_LOCKING_DLM
-
-/**
- * gfs2_setlease - acquire/release a file lease
- * @file: the file pointer
- * @arg: lease type
- * @fl: file lock
- *
- * We don't currently have a way to enforce a lease across the whole
- * cluster; until we do, disable leases (by just returning -EINVAL),
- * unless the administrator has requested purely local locking.
- *
- * Returns: errno
- */
-
-static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
-{
-	return -EINVAL;
-}
-
-/**
- * gfs2_lock - acquire/release a posix lock on a file
- * @file: the file pointer
- * @cmd: either modify or retrieve lock state, possibly wait
- * @fl: type and range of lock
- *
- * Returns: errno
- */
-
-static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
-{
-	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
-	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-
-	if (!(fl->fl_flags & FL_POSIX))
-		return -ENOLCK;
-	if (__mandatory_lock(&ip->i_inode))
-		return -ENOLCK;
-
-	if (cmd == F_CANCELLK) {
-		/* Hack: */
-		cmd = F_SETLK;
-		fl->fl_type = F_UNLCK;
-	}
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		return -EIO;
-	if (IS_GETLK(cmd))
-		return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
-	else if (fl->fl_type == F_UNLCK)
-		return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
-	else
-		return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
-}
-
-static int do_flock(struct file *file, int cmd, struct file_lock *fl)
-{
-	struct gfs2_file *fp = file->private_data;
-	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
-	struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode);
-	struct gfs2_glock *gl;
-	unsigned int state;
-	int flags;
-	int error = 0;
-
-	state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
-	flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
-
-	mutex_lock(&fp->f_fl_mutex);
-
-	gl = fl_gh->gh_gl;
-	if (gl) {
-		if (fl_gh->gh_state == state)
-			goto out;
-		flock_lock_file_wait(file,
-				     &(struct file_lock){.fl_type = F_UNLCK});
-		gfs2_glock_dq_wait(fl_gh);
-		gfs2_holder_reinit(state, flags, fl_gh);
-	} else {
-		error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
-				       &gfs2_flock_glops, CREATE, &gl);
-		if (error)
-			goto out;
-		gfs2_holder_init(gl, state, flags, fl_gh);
-		gfs2_glock_put(gl);
-	}
-	error = gfs2_glock_nq(fl_gh);
-	if (error) {
-		gfs2_holder_uninit(fl_gh);
-		if (error == GLR_TRYFAILED)
-			error = -EAGAIN;
-	} else {
-		error = flock_lock_file_wait(file, fl);
-		gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
-	}
-
-out:
-	mutex_unlock(&fp->f_fl_mutex);
-	return error;
-}
-
-static void do_unflock(struct file *file, struct file_lock *fl)
-{
-	struct gfs2_file *fp = file->private_data;
-	struct gfs2_holder *fl_gh = &fp->f_fl_gh;
-
-	mutex_lock(&fp->f_fl_mutex);
-	flock_lock_file_wait(file, fl);
-	if (fl_gh->gh_gl)
-		gfs2_glock_dq_uninit(fl_gh);
-	mutex_unlock(&fp->f_fl_mutex);
-}
-
-/**
- * gfs2_flock - acquire/release a flock lock on a file
- * @file: the file pointer
- * @cmd: either modify or retrieve lock state, possibly wait
- * @fl: type and range of lock
- *
- * Returns: errno
- */
-
-static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
-{
-	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-
-	if (!(fl->fl_flags & FL_FLOCK))
-		return -ENOLCK;
-	if (__mandatory_lock(&ip->i_inode))
-		return -ENOLCK;
-
-	if (fl->fl_type == F_UNLCK) {
-		do_unflock(file, fl);
-		return 0;
-	} else {
-		return do_flock(file, cmd, fl);
-	}
-}
-
-const struct file_operations gfs2_file_fops = {
-	.llseek		= gfs2_llseek,
-	.read		= do_sync_read,
-	.aio_read	= generic_file_aio_read,
-	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
-	.unlocked_ioctl	= gfs2_ioctl,
-	.mmap		= gfs2_mmap,
-	.open		= gfs2_open,
-	.release	= gfs2_close,
-	.fsync		= gfs2_fsync,
-	.lock		= gfs2_lock,
-	.flock		= gfs2_flock,
-	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
-	.setlease	= gfs2_setlease,
-};
-
-const struct file_operations gfs2_dir_fops = {
-	.readdir	= gfs2_readdir,
-	.unlocked_ioctl	= gfs2_ioctl,
-	.open		= gfs2_open,
-	.release	= gfs2_close,
-	.fsync		= gfs2_fsync,
-	.lock		= gfs2_lock,
-	.flock		= gfs2_flock,
-};
-
-#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
-
-const struct file_operations gfs2_file_fops_nolock = {
-	.llseek		= gfs2_llseek,
-	.read		= do_sync_read,
-	.aio_read	= generic_file_aio_read,
-	.write		= do_sync_write,
-	.aio_write	= generic_file_aio_write,
-	.unlocked_ioctl	= gfs2_ioctl,
-	.mmap		= gfs2_mmap,
-	.open		= gfs2_open,
-	.release	= gfs2_close,
-	.fsync		= gfs2_fsync,
-	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
-	.setlease	= generic_setlease,
-};
-
-const struct file_operations gfs2_dir_fops_nolock = {
-	.readdir	= gfs2_readdir,
-	.unlocked_ioctl	= gfs2_ioctl,
-	.open		= gfs2_open,
-	.release	= gfs2_close,
-	.fsync		= gfs2_fsync,
-};
-
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 152e6c4a0dc..2e9b9326bfc 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -60,7 +60,6 @@
 #include "super.h"
 #include "trans.h"
 #include "inode.h"
-#include "ops_address.h"
 #include "util.h"
 
 #define QUOTA_USER 1
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index ee3d5c1876a..6122c7ee364 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -29,7 +29,6 @@
 #include "util.h"
 #include "log.h"
 #include "inode.h"
-#include "ops_address.h"
 
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
-- 
cgit v1.2.3


From 9e6e0a128bca0a151d8d3fbd9459b22fc21cfebb Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 22 May 2009 10:36:01 +0100
Subject: GFS2: Merge mount.c and ops_super.c into super.c

mount.c only contained a single function, so is not really
worth retaining on its own. All of the super related code
is now either in super.c or ops_fstype.c

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Makefile    |   4 +-
 fs/gfs2/mount.c     | 195 ------------
 fs/gfs2/ops_super.c | 757 -------------------------------------------
 fs/gfs2/super.c     | 903 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 903 insertions(+), 956 deletions(-)
 delete mode 100644 fs/gfs2/mount.c
 delete mode 100644 fs/gfs2/ops_super.c

(limited to 'fs')

diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 4f7332c7682..d53a9bea1c2 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,8 +1,8 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
 	glops.o inode.o log.o lops.o main.o meta_io.o \
-	mount.o aops.o dentry.o export.o file.o \
-	ops_fstype.o ops_inode.o ops_super.o quota.o \
+	aops.o dentry.o export.o file.o \
+	ops_fstype.o ops_inode.o quota.o \
 	recovery.o rgrp.o super.o sys.o trans.o util.o
 
 gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
deleted file mode 100644
index 947af151fa2..00000000000
--- a/fs/gfs2/mount.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/parser.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "super.h"
-#include "sys.h"
-#include "util.h"
-
-enum {
-	Opt_lockproto,
-	Opt_locktable,
-	Opt_hostdata,
-	Opt_spectator,
-	Opt_ignore_local_fs,
-	Opt_localflocks,
-	Opt_localcaching,
-	Opt_debug,
-	Opt_nodebug,
-	Opt_upgrade,
-	Opt_acl,
-	Opt_noacl,
-	Opt_quota_off,
-	Opt_quota_account,
-	Opt_quota_on,
-	Opt_quota,
-	Opt_noquota,
-	Opt_suiddir,
-	Opt_nosuiddir,
-	Opt_data_writeback,
-	Opt_data_ordered,
-	Opt_meta,
-	Opt_discard,
-	Opt_nodiscard,
-	Opt_commit,
-	Opt_err,
-};
-
-static const match_table_t tokens = {
-	{Opt_lockproto, "lockproto=%s"},
-	{Opt_locktable, "locktable=%s"},
-	{Opt_hostdata, "hostdata=%s"},
-	{Opt_spectator, "spectator"},
-	{Opt_ignore_local_fs, "ignore_local_fs"},
-	{Opt_localflocks, "localflocks"},
-	{Opt_localcaching, "localcaching"},
-	{Opt_debug, "debug"},
-	{Opt_nodebug, "nodebug"},
-	{Opt_upgrade, "upgrade"},
-	{Opt_acl, "acl"},
-	{Opt_noacl, "noacl"},
-	{Opt_quota_off, "quota=off"},
-	{Opt_quota_account, "quota=account"},
-	{Opt_quota_on, "quota=on"},
-	{Opt_quota, "quota"},
-	{Opt_noquota, "noquota"},
-	{Opt_suiddir, "suiddir"},
-	{Opt_nosuiddir, "nosuiddir"},
-	{Opt_data_writeback, "data=writeback"},
-	{Opt_data_ordered, "data=ordered"},
-	{Opt_meta, "meta"},
-	{Opt_discard, "discard"},
-	{Opt_nodiscard, "nodiscard"},
-	{Opt_commit, "commit=%d"},
-	{Opt_err, NULL}
-};
-
-/**
- * gfs2_mount_args - Parse mount options
- * @sdp:
- * @data:
- *
- * Return: errno
- */
-
-int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
-{
-	char *o;
-	int token;
-	substring_t tmp[MAX_OPT_ARGS];
-	int rv;
-
-	/* Split the options into tokens with the "," character and
-	   process them */
-
-	while (1) {
-		o = strsep(&options, ",");
-		if (o == NULL)
-			break;
-		if (*o == '\0')
-			continue;
-
-		token = match_token(o, tokens, tmp);
-		switch (token) {
-		case Opt_lockproto:
-			match_strlcpy(args->ar_lockproto, &tmp[0],
-				      GFS2_LOCKNAME_LEN);
-			break;
-		case Opt_locktable:
-			match_strlcpy(args->ar_locktable, &tmp[0],
-				      GFS2_LOCKNAME_LEN);
-			break;
-		case Opt_hostdata:
-			match_strlcpy(args->ar_hostdata, &tmp[0],
-				      GFS2_LOCKNAME_LEN);
-			break;
-		case Opt_spectator:
-			args->ar_spectator = 1;
-			break;
-		case Opt_ignore_local_fs:
-			args->ar_ignore_local_fs = 1;
-			break;
-		case Opt_localflocks:
-			args->ar_localflocks = 1;
-			break;
-		case Opt_localcaching:
-			args->ar_localcaching = 1;
-			break;
-		case Opt_debug:
-			args->ar_debug = 1;
-			break;
-		case Opt_nodebug:
-			args->ar_debug = 0;
-			break;
-		case Opt_upgrade:
-			args->ar_upgrade = 1;
-			break;
-		case Opt_acl:
-			args->ar_posix_acl = 1;
-			break;
-		case Opt_noacl:
-			args->ar_posix_acl = 0;
-			break;
-		case Opt_quota_off:
-		case Opt_noquota:
-			args->ar_quota = GFS2_QUOTA_OFF;
-			break;
-		case Opt_quota_account:
-			args->ar_quota = GFS2_QUOTA_ACCOUNT;
-			break;
-		case Opt_quota_on:
-		case Opt_quota:
-			args->ar_quota = GFS2_QUOTA_ON;
-			break;
-		case Opt_suiddir:
-			args->ar_suiddir = 1;
-			break;
-		case Opt_nosuiddir:
-			args->ar_suiddir = 0;
-			break;
-		case Opt_data_writeback:
-			args->ar_data = GFS2_DATA_WRITEBACK;
-			break;
-		case Opt_data_ordered:
-			args->ar_data = GFS2_DATA_ORDERED;
-			break;
-		case Opt_meta:
-			args->ar_meta = 1;
-			break;
-		case Opt_discard:
-			args->ar_discard = 1;
-			break;
-		case Opt_nodiscard:
-			args->ar_discard = 0;
-			break;
-		case Opt_commit:
-			rv = match_int(&tmp[0], &args->ar_commit);
-			if (rv || args->ar_commit <= 0) {
-				fs_info(sdp, "commit mount option requires a positive numeric argument\n");
-				return rv ? rv : -EINVAL;
-			}
-			break;
-		case Opt_err:
-		default:
-			fs_info(sdp, "invalid mount option: %s\n", o);
-			return -EINVAL;
-		}
-	}
-
-	return 0;
-}
-
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
deleted file mode 100644
index 2fd1dcbcc5b..00000000000
--- a/fs/gfs2/ops_super.c
+++ /dev/null
@@ -1,757 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/statfs.h>
-#include <linux/seq_file.h>
-#include <linux/mount.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/crc32.h>
-#include <linux/time.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "glock.h"
-#include "inode.h"
-#include "log.h"
-#include "quota.h"
-#include "recovery.h"
-#include "rgrp.h"
-#include "super.h"
-#include "sys.h"
-#include "util.h"
-#include "trans.h"
-#include "dir.h"
-#include "eattr.h"
-#include "bmap.h"
-#include "meta_io.h"
-
-#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
-
-/**
- * gfs2_write_inode - Make sure the inode is stable on the disk
- * @inode: The inode
- * @sync: synchronous write flag
- *
- * Returns: errno
- */
-
-static int gfs2_write_inode(struct inode *inode, int sync)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_holder gh;
-	struct buffer_head *bh;
-	struct timespec atime;
-	struct gfs2_dinode *di;
-	int ret = 0;
-
-	/* Check this is a "normal" inode, etc */
-	if (!test_bit(GIF_USER, &ip->i_flags) ||
-	    (current->flags & PF_MEMALLOC))
-		return 0;
-	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-	if (ret)
-		goto do_flush;
-	ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
-	if (ret)
-		goto do_unlock;
-	ret = gfs2_meta_inode_buffer(ip, &bh);
-	if (ret == 0) {
-		di = (struct gfs2_dinode *)bh->b_data;
-		atime.tv_sec = be64_to_cpu(di->di_atime);
-		atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
-		if (timespec_compare(&inode->i_atime, &atime) > 0) {
-			gfs2_trans_add_bh(ip->i_gl, bh, 1);
-			gfs2_dinode_out(ip, bh->b_data);
-		}
-		brelse(bh);
-	}
-	gfs2_trans_end(sdp);
-do_unlock:
-	gfs2_glock_dq_uninit(&gh);
-do_flush:
-	if (sync != 0)
-		gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
-	return ret;
-}
-
-/**
- * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
- * @sdp: the filesystem
- *
- * Returns: errno
- */
-
-static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
-{
-	struct gfs2_holder t_gh;
-	int error;
-
-	gfs2_quota_sync(sdp);
-	gfs2_statfs_sync(sdp);
-
-	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
-				   &t_gh);
-	if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-		return error;
-
-	gfs2_meta_syncfs(sdp);
-	gfs2_log_shutdown(sdp);
-
-	clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
-
-	if (t_gh.gh_gl)
-		gfs2_glock_dq_uninit(&t_gh);
-
-	gfs2_quota_cleanup(sdp);
-
-	return error;
-}
-
-static int gfs2_umount_recovery_wait(void *word)
-{
-	schedule();
-	return 0;
-}
-
-/**
- * gfs2_put_super - Unmount the filesystem
- * @sb: The VFS superblock
- *
- */
-
-static void gfs2_put_super(struct super_block *sb)
-{
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	int error;
-	struct gfs2_jdesc *jd;
-
-	/*  Unfreeze the filesystem, if we need to  */
-
-	mutex_lock(&sdp->sd_freeze_lock);
-	if (sdp->sd_freeze_count)
-		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
-	mutex_unlock(&sdp->sd_freeze_lock);
-
-	/* No more recovery requests */
-	set_bit(SDF_NORECOVERY, &sdp->sd_flags);
-	smp_mb();
-
-	/* Wait on outstanding recovery */
-restart:
-	spin_lock(&sdp->sd_jindex_spin);
-	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-		if (!test_bit(JDF_RECOVERY, &jd->jd_flags))
-			continue;
-		spin_unlock(&sdp->sd_jindex_spin);
-		wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
-			    gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE);
-		goto restart;
-	}
-	spin_unlock(&sdp->sd_jindex_spin);
-
-	kthread_stop(sdp->sd_quotad_process);
-	kthread_stop(sdp->sd_logd_process);
-
-	if (!(sb->s_flags & MS_RDONLY)) {
-		error = gfs2_make_fs_ro(sdp);
-		if (error)
-			gfs2_io_error(sdp);
-	}
-	/*  At this point, we're through modifying the disk  */
-
-	/*  Release stuff  */
-
-	iput(sdp->sd_jindex);
-	iput(sdp->sd_inum_inode);
-	iput(sdp->sd_statfs_inode);
-	iput(sdp->sd_rindex);
-	iput(sdp->sd_quota_inode);
-
-	gfs2_glock_put(sdp->sd_rename_gl);
-	gfs2_glock_put(sdp->sd_trans_gl);
-
-	if (!sdp->sd_args.ar_spectator) {
-		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
-		iput(sdp->sd_ir_inode);
-		iput(sdp->sd_sc_inode);
-		iput(sdp->sd_qc_inode);
-	}
-
-	gfs2_glock_dq_uninit(&sdp->sd_live_gh);
-	gfs2_clear_rgrpd(sdp);
-	gfs2_jindex_free(sdp);
-	/*  Take apart glock structures and buffer lists  */
-	gfs2_gl_hash_clear(sdp);
-	/*  Unmount the locking protocol  */
-	gfs2_lm_unmount(sdp);
-
-	/*  At this point, we're through participating in the lockspace  */
-	gfs2_sys_fs_del(sdp);
-}
-
-/**
- * gfs2_write_super
- * @sb: the superblock
- *
- */
-
-static void gfs2_write_super(struct super_block *sb)
-{
-	sb->s_dirt = 0;
-}
-
-/**
- * gfs2_sync_fs - sync the filesystem
- * @sb: the superblock
- *
- * Flushes the log to disk.
- */
-
-static int gfs2_sync_fs(struct super_block *sb, int wait)
-{
-	sb->s_dirt = 0;
-	if (wait && sb->s_fs_info)
-		gfs2_log_flush(sb->s_fs_info, NULL);
-	return 0;
-}
-
-/**
- * gfs2_freeze - prevent further writes to the filesystem
- * @sb: the VFS structure for the filesystem
- *
- */
-
-static int gfs2_freeze(struct super_block *sb)
-{
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	int error;
-
-	if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-		return -EINVAL;
-
-	for (;;) {
-		error = gfs2_freeze_fs(sdp);
-		if (!error)
-			break;
-
-		switch (error) {
-		case -EBUSY:
-			fs_err(sdp, "waiting for recovery before freeze\n");
-			break;
-
-		default:
-			fs_err(sdp, "error freezing FS: %d\n", error);
-			break;
-		}
-
-		fs_err(sdp, "retrying...\n");
-		msleep(1000);
-	}
-	return 0;
-}
-
-/**
- * gfs2_unfreeze - reallow writes to the filesystem
- * @sb: the VFS structure for the filesystem
- *
- */
-
-static int gfs2_unfreeze(struct super_block *sb)
-{
-	gfs2_unfreeze_fs(sb->s_fs_info);
-	return 0;
-}
-
-/**
- * statfs_fill - fill in the sg for a given RG
- * @rgd: the RG
- * @sc: the sc structure
- *
- * Returns: 0 on success, -ESTALE if the LVB is invalid
- */
-
-static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
-			    struct gfs2_statfs_change_host *sc)
-{
-	gfs2_rgrp_verify(rgd);
-	sc->sc_total += rgd->rd_data;
-	sc->sc_free += rgd->rd_free;
-	sc->sc_dinodes += rgd->rd_dinodes;
-	return 0;
-}
-
-/**
- * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
- * @sdp: the filesystem
- * @sc: the sc info that will be returned
- *
- * Any error (other than a signal) will cause this routine to fall back
- * to the synchronous version.
- *
- * FIXME: This really shouldn't busy wait like this.
- *
- * Returns: errno
- */
-
-static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-	struct gfs2_holder ri_gh;
-	struct gfs2_rgrpd *rgd_next;
-	struct gfs2_holder *gha, *gh;
-	unsigned int slots = 64;
-	unsigned int x;
-	int done;
-	int error = 0, err;
-
-	memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
-	gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
-	if (!gha)
-		return -ENOMEM;
-
-	error = gfs2_rindex_hold(sdp, &ri_gh);
-	if (error)
-		goto out;
-
-	rgd_next = gfs2_rgrpd_get_first(sdp);
-
-	for (;;) {
-		done = 1;
-
-		for (x = 0; x < slots; x++) {
-			gh = gha + x;
-
-			if (gh->gh_gl && gfs2_glock_poll(gh)) {
-				err = gfs2_glock_wait(gh);
-				if (err) {
-					gfs2_holder_uninit(gh);
-					error = err;
-				} else {
-					if (!error)
-						error = statfs_slow_fill(
-							gh->gh_gl->gl_object, sc);
-					gfs2_glock_dq_uninit(gh);
-				}
-			}
-
-			if (gh->gh_gl)
-				done = 0;
-			else if (rgd_next && !error) {
-				error = gfs2_glock_nq_init(rgd_next->rd_gl,
-							   LM_ST_SHARED,
-							   GL_ASYNC,
-							   gh);
-				rgd_next = gfs2_rgrpd_get_next(rgd_next);
-				done = 0;
-			}
-
-			if (signal_pending(current))
-				error = -ERESTARTSYS;
-		}
-
-		if (done)
-			break;
-
-		yield();
-	}
-
-	gfs2_glock_dq_uninit(&ri_gh);
-
-out:
-	kfree(gha);
-	return error;
-}
-
-/**
- * gfs2_statfs_i - Do a statfs
- * @sdp: the filesystem
- * @sg: the sg structure
- *
- * Returns: errno
- */
-
-static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
-	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
-
-	spin_lock(&sdp->sd_statfs_spin);
-
-	*sc = *m_sc;
-	sc->sc_total += l_sc->sc_total;
-	sc->sc_free += l_sc->sc_free;
-	sc->sc_dinodes += l_sc->sc_dinodes;
-
-	spin_unlock(&sdp->sd_statfs_spin);
-
-	if (sc->sc_free < 0)
-		sc->sc_free = 0;
-	if (sc->sc_free > sc->sc_total)
-		sc->sc_free = sc->sc_total;
-	if (sc->sc_dinodes < 0)
-		sc->sc_dinodes = 0;
-
-	return 0;
-}
-
-/**
- * gfs2_statfs - Gather and return stats about the filesystem
- * @sb: The superblock
- * @statfsbuf: The buffer
- *
- * Returns: 0 on success or error code
- */
-
-static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	struct super_block *sb = dentry->d_inode->i_sb;
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	struct gfs2_statfs_change_host sc;
-	int error;
-
-	if (gfs2_tune_get(sdp, gt_statfs_slow))
-		error = gfs2_statfs_slow(sdp, &sc);
-	else
-		error = gfs2_statfs_i(sdp, &sc);
-
-	if (error)
-		return error;
-
-	buf->f_type = GFS2_MAGIC;
-	buf->f_bsize = sdp->sd_sb.sb_bsize;
-	buf->f_blocks = sc.sc_total;
-	buf->f_bfree = sc.sc_free;
-	buf->f_bavail = sc.sc_free;
-	buf->f_files = sc.sc_dinodes + sc.sc_free;
-	buf->f_ffree = sc.sc_free;
-	buf->f_namelen = GFS2_FNAMESIZE;
-
-	return 0;
-}
-
-/**
- * gfs2_remount_fs - called when the FS is remounted
- * @sb:  the filesystem
- * @flags:  the remount flags
- * @data:  extra data passed in (not used right now)
- *
- * Returns: errno
- */
-
-static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
-{
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	struct gfs2_args args = sdp->sd_args; /* Default to current settings */
-	struct gfs2_tune *gt = &sdp->sd_tune;
-	int error;
-
-	spin_lock(&gt->gt_spin);
-	args.ar_commit = gt->gt_log_flush_secs;
-	spin_unlock(&gt->gt_spin);
-	error = gfs2_mount_args(sdp, &args, data);
-	if (error)
-		return error;
-
-	/* Not allowed to change locking details */
-	if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
-	    strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
-	    strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
-		return -EINVAL;
-
-	/* Some flags must not be changed */
-	if (args_neq(&args, &sdp->sd_args, spectator) ||
-	    args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
-	    args_neq(&args, &sdp->sd_args, localflocks) ||
-	    args_neq(&args, &sdp->sd_args, localcaching) ||
-	    args_neq(&args, &sdp->sd_args, meta))
-		return -EINVAL;
-
-	if (sdp->sd_args.ar_spectator)
-		*flags |= MS_RDONLY;
-
-	if ((sb->s_flags ^ *flags) & MS_RDONLY) {
-		if (*flags & MS_RDONLY)
-			error = gfs2_make_fs_ro(sdp);
-		else
-			error = gfs2_make_fs_rw(sdp);
-		if (error)
-			return error;
-	}
-
-	sdp->sd_args = args;
-	if (sdp->sd_args.ar_posix_acl)
-		sb->s_flags |= MS_POSIXACL;
-	else
-		sb->s_flags &= ~MS_POSIXACL;
-	spin_lock(&gt->gt_spin);
-	gt->gt_log_flush_secs = args.ar_commit;
-	spin_unlock(&gt->gt_spin);
-
-	return 0;
-}
-
-/**
- * gfs2_drop_inode - Drop an inode (test for remote unlink)
- * @inode: The inode to drop
- *
- * If we've received a callback on an iopen lock then its because a
- * remote node tried to deallocate the inode but failed due to this node
- * still having the inode open. Here we mark the link count zero
- * since we know that it must have reached zero if the GLF_DEMOTE flag
- * is set on the iopen glock. If we didn't do a disk read since the
- * remote node removed the final link then we might otherwise miss
- * this event. This check ensures that this node will deallocate the
- * inode's blocks, or alternatively pass the baton on to another
- * node for later deallocation.
- */
-
-static void gfs2_drop_inode(struct inode *inode)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-
-	if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
-		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
-		if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
-			clear_nlink(inode);
-	}
-	generic_drop_inode(inode);
-}
-
-/**
- * gfs2_clear_inode - Deallocate an inode when VFS is done with it
- * @inode: The VFS inode
- *
- */
-
-static void gfs2_clear_inode(struct inode *inode)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-
-	/* This tells us its a "real" inode and not one which only
-	 * serves to contain an address space (see rgrp.c, meta_io.c)
-	 * which therefore doesn't have its own glocks.
-	 */
-	if (test_bit(GIF_USER, &ip->i_flags)) {
-		ip->i_gl->gl_object = NULL;
-		gfs2_glock_put(ip->i_gl);
-		ip->i_gl = NULL;
-		if (ip->i_iopen_gh.gh_gl) {
-			ip->i_iopen_gh.gh_gl->gl_object = NULL;
-			gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-		}
-	}
-}
-
-static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
-{
-	do {
-		if (d1 == d2)
-			return 1;
-		d1 = d1->d_parent;
-	} while (!IS_ROOT(d1));
-	return 0;
-}
-
-/**
- * gfs2_show_options - Show mount options for /proc/mounts
- * @s: seq_file structure
- * @mnt: vfsmount
- *
- * Returns: 0 on success or error code
- */
-
-static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
-{
-	struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
-	struct gfs2_args *args = &sdp->sd_args;
-	int lfsecs;
-
-	if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
-		seq_printf(s, ",meta");
-	if (args->ar_lockproto[0])
-		seq_printf(s, ",lockproto=%s", args->ar_lockproto);
-	if (args->ar_locktable[0])
-		seq_printf(s, ",locktable=%s", args->ar_locktable);
-	if (args->ar_hostdata[0])
-		seq_printf(s, ",hostdata=%s", args->ar_hostdata);
-	if (args->ar_spectator)
-		seq_printf(s, ",spectator");
-	if (args->ar_ignore_local_fs)
-		seq_printf(s, ",ignore_local_fs");
-	if (args->ar_localflocks)
-		seq_printf(s, ",localflocks");
-	if (args->ar_localcaching)
-		seq_printf(s, ",localcaching");
-	if (args->ar_debug)
-		seq_printf(s, ",debug");
-	if (args->ar_upgrade)
-		seq_printf(s, ",upgrade");
-	if (args->ar_posix_acl)
-		seq_printf(s, ",acl");
-	if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
-		char *state;
-		switch (args->ar_quota) {
-		case GFS2_QUOTA_OFF:
-			state = "off";
-			break;
-		case GFS2_QUOTA_ACCOUNT:
-			state = "account";
-			break;
-		case GFS2_QUOTA_ON:
-			state = "on";
-			break;
-		default:
-			state = "unknown";
-			break;
-		}
-		seq_printf(s, ",quota=%s", state);
-	}
-	if (args->ar_suiddir)
-		seq_printf(s, ",suiddir");
-	if (args->ar_data != GFS2_DATA_DEFAULT) {
-		char *state;
-		switch (args->ar_data) {
-		case GFS2_DATA_WRITEBACK:
-			state = "writeback";
-			break;
-		case GFS2_DATA_ORDERED:
-			state = "ordered";
-			break;
-		default:
-			state = "unknown";
-			break;
-		}
-		seq_printf(s, ",data=%s", state);
-	}
-	if (args->ar_discard)
-		seq_printf(s, ",discard");
-	lfsecs = sdp->sd_tune.gt_log_flush_secs;
-	if (lfsecs != 60)
-		seq_printf(s, ",commit=%d", lfsecs);
-	return 0;
-}
-
-/*
- * We have to (at the moment) hold the inodes main lock to cover
- * the gap between unlocking the shared lock on the iopen lock and
- * taking the exclusive lock. I'd rather do a shared -> exclusive
- * conversion on the iopen lock, but we can change that later. This
- * is safe, just less efficient.
- */
-
-static void gfs2_delete_inode(struct inode *inode)
-{
-	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder gh;
-	int error;
-
-	if (!test_bit(GIF_USER, &ip->i_flags))
-		goto out;
-
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-	if (unlikely(error)) {
-		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-		goto out;
-	}
-
-	gfs2_glock_dq_wait(&ip->i_iopen_gh);
-	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
-	error = gfs2_glock_nq(&ip->i_iopen_gh);
-	if (error)
-		goto out_truncate;
-
-	if (S_ISDIR(inode->i_mode) &&
-	    (ip->i_diskflags & GFS2_DIF_EXHASH)) {
-		error = gfs2_dir_exhash_dealloc(ip);
-		if (error)
-			goto out_unlock;
-	}
-
-	if (ip->i_eattr) {
-		error = gfs2_ea_dealloc(ip);
-		if (error)
-			goto out_unlock;
-	}
-
-	if (!gfs2_is_stuffed(ip)) {
-		error = gfs2_file_dealloc(ip);
-		if (error)
-			goto out_unlock;
-	}
-
-	error = gfs2_dinode_dealloc(ip);
-	if (error)
-		goto out_unlock;
-
-out_truncate:
-	error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
-	if (error)
-		goto out_unlock;
-	/* Needs to be done before glock release & also in a transaction */
-	truncate_inode_pages(&inode->i_data, 0);
-	gfs2_trans_end(sdp);
-
-out_unlock:
-	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
-		gfs2_glock_dq(&ip->i_iopen_gh);
-	gfs2_holder_uninit(&ip->i_iopen_gh);
-	gfs2_glock_dq_uninit(&gh);
-	if (error && error != GLR_TRYFAILED && error != -EROFS)
-		fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
-out:
-	truncate_inode_pages(&inode->i_data, 0);
-	clear_inode(inode);
-}
-
-static struct inode *gfs2_alloc_inode(struct super_block *sb)
-{
-	struct gfs2_inode *ip;
-
-	ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
-	if (ip) {
-		ip->i_flags = 0;
-		ip->i_gl = NULL;
-	}
-	return &ip->i_inode;
-}
-
-static void gfs2_destroy_inode(struct inode *inode)
-{
-	kmem_cache_free(gfs2_inode_cachep, inode);
-}
-
-const struct super_operations gfs2_super_ops = {
-	.alloc_inode		= gfs2_alloc_inode,
-	.destroy_inode		= gfs2_destroy_inode,
-	.write_inode		= gfs2_write_inode,
-	.delete_inode		= gfs2_delete_inode,
-	.put_super		= gfs2_put_super,
-	.write_super		= gfs2_write_super,
-	.sync_fs		= gfs2_sync_fs,
-	.freeze_fs 		= gfs2_freeze,
-	.unfreeze_fs		= gfs2_unfreeze,
-	.statfs			= gfs2_statfs,
-	.remount_fs		= gfs2_remount_fs,
-	.clear_inode		= gfs2_clear_inode,
-	.drop_inode		= gfs2_drop_inode,
-	.show_options		= gfs2_show_options,
-};
-
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 601913e0a48..40bcc37e5a7 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -7,14 +7,20 @@
  * of the GNU General Public License version 2.
  */
 
+#include <linux/bio.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
-#include <linux/crc32.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/bio.h>
+#include <linux/crc32.h>
+#include <linux/time.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -31,6 +37,183 @@
 #include "super.h"
 #include "trans.h"
 #include "util.h"
+#include "sys.h"
+#include "eattr.h"
+
+#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
+
+enum {
+	Opt_lockproto,
+	Opt_locktable,
+	Opt_hostdata,
+	Opt_spectator,
+	Opt_ignore_local_fs,
+	Opt_localflocks,
+	Opt_localcaching,
+	Opt_debug,
+	Opt_nodebug,
+	Opt_upgrade,
+	Opt_acl,
+	Opt_noacl,
+	Opt_quota_off,
+	Opt_quota_account,
+	Opt_quota_on,
+	Opt_quota,
+	Opt_noquota,
+	Opt_suiddir,
+	Opt_nosuiddir,
+	Opt_data_writeback,
+	Opt_data_ordered,
+	Opt_meta,
+	Opt_discard,
+	Opt_nodiscard,
+	Opt_commit,
+	Opt_error,
+};
+
+static const match_table_t tokens = {
+	{Opt_lockproto, "lockproto=%s"},
+	{Opt_locktable, "locktable=%s"},
+	{Opt_hostdata, "hostdata=%s"},
+	{Opt_spectator, "spectator"},
+	{Opt_ignore_local_fs, "ignore_local_fs"},
+	{Opt_localflocks, "localflocks"},
+	{Opt_localcaching, "localcaching"},
+	{Opt_debug, "debug"},
+	{Opt_nodebug, "nodebug"},
+	{Opt_upgrade, "upgrade"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_quota_off, "quota=off"},
+	{Opt_quota_account, "quota=account"},
+	{Opt_quota_on, "quota=on"},
+	{Opt_quota, "quota"},
+	{Opt_noquota, "noquota"},
+	{Opt_suiddir, "suiddir"},
+	{Opt_nosuiddir, "nosuiddir"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_ordered, "data=ordered"},
+	{Opt_meta, "meta"},
+	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
+	{Opt_commit, "commit=%d"},
+	{Opt_error, NULL}
+};
+
+/**
+ * gfs2_mount_args - Parse mount options
+ * @sdp:
+ * @data:
+ *
+ * Return: errno
+ */
+
+int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
+{
+	char *o;
+	int token;
+	substring_t tmp[MAX_OPT_ARGS];
+	int rv;
+
+	/* Split the options into tokens with the "," character and
+	   process them */
+
+	while (1) {
+		o = strsep(&options, ",");
+		if (o == NULL)
+			break;
+		if (*o == '\0')
+			continue;
+
+		token = match_token(o, tokens, tmp);
+		switch (token) {
+		case Opt_lockproto:
+			match_strlcpy(args->ar_lockproto, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
+			break;
+		case Opt_locktable:
+			match_strlcpy(args->ar_locktable, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
+			break;
+		case Opt_hostdata:
+			match_strlcpy(args->ar_hostdata, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
+			break;
+		case Opt_spectator:
+			args->ar_spectator = 1;
+			break;
+		case Opt_ignore_local_fs:
+			args->ar_ignore_local_fs = 1;
+			break;
+		case Opt_localflocks:
+			args->ar_localflocks = 1;
+			break;
+		case Opt_localcaching:
+			args->ar_localcaching = 1;
+			break;
+		case Opt_debug:
+			args->ar_debug = 1;
+			break;
+		case Opt_nodebug:
+			args->ar_debug = 0;
+			break;
+		case Opt_upgrade:
+			args->ar_upgrade = 1;
+			break;
+		case Opt_acl:
+			args->ar_posix_acl = 1;
+			break;
+		case Opt_noacl:
+			args->ar_posix_acl = 0;
+			break;
+		case Opt_quota_off:
+		case Opt_noquota:
+			args->ar_quota = GFS2_QUOTA_OFF;
+			break;
+		case Opt_quota_account:
+			args->ar_quota = GFS2_QUOTA_ACCOUNT;
+			break;
+		case Opt_quota_on:
+		case Opt_quota:
+			args->ar_quota = GFS2_QUOTA_ON;
+			break;
+		case Opt_suiddir:
+			args->ar_suiddir = 1;
+			break;
+		case Opt_nosuiddir:
+			args->ar_suiddir = 0;
+			break;
+		case Opt_data_writeback:
+			args->ar_data = GFS2_DATA_WRITEBACK;
+			break;
+		case Opt_data_ordered:
+			args->ar_data = GFS2_DATA_ORDERED;
+			break;
+		case Opt_meta:
+			args->ar_meta = 1;
+			break;
+		case Opt_discard:
+			args->ar_discard = 1;
+			break;
+		case Opt_nodiscard:
+			args->ar_discard = 0;
+			break;
+		case Opt_commit:
+			rv = match_int(&tmp[0], &args->ar_commit);
+			if (rv || args->ar_commit <= 0) {
+				fs_info(sdp, "commit mount option requires a positive numeric argument\n");
+				return rv ? rv : -EINVAL;
+			}
+			break;
+		case Opt_error:
+		default:
+			fs_info(sdp, "invalid mount option: %s\n", o);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
 
 /**
  * gfs2_jindex_free - Clear all the journal index information
@@ -436,3 +619,719 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
 	mutex_unlock(&sdp->sd_freeze_lock);
 }
 
+
+/**
+ * gfs2_write_inode - Make sure the inode is stable on the disk
+ * @inode: The inode
+ * @sync: synchronous write flag
+ *
+ * Returns: errno
+ */
+
+static int gfs2_write_inode(struct inode *inode, int sync)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_holder gh;
+	struct buffer_head *bh;
+	struct timespec atime;
+	struct gfs2_dinode *di;
+	int ret = 0;
+
+	/* Check this is a "normal" inode, etc */
+	if (!test_bit(GIF_USER, &ip->i_flags) ||
+	    (current->flags & PF_MEMALLOC))
+		return 0;
+	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (ret)
+		goto do_flush;
+	ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (ret)
+		goto do_unlock;
+	ret = gfs2_meta_inode_buffer(ip, &bh);
+	if (ret == 0) {
+		di = (struct gfs2_dinode *)bh->b_data;
+		atime.tv_sec = be64_to_cpu(di->di_atime);
+		atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
+		if (timespec_compare(&inode->i_atime, &atime) > 0) {
+			gfs2_trans_add_bh(ip->i_gl, bh, 1);
+			gfs2_dinode_out(ip, bh->b_data);
+		}
+		brelse(bh);
+	}
+	gfs2_trans_end(sdp);
+do_unlock:
+	gfs2_glock_dq_uninit(&gh);
+do_flush:
+	if (sync != 0)
+		gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+	return ret;
+}
+
+/**
+ * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+
+static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+{
+	struct gfs2_holder t_gh;
+	int error;
+
+	gfs2_quota_sync(sdp);
+	gfs2_statfs_sync(sdp);
+
+	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
+				   &t_gh);
+	if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return error;
+
+	gfs2_meta_syncfs(sdp);
+	gfs2_log_shutdown(sdp);
+
+	clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
+	if (t_gh.gh_gl)
+		gfs2_glock_dq_uninit(&t_gh);
+
+	gfs2_quota_cleanup(sdp);
+
+	return error;
+}
+
+static int gfs2_umount_recovery_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
+/**
+ * gfs2_put_super - Unmount the filesystem
+ * @sb: The VFS superblock
+ *
+ */
+
+static void gfs2_put_super(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	int error;
+	struct gfs2_jdesc *jd;
+
+	/*  Unfreeze the filesystem, if we need to  */
+
+	mutex_lock(&sdp->sd_freeze_lock);
+	if (sdp->sd_freeze_count)
+		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+	mutex_unlock(&sdp->sd_freeze_lock);
+
+	/* No more recovery requests */
+	set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+	smp_mb();
+
+	/* Wait on outstanding recovery */
+restart:
+	spin_lock(&sdp->sd_jindex_spin);
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (!test_bit(JDF_RECOVERY, &jd->jd_flags))
+			continue;
+		spin_unlock(&sdp->sd_jindex_spin);
+		wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
+			    gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE);
+		goto restart;
+	}
+	spin_unlock(&sdp->sd_jindex_spin);
+
+	kthread_stop(sdp->sd_quotad_process);
+	kthread_stop(sdp->sd_logd_process);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		error = gfs2_make_fs_ro(sdp);
+		if (error)
+			gfs2_io_error(sdp);
+	}
+	/*  At this point, we're through modifying the disk  */
+
+	/*  Release stuff  */
+
+	iput(sdp->sd_jindex);
+	iput(sdp->sd_inum_inode);
+	iput(sdp->sd_statfs_inode);
+	iput(sdp->sd_rindex);
+	iput(sdp->sd_quota_inode);
+
+	gfs2_glock_put(sdp->sd_rename_gl);
+	gfs2_glock_put(sdp->sd_trans_gl);
+
+	if (!sdp->sd_args.ar_spectator) {
+		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+		iput(sdp->sd_ir_inode);
+		iput(sdp->sd_sc_inode);
+		iput(sdp->sd_qc_inode);
+	}
+
+	gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+	gfs2_clear_rgrpd(sdp);
+	gfs2_jindex_free(sdp);
+	/*  Take apart glock structures and buffer lists  */
+	gfs2_gl_hash_clear(sdp);
+	/*  Unmount the locking protocol  */
+	gfs2_lm_unmount(sdp);
+
+	/*  At this point, we're through participating in the lockspace  */
+	gfs2_sys_fs_del(sdp);
+}
+
+/**
+ * gfs2_write_super
+ * @sb: the superblock
+ *
+ */
+
+static void gfs2_write_super(struct super_block *sb)
+{
+	sb->s_dirt = 0;
+}
+
+/**
+ * gfs2_sync_fs - sync the filesystem
+ * @sb: the superblock
+ *
+ * Flushes the log to disk.
+ */
+
+static int gfs2_sync_fs(struct super_block *sb, int wait)
+{
+	sb->s_dirt = 0;
+	if (wait && sb->s_fs_info)
+		gfs2_log_flush(sb->s_fs_info, NULL);
+	return 0;
+}
+
+/**
+ * gfs2_freeze - prevent further writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+
+static int gfs2_freeze(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	int error;
+
+	if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return -EINVAL;
+
+	for (;;) {
+		error = gfs2_freeze_fs(sdp);
+		if (!error)
+			break;
+
+		switch (error) {
+		case -EBUSY:
+			fs_err(sdp, "waiting for recovery before freeze\n");
+			break;
+
+		default:
+			fs_err(sdp, "error freezing FS: %d\n", error);
+			break;
+		}
+
+		fs_err(sdp, "retrying...\n");
+		msleep(1000);
+	}
+	return 0;
+}
+
+/**
+ * gfs2_unfreeze - reallow writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+
+static int gfs2_unfreeze(struct super_block *sb)
+{
+	gfs2_unfreeze_fs(sb->s_fs_info);
+	return 0;
+}
+
+/**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+			    struct gfs2_statfs_change_host *sc)
+{
+	gfs2_rgrp_verify(rgd);
+	sc->sc_total += rgd->rd_data;
+	sc->sc_free += rgd->rd_free;
+	sc->sc_dinodes += rgd->rd_dinodes;
+	return 0;
+}
+
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+	struct gfs2_holder ri_gh;
+	struct gfs2_rgrpd *rgd_next;
+	struct gfs2_holder *gha, *gh;
+	unsigned int slots = 64;
+	unsigned int x;
+	int done;
+	int error = 0, err;
+
+	memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
+	gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+	if (!gha)
+		return -ENOMEM;
+
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		goto out;
+
+	rgd_next = gfs2_rgrpd_get_first(sdp);
+
+	for (;;) {
+		done = 1;
+
+		for (x = 0; x < slots; x++) {
+			gh = gha + x;
+
+			if (gh->gh_gl && gfs2_glock_poll(gh)) {
+				err = gfs2_glock_wait(gh);
+				if (err) {
+					gfs2_holder_uninit(gh);
+					error = err;
+				} else {
+					if (!error)
+						error = statfs_slow_fill(
+							gh->gh_gl->gl_object, sc);
+					gfs2_glock_dq_uninit(gh);
+				}
+			}
+
+			if (gh->gh_gl)
+				done = 0;
+			else if (rgd_next && !error) {
+				error = gfs2_glock_nq_init(rgd_next->rd_gl,
+							   LM_ST_SHARED,
+							   GL_ASYNC,
+							   gh);
+				rgd_next = gfs2_rgrpd_get_next(rgd_next);
+				done = 0;
+			}
+
+			if (signal_pending(current))
+				error = -ERESTARTSYS;
+		}
+
+		if (done)
+			break;
+
+		yield();
+	}
+
+	gfs2_glock_dq_uninit(&ri_gh);
+
+out:
+	kfree(gha);
+	return error;
+}
+
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+
+static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+
+	spin_lock(&sdp->sd_statfs_spin);
+
+	*sc = *m_sc;
+	sc->sc_total += l_sc->sc_total;
+	sc->sc_free += l_sc->sc_free;
+	sc->sc_dinodes += l_sc->sc_dinodes;
+
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	if (sc->sc_free < 0)
+		sc->sc_free = 0;
+	if (sc->sc_free > sc->sc_total)
+		sc->sc_free = sc->sc_total;
+	if (sc->sc_dinodes < 0)
+		sc->sc_dinodes = 0;
+
+	return 0;
+}
+
+/**
+ * gfs2_statfs - Gather and return stats about the filesystem
+ * @sb: The superblock
+ * @statfsbuf: The buffer
+ *
+ * Returns: 0 on success or error code
+ */
+
+static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_inode->i_sb;
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_statfs_change_host sc;
+	int error;
+
+	if (gfs2_tune_get(sdp, gt_statfs_slow))
+		error = gfs2_statfs_slow(sdp, &sc);
+	else
+		error = gfs2_statfs_i(sdp, &sc);
+
+	if (error)
+		return error;
+
+	buf->f_type = GFS2_MAGIC;
+	buf->f_bsize = sdp->sd_sb.sb_bsize;
+	buf->f_blocks = sc.sc_total;
+	buf->f_bfree = sc.sc_free;
+	buf->f_bavail = sc.sc_free;
+	buf->f_files = sc.sc_dinodes + sc.sc_free;
+	buf->f_ffree = sc.sc_free;
+	buf->f_namelen = GFS2_FNAMESIZE;
+
+	return 0;
+}
+
+/**
+ * gfs2_remount_fs - called when the FS is remounted
+ * @sb:  the filesystem
+ * @flags:  the remount flags
+ * @data:  extra data passed in (not used right now)
+ *
+ * Returns: errno
+ */
+
+static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_args args = sdp->sd_args; /* Default to current settings */
+	struct gfs2_tune *gt = &sdp->sd_tune;
+	int error;
+
+	spin_lock(&gt->gt_spin);
+	args.ar_commit = gt->gt_log_flush_secs;
+	spin_unlock(&gt->gt_spin);
+	error = gfs2_mount_args(sdp, &args, data);
+	if (error)
+		return error;
+
+	/* Not allowed to change locking details */
+	if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
+	    strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
+	    strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
+		return -EINVAL;
+
+	/* Some flags must not be changed */
+	if (args_neq(&args, &sdp->sd_args, spectator) ||
+	    args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
+	    args_neq(&args, &sdp->sd_args, localflocks) ||
+	    args_neq(&args, &sdp->sd_args, localcaching) ||
+	    args_neq(&args, &sdp->sd_args, meta))
+		return -EINVAL;
+
+	if (sdp->sd_args.ar_spectator)
+		*flags |= MS_RDONLY;
+
+	if ((sb->s_flags ^ *flags) & MS_RDONLY) {
+		if (*flags & MS_RDONLY)
+			error = gfs2_make_fs_ro(sdp);
+		else
+			error = gfs2_make_fs_rw(sdp);
+		if (error)
+			return error;
+	}
+
+	sdp->sd_args = args;
+	if (sdp->sd_args.ar_posix_acl)
+		sb->s_flags |= MS_POSIXACL;
+	else
+		sb->s_flags &= ~MS_POSIXACL;
+	spin_lock(&gt->gt_spin);
+	gt->gt_log_flush_secs = args.ar_commit;
+	spin_unlock(&gt->gt_spin);
+
+	return 0;
+}
+
+/**
+ * gfs2_drop_inode - Drop an inode (test for remote unlink)
+ * @inode: The inode to drop
+ *
+ * If we've received a callback on an iopen lock then its because a
+ * remote node tried to deallocate the inode but failed due to this node
+ * still having the inode open. Here we mark the link count zero
+ * since we know that it must have reached zero if the GLF_DEMOTE flag
+ * is set on the iopen glock. If we didn't do a disk read since the
+ * remote node removed the final link then we might otherwise miss
+ * this event. This check ensures that this node will deallocate the
+ * inode's blocks, or alternatively pass the baton on to another
+ * node for later deallocation.
+ */
+
+static void gfs2_drop_inode(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
+		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
+		if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
+			clear_nlink(inode);
+	}
+	generic_drop_inode(inode);
+}
+
+/**
+ * gfs2_clear_inode - Deallocate an inode when VFS is done with it
+ * @inode: The VFS inode
+ *
+ */
+
+static void gfs2_clear_inode(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	/* This tells us its a "real" inode and not one which only
+	 * serves to contain an address space (see rgrp.c, meta_io.c)
+	 * which therefore doesn't have its own glocks.
+	 */
+	if (test_bit(GIF_USER, &ip->i_flags)) {
+		ip->i_gl->gl_object = NULL;
+		gfs2_glock_put(ip->i_gl);
+		ip->i_gl = NULL;
+		if (ip->i_iopen_gh.gh_gl) {
+			ip->i_iopen_gh.gh_gl->gl_object = NULL;
+			gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+		}
+	}
+}
+
+static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
+{
+	do {
+		if (d1 == d2)
+			return 1;
+		d1 = d1->d_parent;
+	} while (!IS_ROOT(d1));
+	return 0;
+}
+
+/**
+ * gfs2_show_options - Show mount options for /proc/mounts
+ * @s: seq_file structure
+ * @mnt: vfsmount
+ *
+ * Returns: 0 on success or error code
+ */
+
+static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+	struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
+	struct gfs2_args *args = &sdp->sd_args;
+	int lfsecs;
+
+	if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
+		seq_printf(s, ",meta");
+	if (args->ar_lockproto[0])
+		seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+	if (args->ar_locktable[0])
+		seq_printf(s, ",locktable=%s", args->ar_locktable);
+	if (args->ar_hostdata[0])
+		seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+	if (args->ar_spectator)
+		seq_printf(s, ",spectator");
+	if (args->ar_ignore_local_fs)
+		seq_printf(s, ",ignore_local_fs");
+	if (args->ar_localflocks)
+		seq_printf(s, ",localflocks");
+	if (args->ar_localcaching)
+		seq_printf(s, ",localcaching");
+	if (args->ar_debug)
+		seq_printf(s, ",debug");
+	if (args->ar_upgrade)
+		seq_printf(s, ",upgrade");
+	if (args->ar_posix_acl)
+		seq_printf(s, ",acl");
+	if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
+		char *state;
+		switch (args->ar_quota) {
+		case GFS2_QUOTA_OFF:
+			state = "off";
+			break;
+		case GFS2_QUOTA_ACCOUNT:
+			state = "account";
+			break;
+		case GFS2_QUOTA_ON:
+			state = "on";
+			break;
+		default:
+			state = "unknown";
+			break;
+		}
+		seq_printf(s, ",quota=%s", state);
+	}
+	if (args->ar_suiddir)
+		seq_printf(s, ",suiddir");
+	if (args->ar_data != GFS2_DATA_DEFAULT) {
+		char *state;
+		switch (args->ar_data) {
+		case GFS2_DATA_WRITEBACK:
+			state = "writeback";
+			break;
+		case GFS2_DATA_ORDERED:
+			state = "ordered";
+			break;
+		default:
+			state = "unknown";
+			break;
+		}
+		seq_printf(s, ",data=%s", state);
+	}
+	if (args->ar_discard)
+		seq_printf(s, ",discard");
+	lfsecs = sdp->sd_tune.gt_log_flush_secs;
+	if (lfsecs != 60)
+		seq_printf(s, ",commit=%d", lfsecs);
+	return 0;
+}
+
+/*
+ * We have to (at the moment) hold the inodes main lock to cover
+ * the gap between unlocking the shared lock on the iopen lock and
+ * taking the exclusive lock. I'd rather do a shared -> exclusive
+ * conversion on the iopen lock, but we can change that later. This
+ * is safe, just less efficient.
+ */
+
+static void gfs2_delete_inode(struct inode *inode)
+{
+	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int error;
+
+	if (!test_bit(GIF_USER, &ip->i_flags))
+		goto out;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (unlikely(error)) {
+		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+		goto out;
+	}
+
+	gfs2_glock_dq_wait(&ip->i_iopen_gh);
+	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
+	error = gfs2_glock_nq(&ip->i_iopen_gh);
+	if (error)
+		goto out_truncate;
+
+	if (S_ISDIR(inode->i_mode) &&
+	    (ip->i_diskflags & GFS2_DIF_EXHASH)) {
+		error = gfs2_dir_exhash_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	if (ip->i_eattr) {
+		error = gfs2_ea_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	if (!gfs2_is_stuffed(ip)) {
+		error = gfs2_file_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	error = gfs2_dinode_dealloc(ip);
+	if (error)
+		goto out_unlock;
+
+out_truncate:
+	error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
+	if (error)
+		goto out_unlock;
+	/* Needs to be done before glock release & also in a transaction */
+	truncate_inode_pages(&inode->i_data, 0);
+	gfs2_trans_end(sdp);
+
+out_unlock:
+	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
+		gfs2_glock_dq(&ip->i_iopen_gh);
+	gfs2_holder_uninit(&ip->i_iopen_gh);
+	gfs2_glock_dq_uninit(&gh);
+	if (error && error != GLR_TRYFAILED && error != -EROFS)
+		fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
+out:
+	truncate_inode_pages(&inode->i_data, 0);
+	clear_inode(inode);
+}
+
+static struct inode *gfs2_alloc_inode(struct super_block *sb)
+{
+	struct gfs2_inode *ip;
+
+	ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
+	if (ip) {
+		ip->i_flags = 0;
+		ip->i_gl = NULL;
+	}
+	return &ip->i_inode;
+}
+
+static void gfs2_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(gfs2_inode_cachep, inode);
+}
+
+const struct super_operations gfs2_super_ops = {
+	.alloc_inode		= gfs2_alloc_inode,
+	.destroy_inode		= gfs2_destroy_inode,
+	.write_inode		= gfs2_write_inode,
+	.delete_inode		= gfs2_delete_inode,
+	.put_super		= gfs2_put_super,
+	.write_super		= gfs2_write_super,
+	.sync_fs		= gfs2_sync_fs,
+	.freeze_fs 		= gfs2_freeze,
+	.unfreeze_fs		= gfs2_unfreeze,
+	.statfs			= gfs2_statfs,
+	.remount_fs		= gfs2_remount_fs,
+	.clear_inode		= gfs2_clear_inode,
+	.drop_inode		= gfs2_drop_inode,
+	.show_options		= gfs2_show_options,
+};
+
-- 
cgit v1.2.3


From 2286dbfad1fb622ee2691537e5caaedee4618860 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 22 May 2009 10:45:09 +0100
Subject: GFS2: Move gfs2_rmdiri into ops_inode.c

Move gfs2_rmdiri() into ops_inode.c and make it static.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c     | 52 ----------------------------------------------------
 fs/gfs2/inode.h     |  2 --
 fs/gfs2/ops_inode.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index c03a1a384e7..9b17447a0f9 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1046,58 +1046,6 @@ fail:
 	return ERR_PTR(error);
 }
 
-/**
- * gfs2_rmdiri - Remove a directory
- * @dip: The parent directory of the directory to be removed
- * @name: The name of the directory to be removed
- * @ip: The GFS2 inode of the directory to be removed
- *
- * Assumes Glocks on dip and ip are held
- *
- * Returns: errno
- */
-
-int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
-		struct gfs2_inode *ip)
-{
-	struct qstr dotname;
-	int error;
-
-	if (ip->i_entries != 2) {
-		if (gfs2_consist_inode(ip))
-			gfs2_dinode_print(ip);
-		return -EIO;
-	}
-
-	error = gfs2_dir_del(dip, name);
-	if (error)
-		return error;
-
-	error = gfs2_change_nlink(dip, -1);
-	if (error)
-		return error;
-
-	gfs2_str2qstr(&dotname, ".");
-	error = gfs2_dir_del(ip, &dotname);
-	if (error)
-		return error;
-
-	gfs2_str2qstr(&dotname, "..");
-	error = gfs2_dir_del(ip, &dotname);
-	if (error)
-		return error;
-
-	/* It looks odd, but it really should be done twice */
-	error = gfs2_change_nlink(ip, -1);
-	if (error)
-		return error;
-
-	error = gfs2_change_nlink(ip, -1);
-	if (error)
-		return error;
-
-	return error;
-}
 
 /*
  * gfs2_unlink_ok - check to see that a inode is still in a directory
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 2c3ec072d60..6cd39284eb0 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -96,8 +96,6 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
 				  const struct qstr *name,
 				  unsigned int mode, dev_t dev);
-extern int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
-		       struct gfs2_inode *ip);
 extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 			  const struct gfs2_inode *ip);
 extern int gfs2_permission(struct inode *inode, int mask);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1c70fa5168d..5dacd647ff0 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -472,6 +472,59 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	return 0;
 }
 
+/**
+ * gfs2_rmdiri - Remove a directory
+ * @dip: The parent directory of the directory to be removed
+ * @name: The name of the directory to be removed
+ * @ip: The GFS2 inode of the directory to be removed
+ *
+ * Assumes Glocks on dip and ip are held
+ *
+ * Returns: errno
+ */
+
+static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+		       struct gfs2_inode *ip)
+{
+	struct qstr dotname;
+	int error;
+
+	if (ip->i_entries != 2) {
+		if (gfs2_consist_inode(ip))
+			gfs2_dinode_print(ip);
+		return -EIO;
+	}
+
+	error = gfs2_dir_del(dip, name);
+	if (error)
+		return error;
+
+	error = gfs2_change_nlink(dip, -1);
+	if (error)
+		return error;
+
+	gfs2_str2qstr(&dotname, ".");
+	error = gfs2_dir_del(ip, &dotname);
+	if (error)
+		return error;
+
+	gfs2_str2qstr(&dotname, "..");
+	error = gfs2_dir_del(ip, &dotname);
+	if (error)
+		return error;
+
+	/* It looks odd, but it really should be done twice */
+	error = gfs2_change_nlink(ip, -1);
+	if (error)
+		return error;
+
+	error = gfs2_change_nlink(ip, -1);
+	if (error)
+		return error;
+
+	return error;
+}
+
 /**
  * gfs2_rmdir - Remove a directory
  * @dir: The parent directory of the directory to be removed
-- 
cgit v1.2.3


From 536baf02f650f4547f105386878b4736fbc181e8 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 22 May 2009 10:48:59 +0100
Subject: GFS2: Move gfs2_readlinki into ops_inode.c

Move gfs2_readlinki into ops_inode.c and make it static

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c     | 58 +----------------------------------------------------
 fs/gfs2/inode.h     |  1 -
 fs/gfs2/ops_inode.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9b17447a0f9..676e750fc84 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1085,63 +1085,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 	return 0;
 }
 
-/**
- * gfs2_readlinki - return the contents of a symlink
- * @ip: the symlink's inode
- * @buf: a pointer to the buffer to be filled
- * @len: a pointer to the length of @buf
- *
- * If @buf is too small, a piece of memory is kmalloc()ed and needs
- * to be freed by the caller.
- *
- * Returns: errno
- */
-
-int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
-{
-	struct gfs2_holder i_gh;
-	struct buffer_head *dibh;
-	unsigned int x;
-	int error;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
-	error = gfs2_glock_nq(&i_gh);
-	if (error) {
-		gfs2_holder_uninit(&i_gh);
-		return error;
-	}
-
-	if (!ip->i_disksize) {
-		gfs2_consist_inode(ip);
-		error = -EIO;
-		goto out;
-	}
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		goto out;
-
-	x = ip->i_disksize + 1;
-	if (x > *len) {
-		*buf = kmalloc(x, GFP_NOFS);
-		if (!*buf) {
-			error = -ENOMEM;
-			goto out_brelse;
-		}
-	}
-
-	memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
-	*len = x;
-
-out_brelse:
-	brelse(dibh);
-out:
-	gfs2_glock_dq_uninit(&i_gh);
-	return error;
-}
-
-static int
-__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 {
 	struct buffer_head *dibh;
 	int error;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 6cd39284eb0..fc9a08f45be 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -99,7 +99,6 @@ extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
 extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 			  const struct gfs2_inode *ip);
 extern int gfs2_permission(struct inode *inode, int mask);
-extern int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
 extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 5dacd647ff0..f607f0908cf 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -937,6 +937,61 @@ out:
 	return error;
 }
 
+/**
+ * gfs2_readlinki - return the contents of a symlink
+ * @ip: the symlink's inode
+ * @buf: a pointer to the buffer to be filled
+ * @len: a pointer to the length of @buf
+ *
+ * If @buf is too small, a piece of memory is kmalloc()ed and needs
+ * to be freed by the caller.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
+{
+	struct gfs2_holder i_gh;
+	struct buffer_head *dibh;
+	unsigned int x;
+	int error;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+	error = gfs2_glock_nq(&i_gh);
+	if (error) {
+		gfs2_holder_uninit(&i_gh);
+		return error;
+	}
+
+	if (!ip->i_disksize) {
+		gfs2_consist_inode(ip);
+		error = -EIO;
+		goto out;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+
+	x = ip->i_disksize + 1;
+	if (x > *len) {
+		*buf = kmalloc(x, GFP_NOFS);
+		if (!*buf) {
+			error = -ENOMEM;
+			goto out_brelse;
+		}
+	}
+
+	memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
+	*len = x;
+
+out_brelse:
+	brelse(dibh);
+out:
+	gfs2_glock_dq_uninit(&i_gh);
+	return error;
+}
+
 /**
  * gfs2_readlink - Read the value of a symlink
  * @dentry: the symlink
-- 
cgit v1.2.3


From 87ec21741138bb42e7f943bb142b1d8567c10925 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 22 May 2009 10:54:50 +0100
Subject: GFS2: Move gfs2_unlink_ok into ops_inode.c

Another function which is only called from one ops_inode.c so
we move it and make it static.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c     | 39 ---------------------------------------
 fs/gfs2/inode.h     |  2 --
 fs/gfs2/ops_inode.c | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 676e750fc84..2f94bd72369 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1046,45 +1046,6 @@ fail:
 	return ERR_PTR(error);
 }
 
-
-/*
- * gfs2_unlink_ok - check to see that a inode is still in a directory
- * @dip: the directory
- * @name: the name of the file
- * @ip: the inode
- *
- * Assumes that the lock on (at least) @dip is held.
- *
- * Returns: 0 if the parent/child relationship is correct, errno if it isn't
- */
-
-int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
-		   const struct gfs2_inode *ip)
-{
-	int error;
-
-	if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
-		return -EPERM;
-
-	if ((dip->i_inode.i_mode & S_ISVTX) &&
-	    dip->i_inode.i_uid != current_fsuid() &&
-	    ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
-		return -EPERM;
-
-	if (IS_APPEND(&dip->i_inode))
-		return -EPERM;
-
-	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
-	if (error)
-		return error;
-
-	error = gfs2_dir_check(&dip->i_inode, name, ip);
-	if (error)
-		return error;
-
-	return 0;
-}
-
 static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 {
 	struct buffer_head *dibh;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index fc9a08f45be..c341aaf67ad 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -96,8 +96,6 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
 				  const struct qstr *name,
 				  unsigned int mode, dev_t dev);
-extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
-			  const struct gfs2_inode *ip);
 extern int gfs2_permission(struct inode *inode, int mask);
 extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index f607f0908cf..f8bd20baf99 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -262,6 +262,44 @@ out_parent:
 	return error;
 }
 
+/*
+ * gfs2_unlink_ok - check to see that a inode is still in a directory
+ * @dip: the directory
+ * @name: the name of the file
+ * @ip: the inode
+ *
+ * Assumes that the lock on (at least) @dip is held.
+ *
+ * Returns: 0 if the parent/child relationship is correct, errno if it isn't
+ */
+
+static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+			  const struct gfs2_inode *ip)
+{
+	int error;
+
+	if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+		return -EPERM;
+
+	if ((dip->i_inode.i_mode & S_ISVTX) &&
+	    dip->i_inode.i_uid != current_fsuid() &&
+	    ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
+		return -EPERM;
+
+	if (IS_APPEND(&dip->i_inode))
+		return -EPERM;
+
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+	if (error)
+		return error;
+
+	error = gfs2_dir_check(&dip->i_inode, name, ip);
+	if (error)
+		return error;
+
+	return 0;
+}
+
 /**
  * gfs2_unlink - Unlink a file
  * @dir: The inode of the directory containing the file to unlink
-- 
cgit v1.2.3


From e1defc4ff0cf57aca6c5e3ff99fa503f5943c1f1 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:49 -0400
Subject: block: Do away with the notion of hardsect_size

Until now we have had a 1:1 mapping between storage device physical
block size and the logical block sized used when addressing the device.
With SATA 4KB drives coming out that will no longer be the case.  The
sector size will be 4KB but the logical block size will remain
512-bytes.  Hence we need to distinguish between the physical block size
and the logical ditto.

This patch renames hardsect_size to logical_block_size.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c                     | 3 ++-
 fs/block_dev.c               | 6 +++---
 fs/buffer.c                  | 6 +++---
 fs/direct-io.c               | 2 +-
 fs/ext3/super.c              | 4 ++--
 fs/ext4/super.c              | 2 +-
 fs/gfs2/ops_fstype.c         | 4 ++--
 fs/gfs2/rgrp.c               | 2 +-
 fs/nilfs2/the_nilfs.c        | 2 +-
 fs/ntfs/super.c              | 6 +++---
 fs/ocfs2/cluster/heartbeat.c | 2 +-
 fs/ocfs2/super.c             | 2 +-
 fs/partitions/ibm.c          | 2 +-
 fs/partitions/msdos.c        | 4 ++--
 fs/udf/super.c               | 2 +-
 fs/xfs/linux-2.6/xfs_buf.c   | 2 +-
 16 files changed, 26 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 81dc93e7253..4445c382173 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1490,11 +1490,12 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 sector_t bio_sector_offset(struct bio *bio, unsigned short index,
 			   unsigned int offset)
 {
-	unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
+	unsigned int sector_sz;
 	struct bio_vec *bv;
 	sector_t sectors;
 	int i;
 
+	sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
 	sectors = 0;
 
 	if (index >= bio->bi_idx)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index a85fe310fc6..a29b4dcc1bc 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -76,7 +76,7 @@ int set_blocksize(struct block_device *bdev, int size)
 		return -EINVAL;
 
 	/* Size cannot be smaller than the size supported by the device */
-	if (size < bdev_hardsect_size(bdev))
+	if (size < bdev_logical_block_size(bdev))
 		return -EINVAL;
 
 	/* Don't change the size if it is same as current */
@@ -106,7 +106,7 @@ EXPORT_SYMBOL(sb_set_blocksize);
 
 int sb_min_blocksize(struct super_block *sb, int size)
 {
-	int minsize = bdev_hardsect_size(sb->s_bdev);
+	int minsize = bdev_logical_block_size(sb->s_bdev);
 	if (size < minsize)
 		size = minsize;
 	return sb_set_blocksize(sb, size);
@@ -1117,7 +1117,7 @@ EXPORT_SYMBOL(check_disk_change);
 
 void bd_set_size(struct block_device *bdev, loff_t size)
 {
-	unsigned bsize = bdev_hardsect_size(bdev);
+	unsigned bsize = bdev_logical_block_size(bdev);
 
 	bdev->bd_inode->i_size = size;
 	while (bsize < PAGE_CACHE_SIZE) {
diff --git a/fs/buffer.c b/fs/buffer.c
index aed297739eb..36e2bbc60ec 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1085,12 +1085,12 @@ static struct buffer_head *
 __getblk_slow(struct block_device *bdev, sector_t block, int size)
 {
 	/* Size must be multiple of hard sectorsize */
-	if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
+	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
 			(size < 512 || size > PAGE_SIZE))) {
 		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
 					size);
-		printk(KERN_ERR "hardsect size: %d\n",
-					bdev_hardsect_size(bdev));
+		printk(KERN_ERR "logical block size: %d\n",
+					bdev_logical_block_size(bdev));
 
 		dump_stack();
 		return NULL;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 05763bbc205..8b10b87dc01 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1127,7 +1127,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 		rw = WRITE_ODIRECT;
 
 	if (bdev)
-		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
+		bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
 
 	if (offset & blocksize_mask) {
 		if (bdev)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 599dbfe504c..acbb94fdf90 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1696,7 +1696,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	hblock = bdev_hardsect_size(sb->s_bdev);
+	hblock = bdev_logical_block_size(sb->s_bdev);
 	if (sb->s_blocksize != blocksize) {
 		/*
 		 * Make sure the blocksize for the filesystem is larger
@@ -2119,7 +2119,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	}
 
 	blocksize = sb->s_blocksize;
-	hblock = bdev_hardsect_size(bdev);
+	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
 		printk(KERN_ERR
 			"EXT3-fs: blocksize too small for journal device.\n");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2958f4e6f22..a30549f7a30 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2962,7 +2962,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	}
 
 	blocksize = sb->s_blocksize;
-	hblock = bdev_hardsect_size(bdev);
+	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
 		printk(KERN_ERR
 			"EXT4-fs: blocksize too small for journal device.\n");
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1ff9473ea75..a3b2ac989fc 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -526,11 +526,11 @@ static int init_sb(struct gfs2_sbd *sdp, int silent)
 	}
 
 	/* Set up the buffer cache and SB for real */
-	if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
+	if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) {
 		ret = -EINVAL;
 		fs_err(sdp, "FS block size (%u) is too small for device "
 		       "block size (%u)\n",
-		       sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
+		       sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev));
 		goto out;
 	}
 	if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 565038243fa..a971d24e10c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -845,7 +845,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 	struct super_block *sb = sdp->sd_vfs;
 	struct block_device *bdev = sb->s_bdev;
 	const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize /
-					   bdev_hardsect_size(sb->s_bdev);
+					   bdev_logical_block_size(sb->s_bdev);
 	u64 blk;
 	sector_t start = 0;
 	sector_t nr_sects = 0;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 7f65b3be4aa..a91f15b8673 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -515,7 +515,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 
 	blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
 	if (sb->s_blocksize != blocksize) {
-		int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
+		int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
 
 		if (blocksize < hw_blocksize) {
 			printk(KERN_ERR
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index f76951dcd4a..6aa7c471353 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -25,7 +25,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
-#include <linux/blkdev.h>	/* For bdev_hardsect_size(). */
+#include <linux/blkdev.h>	/* For bdev_logical_block_size(). */
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
@@ -2785,13 +2785,13 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 		goto err_out_now;
 
 	/* We support sector sizes up to the PAGE_CACHE_SIZE. */
-	if (bdev_hardsect_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
+	if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
 		if (!silent)
 			ntfs_error(sb, "Device has unsupported sector size "
 					"(%i).  The maximum supported sector "
 					"size on this architecture is %lu "
 					"bytes.",
-					bdev_hardsect_size(sb->s_bdev),
+					bdev_logical_block_size(sb->s_bdev),
 					PAGE_CACHE_SIZE);
 		goto err_out_now;
 	}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4f85eceab37..09cc25d0461 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1371,7 +1371,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 
 	bdevname(reg->hr_bdev, reg->hr_dev_name);
 
-	sectsize = bdev_hardsect_size(reg->hr_bdev);
+	sectsize = bdev_logical_block_size(reg->hr_bdev);
 	if (sectsize != reg->hr_block_bytes) {
 		mlog(ML_ERROR,
 		     "blocksize %u incorrect for device, expected %d",
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 79ff8d9d37e..5c6163f5503 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -713,7 +713,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
 	*bh = NULL;
 
 	/* may be > 512 */
-	*sector_size = bdev_hardsect_size(sb->s_bdev);
+	*sector_size = bdev_logical_block_size(sb->s_bdev);
 	if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
 		mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
 		     *sector_size, OCFS2_MAX_BLOCKSIZE);
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 46297683cd3..fc71aab0846 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -76,7 +76,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	Sector sect;
 
 	res = 0;
-	blocksize = bdev_hardsect_size(bdev);
+	blocksize = bdev_logical_block_size(bdev);
 	if (blocksize <= 0)
 		goto out_exit;
 	i_size = i_size_read(bdev->bd_inode);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 796511886f2..0028d2ef066 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -110,7 +110,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
 	Sector sect;
 	unsigned char *data;
 	u32 this_sector, this_size;
-	int sector_size = bdev_hardsect_size(bdev) / 512;
+	int sector_size = bdev_logical_block_size(bdev) / 512;
 	int loopct = 0;		/* number of links followed
 				   without finding a data partition */
 	int i;
@@ -415,7 +415,7 @@ static struct {
  
 int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
-	int sector_size = bdev_hardsect_size(bdev) / 512;
+	int sector_size = bdev_logical_block_size(bdev) / 512;
 	Sector sect;
 	unsigned char *data;
 	struct partition *p;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 72348cc855a..0ba44107d8f 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1915,7 +1915,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
 		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
 	} else {
-		uopt.blocksize = bdev_hardsect_size(sb->s_bdev);
+		uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
 		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
 		if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
 			if (!silent)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e28800a9f2b..1418b916fc2 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1501,7 +1501,7 @@ xfs_setsize_buftarg_early(
 	struct block_device	*bdev)
 {
 	return xfs_setsize_buftarg_flags(btp,
-			PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
+			PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0);
 }
 
 int
-- 
cgit v1.2.3


From ae03bf639a5027d27270123f5f6e3ee6a412781d Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:50 -0400
Subject: block: Use accessor functions for queue limits

Convert all external users of queue limits to using wrapper functions
instead of poking the request queue variables directly.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 4445c382173..ab423a1024a 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -499,11 +499,11 @@ int bio_get_nr_vecs(struct block_device *bdev)
 	struct request_queue *q = bdev_get_queue(bdev);
 	int nr_pages;
 
-	nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (nr_pages > q->max_phys_segments)
-		nr_pages = q->max_phys_segments;
-	if (nr_pages > q->max_hw_segments)
-		nr_pages = q->max_hw_segments;
+	nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (nr_pages > queue_max_phys_segments(q))
+		nr_pages = queue_max_phys_segments(q);
+	if (nr_pages > queue_max_hw_segments(q))
+		nr_pages = queue_max_hw_segments(q);
 
 	return nr_pages;
 }
@@ -562,8 +562,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	 * make this too complex.
 	 */
 
-	while (bio->bi_phys_segments >= q->max_phys_segments
-	       || bio->bi_phys_segments >= q->max_hw_segments) {
+	while (bio->bi_phys_segments >= queue_max_phys_segments(q)
+	       || bio->bi_phys_segments >= queue_max_hw_segments(q)) {
 
 		if (retried_segments)
 			return 0;
@@ -634,7 +634,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
 		    unsigned int len, unsigned int offset)
 {
-	return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors);
+	return __bio_add_page(q, bio, page, len, offset,
+			      queue_max_hw_sectors(q));
 }
 
 /**
@@ -654,7 +655,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 		 unsigned int offset)
 {
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-	return __bio_add_page(q, bio, page, len, offset, q->max_sectors);
+	return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
 }
 
 struct bio_map_data {
-- 
cgit v1.2.3


From c72758f33784e5e2a1a4bb9421ef3e6de8f9fcf3 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:53 -0400
Subject: block: Export I/O topology for block devices and partitions

To support devices with physical block sizes bigger than 512 bytes we
need to ensure proper alignment.  This patch adds support for exposing
I/O topology characteristics as devices are stacked.

  logical_block_size is the smallest unit the device can address.

  physical_block_size indicates the smallest I/O the device can write
  without incurring a read-modify-write penalty.

  The io_min parameter is the smallest preferred I/O size reported by
  the device.  In many cases this is the same as the physical block
  size.  However, the io_min parameter can be scaled up when stacking
  (RAID5 chunk size > physical block size).

  The io_opt characteristic indicates the optimal I/O size reported by
  the device.  This is usually the stripe width for arrays.

  The alignment_offset parameter indicates the number of bytes the start
  of the device/partition is offset from the device's natural alignment.
  Partition tools and MD/DM utilities can use this to pad their offsets
  so filesystems start on proper boundaries.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 99e33ef40be..0af36085eb2 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -219,6 +219,13 @@ ssize_t part_size_show(struct device *dev,
 	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
 
+ssize_t part_alignment_offset_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
+}
+
 ssize_t part_stat_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
@@ -272,6 +279,7 @@ ssize_t part_fail_store(struct device *dev,
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
@@ -282,6 +290,7 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_partition.attr,
 	&dev_attr_start.attr,
 	&dev_attr_size.attr,
+	&dev_attr_alignment_offset.attr,
 	&dev_attr_stat.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
@@ -383,6 +392,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	pdev = part_to_dev(p);
 
 	p->start_sect = start;
+	p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
 	p->nr_sects = len;
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
-- 
cgit v1.2.3


From 79f52b77b89e8b7aa9fbe62135eea198a2ecbd5b Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Date: Sat, 23 May 2009 20:28:41 -0500
Subject: jfs: Add missing mutex_unlock call to error path

Jan Kucera found an missing call to mutex_unlock() with his static code
checker.  It's an unlikely error path to hit in the real world, but it
should be fixed.

Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Reported-by: Jan Kucera <kucera.jan.cz@gmail.com>
---
 fs/jfs/jfs_imap.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 346057218ed..0fc30407f03 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -2571,6 +2571,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
 
 			txAbort(tid, 0);
 			txEnd(tid);
+			mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
 
 			/* release the inode map lock */
 			IWRITE_UNLOCK(ipimap);
-- 
cgit v1.2.3


From 759d427aa5a9d88a81afd11817cdeb40aea85234 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 25 May 2009 11:51:00 -0400
Subject: ext4: remove unused function __ext4_write_dirty_metadata

The __ext4_write_dirty_metadata() function was introduced by commit
0390131b, "ext4: Allow ext4 to run without a journal", but nothing
ever used the function, either then or since.  So let's remove it and
save a bit of space.

Cc: Frank Mayhar <fmayhar@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index dadd3f995db..14c00fff371 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4790,25 +4790,6 @@ int ext4_write_inode(struct inode *inode, int wait)
 	return ext4_force_commit(inode->i_sb);
 }
 
-int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
-{
-	int err = 0;
-
-	mark_buffer_dirty(bh);
-	if (inode && inode_needs_sync(inode)) {
-		sync_dirty_buffer(bh);
-		if (buffer_req(bh) && !buffer_uptodate(bh)) {
-			ext4_error(inode->i_sb, __func__,
-				   "IO error syncing inode, "
-				   "inode=%lu, block=%llu",
-				   inode->i_ino,
-				   (unsigned long long)bh->b_blocknr);
-			err = -EIO;
-		}
-	}
-	return err;
-}
-
 /*
  * ext4_setattr()
  *
-- 
cgit v1.2.3


From 88b6edd17c62b7d346d21f4087893ce7d4ef828a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 25 May 2009 11:50:39 -0400
Subject: ext4: Clean up calls to ext4_get_group_desc()

If the caller isn't planning on modifying the block group descriptors,
there's no need to pass in a pointer to a struct buffer_head.  Nuking
this saves a tiny amount of CPU time and stack space usage.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 3 +--
 fs/ext4/super.c  | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 82f7d1d7eae..3743bd849bc 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -347,7 +347,6 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_desc *desc;
-	struct buffer_head *bh;
 	struct flex_groups *flex_group = sbi->s_flex_groups;
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
@@ -402,7 +401,7 @@ find_close_to_parent:
 found_flexbg:
 	for (i = best_flex * flex_size; i < ngroups &&
 		     i < (best_flex + 1) * flex_size; i++) {
-		desc = ext4_get_group_desc(sb, i, &bh);
+		desc = ext4_get_group_desc(sb, i, NULL);
 		if (ext4_free_inodes_count(sb, desc)) {
 			*best_group = i;
 			goto out;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index eca6c057b11..91b98b58ccb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1660,7 +1660,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_desc *gdp = NULL;
-	struct buffer_head *bh;
 	ext4_group_t flex_group_count;
 	ext4_group_t flex_group;
 	int groups_per_flex = 0;
@@ -1693,7 +1692,7 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	}
 
 	for (i = 0; i < sbi->s_groups_count; i++) {
-		gdp = ext4_get_group_desc(sb, i, &bh);
+		gdp = ext4_get_group_desc(sb, i, NULL);
 
 		flex_group = ext4_flex_group(sbi, i);
 		atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
-- 
cgit v1.2.3


From e1b28aab5804aa477c33d19855d6747607a885fd Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 26 May 2009 15:41:27 +0100
Subject: GFS2: Remove lockstruct subdir from gfs2 sysfs files

The lockstruct sub directory contained two entries, both of
which are duplicated elsewhere in the gfs2 sysfs files as
well as being available via /proc/mounts. There is no userland program
using either of them, so this patch removes them.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/sys.c | 41 ++++++++---------------------------------
 1 file changed, 8 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 9f6d48b75fd..94bd59ec54e 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -276,25 +276,6 @@ static struct kobj_type gfs2_ktype = {
 	.sysfs_ops     = &gfs2_attr_ops,
 };
 
-/*
- * display struct lm_lockstruct fields
- */
-
-#define LOCKSTRUCT_ATTR(name, fmt)                                          \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
-{                                                                           \
-	return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \
-}                                                                           \
-static struct gfs2_attr lockstruct_attr_##name = __ATTR_RO(name)
-
-LOCKSTRUCT_ATTR(jid,      "%u\n");
-LOCKSTRUCT_ATTR(first,    "%u\n");
-
-static struct attribute *lockstruct_attrs[] = {
-	&lockstruct_attr_jid.attr,
-	&lockstruct_attr_first.attr,
-	NULL,
-};
 
 /*
  * lock_module. Originally from lock_dlm
@@ -397,6 +378,11 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
 	return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
 }
 
+static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
+{
+	return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
+}
+
 #define GDLM_ATTR(_name,_mode,_show,_store) \
 static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 
@@ -404,6 +390,7 @@ GDLM_ATTR(proto_name,     0444, proto_name_show,	NULL);
 GDLM_ATTR(block,          0644, block_show,		block_store);
 GDLM_ATTR(withdraw,       0644, withdraw_show,		withdraw_store);
 GDLM_ATTR(id,             0444, lkid_show,		NULL);
+GDLM_ATTR(jid,		  0444, jid_show,		NULL);
 GDLM_ATTR(first,          0444, lkfirst_show,		NULL);
 GDLM_ATTR(first_done,     0444, first_done_show,	NULL);
 GDLM_ATTR(recover,        0200, NULL,			recover_store);
@@ -415,7 +402,7 @@ static struct attribute *lock_module_attrs[] = {
 	&gdlm_attr_block.attr,
 	&gdlm_attr_withdraw.attr,
 	&gdlm_attr_id.attr,
-	&lockstruct_attr_jid.attr,
+	&gdlm_attr_jid.attr,
 	&gdlm_attr_first.attr,
 	&gdlm_attr_first_done.attr,
 	&gdlm_attr_recover.attr,
@@ -558,11 +545,6 @@ static struct attribute *tune_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group lockstruct_group = {
-	.name = "lockstruct",
-	.attrs = lockstruct_attrs,
-};
-
 static struct attribute_group args_group = {
 	.name = "args",
 	.attrs = args_attrs,
@@ -588,13 +570,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	if (error)
 		goto fail;
 
-	error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
-	if (error)
-		goto fail_reg;
-
 	error = sysfs_create_group(&sdp->sd_kobj, &args_group);
 	if (error)
-		goto fail_lockstruct;
+		goto fail_reg;
 
 	error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
 	if (error)
@@ -611,8 +589,6 @@ fail_tune:
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 fail_args:
 	sysfs_remove_group(&sdp->sd_kobj, &args_group);
-fail_lockstruct:
-	sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 fail_reg:
 	kobject_put(&sdp->sd_kobj);
 fail:
@@ -624,7 +600,6 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 	sysfs_remove_group(&sdp->sd_kobj, &args_group);
-	sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 	sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
 	kobject_put(&sdp->sd_kobj);
 }
-- 
cgit v1.2.3


From f6eb53498ee8f725832f3a0fffca90566bb118a6 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 26 May 2009 15:50:25 +0100
Subject: GFS2: Remove args subdir from gfs2 sysfs files

Since we can cat /proc/mounts there is no need to have this
subdirectory in the gfs2 sysfs files. In fact this does not
reflect the full range of possible mount argumenmts, where
as /proc/mounts does.

There was only one userland user of this set of sysfs files
and it will function perfectly well without these files
being present (in fact that subcommand of gfs2_tool is
obsolete anyway).

The tune/* subdirectory is also considered mostly obsolete,
but there are a few uses of this until mount arguments can
be added for the last few functions for which there are no
equivalents currently. However the tune/* directory is still
in my sights and new code should avoid using it. Only the gfs2_quota
and gfs2_tool programs are know to use tune/* at the moment.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/sys.c | 52 +---------------------------------------------------
 1 file changed, 1 insertion(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 94bd59ec54e..23419dc3027 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -411,44 +411,6 @@ static struct attribute *lock_module_attrs[] = {
 	NULL,
 };
 
-#define ARGS_ATTR(name, fmt)                                                \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
-{                                                                           \
-	return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name);       \
-}                                                                           \
-static struct gfs2_attr args_attr_##name = __ATTR_RO(name)
-
-ARGS_ATTR(lockproto,       "%s\n");
-ARGS_ATTR(locktable,       "%s\n");
-ARGS_ATTR(hostdata,        "%s\n");
-ARGS_ATTR(spectator,       "%d\n");
-ARGS_ATTR(ignore_local_fs, "%d\n");
-ARGS_ATTR(localcaching,    "%d\n");
-ARGS_ATTR(localflocks,     "%d\n");
-ARGS_ATTR(debug,           "%d\n");
-ARGS_ATTR(upgrade,         "%d\n");
-ARGS_ATTR(posix_acl,       "%d\n");
-ARGS_ATTR(quota,           "%u\n");
-ARGS_ATTR(suiddir,         "%d\n");
-ARGS_ATTR(data,            "%d\n");
-
-static struct attribute *args_attrs[] = {
-	&args_attr_lockproto.attr,
-	&args_attr_locktable.attr,
-	&args_attr_hostdata.attr,
-	&args_attr_spectator.attr,
-	&args_attr_ignore_local_fs.attr,
-	&args_attr_localcaching.attr,
-	&args_attr_localflocks.attr,
-	&args_attr_debug.attr,
-	&args_attr_upgrade.attr,
-	&args_attr_posix_acl.attr,
-	&args_attr_quota.attr,
-	&args_attr_suiddir.attr,
-	&args_attr_data.attr,
-	NULL,
-};
-
 /*
  * get and set struct gfs2_tune fields
  */
@@ -545,11 +507,6 @@ static struct attribute *tune_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group args_group = {
-	.name = "args",
-	.attrs = args_attrs,
-};
-
 static struct attribute_group tune_group = {
 	.name = "tune",
 	.attrs = tune_attrs,
@@ -570,13 +527,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	if (error)
 		goto fail;
 
-	error = sysfs_create_group(&sdp->sd_kobj, &args_group);
-	if (error)
-		goto fail_reg;
-
 	error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
 	if (error)
-		goto fail_args;
+		goto fail_reg;
 
 	error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group);
 	if (error)
@@ -587,8 +540,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 
 fail_tune:
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
-fail_args:
-	sysfs_remove_group(&sdp->sd_kobj, &args_group);
 fail_reg:
 	kobject_put(&sdp->sd_kobj);
 fail:
@@ -599,7 +550,6 @@ fail:
 void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
-	sysfs_remove_group(&sdp->sd_kobj, &args_group);
 	sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
 	kobject_put(&sdp->sd_kobj);
 }
-- 
cgit v1.2.3


From 46a7574caf5bc533c24b315800ed323c187614f5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sun, 24 May 2009 18:45:17 -0400
Subject: cifs: fix artificial limit on reading symlinks

There's no reason to limit the size of a symlink that we can read to
4000 bytes. That may be nowhere near PATH_MAX if the server is sending
UCS2 strings. CIFS should be able to read in a symlink up to the size of
the buffer. The size of the header has already been accounted for when
creating the slabcache, so CIFSMaxBufSize should be the correct size to
pass in.

Fixes samba bug #6384.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d06260251c3..aece2a8c1a7 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2427,8 +2427,7 @@ querySymLinkRetry:
 	params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	/* BB find exact max data count below from sess structure BB */
-	pSMB->MaxDataCount = cpu_to_le16(4000);
+	pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
-- 
cgit v1.2.3


From f55ed1a83d099f275c9560ad7d4c4700d1e54bdd Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 26 May 2009 16:28:11 -0400
Subject: cifs: tighten up default file_mode/dir_mode

The current default file mode is 02767 and dir mode is 0777. This is
extremely "loose". Given that CIFS is a single-user protocol, these
permissions allow anyone to use the mount -- in effect, giving anyone on
the machine access to the credentials used to mount the share.

Change this by making the default permissions restrict write access to
the default owner of the mount. Give read and execute permissions to
everyone else. These are the same permissions that VFAT mounts get by
default so there is some precedent here.

Note that this patch also removes the mandatory locking flags from the
default file_mode. After having looked at how these flags are used by
the kernel, I don't think that keeping them as the default offers any
real benefit. That flag combination makes it so that the kernel enforces
mandatory locking.

Since the server is going to do that for us anyway, I don't think we
want the client to enforce this by default on applications that just
want advisory locks. Anyone that does want this behavior can always
enable it by setting the file_mode appropriately.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4aa81a507b7..f32c9036741 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -827,9 +827,9 @@ cifs_parse_mount_options(char *options, const char *devname,
 	vol->target_rfc1001_name[0] = 0;
 	vol->linux_uid = current_uid();  /* use current_euid() instead? */
 	vol->linux_gid = current_gid();
-	vol->dir_mode = S_IRWXUGO;
-	/* 2767 perms indicate mandatory locking support */
-	vol->file_mode = (S_IRWXUGO | S_ISGID) & (~S_IXGRP);
+
+	/* default to only allowing write access to owner of the mount */
+	vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR;
 
 	/* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
 	vol->rw = true;
-- 
cgit v1.2.3


From 14dba5331b90c20588ae6504fea8049c7283028d Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Wed, 27 May 2009 09:31:52 -0400
Subject: integrity: nfsd imbalance bug fix

An nfsd exported file is opened/closed by the kernel causing the
integrity imbalance message.

Before a file is opened, there normally is permission checking, which
is done in inode_permission().  However, as integrity checking requires
a dentry and mount point, which is not available in inode_permission(),
the integrity (permission) checking must be called separately.

In order to detect any missing integrity checking calls, we keep track
of file open/closes.  ima_path_check() increments these counts and
does the integrity (permission) checking. As a result, the number of
calls to ima_path_check()/ima_file_free() should be balanced.  An extra
call to fput(), indicates the file could have been accessed without first
calling ima_path_check().

In nfsv3 permission checking is done once, followed by multiple reads,
which do an open/close for each read.  The integrity (permission) checking
call should be in nfsd_permission() after the inode_permission() call, but
as there is no correlation between the number of permission checking and
open calls, the integrity checking call should not increment the counters,
but defer it to when the file is actually opened.

This patch adds:
- integrity (permission) checking for nfsd exported files in nfsd_permission().
- a call to increment counts for files opened by nfsd.

This patch has been updated to return the nfs error types.

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/nfsd/vfs.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6c68ffd6b4b..81ff0f4de4b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -55,6 +55,7 @@
 #include <linux/security.h>
 #endif /* CONFIG_NFSD_V4 */
 #include <linux/jhash.h>
+#include <linux/ima.h>
 
 #include <asm/uaccess.h>
 
@@ -735,6 +736,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 			    flags, cred);
 	if (IS_ERR(*filp))
 		host_err = PTR_ERR(*filp);
+	else
+		ima_counts_get(*filp);
 out_nfserr:
 	err = nfserrno(host_err);
 out:
@@ -2024,6 +2027,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 					struct dentry *dentry, int acc)
 {
 	struct inode	*inode = dentry->d_inode;
+	struct path	path;
 	int		err;
 
 	if (acc == NFSD_MAY_NOP)
@@ -2096,7 +2100,17 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 	if (err == -EACCES && S_ISREG(inode->i_mode) &&
 	    acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
 		err = inode_permission(inode, MAY_EXEC);
+	if (err)
+		goto nfsd_out;
 
+	/* Do integrity (permission) checking now, but defer incrementing
+	 * IMA counts to the actual file open.
+	 */
+	path.mnt = exp->ex_path.mnt;
+	path.dentry = dentry;
+	err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC),
+			     IMA_COUNT_LEAVE);
+nfsd_out:
 	return err? nfserrno(err) : 0;
 }
 
-- 
cgit v1.2.3


From 07119a4df8c8c77d888f2f46964ea9512ea84ff8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 May 2009 09:37:33 -0400
Subject: cifs: have cifs_NTtimeToUnix take a little-endian arg

...and just have the function call le64_to_cpu.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  2 +-
 fs/cifs/file.c      |  2 +-
 fs/cifs/inode.c     | 15 ++++++---------
 fs/cifs/netmisc.c   | 12 ++++++------
 fs/cifs/readdir.c   | 12 ++++++------
 5 files changed, 20 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index fae083930ee..8831f649720 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -90,7 +90,7 @@ extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16,
 						 struct cifsTconInfo *);
 extern void DeleteOplockQEntry(struct oplock_q_entry *);
 extern void DeleteTconOplockQEntries(struct cifsTconInfo *);
-extern struct timespec cifs_NTtimeToUnix(u64 utc_nanoseconds_since_1601);
+extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
 extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
 extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 302ea15f02e..06866841b97 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -241,7 +241,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 	/* BB need same check in cifs_create too? */
 	/* if not oplocked, invalidate inode pages if mtime or file
 	   size changed */
-	temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
+	temp = cifs_NTtimeToUnix(buf->LastWriteTime);
 	if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
 			   (file->f_path.dentry->d_inode->i_size ==
 			    (loff_t)le64_to_cpu(buf->EndOfFile))) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9c869a6dcba..42d6e0fb6f3 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -85,10 +85,10 @@ static void cifs_unix_info_to_inode(struct inode *inode,
 	__u64 num_of_bytes = le64_to_cpu(info->NumOfBytes);
 	__u64 end_of_file = le64_to_cpu(info->EndOfFile);
 
-	inode->i_atime = cifs_NTtimeToUnix(le64_to_cpu(info->LastAccessTime));
+	inode->i_atime = cifs_NTtimeToUnix(info->LastAccessTime);
 	inode->i_mtime =
-		cifs_NTtimeToUnix(le64_to_cpu(info->LastModificationTime));
-	inode->i_ctime = cifs_NTtimeToUnix(le64_to_cpu(info->LastStatusChange));
+		cifs_NTtimeToUnix(info->LastModificationTime);
+	inode->i_ctime = cifs_NTtimeToUnix(info->LastStatusChange);
 	inode->i_mode = le64_to_cpu(info->Permissions);
 
 	/*
@@ -554,14 +554,11 @@ int cifs_get_inode_info(struct inode **pinode,
 
 	/* Linux can not store file creation time so ignore it */
 	if (pfindData->LastAccessTime)
-		inode->i_atime = cifs_NTtimeToUnix
-			(le64_to_cpu(pfindData->LastAccessTime));
+		inode->i_atime = cifs_NTtimeToUnix(pfindData->LastAccessTime);
 	else /* do not need to use current_fs_time - time not stored */
 		inode->i_atime = CURRENT_TIME;
-	inode->i_mtime =
-		    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
-	inode->i_ctime =
-	    cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
+	inode->i_mtime = cifs_NTtimeToUnix(pfindData->LastWriteTime);
+	inode->i_ctime = cifs_NTtimeToUnix(pfindData->ChangeTime);
 	cFYI(DBG2, ("Attributes came in as 0x%x", attr));
 	if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
 		inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index e2fe998989a..d3ba75ef014 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -853,12 +853,12 @@ smbCalcSize_LE(struct smb_hdr *ptr)
 
 #define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
 
-    /*
-     * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
-     * into Unix UTC (based 1970-01-01, in seconds).
-     */
+/*
+ * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
+ * into Unix UTC (based 1970-01-01, in seconds).
+ */
 struct timespec
-cifs_NTtimeToUnix(u64 ntutc)
+cifs_NTtimeToUnix(__le64 ntutc)
 {
 	struct timespec ts;
 	/* BB what about the timezone? BB */
@@ -866,7 +866,7 @@ cifs_NTtimeToUnix(u64 ntutc)
 	/* Subtract the NTFS time offset, then convert to 1s intervals. */
 	u64 t;
 
-	t = ntutc - NTFS_TIME_OFFSET;
+	t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
 	ts.tv_nsec = do_div(t, 10000000) * 100;
 	ts.tv_sec = t;
 	return ts;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 964e097c820..79c46c2226c 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -150,11 +150,11 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 		allocation_size = le64_to_cpu(pfindData->AllocationSize);
 		end_of_file = le64_to_cpu(pfindData->EndOfFile);
 		tmp_inode->i_atime =
-		      cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+			cifs_NTtimeToUnix(pfindData->LastAccessTime);
 		tmp_inode->i_mtime =
-		      cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
+			cifs_NTtimeToUnix(pfindData->LastWriteTime);
 		tmp_inode->i_ctime =
-		      cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
+			cifs_NTtimeToUnix(pfindData->ChangeTime);
 	} else { /* legacy, OS2 and DOS style */
 /*		struct timespec ts;*/
 		FIND_FILE_STANDARD_INFO *pfindData =
@@ -331,11 +331,11 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
 	local_size  = tmp_inode->i_size;
 
 	tmp_inode->i_atime =
-	    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+	    cifs_NTtimeToUnix(pfindData->LastAccessTime);
 	tmp_inode->i_mtime =
-	    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastModificationTime));
+	    cifs_NTtimeToUnix(pfindData->LastModificationTime);
 	tmp_inode->i_ctime =
-	    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastStatusChange));
+	    cifs_NTtimeToUnix(pfindData->LastStatusChange);
 
 	tmp_inode->i_mode = le64_to_cpu(pfindData->Permissions);
 	/* since we set the inode type below we need to mask off type
-- 
cgit v1.2.3


From c4a2c08db7d976c2e23a97da5d69ec7c9701034d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 May 2009 09:37:33 -0400
Subject: cifs: make cnvrtDosUnixTm take a little-endian args and an offset

The callers primarily end up converting the args from le anyway. Also,
most of the callers end up needing to add an offset to the result. The
exception to these rules is cnvrtDosCifsTm, but there are no callers of
that function, so we might as well remove it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  4 ++--
 fs/cifs/cifssmb.c   |  4 ++--
 fs/cifs/netmisc.c   | 12 ++++--------
 fs/cifs/readdir.c   | 32 ++++++++++----------------------
 4 files changed, 18 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8831f649720..d542cf1f69c 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -92,8 +92,8 @@ extern void DeleteOplockQEntry(struct oplock_q_entry *);
 extern void DeleteTconOplockQEntries(struct cifsTconInfo *);
 extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
-extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
-extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
+extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
+				      int offset);
 
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
 			   struct super_block *sb, int mode, int oflags,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index aece2a8c1a7..b84c61d5bca 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -524,8 +524,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			int val, seconds, remain, result;
 			struct timespec ts, utc;
 			utc = CURRENT_TIME;
-			ts = cnvrtDosUnixTm(le16_to_cpu(rsp->SrvTime.Date),
-						le16_to_cpu(rsp->SrvTime.Time));
+			ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
+					    rsp->SrvTime.Time, 0);
 			cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d",
 				(int)ts.tv_sec, (int)utc.tv_sec,
 				(int)(utc.tv_sec - ts.tv_sec)));
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index d3ba75ef014..32d6baa0a54 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -883,16 +883,12 @@ cifs_UnixTimeToNT(struct timespec t)
 static int total_days_of_prev_months[] =
 {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334};
 
-
-__le64 cnvrtDosCifsTm(__u16 date, __u16 time)
-{
-	return cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm(date, time)));
-}
-
-struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
+struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
 {
 	struct timespec ts;
 	int sec, min, days, month, year;
+	u16 date = le16_to_cpu(le_date);
+	u16 time = le16_to_cpu(le_time);
 	SMB_TIME *st = (SMB_TIME *)&time;
 	SMB_DATE *sd = (SMB_DATE *)&date;
 
@@ -933,7 +929,7 @@ struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
 		days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0);
 	sec += 24 * 60 * 60 * days;
 
-	ts.tv_sec = sec;
+	ts.tv_sec = sec + offset;
 
 	/* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
 
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 79c46c2226c..86d0055dc52 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -115,17 +115,6 @@ construct_dentry(struct qstr *qstring, struct file *file,
 	return rc;
 }
 
-static void AdjustForTZ(struct cifsTconInfo *tcon, struct inode *inode)
-{
-	if ((tcon) && (tcon->ses) && (tcon->ses->server)) {
-		inode->i_ctime.tv_sec += tcon->ses->server->timeAdj;
-		inode->i_mtime.tv_sec += tcon->ses->server->timeAdj;
-		inode->i_atime.tv_sec += tcon->ses->server->timeAdj;
-	}
-	return;
-}
-
-
 static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 			  char *buf, unsigned int *pobject_type, int isNewInode)
 {
@@ -156,20 +145,19 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 		tmp_inode->i_ctime =
 			cifs_NTtimeToUnix(pfindData->ChangeTime);
 	} else { /* legacy, OS2 and DOS style */
-/*		struct timespec ts;*/
+		int offset = cifs_sb->tcon->ses->server->timeAdj;
 		FIND_FILE_STANDARD_INFO *pfindData =
 			(FIND_FILE_STANDARD_INFO *)buf;
 
-		tmp_inode->i_mtime = cnvrtDosUnixTm(
-				le16_to_cpu(pfindData->LastWriteDate),
-				le16_to_cpu(pfindData->LastWriteTime));
-		tmp_inode->i_atime = cnvrtDosUnixTm(
-				le16_to_cpu(pfindData->LastAccessDate),
-				le16_to_cpu(pfindData->LastAccessTime));
-		tmp_inode->i_ctime = cnvrtDosUnixTm(
-				le16_to_cpu(pfindData->LastWriteDate),
-				le16_to_cpu(pfindData->LastWriteTime));
-		AdjustForTZ(cifs_sb->tcon, tmp_inode);
+		tmp_inode->i_mtime = cnvrtDosUnixTm(pfindData->LastWriteDate,
+						    pfindData->LastWriteTime,
+						    offset);
+		tmp_inode->i_atime = cnvrtDosUnixTm(pfindData->LastAccessDate,
+						    pfindData->LastAccessTime,
+						    offset);
+		tmp_inode->i_ctime = cnvrtDosUnixTm(pfindData->LastWriteDate,
+						    pfindData->LastWriteTime,
+						    offset);
 		attr = le16_to_cpu(pfindData->Attributes);
 		allocation_size = le32_to_cpu(pfindData->AllocationSize);
 		end_of_file = le32_to_cpu(pfindData->DataSize);
-- 
cgit v1.2.3


From bd433d4cf4d8593a5f1764776b91f1794fce5a77 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 May 2009 09:37:34 -0400
Subject: cifs: rename cifs_iget to cifs_root_iget

The current cifs_iget isn't suitable for anything but the root inode.
Rename it with a more appropriate name.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 2 +-
 fs/cifs/cifsfs.h | 2 +-
 fs/cifs/inode.c  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5e6d35804d7..0a10a59b639 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -146,7 +146,7 @@ cifs_read_super(struct super_block *sb, void *data,
 #endif
 	sb->s_blocksize = CIFS_MAX_MSGSIZE;
 	sb->s_blocksize_bits = 14;	/* default 2**14 = CIFS_MAX_MSGSIZE */
-	inode = cifs_iget(sb, ROOT_I);
+	inode = cifs_root_iget(sb, ROOT_I);
 
 	if (IS_ERR(inode)) {
 		rc = PTR_ERR(inode);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 051b71cfdea..3b6a85cd484 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -36,7 +36,7 @@ extern void cifs_read_inode(struct inode *);
 
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
-extern struct inode *cifs_iget(struct super_block *, unsigned long);
+extern struct inode *cifs_root_iget(struct super_block *, unsigned long);
 extern int cifs_create(struct inode *, struct dentry *, int,
 		       struct nameidata *);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 42d6e0fb6f3..84b7bea7367 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -696,7 +696,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 }
 
 /* gets root inode */
-struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
+struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 {
 	int xid;
 	struct cifs_sb_info *cifs_sb;
-- 
cgit v1.2.3


From a0c9217f64ee3cd1e534966da8c5f05768e1ab09 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 May 2009 15:40:47 -0400
Subject: cifs: make serverino the default when mounting

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f32c9036741..8ae563f028b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -835,6 +835,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 	vol->rw = true;
 	/* default is always to request posix paths. */
 	vol->posix_paths = 1;
+	/* default to using server inode numbers where available */
+	vol->server_ino = 1;
 
 	if (!options)
 		return 1;
-- 
cgit v1.2.3


From c5077ec42303e07c2c685b0f6cb8eee0f2c7751c Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 28 May 2009 15:09:04 +0000
Subject: [CIFS] Update readme to indicate change to default mount (serverino)

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES  | 8 ++++++++
 fs/cifs/README   | 7 ++++++-
 fs/cifs/cifsfs.h | 2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index f20c4069c22..227c681b816 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,9 @@
+Version 1.59
+------------
+Client uses server inode numbers (which are persistent) rather than
+client generated ones by default (mount option "serverino" turned
+on by default if server supports it).
+
 Version 1.58
 ------------
 Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
@@ -10,6 +16,8 @@ we converted from).  Fix endianness of the vcnum field used during
 session setup to distinguish multiple mounts to same server from different
 userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
 flag to be set to 2, and mount must enable krb5 to turn on extended security).
+Performance of file create to Samba improved (posix create on lookup
+removes 1 of 2 network requests sent on file create)
  
 Version 1.57
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index db208ddb989..6d1608fabde 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -388,8 +388,13 @@ A partial list of the supported mount options follows:
 		or the CIFS Unix Extensions equivalent and for those
 		this mount option will have no effect.  Exporting cifs mounts
 		under nfsd requires this mount option on the cifs mount.
+		This is now the default if server supports the 
+		required network operation.
   noserverino   Client generates inode numbers (rather than using the actual one
-		from the server) by default.
+		from the server). These inode numbers will vary after
+		unmount or reboot which can confuse some applications,
+		but not all server filesystems support unique inode
+		numbers.
   setuids       If the CIFS Unix extensions are negotiated with the server
 		the client will attempt to set the effective uid and gid of
 		the local process on newly created files, directories, and
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 3b6a85cd484..9570a0e8023 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.58"
+#define CIFS_VERSION   "1.59"
 #endif				/* _CIFSFS_H */
-- 
cgit v1.2.3


From 1bf4072da67c14d6b02cfeef02212aa5a6211df2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 27 May 2009 09:37:33 -0400
Subject: cifs: reorganize get_cifs_acl

Thus spake Christoph:

"But this whole set_cifs_acl function is a real mess anyway and needs
some splitting up."

With this change too, it's possible to call acl_to_uid_mode() with a
NULL inode pointer. That (or something close to it) will eventually be
necessary when cifs_get_inode_info is reorganized.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c   | 100 +++++++++++++++++++++++++++-------------------------
 fs/cifs/cifsproto.h |   4 +--
 fs/cifs/inode.c     |   2 +-
 3 files changed, 55 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 57ecdc83c26..7f8e6c46d11 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -552,67 +552,66 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
 	return rc;
 }
 
-
-/* Retrieve an ACL from the server */
-static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
-				       const char *path, const __u16 *pfid)
+static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
+		__u16 fid, u32 *pacllen)
 {
-	struct cifsFileInfo *open_file = NULL;
-	bool unlock_file = false;
-	int xid;
-	int rc = -EIO;
-	__u16 fid;
-	struct super_block *sb;
-	struct cifs_sb_info *cifs_sb;
 	struct cifs_ntsd *pntsd = NULL;
+	int xid, rc;
 
-	cFYI(1, ("get mode from ACL for %s", path));
+	xid = GetXid();
+	rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
+	FreeXid(xid);
 
-	if (inode == NULL)
-		return NULL;
 
-	xid = GetXid();
-	if (pfid == NULL)
-		open_file = find_readable_file(CIFS_I(inode));
-	else
-		fid = *pfid;
+	cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
+	return pntsd;
+}
 
-	sb = inode->i_sb;
-	if (sb == NULL) {
-		FreeXid(xid);
-		return NULL;
-	}
-	cifs_sb = CIFS_SB(sb);
+static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
+		const char *path, u32 *pacllen)
+{
+	struct cifs_ntsd *pntsd = NULL;
+	int oplock = 0;
+	int xid, rc;
+	__u16 fid;
 
-	if (open_file) {
-		unlock_file = true;
-		fid = open_file->netfid;
-	} else if (pfid == NULL) {
-		int oplock = 0;
-		/* open file */
-		rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
-				READ_CONTROL, 0, &fid, &oplock, NULL,
-				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc != 0) {
-			cERROR(1, ("Unable to open file to get ACL"));
-			FreeXid(xid);
-			return NULL;
-		}
+	xid = GetXid();
+
+	rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0,
+			 &fid, &oplock, NULL, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc) {
+		cERROR(1, ("Unable to open file to get ACL"));
+		goto out;
 	}
 
 	rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
 	cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
-	if (unlock_file == true) /* find_readable_file increments ref count */
-		atomic_dec(&open_file->wrtPending);
-	else if (pfid == NULL) /* if opened above we have to close the handle */
-		CIFSSMBClose(xid, cifs_sb->tcon, fid);
-	/* else handle was passed in by caller */
 
+	CIFSSMBClose(xid, cifs_sb->tcon, fid);
+ out:
 	FreeXid(xid);
 	return pntsd;
 }
 
+/* Retrieve an ACL from the server */
+static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
+				      struct inode *inode, const char *path,
+				      u32 *pacllen)
+{
+	struct cifs_ntsd *pntsd = NULL;
+	struct cifsFileInfo *open_file = NULL;
+
+	if (inode)
+		open_file = find_readable_file(CIFS_I(inode));
+	if (!open_file)
+		return get_cifs_acl_by_path(cifs_sb, path, pacllen);
+
+	pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
+	atomic_dec(&open_file->wrtPending);
+	return pntsd;
+}
+
 /* Set an ACL on the server */
 static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 				struct inode *inode, const char *path)
@@ -668,14 +667,19 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 }
 
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
-void acl_to_uid_mode(struct inode *inode, const char *path, const __u16 *pfid)
+void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode,
+		     const char *path, const __u16 *pfid)
 {
 	struct cifs_ntsd *pntsd = NULL;
 	u32 acllen = 0;
 	int rc = 0;
 
 	cFYI(DBG2, ("converting ACL to mode for %s", path));
-	pntsd = get_cifs_acl(&acllen, inode, path, pfid);
+
+	if (pfid)
+		pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
+	else
+		pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
 
 	/* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
 	if (pntsd)
@@ -698,7 +702,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
 	cFYI(DBG2, ("set ACL from mode for %s", path));
 
 	/* Get the security descriptor */
-	pntsd = get_cifs_acl(&secdesclen, inode, path, NULL);
+	pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
 
 	/* Add three ACEs for owner, group, everyone getting rid of
 	   other ACEs as chmod disables ACEs and set the security descriptor */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index d542cf1f69c..f9452329bcc 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -108,8 +108,8 @@ extern int cifs_get_inode_info(struct inode **pinode,
 extern int cifs_get_inode_info_unix(struct inode **pinode,
 			const unsigned char *search_path,
 			struct super_block *sb, int xid);
-extern void acl_to_uid_mode(struct inode *inode, const char *path,
-			    const __u16 *pfid);
+extern void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode,
+			    const char *path, const __u16 *pfid);
 extern int mode_to_acl(struct inode *inode, const char *path, __u64);
 
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 84b7bea7367..fad882b075b 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -626,7 +626,7 @@ int cifs_get_inode_info(struct inode **pinode,
 	/* fill in 0777 bits from ACL */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
 		cFYI(1, ("Getting mode bits from ACL"));
-		acl_to_uid_mode(inode, full_path, pfid);
+		acl_to_uid_mode(cifs_sb, inode, full_path, pfid);
 	}
 #endif
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-- 
cgit v1.2.3


From b96d31a62f714566fa6420851b3bb3615c796322 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 27 May 2009 09:37:33 -0400
Subject: cifs: clean up set_cifs_acl interfaces

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c | 78 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 41 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7f8e6c46d11..1403b5d86a7 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -612,57 +612,61 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
 	return pntsd;
 }
 
-/* Set an ACL on the server */
-static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
-				struct inode *inode, const char *path)
+static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
+		struct cifs_ntsd *pnntsd, u32 acllen)
 {
-	struct cifsFileInfo *open_file;
-	bool unlock_file = false;
-	int xid;
-	int rc = -EIO;
-	__u16 fid;
-	struct super_block *sb;
-	struct cifs_sb_info *cifs_sb;
+	int xid, rc;
 
-	cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+	xid = GetXid();
+	rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
+	FreeXid(xid);
 
-	if (!inode)
-		return rc;
+	cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
+	return rc;
+}
 
-	sb = inode->i_sb;
-	if (sb == NULL)
-		return rc;
+static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
+		struct cifs_ntsd *pnntsd, u32 acllen)
+{
+	int oplock = 0;
+	int xid, rc;
+	__u16 fid;
 
-	cifs_sb = CIFS_SB(sb);
 	xid = GetXid();
 
-	open_file = find_readable_file(CIFS_I(inode));
-	if (open_file) {
-		unlock_file = true;
-		fid = open_file->netfid;
-	} else {
-		int oplock = 0;
-		/* open file */
-		rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
-				WRITE_DAC, 0, &fid, &oplock, NULL,
-				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc != 0) {
-			cERROR(1, ("Unable to open file to set ACL"));
-			FreeXid(xid);
-			return rc;
-		}
+	rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0,
+			 &fid, &oplock, NULL, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc) {
+		cERROR(1, ("Unable to open file to set ACL"));
+		goto out;
 	}
 
 	rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
 	cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
-	if (unlock_file)
-		atomic_dec(&open_file->wrtPending);
-	else
-		CIFSSMBClose(xid, cifs_sb->tcon, fid);
 
+	CIFSSMBClose(xid, cifs_sb->tcon, fid);
+ out:
 	FreeXid(xid);
+	return rc;
+}
 
+/* Set an ACL on the server */
+static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+				struct inode *inode, const char *path)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsFileInfo *open_file;
+	int rc;
+
+	cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+
+	open_file = find_readable_file(CIFS_I(inode));
+	if (!open_file)
+		return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
+
+	rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
+	atomic_dec(&open_file->wrtPending);
 	return rc;
 }
 
-- 
cgit v1.2.3


From 1f23920dbf1377fa9e4aef4f3d20c34a06a71a35 Mon Sep 17 00:00:00 2001
From: Felix Blyakher <felixb@sgi.com>
Date: Thu, 7 May 2009 19:49:45 -0500
Subject: xfs: fix double unlock in xfs_swap_extents()

Regreesion from commit ef8f7fc, which rearranged the code in
xfs_swap_extents() leading to double unlock of xfs inode ilock.
That resulted in xfs_fsr deadlocking itself on platforms, which
don't handle double unlock of rw_semaphore nicely. It caused the
count go negative, which represents the write holder, without
really having one. ia64 is one of the platforms where deadlock
was easily reproduced and the fix was tested.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_dfrag.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index e6d839bddbf..7465f9ee125 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -347,13 +347,15 @@ xfs_swap_extents(
 
 	error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT);
 
-out_unlock:
-	xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-	xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 out:
 	kmem_free(tempifp);
 	return error;
 
+out_unlock:
+	xfs_iunlock(ip,  XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	goto out;
+
 out_trans_cancel:
 	xfs_trans_cancel(tp, 0);
 	goto out_unlock;
-- 
cgit v1.2.3


From e6da7c9fed111ba1243297ee6eda8e24ae11c384 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Sat, 23 May 2009 14:30:12 -0500
Subject: xfs: fix overflow in xfs_growfs_data_private

In the case where growing a filesystem would leave the last AG
too small, the fixup code has an overflow in the calculation
of the new size with one fewer ag, because "nagcount" is a 32
bit number.  If the new filesystem has > 2^32 blocks in it
this causes a problem resulting in an EINVAL return from growfs:

 # xfs_io -f -c "truncate 19998630180864" fsfile
 # mkfs.xfs -f -bsize=4096 -dagsize=76288719b,size=3905982455b fsfile
 # mount -o loop fsfile /mnt
 # xfs_growfs /mnt

meta-data=/dev/loop0             isize=256    agcount=52,
agsize=76288719 blks
         =                       sectsz=512   attr=2
data     =                       bsize=4096   blocks=3905982455, imaxpct=5
         =                       sunit=0      swidth=0 blks
naming   =version 2              bsize=4096   ascii-ci=0
log      =internal               bsize=4096   blocks=32768, version=2
         =                       sectsz=512   sunit=0 blks, lazy-count=0
realtime =none                   extsz=4096   blocks=0, rtextents=0
xfs_growfs: XFS_IOC_FSGROWFSDATA xfsctl failed: Invalid argument

Reported-by: richard.ems@cape-horn-eng.com
Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_fsops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 8379e3bca26..cbd451bb484 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -160,7 +160,7 @@ xfs_growfs_data_private(
 	nagcount = new + (nb_mod != 0);
 	if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
 		nagcount--;
-		nb = nagcount * mp->m_sb.sb_agblocks;
+		nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
 		if (nb < mp->m_sb.sb_dblocks)
 			return XFS_ERROR(EINVAL);
 	}
-- 
cgit v1.2.3


From 1b17d766463d51904cb242f194a780737e5f73ef Mon Sep 17 00:00:00 2001
From: Felix Blyakher <felixb@sgi.com>
Date: Mon, 1 Jun 2009 13:13:24 -0500
Subject: xfs: prevent deadlock in xfs_qm_shake()

It's possible to recurse into filesystem from the memory
allocation, which deadlocks in xfs_qm_shake(). Add check
for __GFP_FS, and bail out if it is not set.

Signed-off-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Hedi Berriche <hedi@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/kmem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index af6843c7ee4..179cbd630f6 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -103,7 +103,7 @@ extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
 static inline int
 kmem_shake_allow(gfp_t gfp_mask)
 {
-	return (gfp_mask & __GFP_WAIT) != 0;
+	return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS));
 }
 
 #endif /* __XFS_SUPPORT_KMEM_H__ */
-- 
cgit v1.2.3


From a12af1ebe675e85831fde3c4d0908fc3b0908b7a Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Mon, 1 Jun 2009 12:30:03 -0500
Subject: GFS2: smbd proccess hangs with flock() call.

GFS2 currently does not support mandatory flocks. An flock() call with
LOCK_MAND triggers unexpected behavior because gfs2 is not checking for
this lock type. This patch corrects that.

Signed-off-by: Abhi Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 73b6f552f06..841ddc97938 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -698,8 +698,8 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
-	if (__mandatory_lock(&ip->i_inode))
-		return -ENOLCK;
+	if (fl->fl_type & LOCK_MAND)
+		return -EOPNOTSUPP;
 
 	if (fl->fl_type == F_UNLCK) {
 		do_unflock(file, fl);
-- 
cgit v1.2.3


From 50b64e3b77d569c217a48e078cd565dbd6462ad0 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 2 Jun 2009 06:55:20 -0400
Subject: cifs: fix IPv6 address length check

For IPv6 the userspace mount helper sends an address in the "ip="
option.  This check fails if the length is > 35 characters. I have no
idea where the magic 35 character limit came from, but it's clearly not
enough for IPv6. Fix it by making it use the INET6_ADDRSTRLEN #define.

While we're at it, use the same #define for the address length in SPNEGO
upcalls.

Reported-by: Charles R. Anderson <cra@wpi.edu>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_spnego.c | 6 ++----
 fs/cifs/connect.c     | 4 +++-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 67bf93a40d2..4a4581cb2b5 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -23,6 +23,7 @@
 #include <linux/string.h>
 #include <keys/user-type.h>
 #include <linux/key-type.h>
+#include <linux/inet.h>
 #include "cifsglob.h"
 #include "cifs_spnego.h"
 #include "cifs_debug.h"
@@ -73,9 +74,6 @@ struct key_type cifs_spnego_key_type = {
  * strlen(";sec=ntlmsspi") */
 #define MAX_MECH_STR_LEN	13
 
-/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/128 */
-#define MAX_IPV6_ADDR_LEN	43
-
 /* strlen of "host=" */
 #define HOST_KEY_LEN		5
 
@@ -102,7 +100,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 	   host=hostname sec=mechanism uid=0xFF user=username */
 	desc_len = MAX_VER_STR_LEN +
 		   HOST_KEY_LEN + strlen(hostname) +
-		   IP_KEY_LEN + MAX_IPV6_ADDR_LEN +
+		   IP_KEY_LEN + INET6_ADDRSTRLEN +
 		   MAX_MECH_STR_LEN +
 		   UID_KEY_LEN + (sizeof(uid_t) * 2) +
 		   USER_KEY_LEN + strlen(sesInfo->userName) + 1;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8ae563f028b..74b5a87e919 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -35,6 +35,7 @@
 #include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
+#include <linux/inet.h>
 #include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -960,7 +961,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 		} else if (strnicmp(data, "ip", 2) == 0) {
 			if (!value || !*value) {
 				vol->UNCip = NULL;
-			} else if (strnlen(value, 35) < 35) {
+			} else if (strnlen(value, INET6_ADDRSTRLEN) <
+							INET6_ADDRSTRLEN) {
 				vol->UNCip = value;
 			} else {
 				printk(KERN_WARNING "CIFS: ip address "
-- 
cgit v1.2.3


From e09f9446b94ac64b27d37e98c1110f29d712cdad Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 3 Jun 2009 10:07:44 +0100
Subject: GFS2: Remove unused variable

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/file.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 841ddc97938..73318a3ce6f 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -694,8 +694,6 @@ static void do_unflock(struct file *file, struct file_lock *fl)
 
 static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
 	if (fl->fl_type & LOCK_MAND)
-- 
cgit v1.2.3


From bfcd3555af478dbf04c87adc9bb1a739d0a6ccff Mon Sep 17 00:00:00 2001
From: Alberto Bertogli <albertito@blitiri.com.ar>
Date: Tue, 9 Jun 2009 00:06:20 -0400
Subject: jbd2: Fix minor typos in comments in fs/jbd2/journal.c

Signed-off-by: Alberto Bertogli <albertito@blitiri.com.ar>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/journal.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 58144102bf2..62be7d294ec 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1781,7 +1781,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
  * Journal abort has very specific semantics, which we describe
  * for journal abort.
  *
- * Two internal function, which provide abort to te jbd layer
+ * Two internal functions, which provide abort to the jbd layer
  * itself are here.
  */
 
@@ -1879,7 +1879,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
  * int jbd2_journal_errno () - returns the journal's error state.
  * @journal: journal to examine.
  *
- * This is the errno numbet set with jbd2_journal_abort(), the last
+ * This is the errno number set with jbd2_journal_abort(), the last
  * time the journal was mounted - if the journal was stopped
  * without calling abort this will be 0.
  *
@@ -1903,7 +1903,7 @@ int jbd2_journal_errno(journal_t *journal)
  * int jbd2_journal_clear_err () - clears the journal's error state
  * @journal: journal to act on.
  *
- * An error must be cleared or Acked to take a FS out of readonly
+ * An error must be cleared or acked to take a FS out of readonly
  * mode.
  */
 int jbd2_journal_clear_err(journal_t *journal)
@@ -1923,7 +1923,7 @@ int jbd2_journal_clear_err(journal_t *journal)
  * void jbd2_journal_ack_err() - Ack journal err.
  * @journal: journal to act on.
  *
- * An error must be cleared or Acked to take a FS out of readonly
+ * An error must be cleared or acked to take a FS out of readonly
  * mode.
  */
 void jbd2_journal_ack_err(journal_t *journal)
-- 
cgit v1.2.3


From 0b8e58a140cae2ba1c4a21ccae7c6c3c939c51f9 Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@sun.com>
Date: Wed, 3 Jun 2009 17:59:28 -0400
Subject: ext4: super.c whitespace cleanup

Cleanup of whitespace and formatting.  Initially driven by confusing indents
for the ext4_{block,inode}_bitmap() et. al. helper routines, but figured I'd
cleanup some other 80-column wrapping and other indenting problems at the
same time.

Signed-off-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 117 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 61 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 91b98b58ccb..0a97b1ad3e1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -79,7 +79,7 @@ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 {
 	return le32_to_cpu(bg->bg_block_bitmap_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 }
 
 ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
@@ -87,7 +87,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 {
 	return le32_to_cpu(bg->bg_inode_bitmap_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 }
 
 ext4_fsblk_t ext4_inode_table(struct super_block *sb,
@@ -95,7 +95,7 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 {
 	return le32_to_cpu(bg->bg_inode_table_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
 
 __u32 ext4_free_blks_count(struct super_block *sb,
@@ -103,7 +103,7 @@ __u32 ext4_free_blks_count(struct super_block *sb,
 {
 	return le16_to_cpu(bg->bg_free_blocks_count_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+		 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
 }
 
 __u32 ext4_free_inodes_count(struct super_block *sb,
@@ -111,7 +111,7 @@ __u32 ext4_free_inodes_count(struct super_block *sb,
 {
 	return le16_to_cpu(bg->bg_free_inodes_count_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+		 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
 }
 
 __u32 ext4_used_dirs_count(struct super_block *sb,
@@ -119,7 +119,7 @@ __u32 ext4_used_dirs_count(struct super_block *sb,
 {
 	return le16_to_cpu(bg->bg_used_dirs_count_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+		 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
 }
 
 __u32 ext4_itable_unused_count(struct super_block *sb,
@@ -127,7 +127,7 @@ __u32 ext4_itable_unused_count(struct super_block *sb,
 {
 	return le16_to_cpu(bg->bg_itable_unused_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+		 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
 }
 
 void ext4_block_bitmap_set(struct super_block *sb,
@@ -207,8 +207,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 	journal = EXT4_SB(sb)->s_journal;
 	if (journal) {
 		if (is_journal_aborted(journal)) {
-			ext4_abort(sb, __func__,
-				   "Detected aborted journal");
+			ext4_abort(sb, __func__, "Detected aborted journal");
 			return ERR_PTR(-EROFS);
 		}
 		return jbd2_journal_start(journal, nblocks);
@@ -436,7 +435,7 @@ void ext4_warning(struct super_block *sb, const char *function,
 }
 
 void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
-				const char *function, const char *fmt, ...)
+			   const char *function, const char *fmt, ...)
 __releases(bitlock)
 __acquires(bitlock)
 {
@@ -472,7 +471,6 @@ __acquires(bitlock)
 	return;
 }
 
-
 void ext4_update_dynamic_rev(struct super_block *sb)
 {
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -638,7 +636,6 @@ static void ext4_put_super(struct super_block *sb)
 	lock_kernel();
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
-	return;
 }
 
 static struct kmem_cache *ext4_inode_cachep;
@@ -653,6 +650,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
+
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	ei->i_acl = EXT4_ACL_NOT_CACHED;
 	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
@@ -673,6 +671,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei->i_allocated_meta_blocks = 0;
 	ei->i_delalloc_reserved_flag = 0;
 	spin_lock_init(&(ei->i_block_reservation_lock));
+
 	return &ei->vfs_inode;
 }
 
@@ -879,12 +878,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",noauto_da_alloc");
 
 	ext4_show_quota_options(seq, sb);
+
 	return 0;
 }
 
-
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
-		u64 ino, u32 generation)
+					u64 ino, u32 generation)
 {
 	struct inode *inode;
 
@@ -913,14 +912,14 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
 }
 
 static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
+					int fh_len, int fh_type)
 {
 	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
 				    ext4_nfs_get_inode);
 }
 
 static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
+					int fh_len, int fh_type)
 {
 	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
 				    ext4_nfs_get_inode);
@@ -932,7 +931,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
  * which would prevent try_to_free_buffers() from freeing them, we must use
  * jbd2 layer's try_to_free_buffers() function to release them.
  */
-static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+				 gfp_t wait)
 {
 	journal_t *journal = EXT4_SB(sb)->s_journal;
 
@@ -1133,8 +1133,9 @@ static ext4_fsblk_t get_sb_block(void **data)
 
 	if (!options || strncmp(options, "sb=", 3) != 0)
 		return 1;	/* Default location */
+
 	options += 3;
-	/*todo: use simple_strtoll with >32bit ext4 */
+	/* TODO: use simple_strtoll with >32bit ext4 */
 	sb_block = simple_strtoul(options, &options, 0);
 	if (*options && *options != ',') {
 		printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
@@ -1144,6 +1145,7 @@ static ext4_fsblk_t get_sb_block(void **data)
 	if (*options == ',')
 		options++;
 	*data = (void *) options;
+
 	return sb_block;
 }
 
@@ -1626,7 +1628,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 		printk(KERN_WARNING
 		       "EXT4-fs warning: checktime reached, "
 		       "running e2fsck is recommended\n");
-	if (!sbi->s_journal) 
+	if (!sbi->s_journal)
 		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
 	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
 		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
@@ -1810,7 +1812,7 @@ static int ext4_check_descriptors(struct super_block *sb)
 	}
 
 	ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
-	sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+	sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
 	return 1;
 }
 
@@ -1926,6 +1928,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 #endif
 	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 }
+
 /*
  * Maximal extent format file size.
  * Resulting logical blkno at s_maxbytes must fit in our on-disk
@@ -1976,19 +1979,19 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 	loff_t res = EXT4_NDIR_BLOCKS;
 	int meta_blocks;
 	loff_t upper_limit;
-	/* This is calculated to be the largest file size for a
-	 * dense, bitmapped file such that the total number of
-	 * sectors in the file, including data and all indirect blocks,
-	 * does not exceed 2^48 -1
-	 * __u32 i_blocks_lo and _u16 i_blocks_high representing the
-	 * total number of  512 bytes blocks of the file
+	/* This is calculated to be the largest file size for a dense, block
+	 * mapped file such that the file's total number of 512-byte sectors,
+	 * including data and all indirect blocks, does not exceed (2^48 - 1).
+	 *
+	 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
+	 * number of 512-byte sectors of the file.
 	 */
 
 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * !has_huge_files or CONFIG_LBD is not enabled
-		 * implies the inode i_block represent total blocks in
-		 * 512 bytes 32 == size of vfs inode i_blocks * 8
+		 * !has_huge_files or CONFIG_LBD not enabled implies that
+		 * the inode i_block field represents total file blocks in
+		 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
 		 */
 		upper_limit = (1LL << 32) - 1;
 
@@ -2030,7 +2033,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 }
 
 static ext4_fsblk_t descriptor_loc(struct super_block *sb,
-				ext4_fsblk_t logical_sb_block, int nr)
+				   ext4_fsblk_t logical_sb_block, int nr)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t bg, first_meta_bg;
@@ -2044,6 +2047,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
 	bg = sbi->s_desc_per_block * nr;
 	if (ext4_bg_has_super(sb, bg))
 		has_super = 1;
+
 	return (has_super + ext4_group_first_block_no(sb, bg));
 }
 
@@ -2148,7 +2152,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 }
 
 static ssize_t sbi_ui_show(struct ext4_attr *a,
-				struct ext4_sb_info *sbi, char *buf)
+			   struct ext4_sb_info *sbi, char *buf)
 {
 	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
 
@@ -2253,7 +2257,6 @@ static struct kobj_type ext4_ktype = {
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				__releases(kernel_lock)
 				__acquires(kernel_lock)
-
 {
 	struct buffer_head *bh;
 	struct ext4_super_block *es = NULL;
@@ -2379,7 +2382,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	set_opt(sbi->s_mount_opt, DELALLOC);
 
-
 	if (!parse_options((char *) data, sb, &journal_devnum,
 			   &journal_ioprio, NULL, 0))
 		goto failed_mount;
@@ -2442,7 +2444,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	if (sb->s_blocksize != blocksize) {
-
 		/* Validate the filesystem blocksize */
 		if (!sb_set_blocksize(sb, blocksize)) {
 			printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
@@ -2489,6 +2490,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
 			sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
 	}
+
 	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
 		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
@@ -2501,10 +2503,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		}
 	} else
 		sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
+
 	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
 	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
 	if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
 		goto cantfind_ext4;
+
 	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
 	if (sbi->s_inodes_per_block == 0)
 		goto cantfind_ext4;
@@ -2515,6 +2519,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_mount_state = le16_to_cpu(es->s_state);
 	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
 	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
+
 	for (i = 0; i < 4; i++)
 		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
 	sbi->s_def_hash_version = es->s_def_hash_version;
@@ -2566,12 +2571,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-        /*
-         * It makes no sense for the first data block to be beyond the end
-         * of the filesystem.
-         */
-        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
-                printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
+	/*
+	 * It makes no sense for the first data block to be beyond the end
+	 * of the filesystem.
+	 */
+	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+		printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
 		       "block %u is beyond end of filesystem (%llu)\n",
 		       le32_to_cpu(es->s_first_data_block),
 		       ext4_blocks_count(es));
@@ -3082,6 +3087,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	EXT4_SB(sb)->journal_bdev = bdev;
 	ext4_init_journal_params(sb, journal);
 	return journal;
+
 out_journal:
 	jbd2_journal_destroy(journal);
 out_bdev:
@@ -3116,7 +3122,6 @@ static int ext4_load_journal(struct super_block *sb,
 	 * crash?  For recovery, we need to check in advance whether we
 	 * can get read-write access to the device.
 	 */
-
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 		if (sb->s_flags & MS_RDONLY) {
 			printk(KERN_INFO "EXT4-fs: INFO: recovery "
@@ -3234,7 +3239,6 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	return error;
 }
 
-
 /*
  * Have we just finished recovery?  If so, and if we are mounting (or
  * remounting) the filesystem readonly, then we will end up with a
@@ -3485,8 +3489,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 
 			/*
 			 * Make sure the group descriptor checksums
-			 * are sane.  If they aren't, refuse to
-			 * remount r/w.
+			 * are sane.  If they aren't, refuse to remount r/w.
 			 */
 			for (g = 0; g < sbi->s_groups_count; g++) {
 				struct ext4_group_desc *gdp =
@@ -3545,6 +3548,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			kfree(old_opts.s_qf_names[i]);
 #endif
 	return 0;
+
 restore_opts:
 	sb->s_flags = old_sb_flags;
 	sbi->s_mount_opt = old_opts.s_mount_opt;
@@ -3628,11 +3632,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
 	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
 	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+
 	return 0;
 }
 
-/* Helper function for writing quotas on sync - we need to start transaction before quota file
- * is locked for write. Otherwise the are possible deadlocks:
+/* Helper function for writing quotas on sync - we need to start transaction
+ * before quota file is locked for write. Otherwise the are possible deadlocks:
  * Process 1                         Process 2
  * ext4_create()                     quota_sync()
  *   jbd2_journal_start()                  write_dquot()
@@ -3656,7 +3661,7 @@ static int ext4_write_dquot(struct dquot *dquot)
 
 	inode = dquot_to_inode(dquot);
 	handle = ext4_journal_start(inode,
-					EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+				    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_commit(dquot);
@@ -3672,7 +3677,7 @@ static int ext4_acquire_dquot(struct dquot *dquot)
 	handle_t *handle;
 
 	handle = ext4_journal_start(dquot_to_inode(dquot),
-					EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+				    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_acquire(dquot);
@@ -3688,7 +3693,7 @@ static int ext4_release_dquot(struct dquot *dquot)
 	handle_t *handle;
 
 	handle = ext4_journal_start(dquot_to_inode(dquot),
-					EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+				    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle)) {
 		/* Release dquot anyway to avoid endless cycle in dqput() */
 		dquot_release(dquot);
@@ -3736,7 +3741,7 @@ static int ext4_write_info(struct super_block *sb, int type)
 static int ext4_quota_on_mount(struct super_block *sb, int type)
 {
 	return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
-			EXT4_SB(sb)->s_jquota_fmt, type);
+				  EXT4_SB(sb)->s_jquota_fmt, type);
 }
 
 /*
@@ -3907,10 +3912,10 @@ out:
 
 #endif
 
-static int ext4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 
 static struct file_system_type ext4_fs_type = {
@@ -3922,14 +3927,14 @@ static struct file_system_type ext4_fs_type = {
 };
 
 #ifdef CONFIG_EXT4DEV_COMPAT
-static int ext4dev_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
+			  const char *dev_name, void *data,struct vfsmount *mnt)
 {
 	printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
 	       "to mount using ext4\n");
 	printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
 	       "will go away by 2.6.31\n");
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 
 static struct file_system_type ext4dev_fs_type = {
-- 
cgit v1.2.3


From 2cc3c559fb2fe8cecca82a517bc56e88b0c1effd Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 4 Jun 2009 09:23:50 -0400
Subject: Btrfs: set device->total_disk_bytes when adding new device

It was not being properly initialized, and so the size saved to
disk was not correct.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5f01dad4b69..a6d35b0054c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1440,6 +1440,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	device->io_align = root->sectorsize;
 	device->sector_size = root->sectorsize;
 	device->total_bytes = i_size_read(bdev->bd_inode);
+	device->disk_total_bytes = device->total_bytes;
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = bdev;
 	device->in_fs_metadata = 1;
-- 
cgit v1.2.3


From 44fb5511638938a2c37c895abc14df648ffc07e9 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 4 Jun 2009 15:34:51 -0400
Subject: Btrfs: Fix oops and use after free during space balancing

The btrfs allocator uses list_for_each to walk the available block
groups when searching for free blocks.  It starts off with a hint
to help find the best block group for a given allocation.

The hint is resolved into a block group, but we don't properly check
to make sure the block group we find isn't in the middle of being
freed due to filesystem shrinking or balancing.  If it is being
freed, the list pointers in it are bogus and can't be trusted.  But,
the code happily goes along and uses them in the list_for_each loop,
leading to all kinds of fun.

The fix used here is to check to make sure the block group we find really
is on the list before we use it.  list_del_init is used when removing
it from the list, so we can do a proper check.

The allocation clustering code has a similar bug where it will trust
the block group in the current free space cluster.  If our allocation
flags have changed (going from single spindle dup to raid1 for example)
because the drives in the FS have changed, we're not allowed to use
the old block group any more.

The fix used here is to check the current cluster against the
current allocation flags.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3e2c7c738f2..35af9335506 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2622,7 +2622,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 						       search_start);
 		if (block_group && block_group_bits(block_group, data)) {
 			down_read(&space_info->groups_sem);
-			goto have_block_group;
+			if (list_empty(&block_group->list) ||
+			    block_group->ro) {
+				/*
+				 * someone is removing this block group,
+				 * we can't jump into the have_block_group
+				 * target because our list pointers are not
+				 * valid
+				 */
+				btrfs_put_block_group(block_group);
+				up_read(&space_info->groups_sem);
+			} else
+				goto have_block_group;
 		} else if (block_group) {
 			btrfs_put_block_group(block_group);
 		}
@@ -2656,6 +2667,13 @@ have_block_group:
 			 * people trying to start a new cluster
 			 */
 			spin_lock(&last_ptr->refill_lock);
+			if (last_ptr->block_group &&
+			    (last_ptr->block_group->ro ||
+			    !block_group_bits(last_ptr->block_group, data))) {
+				offset = 0;
+				goto refill_cluster;
+			}
+
 			offset = btrfs_alloc_from_cluster(block_group, last_ptr,
 						 num_bytes, search_start);
 			if (offset) {
@@ -2681,10 +2699,17 @@ have_block_group:
 
 				last_ptr_loop = 1;
 				search_start = block_group->key.objectid;
+				/*
+				 * we know this block group is properly
+				 * in the list because
+				 * btrfs_remove_block_group, drops the
+				 * cluster before it removes the block
+				 * group from the list
+				 */
 				goto have_block_group;
 			}
 			spin_unlock(&last_ptr->lock);
-
+refill_cluster:
 			/*
 			 * this cluster didn't work out, free it and
 			 * start over
@@ -5968,6 +5993,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_path *path;
 	struct btrfs_block_group_cache *block_group;
+	struct btrfs_free_cluster *cluster;
 	struct btrfs_key key;
 	int ret;
 
@@ -5979,6 +6005,21 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
 	memcpy(&key, &block_group->key, sizeof(key));
 
+	/* make sure this block group isn't part of an allocation cluster */
+	cluster = &root->fs_info->data_alloc_cluster;
+	spin_lock(&cluster->refill_lock);
+	btrfs_return_cluster_to_free_space(block_group, cluster);
+	spin_unlock(&cluster->refill_lock);
+
+	/*
+	 * make sure this block group isn't part of a metadata
+	 * allocation cluster
+	 */
+	cluster = &root->fs_info->meta_alloc_cluster;
+	spin_lock(&cluster->refill_lock);
+	btrfs_return_cluster_to_free_space(block_group, cluster);
+	spin_unlock(&cluster->refill_lock);
+
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
@@ -5988,7 +6029,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	spin_unlock(&root->fs_info->block_group_cache_lock);
 	btrfs_remove_free_space_cache(block_group);
 	down_write(&block_group->space_info->groups_sem);
-	list_del(&block_group->list);
+	/*
+	 * we must use list_del_init so people can check to see if they
+	 * are still on the list after taking the semaphore
+	 */
+	list_del_init(&block_group->list);
 	up_write(&block_group->space_info->groups_sem);
 
 	spin_lock(&block_group->space_info->lock);
-- 
cgit v1.2.3


From 172124e220f1854acc99ee394671781b8b5e2120 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 4 Jun 2009 22:34:44 +0200
Subject: Revert "block: implement blkdev_readpages"

This reverts commit db2dbb12dc47a50c7a4c5678f526014063e486f6.

It apparently causes problems with partition table read-ahead
on archs with large page sizes. Until that problem is diagnosed
further, just drop the readpages support on block devices.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index a29b4dcc1bc..2dfc6cdcebb 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -331,12 +331,6 @@ static int blkdev_readpage(struct file * file, struct page * page)
 	return block_read_full_page(page, blkdev_get_block);
 }
 
-static int blkdev_readpages(struct file *file, struct address_space *mapping,
-			struct list_head *pages, unsigned nr_pages)
-{
-	return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
-}
-
 static int blkdev_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
@@ -1405,7 +1399,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
 
 static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
-	.readpages	= blkdev_readpages,
 	.writepage	= blkdev_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= blkdev_write_begin,
-- 
cgit v1.2.3


From 03f5d8bcf094a5e3b501bd2ae1553656efa8d1be Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 9 Jun 2009 00:17:05 -0400
Subject: ext4: Get rid of EXTEND_DISKSIZE flag of ext4_get_blocks_handle()

Get rid of EXTEND_DISKSIZE flag of ext4_get_blocks_handle(). This
seems to be a relict from some old days and setting disksize in this
function does not make much sense.  Currently it was set only by
ext4_getblk().  Since the parameter has some effect only if create ==
1, it is easy to check by grepping through the sources that the three
callers which end up calling ext4_getblk() with create == 1
(ext4_append, ext4_quota_write, ext4_mkdir) do the right thing and set
disksize themselves.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  6 ++----
 fs/ext4/extents.c |  9 ---------
 fs/ext4/inode.c   | 23 +++--------------------
 3 files changed, 5 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4311cc85b53..59657ff7b8f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -323,15 +323,13 @@ struct ext4_new_group_data {
 #define EXT4_GET_BLOCKS_UNINIT_EXT		0x0002
 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT	(EXT4_GET_BLOCKS_UNINIT_EXT|\
 						 EXT4_GET_BLOCKS_CREATE)
-	/* Update the ext4_inode_info i_disksize field */
-#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE		0x0004
 	/* Caller is from the delayed allocation writeout path,
 	   so set the magic i_delalloc_reserve_flag after taking the 
 	   inode allocation semaphore for */
-#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0008
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0004
 	/* Call ext4_da_update_reserve_space() after successfully 
 	   allocating the blocks */
-#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE	0x0010
+#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE	0x0008
 
 
 /*
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d4e99e96fdd..9c35a7b1f0a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2779,7 +2779,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	int err = 0, depth, ret, cache_type;
 	unsigned int allocated = 0;
 	struct ext4_allocation_request ar;
-	loff_t disksize;
 
 	__clear_bit(BH_New, &bh_result->b_state);
 	ext_debug("blocks %u/%u requested for inode %u\n",
@@ -2969,14 +2968,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	newblock = ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
 outnew:
-	if (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE) {
-		disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
-		if (disksize > i_size_read(inode))
-			disksize = i_size_read(inode);
-		if (disksize > EXT4_I(inode)->i_disksize)
-			EXT4_I(inode)->i_disksize = disksize;
-	}
-
 	set_buffer_new(bh_result);
 
 	/* Cache only when it is _not_ an uninitialized extent */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 14c00fff371..17ed0d244db 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -933,11 +933,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	int indirect_blks;
 	int blocks_to_boundary = 0;
 	int depth;
-	struct ext4_inode_info *ei = EXT4_I(inode);
 	int count = 0;
 	ext4_fsblk_t first_block = 0;
-	loff_t disksize;
-
 
 	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
 	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
@@ -1003,19 +1000,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	if (!err)
 		err = ext4_splice_branch(handle, inode, iblock,
 					partial, indirect_blks, count);
-	/*
-	 * i_disksize growing is protected by i_data_sem.  Don't forget to
-	 * protect it if you're about to implement concurrent
-	 * ext4_get_block() -bzzz
-	*/
-	if (!err && (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE)) {
-		disksize = ((loff_t) iblock + count) << inode->i_blkbits;
-		if (disksize > i_size_read(inode))
-			disksize = i_size_read(inode);
-		if (disksize > ei->i_disksize)
-			ei->i_disksize = disksize;
-	}
-	if (err)
+	else 
 		goto cleanup;
 
 	set_buffer_new(bh_result);
@@ -1321,7 +1306,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 {
 	struct buffer_head dummy;
 	int fatal = 0, err;
-	int flags = EXT4_GET_BLOCKS_EXTEND_DISKSIZE;
+	int flags = 0;
 
 	J_ASSERT(handle != NULL || create == 0);
 
@@ -2153,9 +2138,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	}
 
 	/*
-	 * Update on-disk size along with block allocation we don't
-	 * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change
-	 * within already allocated block -bzzz
+	 * Update on-disk size along with block allocation.
 	 */
 	disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
 	if (disksize > i_size_read(mpd->inode))
-- 
cgit v1.2.3


From b31e15527a9bb71b6a11a425d17ce139a62f5af5 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 4 Jun 2009 17:36:36 -0400
Subject: ext4: Change all super.c messages to print the device

This patch changes ext4 super.c to include the device name with all
warning/error messages, by using a new utility function ext4_msg.
It's a rather large patch, but very mechanic. I left debug printks
alone.

This is a straightforward port of a patch which Andi Kleen did for
ext3.

Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |   2 +
 fs/ext4/super.c | 462 ++++++++++++++++++++++++++++----------------------------
 2 files changed, 235 insertions(+), 229 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 59657ff7b8f..cc7d5edc38c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1401,6 +1401,8 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
 extern void ext4_warning(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
+extern void ext4_msg(struct super_block *, const char *, const char *, ...)
+	__attribute__ ((format (printf, 3, 4)));
 extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
 				const char *, const char *, ...)
 	__attribute__ ((format (printf, 4, 5)));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0a97b1ad3e1..c191d0f65fe 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -306,7 +306,7 @@ static void ext4_handle_error(struct super_block *sb)
 			jbd2_journal_abort(journal, -EIO);
 	}
 	if (test_opt(sb, ERRORS_RO)) {
-		printk(KERN_CRIT "Remounting filesystem read-only\n");
+		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 		sb->s_flags |= MS_RDONLY;
 	}
 	ext4_commit_super(sb, 1);
@@ -399,8 +399,6 @@ void ext4_abort(struct super_block *sb, const char *function,
 {
 	va_list args;
 
-	printk(KERN_CRIT "ext4_abort called.\n");
-
 	va_start(args, fmt);
 	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
 	vprintk(fmt, args);
@@ -413,7 +411,7 @@ void ext4_abort(struct super_block *sb, const char *function,
 	if (sb->s_flags & MS_RDONLY)
 		return;
 
-	printk(KERN_CRIT "Remounting filesystem read-only\n");
+	ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 	sb->s_flags |= MS_RDONLY;
 	EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
@@ -421,6 +419,18 @@ void ext4_abort(struct super_block *sb, const char *function,
 		jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
 
+void ext4_msg (struct super_block * sb, const char *prefix,
+		   const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+}
+
 void ext4_warning(struct super_block *sb, const char *function,
 		  const char *fmt, ...)
 {
@@ -499,7 +509,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
 /*
  * Open the external journal device
  */
-static struct block_device *ext4_blkdev_get(dev_t dev)
+static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 {
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
@@ -510,7 +520,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
 	return bdev;
 
 fail:
-	printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
+	ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
 			__bdevname(dev, b), PTR_ERR(bdev));
 	return NULL;
 }
@@ -546,8 +556,8 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
 {
 	struct list_head *l;
 
-	printk(KERN_ERR "sb orphan head is %d\n",
-	       le32_to_cpu(sbi->s_es->s_last_orphan));
+	ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
+		 le32_to_cpu(sbi->s_es->s_last_orphan));
 
 	printk(KERN_ERR "sb_info orphan list:\n");
 	list_for_each(l, &sbi->s_orphan) {
@@ -678,8 +688,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 static void ext4_destroy_inode(struct inode *inode)
 {
 	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
-		printk("EXT4 Inode %p: orphan list check failed!\n",
-			EXT4_I(inode));
+		ext4_msg(inode->i_sb, KERN_ERR,
+			 "Inode %lu (%p): orphan list check failed!",
+			 inode->i_ino, EXT4_I(inode));
 		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
 				EXT4_I(inode), sizeof(struct ext4_inode_info),
 				true);
@@ -1239,8 +1250,7 @@ static int parse_options(char *options, struct super_block *sb,
 #else
 		case Opt_user_xattr:
 		case Opt_nouser_xattr:
-			printk(KERN_ERR "EXT4 (no)user_xattr options "
-			       "not supported\n");
+			ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
 			break;
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -1253,8 +1263,7 @@ static int parse_options(char *options, struct super_block *sb,
 #else
 		case Opt_acl:
 		case Opt_noacl:
-			printk(KERN_ERR "EXT4 (no)acl options "
-			       "not supported\n");
+			ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
 			break;
 #endif
 		case Opt_journal_update:
@@ -1264,16 +1273,16 @@ static int parse_options(char *options, struct super_block *sb,
 			   user to specify an existing inode to be the
 			   journal file. */
 			if (is_remount) {
-				printk(KERN_ERR "EXT4-fs: cannot specify "
-				       "journal on remount\n");
+				ext4_msg(sb, KERN_ERR,
+					 "Cannot specify journal on remount");
 				return 0;
 			}
 			set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
 			break;
 		case Opt_journal_dev:
 			if (is_remount) {
-				printk(KERN_ERR "EXT4-fs: cannot specify "
-				       "journal on remount\n");
+				ext4_msg(sb, KERN_ERR,
+					"Cannot specify journal on remount");
 				return 0;
 			}
 			if (match_int(&args[0], &option))
@@ -1327,9 +1336,8 @@ static int parse_options(char *options, struct super_block *sb,
 			if (is_remount) {
 				if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
 						!= data_opt) {
-					printk(KERN_ERR
-						"EXT4-fs: cannot change data "
-						"mode on remount\n");
+					ext4_msg(sb, KERN_ERR,
+						"Cannot change data mode on remount");
 					return 0;
 				}
 			} else {
@@ -1359,31 +1367,31 @@ static int parse_options(char *options, struct super_block *sb,
 set_qf_name:
 			if (sb_any_quota_loaded(sb) &&
 			    !sbi->s_qf_names[qtype]) {
-				printk(KERN_ERR
-				       "EXT4-fs: Cannot change journaled "
-				       "quota options when quota turned on.\n");
+				ext4_msg(sb, KERN_ERR,
+				       "Cannot change journaled "
+				       "quota options when quota turned on");
 				return 0;
 			}
 			qname = match_strdup(&args[0]);
 			if (!qname) {
-				printk(KERN_ERR
-					"EXT4-fs: not enough memory for "
-					"storing quotafile name.\n");
+				ext4_msg(sb, KERN_ERR,
+					"Not enough memory for "
+					"storing quotafile name");
 				return 0;
 			}
 			if (sbi->s_qf_names[qtype] &&
 			    strcmp(sbi->s_qf_names[qtype], qname)) {
-				printk(KERN_ERR
-					"EXT4-fs: %s quota file already "
-					"specified.\n", QTYPE2NAME(qtype));
+				ext4_msg(sb, KERN_ERR,
+					"%s quota file already "
+					"specified", QTYPE2NAME(qtype));
 				kfree(qname);
 				return 0;
 			}
 			sbi->s_qf_names[qtype] = qname;
 			if (strchr(sbi->s_qf_names[qtype], '/')) {
-				printk(KERN_ERR
-					"EXT4-fs: quotafile must be on "
-					"filesystem root.\n");
+				ext4_msg(sb, KERN_ERR,
+					"quotafile must be on "
+					"filesystem root");
 				kfree(sbi->s_qf_names[qtype]);
 				sbi->s_qf_names[qtype] = NULL;
 				return 0;
@@ -1398,9 +1406,9 @@ set_qf_name:
 clear_qf_name:
 			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_qf_names[qtype]) {
-				printk(KERN_ERR "EXT4-fs: Cannot change "
+				ext4_msg(sb, KERN_ERR, "Cannot change "
 					"journaled quota options when "
-					"quota turned on.\n");
+					"quota turned on");
 				return 0;
 			}
 			/*
@@ -1417,9 +1425,9 @@ clear_qf_name:
 set_qf_format:
 			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_jquota_fmt != qfmt) {
-				printk(KERN_ERR "EXT4-fs: Cannot change "
+				ext4_msg(sb, KERN_ERR, "Cannot change "
 					"journaled quota options when "
-					"quota turned on.\n");
+					"quota turned on");
 				return 0;
 			}
 			sbi->s_jquota_fmt = qfmt;
@@ -1435,8 +1443,8 @@ set_qf_format:
 			break;
 		case Opt_noquota:
 			if (sb_any_quota_loaded(sb)) {
-				printk(KERN_ERR "EXT4-fs: Cannot change quota "
-					"options when quota turned on.\n");
+				ext4_msg(sb, KERN_ERR, "Cannot change quota "
+					"options when quota turned on");
 				return 0;
 			}
 			clear_opt(sbi->s_mount_opt, QUOTA);
@@ -1447,8 +1455,8 @@ set_qf_format:
 		case Opt_quota:
 		case Opt_usrquota:
 		case Opt_grpquota:
-			printk(KERN_ERR
-				"EXT4-fs: quota options not supported.\n");
+			ext4_msg(sb, KERN_ERR,
+				"quota options not supported");
 			break;
 		case Opt_usrjquota:
 		case Opt_grpjquota:
@@ -1456,9 +1464,8 @@ set_qf_format:
 		case Opt_offgrpjquota:
 		case Opt_jqfmt_vfsold:
 		case Opt_jqfmt_vfsv0:
-			printk(KERN_ERR
-				"EXT4-fs: journaled quota options not "
-				"supported.\n");
+			ext4_msg(sb, KERN_ERR,
+				"journaled quota options not supported");
 			break;
 		case Opt_noquota:
 			break;
@@ -1483,8 +1490,9 @@ set_qf_format:
 			break;
 		case Opt_resize:
 			if (!is_remount) {
-				printk("EXT4-fs: resize option only available "
-					"for remount\n");
+				ext4_msg(sb, KERN_ERR,
+					"resize option only available "
+					"for remount");
 				return 0;
 			}
 			if (match_int(&args[0], &option) != 0)
@@ -1526,8 +1534,9 @@ set_qf_format:
 			if (option < 0 || option > (1 << 30))
 				return 0;
 			if (!is_power_of_2(option)) {
-				printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
-				       " must be a power of 2\n");
+				ext4_msg(sb, KERN_ERR,
+					 "EXT4-fs: inode_readahead_blks"
+					 " must be a power of 2");
 				return 0;
 			}
 			sbi->s_inode_readahead_blks = option;
@@ -1554,9 +1563,9 @@ set_qf_format:
 				set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
 			break;
 		default:
-			printk(KERN_ERR
-			       "EXT4-fs: Unrecognized mount option \"%s\" "
-			       "or missing value\n", p);
+			ext4_msg(sb, KERN_ERR,
+			       "Unrecognized mount option \"%s\" "
+			       "or missing value", p);
 			return 0;
 		}
 	}
@@ -1574,21 +1583,21 @@ set_qf_format:
 				(sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
 		    (sbi->s_qf_names[GRPQUOTA] &&
 				(sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
-			printk(KERN_ERR "EXT4-fs: old and new quota "
-					"format mixing.\n");
+			ext4_msg(sb, KERN_ERR, "old and new quota "
+					"format mixing");
 			return 0;
 		}
 
 		if (!sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT4-fs: journaled quota format "
-					"not specified.\n");
+			ext4_msg(sb, KERN_ERR, "journaled quota format "
+					"not specified");
 			return 0;
 		}
 	} else {
 		if (sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT4-fs: journaled quota format "
+			ext4_msg(sb, KERN_ERR, "journaled quota format "
 					"specified with no journaling "
-					"enabled.\n");
+					"enabled");
 			return 0;
 		}
 	}
@@ -1603,31 +1612,31 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	int res = 0;
 
 	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
-		printk(KERN_ERR "EXT4-fs warning: revision level too high, "
-		       "forcing read-only mode\n");
+		ext4_msg(sb, KERN_ERR, "revision level too high, "
+			 "forcing read-only mode");
 		res = MS_RDONLY;
 	}
 	if (read_only)
 		return res;
 	if (!(sbi->s_mount_state & EXT4_VALID_FS))
-		printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
-		       "running e2fsck is recommended\n");
+		ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
+			 "running e2fsck is recommended");
 	else if ((sbi->s_mount_state & EXT4_ERROR_FS))
-		printk(KERN_WARNING
-		       "EXT4-fs warning: mounting fs with errors, "
-		       "running e2fsck is recommended\n");
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: mounting fs with errors, "
+			 "running e2fsck is recommended");
 	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
 		 le16_to_cpu(es->s_mnt_count) >=
 		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
-		printk(KERN_WARNING
-		       "EXT4-fs warning: maximal mount count reached, "
-		       "running e2fsck is recommended\n");
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: maximal mount count reached, "
+			 "running e2fsck is recommended");
 	else if (le32_to_cpu(es->s_checkinterval) &&
 		(le32_to_cpu(es->s_lastcheck) +
 			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
-		printk(KERN_WARNING
-		       "EXT4-fs warning: checktime reached, "
-		       "running e2fsck is recommended\n");
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: checktime reached, "
+			 "running e2fsck is recommended");
 	if (!sbi->s_journal)
 		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
 	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
@@ -1649,11 +1658,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 			sbi->s_mount_opt);
 
 	if (EXT4_SB(sb)->s_journal) {
-		printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
-		       sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+		ext4_msg(sb, KERN_INFO, "%s journal on %s",
+		       EXT4_SB(sb)->s_journal->j_inode ? "internal" :
 		       "external", EXT4_SB(sb)->s_journal->j_devname);
 	} else {
-		printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
+		ext4_msg(sb, KERN_INFO, "no journal");
 	}
 	return res;
 }
@@ -1688,8 +1697,8 @@ static int ext4_fill_flex_info(struct super_block *sb)
 			memset(sbi->s_flex_groups, 0, size);
 	}
 	if (sbi->s_flex_groups == NULL) {
-		printk(KERN_ERR "EXT4-fs: not enough memory for "
-				"%u flex groups\n", flex_group_count);
+		ext4_msg(sb, KERN_ERR, "not enough memory for "
+				"%u flex groups", flex_group_count);
 		goto failed;
 	}
 
@@ -1775,32 +1784,32 @@ static int ext4_check_descriptors(struct super_block *sb)
 
 		block_bitmap = ext4_block_bitmap(sb, gdp);
 		if (block_bitmap < first_block || block_bitmap > last_block) {
-			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
 			       "Block bitmap for group %u not in group "
-			       "(block %llu)!\n", i, block_bitmap);
+			       "(block %llu)!", i, block_bitmap);
 			return 0;
 		}
 		inode_bitmap = ext4_inode_bitmap(sb, gdp);
 		if (inode_bitmap < first_block || inode_bitmap > last_block) {
-			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
 			       "Inode bitmap for group %u not in group "
-			       "(block %llu)!\n", i, inode_bitmap);
+			       "(block %llu)!", i, inode_bitmap);
 			return 0;
 		}
 		inode_table = ext4_inode_table(sb, gdp);
 		if (inode_table < first_block ||
 		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
-			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
 			       "Inode table for group %u not in group "
-			       "(block %llu)!\n", i, inode_table);
+			       "(block %llu)!", i, inode_table);
 			return 0;
 		}
 		ext4_lock_group(sb, i);
 		if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
-			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-			       "Checksum for group %u failed (%u!=%u)\n",
-			       i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
-			       gdp)), le16_to_cpu(gdp->bg_checksum));
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+				 "Checksum for group %u failed (%u!=%u)",
+				 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+				     gdp)), le16_to_cpu(gdp->bg_checksum));
 			if (!(sb->s_flags & MS_RDONLY)) {
 				ext4_unlock_group(sb, i);
 				return 0;
@@ -1847,8 +1856,8 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 	}
 
 	if (bdev_read_only(sb->s_bdev)) {
-		printk(KERN_ERR "EXT4-fs: write access "
-			"unavailable, skipping orphan cleanup.\n");
+		ext4_msg(sb, KERN_ERR, "write access "
+			"unavailable, skipping orphan cleanup");
 		return;
 	}
 
@@ -1862,8 +1871,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 	}
 
 	if (s_flags & MS_RDONLY) {
-		printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n",
-		       sb->s_id);
+		ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
 		sb->s_flags &= ~MS_RDONLY;
 	}
 #ifdef CONFIG_QUOTA
@@ -1874,9 +1882,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		if (EXT4_SB(sb)->s_qf_names[i]) {
 			int ret = ext4_quota_on_mount(sb, i);
 			if (ret < 0)
-				printk(KERN_ERR
-					"EXT4-fs: Cannot turn on journaled "
-					"quota: error %d\n", ret);
+				ext4_msg(sb, KERN_ERR,
+					"Cannot turn on journaled "
+					"quota: error %d", ret);
 		}
 	}
 #endif
@@ -1893,16 +1901,16 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
 		vfs_dq_init(inode);
 		if (inode->i_nlink) {
-			printk(KERN_DEBUG
-				"%s: truncating inode %lu to %lld bytes\n",
+			ext4_msg(sb, KERN_DEBUG,
+				"%s: truncating inode %lu to %lld bytes",
 				__func__, inode->i_ino, inode->i_size);
 			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
 				  inode->i_ino, inode->i_size);
 			ext4_truncate(inode);
 			nr_truncates++;
 		} else {
-			printk(KERN_DEBUG
-				"%s: deleting unreferenced inode %lu\n",
+			ext4_msg(sb, KERN_DEBUG,
+				"%s: deleting unreferenced inode %lu",
 				__func__, inode->i_ino);
 			jbd_debug(2, "deleting unreferenced inode %lu\n",
 				  inode->i_ino);
@@ -1914,11 +1922,11 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 #define PLURAL(x) (x), ((x) == 1) ? "" : "s"
 
 	if (nr_orphans)
-		printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
-		       sb->s_id, PLURAL(nr_orphans));
+		ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+		       PLURAL(nr_orphans));
 	if (nr_truncates)
-		printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n",
-		       sb->s_id, PLURAL(nr_truncates));
+		ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+		       PLURAL(nr_truncates));
 #ifdef CONFIG_QUOTA
 	/* Turn quotas off */
 	for (i = 0; i < MAXQUOTAS; i++) {
@@ -2307,7 +2315,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
 	if (!blocksize) {
-		printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
+		ext4_msg(sb, KERN_ERR, "unable to set blocksize");
 		goto out_fail;
 	}
 
@@ -2323,7 +2331,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	if (!(bh = sb_bread(sb, logical_sb_block))) {
-		printk(KERN_ERR "EXT4-fs: unable to read superblock\n");
+		ext4_msg(sb, KERN_ERR, "unable to read superblock");
 		goto out_fail;
 	}
 	/*
@@ -2393,9 +2401,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	    (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
 	     EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
 	     EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
-		printk(KERN_WARNING
-		       "EXT4-fs warning: feature flags set on rev 0 fs, "
-		       "running e2fsck is recommended\n");
+		ext4_msg(sb, KERN_WARNING,
+		       "feature flags set on rev 0 fs, "
+		       "running e2fsck is recommended");
 
 	/*
 	 * Check feature flags regardless of the revision level, since we
@@ -2404,16 +2412,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
 	if (features) {
-		printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
-		       "unsupported optional features (%x).\n", sb->s_id,
+		ext4_msg(sb, KERN_ERR,
+			"Couldn't mount because of "
+			"unsupported optional features (%x)",
 			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
 			~EXT4_FEATURE_INCOMPAT_SUPP));
 		goto failed_mount;
 	}
 	features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
 	if (!(sb->s_flags & MS_RDONLY) && features) {
-		printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
-		       "unsupported optional features (%x).\n", sb->s_id,
+		ext4_msg(sb, KERN_ERR,
+			"Couldn't mount RDWR because of "
+			"unsupported optional features (%x)",
 			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
 			~EXT4_FEATURE_RO_COMPAT_SUPP));
 		goto failed_mount;
@@ -2427,9 +2437,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		 */
 		if (sizeof(root->i_blocks) < sizeof(u64) &&
 				!(sb->s_flags & MS_RDONLY)) {
-			printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
+			ext4_msg(sb, KERN_ERR, "Filesystem with huge "
 					"files cannot be mounted read-write "
-					"without CONFIG_LBD.\n", sb->s_id);
+					"without CONFIG_LBD");
 			goto failed_mount;
 		}
 	}
@@ -2437,16 +2447,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
 	    blocksize > EXT4_MAX_BLOCK_SIZE) {
-		printk(KERN_ERR
-		       "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n",
-		       blocksize, sb->s_id);
+		ext4_msg(sb, KERN_ERR,
+		       "Unsupported filesystem blocksize %d", blocksize);
 		goto failed_mount;
 	}
 
 	if (sb->s_blocksize != blocksize) {
 		/* Validate the filesystem blocksize */
 		if (!sb_set_blocksize(sb, blocksize)) {
-			printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
+			ext4_msg(sb, KERN_ERR, "bad block size %d",
 					blocksize);
 			goto failed_mount;
 		}
@@ -2456,15 +2465,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		offset = do_div(logical_sb_block, blocksize);
 		bh = sb_bread(sb, logical_sb_block);
 		if (!bh) {
-			printk(KERN_ERR
-			       "EXT4-fs: Can't read superblock on 2nd try.\n");
+			ext4_msg(sb, KERN_ERR,
+			       "Can't read superblock on 2nd try");
 			goto failed_mount;
 		}
 		es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
 		sbi->s_es = es;
 		if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
-			printk(KERN_ERR
-			       "EXT4-fs: Magic mismatch, very weird !\n");
+			ext4_msg(sb, KERN_ERR,
+			       "Magic mismatch, very weird!");
 			goto failed_mount;
 		}
 	}
@@ -2482,8 +2491,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
 		    (!is_power_of_2(sbi->s_inode_size)) ||
 		    (sbi->s_inode_size > blocksize)) {
-			printk(KERN_ERR
-			       "EXT4-fs: unsupported inode size: %d\n",
+			ext4_msg(sb, KERN_ERR,
+			       "unsupported inode size: %d",
 			       sbi->s_inode_size);
 			goto failed_mount;
 		}
@@ -2496,8 +2505,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
 		    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
 		    !is_power_of_2(sbi->s_desc_size)) {
-			printk(KERN_ERR
-			       "EXT4-fs: unsupported descriptor size %lu\n",
+			ext4_msg(sb, KERN_ERR,
+			       "unsupported descriptor size %lu",
 			       sbi->s_desc_size);
 			goto failed_mount;
 		}
@@ -2537,25 +2546,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	if (sbi->s_blocks_per_group > blocksize * 8) {
-		printk(KERN_ERR
-		       "EXT4-fs: #blocks per group too big: %lu\n",
+		ext4_msg(sb, KERN_ERR,
+		       "#blocks per group too big: %lu",
 		       sbi->s_blocks_per_group);
 		goto failed_mount;
 	}
 	if (sbi->s_inodes_per_group > blocksize * 8) {
-		printk(KERN_ERR
-		       "EXT4-fs: #inodes per group too big: %lu\n",
+		ext4_msg(sb, KERN_ERR,
+		       "#inodes per group too big: %lu",
 		       sbi->s_inodes_per_group);
 		goto failed_mount;
 	}
 
 	if (ext4_blocks_count(es) >
 		    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
-		printk(KERN_ERR "EXT4-fs: filesystem on %s:"
-			" too large to mount safely\n", sb->s_id);
+		ext4_msg(sb, KERN_ERR, "filesystem"
+			" too large to mount safely");
 		if (sizeof(sector_t) < 8)
-			printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not "
-					"enabled\n");
+			ext4_msg(sb, KERN_WARNING, "CONFIG_LBD not enabled");
 		goto failed_mount;
 	}
 
@@ -2565,8 +2573,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	/* check blocks count against device size */
 	blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
 	if (blocks_count && ext4_blocks_count(es) > blocks_count) {
-		printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu "
-		       "exceeds size of device (%llu blocks)\n",
+		ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
+		       "exceeds size of device (%llu blocks)",
 		       ext4_blocks_count(es), blocks_count);
 		goto failed_mount;
 	}
@@ -2576,10 +2584,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * of the filesystem.
 	 */
 	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
-		printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
-		       "block %u is beyond end of filesystem (%llu)\n",
-		       le32_to_cpu(es->s_first_data_block),
-		       ext4_blocks_count(es));
+                ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
+			 "block %u is beyond end of filesystem (%llu)",
+			 le32_to_cpu(es->s_first_data_block),
+			 ext4_blocks_count(es));
 		goto failed_mount;
 	}
 	blocks_count = (ext4_blocks_count(es) -
@@ -2587,9 +2595,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			EXT4_BLOCKS_PER_GROUP(sb) - 1);
 	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
 	if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
-		printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
+		ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
 		       "(block count %llu, first data block %u, "
-		       "blocks per group %lu)\n", sbi->s_groups_count,
+		       "blocks per group %lu)", sbi->s_groups_count,
 		       ext4_blocks_count(es),
 		       le32_to_cpu(es->s_first_data_block),
 		       EXT4_BLOCKS_PER_GROUP(sb));
@@ -2601,7 +2609,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
 				    GFP_KERNEL);
 	if (sbi->s_group_desc == NULL) {
-		printk(KERN_ERR "EXT4-fs: not enough memory\n");
+		ext4_msg(sb, KERN_ERR, "not enough memory");
 		goto failed_mount;
 	}
 
@@ -2616,21 +2624,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		block = descriptor_loc(sb, logical_sb_block, i);
 		sbi->s_group_desc[i] = sb_bread(sb, block);
 		if (!sbi->s_group_desc[i]) {
-			printk(KERN_ERR "EXT4-fs: "
-			       "can't read group descriptor %d\n", i);
+			ext4_msg(sb, KERN_ERR,
+			       "can't read group descriptor %d", i);
 			db_count = i;
 			goto failed_mount2;
 		}
 	}
 	if (!ext4_check_descriptors(sb)) {
-		printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
+		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
 		goto failed_mount2;
 	}
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
 		if (!ext4_fill_flex_info(sb)) {
-			printk(KERN_ERR
-			       "EXT4-fs: unable to initialize "
-			       "flex_bg meta info!\n");
+			ext4_msg(sb, KERN_ERR,
+			       "unable to initialize "
+			       "flex_bg meta info!");
 			goto failed_mount2;
 		}
 
@@ -2652,7 +2660,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
 	}
 	if (err) {
-		printk(KERN_ERR "EXT4-fs: insufficient memory\n");
+		ext4_msg(sb, KERN_ERR, "insufficient memory");
 		goto failed_mount3;
 	}
 
@@ -2692,13 +2700,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			goto failed_mount3;
 		if (!(sb->s_flags & MS_RDONLY) &&
 		    EXT4_SB(sb)->s_journal->j_failed_commit) {
-			printk(KERN_CRIT "EXT4-fs error (device %s): "
+			ext4_msg(sb, KERN_CRIT, "error: "
 			       "ext4_fill_super: Journal transaction "
-			       "%u is corrupt\n", sb->s_id,
+			       "%u is corrupt",
 			       EXT4_SB(sb)->s_journal->j_failed_commit);
 			if (test_opt(sb, ERRORS_RO)) {
-				printk(KERN_CRIT
-				       "Mounting filesystem read-only\n");
+				ext4_msg(sb, KERN_CRIT,
+				       "Mounting filesystem read-only");
 				sb->s_flags |= MS_RDONLY;
 				EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 				es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -2712,8 +2720,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		}
 	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
 	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
-		printk(KERN_ERR "EXT4-fs: required journal recovery "
-		       "suppressed and not mounted read-only\n");
+		ext4_msg(sb, KERN_ERR, "required journal recovery "
+		       "suppressed and not mounted read-only");
 		goto failed_mount4;
 	} else {
 		clear_opt(sbi->s_mount_opt, DATA_FLAGS);
@@ -2726,7 +2734,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (ext4_blocks_count(es) > 0xffffffffULL &&
 	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
 				       JBD2_FEATURE_INCOMPAT_64BIT)) {
-		printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
+		ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
 		goto failed_mount4;
 	}
 
@@ -2764,8 +2772,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	case EXT4_MOUNT_WRITEBACK_DATA:
 		if (!jbd2_journal_check_available_features
 		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
-			printk(KERN_ERR "EXT4-fs: Journal does not support "
-			       "requested data journaling mode\n");
+			ext4_msg(sb, KERN_ERR, "Journal does not support "
+			       "requested data journaling mode");
 			goto failed_mount4;
 		}
 	default:
@@ -2777,8 +2785,8 @@ no_journal:
 
 	if (test_opt(sb, NOBH)) {
 		if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
-			printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
-				"its supported only with writeback mode\n");
+			ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
+				"its supported only with writeback mode");
 			clear_opt(sbi->s_mount_opt, NOBH);
 		}
 	}
@@ -2789,18 +2797,18 @@ no_journal:
 
 	root = ext4_iget(sb, EXT4_ROOT_INO);
 	if (IS_ERR(root)) {
-		printk(KERN_ERR "EXT4-fs: get root inode failed\n");
+		ext4_msg(sb, KERN_ERR, "get root inode failed");
 		ret = PTR_ERR(root);
 		goto failed_mount4;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		iput(root);
-		printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n");
+		ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
 		goto failed_mount4;
 	}
 	sb->s_root = d_alloc_root(root);
 	if (!sb->s_root) {
-		printk(KERN_ERR "EXT4-fs: get root dentry failed\n");
+		ext4_msg(sb, KERN_ERR, "get root dentry failed");
 		iput(root);
 		ret = -ENOMEM;
 		goto failed_mount4;
@@ -2829,29 +2837,29 @@ no_journal:
 							sbi->s_inode_size) {
 		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
 						       EXT4_GOOD_OLD_INODE_SIZE;
-		printk(KERN_INFO "EXT4-fs: required extra inode space not"
-			"available.\n");
+		ext4_msg(sb, KERN_INFO, "required extra inode space not"
+			 "available");
 	}
 
 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
-		printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
-				"requested data journaling mode\n");
+		ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
+			 "requested data journaling mode");
 		clear_opt(sbi->s_mount_opt, DELALLOC);
 	} else if (test_opt(sb, DELALLOC))
-		printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+		ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
 
 	err = ext4_setup_system_zone(sb);
 	if (err) {
-		printk(KERN_ERR "EXT4-fs: failed to initialize system "
-		       "zone (%d)\n", err);
+		ext4_msg(sb, KERN_ERR, "failed to initialize system "
+			 "zone (%d)\n", err);
 		goto failed_mount4;
 	}
 
 	ext4_ext_init(sb);
 	err = ext4_mb_init(sb, needs_recovery);
 	if (err) {
-		printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
-		       err);
+		ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)",
+			 err);
 		goto failed_mount4;
 	}
 
@@ -2869,7 +2877,7 @@ no_journal:
 	ext4_orphan_cleanup(sb, es);
 	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
 	if (needs_recovery) {
-		printk(KERN_INFO "EXT4-fs: recovery complete.\n");
+		ext4_msg(sb, KERN_INFO, "recovery complete");
 		ext4_mark_recovery_complete(sb, es);
 	}
 	if (EXT4_SB(sb)->s_journal) {
@@ -2882,20 +2890,18 @@ no_journal:
 	} else
 		descr = "out journal";
 
-	printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
-	       sb->s_id, descr);
+	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
 
 	lock_kernel();
 	return 0;
 
 cantfind_ext4:
 	if (!silent)
-		printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n",
-		       sb->s_id);
+		ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
 	goto failed_mount;
 
 failed_mount4:
-	printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
+	ext4_msg(sb, KERN_ERR, "mount failed");
 	ext4_release_system_zone(sb);
 	if (sbi->s_journal) {
 		jbd2_journal_destroy(sbi->s_journal);
@@ -2973,27 +2979,27 @@ static journal_t *ext4_get_journal(struct super_block *sb,
 
 	journal_inode = ext4_iget(sb, journal_inum);
 	if (IS_ERR(journal_inode)) {
-		printk(KERN_ERR "EXT4-fs: no journal found.\n");
+		ext4_msg(sb, KERN_ERR, "no journal found");
 		return NULL;
 	}
 	if (!journal_inode->i_nlink) {
 		make_bad_inode(journal_inode);
 		iput(journal_inode);
-		printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n");
+		ext4_msg(sb, KERN_ERR, "journal inode is deleted");
 		return NULL;
 	}
 
 	jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
 		  journal_inode, journal_inode->i_size);
 	if (!S_ISREG(journal_inode->i_mode)) {
-		printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
+		ext4_msg(sb, KERN_ERR, "invalid journal inode");
 		iput(journal_inode);
 		return NULL;
 	}
 
 	journal = jbd2_journal_init_inode(journal_inode);
 	if (!journal) {
-		printk(KERN_ERR "EXT4-fs: Could not load journal inode\n");
+		ext4_msg(sb, KERN_ERR, "Could not load journal inode");
 		iput(journal_inode);
 		return NULL;
 	}
@@ -3017,13 +3023,13 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 
 	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
 
-	bdev = ext4_blkdev_get(j_dev);
+	bdev = ext4_blkdev_get(j_dev, sb);
 	if (bdev == NULL)
 		return NULL;
 
 	if (bd_claim(bdev, sb)) {
-		printk(KERN_ERR
-			"EXT4-fs: failed to claim external journal device.\n");
+		ext4_msg(sb, KERN_ERR,
+			"failed to claim external journal device");
 		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 		return NULL;
 	}
@@ -3031,8 +3037,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	blocksize = sb->s_blocksize;
 	hblock = bdev_hardsect_size(bdev);
 	if (blocksize < hblock) {
-		printk(KERN_ERR
-			"EXT4-fs: blocksize too small for journal device.\n");
+		ext4_msg(sb, KERN_ERR,
+			"blocksize too small for journal device");
 		goto out_bdev;
 	}
 
@@ -3040,8 +3046,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	offset = EXT4_MIN_BLOCK_SIZE % blocksize;
 	set_blocksize(bdev, blocksize);
 	if (!(bh = __bread(bdev, sb_block, blocksize))) {
-		printk(KERN_ERR "EXT4-fs: couldn't read superblock of "
-		       "external journal\n");
+		ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
+		       "external journal");
 		goto out_bdev;
 	}
 
@@ -3049,14 +3055,14 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
 	    !(le32_to_cpu(es->s_feature_incompat) &
 	      EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
-		printk(KERN_ERR "EXT4-fs: external journal has "
-					"bad superblock\n");
+		ext4_msg(sb, KERN_ERR, "external journal has "
+					"bad superblock");
 		brelse(bh);
 		goto out_bdev;
 	}
 
 	if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
-		printk(KERN_ERR "EXT4-fs: journal UUID does not match\n");
+		ext4_msg(sb, KERN_ERR, "journal UUID does not match");
 		brelse(bh);
 		goto out_bdev;
 	}
@@ -3068,19 +3074,19 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
 					start, len, blocksize);
 	if (!journal) {
-		printk(KERN_ERR "EXT4-fs: failed to create device journal\n");
+		ext4_msg(sb, KERN_ERR, "failed to create device journal");
 		goto out_bdev;
 	}
 	journal->j_private = sb;
 	ll_rw_block(READ, 1, &journal->j_sb_buffer);
 	wait_on_buffer(journal->j_sb_buffer);
 	if (!buffer_uptodate(journal->j_sb_buffer)) {
-		printk(KERN_ERR "EXT4-fs: I/O error on journal device\n");
+		ext4_msg(sb, KERN_ERR, "I/O error on journal device");
 		goto out_journal;
 	}
 	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
-		printk(KERN_ERR "EXT4-fs: External journal has more than one "
-					"user (unsupported) - %d\n",
+		ext4_msg(sb, KERN_ERR, "External journal has more than one "
+					"user (unsupported) - %d",
 			be32_to_cpu(journal->j_superblock->s_nr_users));
 		goto out_journal;
 	}
@@ -3109,8 +3115,8 @@ static int ext4_load_journal(struct super_block *sb,
 
 	if (journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
-		printk(KERN_INFO "EXT4-fs: external journal device major/minor "
-			"numbers have changed\n");
+		ext4_msg(sb, KERN_INFO, "external journal device major/minor "
+			"numbers have changed");
 		journal_dev = new_decode_dev(journal_devnum);
 	} else
 		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
@@ -3124,21 +3130,21 @@ static int ext4_load_journal(struct super_block *sb,
 	 */
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 		if (sb->s_flags & MS_RDONLY) {
-			printk(KERN_INFO "EXT4-fs: INFO: recovery "
-					"required on readonly filesystem.\n");
+			ext4_msg(sb, KERN_INFO, "INFO: recovery "
+					"required on readonly filesystem");
 			if (really_read_only) {
-				printk(KERN_ERR "EXT4-fs: write access "
-					"unavailable, cannot proceed.\n");
+				ext4_msg(sb, KERN_ERR, "write access "
+					"unavailable, cannot proceed");
 				return -EROFS;
 			}
-			printk(KERN_INFO "EXT4-fs: write access will "
-			       "be enabled during recovery.\n");
+			ext4_msg(sb, KERN_INFO, "write access will "
+			       "be enabled during recovery");
 		}
 	}
 
 	if (journal_inum && journal_dev) {
-		printk(KERN_ERR "EXT4-fs: filesystem has both journal "
-		       "and inode journals!\n");
+		ext4_msg(sb, KERN_ERR, "filesystem has both journal "
+		       "and inode journals!");
 		return -EINVAL;
 	}
 
@@ -3151,14 +3157,14 @@ static int ext4_load_journal(struct super_block *sb,
 	}
 
 	if (journal->j_flags & JBD2_BARRIER)
-		printk(KERN_INFO "EXT4-fs: barriers enabled\n");
+		ext4_msg(sb, KERN_INFO, "barriers enabled");
 	else
-		printk(KERN_INFO "EXT4-fs: barriers disabled\n");
+		ext4_msg(sb, KERN_INFO, "barriers disabled");
 
 	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
 		err = jbd2_journal_update_format(journal);
 		if (err)  {
-			printk(KERN_ERR "EXT4-fs: error updating journal.\n");
+			ext4_msg(sb, KERN_ERR, "error updating journal");
 			jbd2_journal_destroy(journal);
 			return err;
 		}
@@ -3170,7 +3176,7 @@ static int ext4_load_journal(struct super_block *sb,
 		err = jbd2_journal_load(journal);
 
 	if (err) {
-		printk(KERN_ERR "EXT4-fs: error loading journal.\n");
+		ext4_msg(sb, KERN_ERR, "error loading journal");
 		jbd2_journal_destroy(journal);
 		return err;
 	}
@@ -3206,8 +3212,8 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 		 * be remapped.  Nothing we can do but to retry the
 		 * write and hope for the best.
 		 */
-		printk(KERN_ERR "EXT4-fs: previous I/O error to "
-		       "superblock detected for %s.\n", sb->s_id);
+		ext4_msg(sb, KERN_ERR, "previous I/O error to "
+		       "superblock detected");
 		clear_buffer_write_io_error(sbh);
 		set_buffer_uptodate(sbh);
 	}
@@ -3230,8 +3236,8 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 
 		error = buffer_write_io_error(sbh);
 		if (error) {
-			printk(KERN_ERR "EXT4-fs: I/O error while writing "
-			       "superblock for %s.\n", sb->s_id);
+			ext4_msg(sb, KERN_ERR, "I/O error while writing "
+			       "superblock");
 			clear_buffer_write_io_error(sbh);
 			set_buffer_uptodate(sbh);
 		}
@@ -3478,9 +3484,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			int ret;
 			if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
 					~EXT4_FEATURE_RO_COMPAT_SUPP))) {
-				printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+				ext4_msg(sb, KERN_WARNING, "couldn't "
 				       "remount RDWR because of unsupported "
-				       "optional features (%x).\n", sb->s_id,
+				       "optional features (%x)",
 				(le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
 					~EXT4_FEATURE_RO_COMPAT_SUPP));
 				err = -EROFS;
@@ -3496,9 +3502,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 					ext4_get_group_desc(sb, g, NULL);
 
 				if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
-					printk(KERN_ERR
-	       "EXT4-fs: ext4_remount: "
-		"Checksum for group %u failed (%u!=%u)\n",
+					ext4_msg(sb, KERN_ERR,
+	       "ext4_remount: Checksum for group %u failed (%u!=%u)",
 		g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
 					       le16_to_cpu(gdp->bg_checksum));
 					err = -EINVAL;
@@ -3512,11 +3517,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			 * require a full umount/remount for now.
 			 */
 			if (es->s_last_orphan) {
-				printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+				ext4_msg(sb, KERN_WARNING, "Couldn't "
 				       "remount RDWR because of unprocessed "
 				       "orphan inode list.  Please "
-				       "umount/remount instead.\n",
-				       sb->s_id);
+				       "umount/remount instead");
 				err = -EINVAL;
 				goto restore_opts;
 			}
@@ -3772,9 +3776,9 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 	if (EXT4_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not in fs root? */
 		if (path.dentry->d_parent != sb->s_root)
-			printk(KERN_WARNING
-				"EXT4-fs: Quota file not on filesystem root. "
-				"Journaled quota will not work.\n");
+			ext4_msg(sb, KERN_WARNING,
+				"Quota file not on filesystem root. "
+				"Journaled quota will not work");
 	}
 
 	/*
@@ -3857,8 +3861,8 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 	handle_t *handle = journal_current_handle();
 
 	if (EXT4_SB(sb)->s_journal && !handle) {
-		printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
-			" cancelled because transaction is not started.\n",
+		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+			" cancelled because transaction is not started",
 			(unsigned long long)off, (unsigned long long)len);
 		return -EIO;
 	}
@@ -3930,10 +3934,10 @@ static struct file_system_type ext4_fs_type = {
 static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
 			  const char *dev_name, void *data,struct vfsmount *mnt)
 {
-	printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
-	       "to mount using ext4\n");
-	printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
-	       "will go away by 2.6.31\n");
+	printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
+	       "to mount using ext4\n", dev_name);
+	printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
+	       "will go away by 2.6.31\n", dev_name);
 	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 
-- 
cgit v1.2.3


From 1938a150c25bf7c2c47182e753a1038945b70b0e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 5 Jun 2009 01:00:26 -0400
Subject: ext4: Avoid leaking blocks after a block allocation failure

We should add inode to the orphan list in the same transaction
as block allocation.  This ensures that if we crash after a failed
block allocation and before we do a vmtruncate we don't leak block
(ie block marked as used in bitmap but not claimed by the inode).

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
CC:  Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 17ed0d244db..8d215881172 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1459,7 +1459,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 				struct page **pagep, void **fsdata)
 {
 	struct inode *inode = mapping->host;
-	int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+	int ret, needed_blocks;
 	handle_t *handle;
 	int retries = 0;
 	struct page *page;
@@ -1470,6 +1470,11 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 		   "dev %s ino %lu pos %llu len %u flags %u",
 		   inode->i_sb->s_id, inode->i_ino,
 		   (unsigned long long) pos, len, flags);
+	/*
+	 * Reserve one block more for addition to orphan list in case
+	 * we allocate blocks but write fails for some reason
+	 */
+	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
  	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
@@ -1503,15 +1508,30 @@ retry:
 
 	if (ret) {
 		unlock_page(page);
-		ext4_journal_stop(handle);
 		page_cache_release(page);
 		/*
 		 * block_write_begin may have instantiated a few blocks
 		 * outside i_size.  Trim these off again. Don't need
 		 * i_size_read because we hold i_mutex.
+		 *
+		 * Add inode to orphan list in case we crash before
+		 * truncate finishes
 		 */
 		if (pos + len > inode->i_size)
+			ext4_orphan_add(handle, inode);
+
+		ext4_journal_stop(handle);
+		if (pos + len > inode->i_size) {
 			vmtruncate(inode, inode->i_size);
+			/* 
+			 * If vmtruncate failed early the inode might
+			 * still be on the orphan list; we need to
+			 * make sure the inode is removed from the
+			 * orphan list in that case.
+			 */
+			if (inode->i_nlink)
+				ext4_orphan_del(NULL, inode);
+		}
 	}
 
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-- 
cgit v1.2.3


From f8514083cd61daef12fba5ef883ad9352c450428 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 5 Jun 2009 00:56:49 -0400
Subject: ext4: truncate the file properly if we fail to copy data from
 userspace

In generic_perform_write if we fail to copy the user data we don't
update the inode->i_size.  We should truncate the file in the above
case so that we don't have blocks allocated outside inode->i_size.  Add
the inode to orphan list in the same transaction as block allocation
This ensures that if we crash in between the recovery would do the
truncate.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
CC:  Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 128 ++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 102 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d215881172..2c10d346f7a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1549,6 +1549,52 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 	return ext4_handle_dirty_metadata(handle, NULL, bh);
 }
 
+static int ext4_generic_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	int i_size_changed = 0;
+	struct inode *inode = mapping->host;
+	handle_t *handle = ext4_journal_current_handle();
+
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold i_mutex.
+	 *
+	 * But it's important to update i_size while still holding page lock:
+	 * page writeout could otherwise come in and zero beyond i_size.
+	 */
+	if (pos + copied > inode->i_size) {
+		i_size_write(inode, pos + copied);
+		i_size_changed = 1;
+	}
+
+	if (pos + copied >  EXT4_I(inode)->i_disksize) {
+		/* We need to mark inode dirty even if
+		 * new_i_size is less that inode->i_size
+		 * bu greater than i_disksize.(hint delalloc)
+		 */
+		ext4_update_i_disksize(inode, (pos + copied));
+		i_size_changed = 1;
+	}
+	unlock_page(page);
+	page_cache_release(page);
+
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		ext4_mark_inode_dirty(handle, inode);
+
+	return copied;
+}
+
 /*
  * We need to pick up the new inode size which generic_commit_write gave us
  * `file' can be NULL - eg, when called from page_symlink().
@@ -1572,21 +1618,15 @@ static int ext4_ordered_write_end(struct file *file,
 	ret = ext4_jbd2_file_inode(handle, inode);
 
 	if (ret == 0) {
-		loff_t new_i_size;
-
-		new_i_size = pos + copied;
-		if (new_i_size > EXT4_I(inode)->i_disksize) {
-			ext4_update_i_disksize(inode, new_i_size);
-			/* We need to mark inode dirty even if
-			 * new_i_size is less that inode->i_size
-			 * bu greater than i_disksize.(hint delalloc)
-			 */
-			ext4_mark_inode_dirty(handle, inode);
-		}
-
-		ret2 = generic_write_end(file, mapping, pos, len, copied,
+		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 		copied = ret2;
+		if (pos + len > inode->i_size)
+			/* if we have allocated more blocks and copied
+			 * less. We will have blocks allocated outside
+			 * inode->i_size. So truncate them
+			 */
+			ext4_orphan_add(handle, inode);
 		if (ret2 < 0)
 			ret = ret2;
 	}
@@ -1594,6 +1634,18 @@ static int ext4_ordered_write_end(struct file *file,
 	if (!ret)
 		ret = ret2;
 
+	if (pos + len > inode->i_size) {
+		vmtruncate(inode, inode->i_size);
+		/* 
+		 * If vmtruncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+
+
 	return ret ? ret : copied;
 }
 
@@ -1605,25 +1657,21 @@ static int ext4_writeback_write_end(struct file *file,
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
-	loff_t new_i_size;
 
 	trace_mark(ext4_writeback_write_end,
 		   "dev %s ino %lu pos %llu len %u copied %u",
 		   inode->i_sb->s_id, inode->i_ino,
 		   (unsigned long long) pos, len, copied);
-	new_i_size = pos + copied;
-	if (new_i_size > EXT4_I(inode)->i_disksize) {
-		ext4_update_i_disksize(inode, new_i_size);
-		/* We need to mark inode dirty even if
-		 * new_i_size is less that inode->i_size
-		 * bu greater than i_disksize.(hint delalloc)
-		 */
-		ext4_mark_inode_dirty(handle, inode);
-	}
-
-	ret2 = generic_write_end(file, mapping, pos, len, copied,
+	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
+	if (pos + len > inode->i_size)
+		/* if we have allocated more blocks and copied
+		 * less. We will have blocks allocated outside
+		 * inode->i_size. So truncate them
+		 */
+		ext4_orphan_add(handle, inode);
+
 	if (ret2 < 0)
 		ret = ret2;
 
@@ -1631,6 +1679,17 @@ static int ext4_writeback_write_end(struct file *file,
 	if (!ret)
 		ret = ret2;
 
+	if (pos + len > inode->i_size) {
+		vmtruncate(inode, inode->i_size);
+		/* 
+		 * If vmtruncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+
 	return ret ? ret : copied;
 }
 
@@ -1675,10 +1734,27 @@ static int ext4_journalled_write_end(struct file *file,
 	}
 
 	unlock_page(page);
+	page_cache_release(page);
+	if (pos + len > inode->i_size)
+		/* if we have allocated more blocks and copied
+		 * less. We will have blocks allocated outside
+		 * inode->i_size. So truncate them
+		 */
+		ext4_orphan_add(handle, inode);
+
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	page_cache_release(page);
+	if (pos + len > inode->i_size) {
+		vmtruncate(inode, inode->i_size);
+		/* 
+		 * If vmtruncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
 
 	return ret ? ret : copied;
 }
-- 
cgit v1.2.3


From f6d03139d745198b434f65a28aabed524f415a4c Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 5 Jun 2009 07:18:57 +0100
Subject: GFS2: Fix locking issue mounting gfs2meta fs

This patch uses sget() to get a reference to the
existing gfs2 sb when mouting the gfs2meta filesystem
(in fact thats just another mount of the gfs2
filesystem with a different root and this interface
is for backward compatibility).

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Reported-by: Benjamin Marzinski <bmarzins@redhat.com>
Tested-by: Benjamin Marzinski <bmarzins@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
---
 fs/gfs2/ops_fstype.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 2cd1164c88d..9da161cbb30 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1273,9 +1273,20 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
 	return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
 }
 
+static int test_meta_super(struct super_block *s, void *ptr)
+{
+	struct block_device *bdev = ptr;
+	return (bdev == s->s_bdev);
+}
+
+static int set_meta_super(struct super_block *s, void *ptr)
+{
+	return -EINVAL;
+}
+
 static struct super_block *get_gfs2_sb(const char *dev_name)
 {
-	struct super_block *sb;
+	struct super_block *s;
 	struct path path;
 	int error;
 
@@ -1283,30 +1294,27 @@ static struct super_block *get_gfs2_sb(const char *dev_name)
 	if (error) {
 		printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
 		       dev_name, error);
-		return NULL;
+		return ERR_PTR(-ENOENT);
 	}
-	sb = path.dentry->d_inode->i_sb;
-	if (sb && (sb->s_type == &gfs2_fs_type))
-		atomic_inc(&sb->s_active);
-	else
-		sb = NULL;
+	s = sget(&gfs2_fs_type, test_meta_super, set_meta_super,
+		 path.dentry->d_inode->i_sb->s_bdev);
 	path_put(&path);
-	return sb;
+	return s;
 }
 
 static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 			    const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	struct super_block *sb = NULL;
+	struct super_block *s;
 	struct gfs2_sbd *sdp;
 
-	sb = get_gfs2_sb(dev_name);
-	if (!sb) {
+	s = get_gfs2_sb(dev_name);
+	if (IS_ERR(s)) {
 		printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
-		return -ENOENT;
+		return PTR_ERR(s);
 	}
-	sdp = sb->s_fs_info;
-	mnt->mnt_sb = sb;
+	sdp = s->s_fs_info;
+	mnt->mnt_sb = s;
 	mnt->mnt_root = dget(sdp->sd_master_dir);
 	return 0;
 }
-- 
cgit v1.2.3


From 460bcf57b128ce1c0dd553d905fedc097f9955c6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 12 May 2009 07:37:56 -0400
Subject: Fix nobh_truncate_page() to not pass stack garbage to get_block()

The nobh_truncate_page() function is used by ext2, exofs, and jfs.  Of
these three, only ext2 and jfs's get_block() function pays attention
to bh->b_size --- which is normally always the filesystem blocksize
except when the get_block() function is called by either
mpage_readpage(), mpage_readpages(), or the direct I/O routines in
fs/direct_io.c.

Unfortunately, nobh_truncate_page() does not initialize map_bh before
calling the filesystem-supplied get_block() function.  So ext2 and jfs
will try to calculate the number of blocks to map by taking stack
garbage and shifting it left by inode->i_blkbits.  This should be
*mostly* harmless (except the filesystem will do some unnneeded work)
unless the stack garbage is less than filesystem's blocksize, in which
case maxblocks will be zero, and the attempt to find out whether or
not the filesystem has a hole at a given logical block will fail, and
the page cache entry might not get zero'ed out.

Also if the stack garbage in in map_bh->state happens to have the
BH_Mapped bit set, there could be an attempt to call readpage() on a
non-existent page, which could cause nobh_truncate_page() to return an
error when it should not.

Fix this by initializing map_bh->state and map_bh->size.

Fortunately, it's probably fairly unlikely that ext2 and jfs users
mount with nobh these days.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/buffer.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index aed297739eb..49106127a4a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2736,6 +2736,8 @@ has_buffers:
 		pos += blocksize;
 	}
 
+	map_bh.b_size = blocksize;
+	map_bh.b_state = 0;
 	err = get_block(inode, iblock, &map_bh, 0);
 	if (err)
 		goto unlock;
-- 
cgit v1.2.3


From 72a43d63cb51057393edfbcfc4596066205ad15d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 13 May 2009 19:13:40 +0100
Subject: ext3/4 with synchronous writes gets wedged by Postfix

OK, that's probably the easiest way to do that, as much as I don't like it...
Since iget() et.al. will not accept I_FREEING (will wait to go away
and restart), and since we'd better have serialization between new/free
on fs data structures anyway, we can afford simply skipping I_FREEING
et.al. in insert_inode_locked().

We do that from new_inode, so it won't race with free_inode in any interesting
ways and it won't race with iget (of any origin; nfsd or in case of fs
corruption a lookup) since both still will wait for I_LOCK.

Reviewed-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Jan Kara <jack@suse.cz>
Tested-by: David Watson <dbwatson@ukfsn.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 0571983755d..a4876e56195 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1053,13 +1053,22 @@ int insert_inode_locked(struct inode *inode)
 	struct super_block *sb = inode->i_sb;
 	ino_t ino = inode->i_ino;
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
-	struct inode *old;
 
 	inode->i_state |= I_LOCK|I_NEW;
 	while (1) {
+		struct hlist_node *node;
+		struct inode *old = NULL;
 		spin_lock(&inode_lock);
-		old = find_inode_fast(sb, head, ino);
-		if (likely(!old)) {
+		hlist_for_each_entry(old, node, head, i_hash) {
+			if (old->i_ino != ino)
+				continue;
+			if (old->i_sb != sb)
+				continue;
+			if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+				continue;
+			break;
+		}
+		if (likely(!node)) {
 			hlist_add_head(&inode->i_hash, head);
 			spin_unlock(&inode_lock);
 			return 0;
@@ -1081,14 +1090,24 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
 {
 	struct super_block *sb = inode->i_sb;
 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
-	struct inode *old;
 
 	inode->i_state |= I_LOCK|I_NEW;
 
 	while (1) {
+		struct hlist_node *node;
+		struct inode *old = NULL;
+
 		spin_lock(&inode_lock);
-		old = find_inode(sb, head, test, data);
-		if (likely(!old)) {
+		hlist_for_each_entry(old, node, head, i_hash) {
+			if (old->i_sb != sb)
+				continue;
+			if (!test(old, data))
+				continue;
+			if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))
+				continue;
+			break;
+		}
+		if (likely(!node)) {
 			hlist_add_head(&inode->i_hash, head);
 			spin_unlock(&inode_lock);
 			return 0;
-- 
cgit v1.2.3


From 4ae1507f6d266d0cc3dd36e474d83aad70fec9e4 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sun, 24 May 2009 18:45:15 -0400
Subject: cifs: make overriding of ownership conditional on new mount options

We have a bit of a problem with the uid= option. The basic issue is that
it means too many things and has too many side-effects.

It's possible to allow an unprivileged user to mount a filesystem if the
user owns the mountpoint, /bin/mount is setuid root, and the mount is
set up in /etc/fstab with the "user" option.

When doing this though, /bin/mount automatically adds the "uid=" and
"gid=" options to the share. This is fortunate since the correct uid=
option is needed in order to tell the upcall what user's credcache to
use when generating the SPNEGO blob.

On a mount without unix extensions this is fine -- you generally will
want the files to be owned by the "owner" of the mount. The problem
comes in on a mount with unix extensions. With those enabled, the
uid/gid options cause the ownership of files to be overriden even though
the server is sending along the ownership info.

This means that it's not possible to have a mount by an unprivileged
user that shows the server's file ownership info. The result is also
inode permissions that have no reflection at all on the server. You
simply cannot separate ownership from the mode in this fashion.

This behavior also makes MultiuserMount option less usable. Once you
pass in the uid= option for a mount, then you can't use unix ownership
info and allow someone to share the mount.

While I'm not thrilled with it, the only solution I can see is to stop
making uid=/gid= force the overriding of ownership on mounts, and to add
new mount options that turn this behavior on.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 74b5a87e919..10151f8d849 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1096,17 +1096,17 @@ cifs_parse_mount_options(char *options, const char *devname,
 				return 1;
 			}
 		} else if (strnicmp(data, "uid", 3) == 0) {
-			if (value && *value) {
+			if (value && *value)
 				vol->linux_uid =
 					simple_strtoul(value, &value, 0);
+		} else if (strnicmp(data, "forceuid", 8) == 0) {
 				vol->override_uid = 1;
-			}
 		} else if (strnicmp(data, "gid", 3) == 0) {
-			if (value && *value) {
+			if (value && *value)
 				vol->linux_gid =
 					simple_strtoul(value, &value, 0);
+		} else if (strnicmp(data, "forcegid", 8) == 0) {
 				vol->override_gid = 1;
-			}
 		} else if (strnicmp(data, "file_mode", 4) == 0) {
 			if (value && *value) {
 				vol->file_mode =
-- 
cgit v1.2.3


From f0472d0ec89bef2ea4432828c3daa1b26ef569aa Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sat, 6 Jun 2009 21:09:39 +0000
Subject: [CIFS] Add mention of new mount parm (forceuid) to cifs readme

Also update fs/cifs/CHANGES

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES | 5 ++++-
 fs/cifs/README  | 9 ++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 227c681b816..b4868983942 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -2,7 +2,10 @@ Version 1.59
 ------------
 Client uses server inode numbers (which are persistent) rather than
 client generated ones by default (mount option "serverino" turned
-on by default if server supports it).
+on by default if server supports it).  Add forceuid and forcegid
+mount options (so that when negotiating unix extensions specifying
+which uid mounted does not immediately force the server's reported
+uids to be overridden).
 
 Version 1.58
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 6d1608fabde..ad92921dbde 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -262,7 +262,8 @@ A partial list of the supported mount options follows:
 		mount.	
   domain	Set the SMB/CIFS workgroup name prepended to the
 		username during CIFS session establishment
-  uid		Set the default uid for inodes. For mounts to servers
+  forceuid	Set the default uid for inodes based on the uid
+		passed in. For mounts to servers
 		which do support the CIFS Unix extensions, such as a
 		properly configured Samba server, the server provides
 		the uid, gid and mode so this parameter should  not be
@@ -292,6 +293,12 @@ A partial list of the supported mount options follows:
 		the client.  Note that the mount.cifs helper must be
 		at version 1.10 or higher to support specifying the uid
 		(or gid) in non-numeric form.
+  forcegid	(similar to above but for the groupid instead of uid)
+  uid		Set the default uid for inodes, and indicate to the
+		cifs kernel driver which local user mounted . If the server
+		supports the unix extensions the default uid is
+		not used to fill in the owner fields of inodes (files)
+		unless the "forceuid" parameter is specified.
   gid		Set the default gid for inodes (similar to above).
   file_mode     If CIFS Unix extensions are not supported by the server
 		this overrides the default mode for file inodes.
-- 
cgit v1.2.3


From f07502dae230a2c3b65381fd1b06e8a18b2c7525 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Date: Sat, 6 Jun 2009 21:18:09 +0100
Subject: integrity: fix IMA inode leak

CONFIG_IMA=y inode activity leaks iint_cache and radix_tree_node objects
until the system runs out of memory.  Nowhere is calling ima_inode_free()
a.k.a. ima_iint_delete().  Fix that by calling it from destroy_inode().

Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index a4876e56195..bca0c618fdb 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -219,6 +219,7 @@ static struct inode *alloc_inode(struct super_block *sb)
 void destroy_inode(struct inode *inode)
 {
 	BUG_ON(inode_has_buffers(inode));
+	ima_inode_free(inode);
 	security_inode_free(inode);
 	if (inode->i_sb->s_op->destroy_inode)
 		inode->i_sb->s_op->destroy_inode(inode);
-- 
cgit v1.2.3


From 9aee2286071c23c535fe9928eec1a26e0bcf256d Mon Sep 17 00:00:00 2001
From: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Date: Mon, 8 Jun 2009 12:41:35 -0400
Subject: ext4: fix dx_map_entry to support 256k directory blocks

The dx_map_entry structure doesn't support over 64KB block size by
current usage of its member("offs"). Because "offs" treats an offset
of copies of the ext4_dir_entry_2 structure as is. This member size is
16 bits. But real offset for over 64KB(256KB) block size needs 18
bits. However, real offset keeps 4 byte boundary, so lower 2 bits is
not used.

Therefore, we do the following to fix this limitation:
For "store":
	we divide the real offset by 4 and then store this result to "offs"
	member.
For "use":
	we multiply "offs" member by 4 and then use this result
	as real offset.

Signed-off-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f2bc160463b..07eb6649e4f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -749,7 +749,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
 			ext4fs_dirhash(de->name, de->name_len, &h);
 			map_tail--;
 			map_tail->hash = h.hash;
-			map_tail->offs = (u16) ((char *) de - base);
+			map_tail->offs = ((char *) de - base)>>2;
 			map_tail->size = le16_to_cpu(de->rec_len);
 			count++;
 			cond_resched();
@@ -1147,7 +1147,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
 	unsigned rec_len = 0;
 
 	while (count--) {
-		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs);
+		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 
+						(from + (map->offs<<2));
 		rec_len = EXT4_DIR_REC_LEN(de->name_len);
 		memcpy (to, de, rec_len);
 		((struct ext4_dir_entry_2 *) to)->rec_len =
-- 
cgit v1.2.3


From 0eab928221bac8895a0b494a16a8810002bd8645 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 9 Jun 2009 09:54:40 -0400
Subject: ext4: Don't treat a truncation of a zero-length file as
 replace-via-truncate

If a non-existent file is opened via O_WRONLY|O_CREAT|O_TRUNC, there's
no need to treat this as a true file truncation, so we shouldn't
activate the replace-via-truncate hueristic.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2c10d346f7a..875db944b22 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4127,7 +4127,8 @@ void ext4_truncate(struct inode *inode)
 	if (!ext4_can_truncate(inode))
 		return;
 
-	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+	if (ei->i_disksize && inode->i_size == 0 &&
+	    !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
 		ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
 
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-- 
cgit v1.2.3


From 55782138e47d9baf2f7d3a7af9e7cf42adf72c56 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Jun 2009 13:43:05 +0800
Subject: tracing/events: convert block trace points to TRACE_EVENT()

TRACE_EVENT is a more generic way to define tracepoints. Doing so adds
these new capabilities to this tracepoint:

  - zero-copy and per-cpu splice() tracing
  - binary tracing without printf overhead
  - structured logging records exposed under /debug/tracing/events
  - trace events embedded in function tracer output and other plugins
  - user-defined, per tracepoint filter expressions
  ...

Cons:

  - no dev_t info for the output of plug, unplug_timer and unplug_io events.
    no dev_t info for getrq and sleeprq events if bio == NULL.
    no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.

    This is mainly because we can't get the deivce from a request queue.
    But this may change in the future.

  - A packet command is converted to a string in TP_assign, not TP_print.
    While blktrace do the convertion just before output.

    Since pc requests should be rather rare, this is not a big issue.

  - In blktrace, an event can have 2 different print formats, but a TRACE_EVENT
    has a unique format, which means we have some unused data in a trace entry.

    The overhead is minimized by using __dynamic_array() instead of __array().

I've benchmarked the ioctl blktrace vs the splice based TRACE_EVENT tracing:

      dd                   dd + ioctl blktrace       dd + TRACE_EVENT (splice)
1     7.36s, 42.7 MB/s     7.50s, 42.0 MB/s          7.41s, 42.5 MB/s
2     7.43s, 42.3 MB/s     7.48s, 42.1 MB/s          7.43s, 42.4 MB/s
3     7.38s, 42.6 MB/s     7.45s, 42.2 MB/s          7.41s, 42.5 MB/s

So the overhead of tracing is very small, and no regression when using
those trace events vs blktrace.

And the binary output of TRACE_EVENT is much smaller than blktrace:

 # ls -l -h
 -rw-r--r-- 1 root root 8.8M 06-09 13:24 sda.blktrace.0
 -rw-r--r-- 1 root root 195K 06-09 13:24 sda.blktrace.1
 -rw-r--r-- 1 root root 2.7M 06-09 13:25 trace_splice.out

Following are some comparisons between TRACE_EVENT and blktrace:

plug:
  kjournald-480   [000]   303.084981: block_plug: [kjournald]
  kjournald-480   [000]   303.084981:   8,0    P   N [kjournald]

unplug_io:
  kblockd/0-118   [000]   300.052973: block_unplug_io: [kblockd/0] 1
  kblockd/0-118   [000]   300.052974:   8,0    U   N [kblockd/0] 1

remap:
  kjournald-480   [000]   303.085042: block_remap: 8,0 W 102736992 + 8 <- (8,8) 33384
  kjournald-480   [000]   303.085043:   8,0    A   W 102736992 + 8 <- (8,8) 33384

bio_backmerge:
  kjournald-480   [000]   303.085086: block_bio_backmerge: 8,0 W 102737032 + 8 [kjournald]
  kjournald-480   [000]   303.085086:   8,0    M   W 102737032 + 8 [kjournald]

getrq:
  kjournald-480   [000]   303.084974: block_getrq: 8,0 W 102736984 + 8 [kjournald]
  kjournald-480   [000]   303.084975:   8,0    G   W 102736984 + 8 [kjournald]

  bash-2066  [001]  1072.953770:   8,0    G   N [bash]
  bash-2066  [001]  1072.953773: block_getrq: 0,0 N 0 + 0 [bash]

rq_complete:
  konsole-2065  [001]   300.053184: block_rq_complete: 8,0 W () 103669040 + 16 [0]
  konsole-2065  [001]   300.053191:   8,0    C   W 103669040 + 16 [0]

  ksoftirqd/1-7   [001]  1072.953811:   8,0    C   N (5a 00 08 00 00 00 00 00 24 00) [0]
  ksoftirqd/1-7   [001]  1072.953813: block_rq_complete: 0,0 N (5a 00 08 00 00 00 00 00 24 00) 0 + 0 [0]

rq_insert:
  kjournald-480   [000]   303.084985: block_rq_insert: 8,0 W 0 () 102736984 + 8 [kjournald]
  kjournald-480   [000]   303.084986:   8,0    I   W 102736984 + 8 [kjournald]

Changelog from v2 -> v3:

- use the newly introduced __dynamic_array().

Changelog from v1 -> v2:

- use __string() instead of __array() to minimize the memory required
  to store hex dump of rq->cmd().

- support large pc requests.

- add missing blk_fill_rwbs_rq() in block_rq_requeue TRACE_EVENT.

- some cleanups.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A2DF669.5070905@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 fs/bio.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 98711647ece..740699c4f90 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -26,10 +26,9 @@
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
-DEFINE_TRACE(block_split);
+#include <trace/events/block.h>
 
 /*
  * Test patch to inline a certain number of bi_io_vec's inside the bio
-- 
cgit v1.2.3


From 463aea1a1c49f1a7d4b50656dfd6c8bb33358b1b Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 9 Jun 2009 16:26:24 -0700
Subject: autofs4: remove hashed check in validate_wait()

The recent ->lookup() deadlock correction required the directory inode
mutex to be dropped while waiting for expire completion.  We were
concerned about side effects from this change and one has been identified.

I saw several error messages.

They cause autofs to become quite confused and don't really point to the
actual problem.

Things like:

handle_packet_missing_direct:1376: can't find map entry for (43,1827932)

which is usually totally fatal (although in this case it wouldn't be
except that I treat is as such because it normally is).

do_mount_direct: direct trigger not valid or already mounted
/test/nested/g3c/s1/ss1

which is recoverable, however if this problem is at play it can cause
autofs to become quite confused as to the dependencies in the mount tree
because mount triggers end up mounted multiple times.  It's hard to
accurately check for this over mounting case and automount shouldn't need
to if the kernel module is doing its job.

There was one other message, similar in consequence of this last one but I
can't locate a log example just now.

When checking if a mount has already completed prior to adding a new mount
request to the wait queue we check if the dentry is hashed and, if so, if
it is a mount point.  But, if a mount successfully completed while we
slept on the wait queue mutex the dentry must exist for the mount to have
completed so the test is not really needed.

Mounts can also be done on top of a global root dentry, so for the above
case, where a mount request completes and the wait queue entry has already
been removed, the hashed test returning false can cause an incorrect
callback to the daemon.  Also, d_mountpoint() is not sufficient to check
if a mount has completed for the multi-mount case when we don't have a
real mount at the base of the tree.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/waitq.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index eeb24684590..2341375386f 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -297,20 +297,14 @@ static int validate_request(struct autofs_wait_queue **wait,
 	 */
 	if (notify == NFY_MOUNT) {
 		/*
-		 * If the dentry isn't hashed just go ahead and try the
-		 * mount again with a new wait (not much else we can do).
-		*/
-		if (!d_unhashed(dentry)) {
-			/*
-			 * But if the dentry is hashed, that means that we
-			 * got here through the revalidate path.  Thus, we
-			 * need to check if the dentry has been mounted
-			 * while we waited on the wq_mutex. If it has,
-			 * simply return success.
-			 */
-			if (d_mountpoint(dentry))
-				return 0;
-		}
+		 * If the dentry was successfully mounted while we slept
+		 * on the wait queue mutex we can return success. If it
+		 * isn't mounted (doesn't have submounts for the case of
+		 * a multi-mount with no mount at it's base) we can
+		 * continue on and create a new request.
+		 */
+		if (have_submounts(dentry))
+			return 0;
 	}
 
 	return 1;
-- 
cgit v1.2.3


From a61d90d75d0f9e86432c45b496b4b0fbf0fd03dc Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 9 Jun 2009 16:26:26 -0700
Subject: jbd: fix race in buffer processing in commit code

In commit code, we scan buffers attached to a transaction.  During this
scan, we sometimes have to drop j_list_lock and then we recheck whether
the journal buffer head didn't get freed by journal_try_to_free_buffers().
 But checking for buffer_jbd(bh) isn't enough because a new journal head
could get attached to our buffer head.  So add a check whether the journal
head remained the same and whether it's still at the same transaction and
list.

This is a nasty bug and can cause problems like memory corruption (use after
free) or trigger various assertions in JBD code (observed).

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: <stable@kernel.org>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/commit.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 06560c520f4..618e21c0b7a 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -241,7 +241,7 @@ write_out_data:
 			spin_lock(&journal->j_list_lock);
 		}
 		/* Someone already cleaned up the buffer? */
-		if (!buffer_jbd(bh)
+		if (!buffer_jbd(bh) || bh2jh(bh) != jh
 			|| jh->b_transaction != commit_transaction
 			|| jh->b_jlist != BJ_SyncData) {
 			jbd_unlock_bh_state(bh);
@@ -478,7 +478,9 @@ void journal_commit_transaction(journal_t *journal)
 			spin_lock(&journal->j_list_lock);
 			continue;
 		}
-		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
+		if (buffer_jbd(bh) && bh2jh(bh) == jh &&
+		    jh->b_transaction == commit_transaction &&
+		    jh->b_jlist == BJ_Locked) {
 			__journal_unfile_buffer(jh);
 			jbd_unlock_bh_state(bh);
 			journal_remove_journal_head(bh);
-- 
cgit v1.2.3


From 40bc9a27e00d6c8c7e4dc2865c02d7402a950472 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 10 Jun 2009 09:09:40 +0100
Subject: GFS2: Fix cache coherency between truncate and O_DIRECT read

If a page was partially zeroed as the result of a truncate, then it was
not being correctly marked dirty. This resulted in the deleted data
reappearing if the file was read back via direct I/O.

Reported-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1153a078920..329763530dc 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1012,7 +1012,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
 		gfs2_trans_add_bh(ip->i_gl, bh, 0);
 
 	zero_user(page, offset, length);
-
+	mark_buffer_dirty(bh);
 unlock:
 	unlock_page(page);
 	page_cache_release(page);
-- 
cgit v1.2.3


From 003dec8913d6bebb4ecc989ec04a235cf38f5ea9 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 10 Jun 2009 10:31:45 +0100
Subject: GFS2: Merge gfs2_get_sb into gfs2_get_sb_meta

These don't need to be separate functions.

Reported-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 9da161cbb30..f234aba36fb 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1284,9 +1284,11 @@ static int set_meta_super(struct super_block *s, void *ptr)
 	return -EINVAL;
 }
 
-static struct super_block *get_gfs2_sb(const char *dev_name)
+static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
+			    const char *dev_name, void *data, struct vfsmount *mnt)
 {
 	struct super_block *s;
+	struct gfs2_sbd *sdp;
 	struct path path;
 	int error;
 
@@ -1294,21 +1296,11 @@ static struct super_block *get_gfs2_sb(const char *dev_name)
 	if (error) {
 		printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
 		       dev_name, error);
-		return ERR_PTR(-ENOENT);
+		return error;
 	}
 	s = sget(&gfs2_fs_type, test_meta_super, set_meta_super,
 		 path.dentry->d_inode->i_sb->s_bdev);
 	path_put(&path);
-	return s;
-}
-
-static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
-			    const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	struct super_block *s;
-	struct gfs2_sbd *sdp;
-
-	s = get_gfs2_sb(dev_name);
 	if (IS_ERR(s)) {
 		printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
 		return PTR_ERR(s);
-- 
cgit v1.2.3


From 58f7f68f228c3aba2ba4468d92e2cec35724ba2e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 10 Jun 2009 09:57:55 -0400
Subject: cifs: add addr= mount option alias for ip=

When you look in /proc/mounts, the address of the server gets displayed
as "addr=". That's really a better option to use anyway since it's more
generic. What if we eventually want to support non-IP transports? It
also makes CIFS option consistent with the NFS option of the same name.

Begin the migration to that option name by adding an alias for ip=
called addr=.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 10151f8d849..6298dc32ade 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -958,7 +958,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 				}
 				strcpy(vol->password, value);
 			}
-		} else if (strnicmp(data, "ip", 2) == 0) {
+		} else if (!strnicmp(data, "ip", 2) ||
+			   !strnicmp(data, "addr", 4)) {
 			if (!value || !*value) {
 				vol->UNCip = NULL;
 			} else if (strnlen(value, INET6_ADDRSTRLEN) <
-- 
cgit v1.2.3


From a41f20716975910d9beb90b7efc61107901492b8 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 10 Jun 2009 14:22:55 -0400
Subject: ext4: Avoid corrupting the uninitialized bit in the extent during
 truncate

The unitialized bit was not properly getting preserved in in an extent
which is partially truncated because the it was geting set to the
value of the first extent to be removed or truncated as part of the
truncate operation, and if there are multiple extents are getting
removed or modified as part of the truncate operation, it is only the
last extent which will might be partially truncated, and its
uninitalized bit is not necessarily the same as the first extent to be
truncated.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9c35a7b1f0a..2593f748c3a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2083,12 +2083,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	ex = EXT_LAST_EXTENT(eh);
 
 	ex_ee_block = le32_to_cpu(ex->ee_block);
-	if (ext4_ext_is_uninitialized(ex))
-		uninitialized = 1;
 	ex_ee_len = ext4_ext_get_actual_len(ex);
 
 	while (ex >= EXT_FIRST_EXTENT(eh) &&
 			ex_ee_block + ex_ee_len > start) {
+
+		if (ext4_ext_is_uninitialized(ex))
+			uninitialized = 1;
+		else
+			uninitialized = 0;
+
 		ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
 		path[depth].p_ext = ex;
 
-- 
cgit v1.2.3


From 61b6bc525a34931bb73e4c95bfe009cd9572a288 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 10 Jun 2009 10:04:58 -0400
Subject: cifs: remove never-used in6_addr option

This option was never used to my knowledge. Remove it before someone
does...

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 6298dc32ade..97f4311b9a8 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -62,7 +62,6 @@ struct smb_vol {
 	char *domainname;
 	char *UNC;
 	char *UNCip;
-	char *in6_addr;   /* ipv6 address as human readable form of in6_addr */
 	char *iocharset;  /* local code page for mapping to and from Unicode */
 	char source_rfc1001_name[16]; /* netbios name of client */
 	char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
@@ -1320,16 +1319,6 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->direct_io = 1;
 		} else if (strnicmp(data, "forcedirectio", 13) == 0) {
 			vol->direct_io = 1;
-		} else if (strnicmp(data, "in6_addr", 8) == 0) {
-			if (!value || !*value) {
-				vol->in6_addr = NULL;
-			} else if (strnlen(value, 49) == 48) {
-				vol->in6_addr = value;
-			} else {
-				printk(KERN_WARNING "CIFS: ip v6 address not "
-						    "48 characters long\n");
-				return 1;
-			}
 		} else if (strnicmp(data, "noac", 4) == 0) {
 			printk(KERN_WARNING "CIFS: Mount option noac not "
 				"supported. Instead set "
-- 
cgit v1.2.3


From 0e0c62123b517d2b3c26922342c0cc5bb63a93f8 Mon Sep 17 00:00:00 2001
From: Michal Simek <monstr@monstr.eu>
Date: Wed, 10 Jun 2009 12:57:07 -0700
Subject: fs/bio.c: add missing __user annotation

As reported by sparse:

fs/bio.c:720:13: warning: incorrect type in assignment (different address spaces)
fs/bio.c:720:13:    expected char *iov_addr
fs/bio.c:720:13:    got void [noderef] <asn:1>*
fs/bio.c:724:36: warning: incorrect type in argument 2 (different address spaces)
fs/bio.c:724:36:    expected void const [noderef] <asn:1>*from
fs/bio.c:724:36:    got char *iov_addr

Signed-off-by: Michal Simek <monstr@monstr.eu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index ab423a1024a..533266a5e58 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -722,7 +722,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
 
 		while (bv_len && iov_idx < iov_count) {
 			unsigned int bytes;
-			char *iov_addr;
+			char __user *iov_addr;
 
 			bytes = min_t(unsigned int,
 				      iov[iov_idx].iov_len - iov_off, bv_len);
-- 
cgit v1.2.3


From 93d5581e20600593ec3236921b6620225fb76034 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 14:03:55 +0100
Subject: devpts: unregister the file system on error

Closes-bug: http://bugzilla.kernel.org/show_bug.cgi?id=13429

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/devpts/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c68edb96944..9b1d285f9fe 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -557,8 +557,10 @@ static int __init init_devpts_fs(void)
 	int err = register_filesystem(&devpts_fs_type);
 	if (!err) {
 		devpts_mnt = kern_mount(&devpts_fs_type);
-		if (IS_ERR(devpts_mnt))
+		if (IS_ERR(devpts_mnt)) {
 			err = PTR_ERR(devpts_mnt);
+			unregister_filesystem(&devpts_fs_type);
+		}
 	}
 	return err;
 }
-- 
cgit v1.2.3


From 2e1483c995bbd0fa6cbd055ad76088a520799ba4 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:24:13 +0100
Subject: kmemleak: Remove some of the kmemleak false positives

There are allocations for which the main pointer cannot be found but
they are not memory leaks. This patch fixes some of them. For more
information on false positives, see Documentation/kmemleak.txt.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 fs/block_dev.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index f45dbc18dd1..d250f807fd8 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -25,6 +25,7 @@
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <linux/log2.h>
+#include <linux/kmemleak.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -492,6 +493,11 @@ void __init bdev_cache_init(void)
 	bd_mnt = kern_mount(&bd_type);
 	if (IS_ERR(bd_mnt))
 		panic("Cannot create bdev pseudo-fs");
+	/*
+	 * This vfsmount structure is only used to obtain the
+	 * blockdev_superblock, so tell kmemleak not to report it.
+	 */
+	kmemleak_not_leak(bd_mnt);
 	blockdev_superblock = bd_mnt->mnt_sb;	/* For writeback */
 }
 
-- 
cgit v1.2.3