From e55014923e65e4ee8e477a1212381cca0125f3aa Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Wed, 19 Sep 2007 14:38:12 +1000
Subject: [POWERPC] spufs: Cleanup ELF coredump extra notes logic

To start with, arch_notes_size() etc. is a little too ambiguous a name for
my liking, so change the function names to be more explicit.

Calling through macros is ugly, especially with hidden parameters, so don't
do that, call the routines directly.

Use ARCH_HAVE_EXTRA_ELF_NOTES as the only flag, and based on it decide
whether we want the extern declarations or the empty versions.

Since we have empty routines, actually use them in the coredump code to
save a few #ifdefs.

We want to change the handling of foffset so that the write routine updates
foffset as it goes, instead of using file->f_pos (so that writing to a pipe
works).  So pass foffset to the write routine, and for now just set it to
file->f_pos at the end of writing.

It should also be possible for the write routine to fail, so change it to
return int and treat a non-zero return as failure.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 fs/binfmt_elf.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 4482a0673b1..b1013f34085 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1514,9 +1514,6 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 	int thread_status_size = 0;
 	elf_addr_t *auxv;
 	unsigned long mm_flags;
-#ifdef ELF_CORE_WRITE_EXTRA_NOTES
-	int extra_notes_size;
-#endif
 
 	/*
 	 * We no longer stop all VM operations.
@@ -1645,10 +1642,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 		
 		sz += thread_status_size;
 
-#ifdef ELF_CORE_WRITE_EXTRA_NOTES
-		extra_notes_size = ELF_CORE_EXTRA_NOTES_SIZE;
-		sz += extra_notes_size;
-#endif
+		sz += elf_coredump_extra_notes_size();
 
 		fill_elf_note_phdr(&phdr, sz, offset);
 		offset += sz;
@@ -1698,10 +1692,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file)
 		if (!writenote(notes + i, file, &foffset))
 			goto end_coredump;
 
-#ifdef ELF_CORE_WRITE_EXTRA_NOTES
-	ELF_CORE_WRITE_EXTRA_NOTES;
-	foffset += extra_notes_size;
-#endif
+	if (elf_coredump_extra_notes_write(file, &foffset))
+		goto end_coredump;
 
 	/* write out the thread status notes section */
 	list_for_each(t, &thread_list) {
-- 
cgit v1.2.3


From f5ff8422bbdd59f8c1f699df248e1b7a11073027 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 21 Sep 2007 09:19:54 +0200
Subject: Fix warnings with !CONFIG_BLOCK

Hide everything in blkdev.h with CONFIG_BLOCK isn't set, and fixup
the (few) files that fail to build because they were relying on blkdev.h
pulling in extra includes for them.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/fs-writeback.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a4b142a6a2c..8d23b0b3871 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -14,6 +14,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
-- 
cgit v1.2.3


From 9cc54d40b8ca01fcefc9151044b6996565061d90 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 27 Sep 2007 12:46:12 +0200
Subject: Only call bi_end_io once for any bio

Currently bi_end_io can be called multiple times as sub-requests
complete.  However no ->bi_end_io function wants to know about that.
So only call when the bio is complete.

Signed-off-by: Neil Brown <neilb@suse.de>

### Diffstat output
 ./fs/bio.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff .prev/fs/bio.c ./fs/bio.c
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 29a44c1b64c..5720b940bb5 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1018,6 +1018,8 @@ void bio_endio(struct bio *bio, unsigned int bytes_done, int error)
 {
 	if (error)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		error = -EIO;
 
 	if (unlikely(bytes_done > bio->bi_size)) {
 		printk("%s: want %u bytes done, only %u left\n", __FUNCTION__,
@@ -1028,7 +1030,7 @@ void bio_endio(struct bio *bio, unsigned int bytes_done, int error)
 	bio->bi_size -= bytes_done;
 	bio->bi_sector += (bytes_done >> 9);
 
-	if (bio->bi_end_io)
+	if (bio->bi_size && bio->bi_end_io)
 		bio->bi_end_io(bio, bytes_done, error);
 }
 
-- 
cgit v1.2.3


From 5bb23a688b2de23d7765a1dd439d89c038378978 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 27 Sep 2007 12:46:13 +0200
Subject: Don't decrement bi_size in bio_endio

The only caller of bio_endio that does not pass the full bi_size
is end_that_request_first.  Also, no ->bi_end_io method is really
interested in bi_size being decremented.

So move the decrement and related code into ll_rw_blk and merge it
with order_bio_endio to form req_bio_endio which does endio functionality
specific to request completion.

As some ->bi_end_io methods do check bi_size of 0, we set it thus for
now, but that will go in the next patch.

Signed-off-by: Neil Brown <neilb@suse.de>

### Diffstat output
 ./block/ll_rw_blk.c |   42 +++++++++++++++++++++++++++---------------
 ./fs/bio.c          |   23 +++++++++++------------
 2 files changed, 38 insertions(+), 27 deletions(-)

diff .prev/block/ll_rw_blk.c ./block/ll_rw_blk.c
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 5720b940bb5..3adecd64ff6 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1006,13 +1006,14 @@ void bio_check_pages_dirty(struct bio *bio)
  * @error:	error, if any
  *
  * Description:
- *   bio_endio() will end I/O on @bytes_done number of bytes. This may be
- *   just a partial part of the bio, or it may be the whole bio. bio_endio()
- *   is the preferred way to end I/O on a bio, it takes care of decrementing
- *   bi_size and clearing BIO_UPTODATE on error. @error is 0 on success, and
- *   and one of the established -Exxxx (-EIO, for instance) error values in
- *   case something went wrong. Noone should call bi_end_io() directly on
- *   a bio unless they own it and thus know that it has an end_io function.
+ *   bio_endio() will end I/O on @bytes_done number of bytes. This
+ *   must always be the whole (remaining) bio. bio_endio() is the
+ *   preferred way to end I/O on a bio, it takes care of clearing
+ *   BIO_UPTODATE on error. @error is 0 on success, and and one of the
+ *   established -Exxxx (-EIO, for instance) error values in case
+ *   something went wrong. Noone should call bi_end_io() directly on a
+ *   bio unless they own it and thus know that it has an end_io
+ *   function.
  **/
 void bio_endio(struct bio *bio, unsigned int bytes_done, int error)
 {
@@ -1021,16 +1022,14 @@ void bio_endio(struct bio *bio, unsigned int bytes_done, int error)
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = -EIO;
 
-	if (unlikely(bytes_done > bio->bi_size)) {
+	if (unlikely(bytes_done != bio->bi_size)) {
 		printk("%s: want %u bytes done, only %u left\n", __FUNCTION__,
 						bytes_done, bio->bi_size);
 		bytes_done = bio->bi_size;
 	}
 
-	bio->bi_size -= bytes_done;
-	bio->bi_sector += (bytes_done >> 9);
-
-	if (bio->bi_size && bio->bi_end_io)
+	bio->bi_size = 0; /* expected by some callees - will be removed */
+	if (bio->bi_end_io)
 		bio->bi_end_io(bio, bytes_done, error);
 }
 
-- 
cgit v1.2.3


From 6712ecf8f648118c3363c142196418f89a510b90 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 27 Sep 2007 12:47:43 +0200
Subject: Drop 'size' argument from bio_endio and bi_end_io

As bi_end_io is only called once when the reqeust is complete,
the 'size' argument is now redundant.  Remove it.

Now there is no need for bio_endio to subtract the size completed
from bi_size.  So don't do that either.

While we are at it, change bi_end_io to return void.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c                     | 35 +++++++----------------------------
 fs/block_dev.c               |  2 +-
 fs/buffer.c                  |  6 +-----
 fs/direct-io.c               | 13 ++-----------
 fs/gfs2/super.c              |  4 +---
 fs/jfs/jfs_logmgr.c          |  5 +----
 fs/jfs/jfs_metapage.c        | 12 ++----------
 fs/mpage.c                   | 12 ++----------
 fs/ocfs2/cluster/heartbeat.c |  4 ----
 fs/xfs/linux-2.6/xfs_aops.c  |  4 ----
 fs/xfs/linux-2.6/xfs_buf.c   |  4 ----
 11 files changed, 17 insertions(+), 84 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 3adecd64ff6..5f604f269df 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -798,13 +798,9 @@ void bio_unmap_user(struct bio *bio)
 	bio_put(bio);
 }
 
-static int bio_map_kern_endio(struct bio *bio, unsigned int bytes_done, int err)
+static void bio_map_kern_endio(struct bio *bio, int err)
 {
-	if (bio->bi_size)
-		return 1;
-
 	bio_put(bio);
-	return 0;
 }
 
 
@@ -1002,12 +998,10 @@ void bio_check_pages_dirty(struct bio *bio)
 /**
  * bio_endio - end I/O on a bio
  * @bio:	bio
- * @bytes_done:	number of bytes completed
  * @error:	error, if any
  *
  * Description:
- *   bio_endio() will end I/O on @bytes_done number of bytes. This
- *   must always be the whole (remaining) bio. bio_endio() is the
+ *   bio_endio() will end I/O on the whole bio. bio_endio() is the
  *   preferred way to end I/O on a bio, it takes care of clearing
  *   BIO_UPTODATE on error. @error is 0 on success, and and one of the
  *   established -Exxxx (-EIO, for instance) error values in case
@@ -1015,22 +1009,15 @@ void bio_check_pages_dirty(struct bio *bio)
  *   bio unless they own it and thus know that it has an end_io
  *   function.
  **/
-void bio_endio(struct bio *bio, unsigned int bytes_done, int error)
+void bio_endio(struct bio *bio, int error)
 {
 	if (error)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = -EIO;
 
-	if (unlikely(bytes_done != bio->bi_size)) {
-		printk("%s: want %u bytes done, only %u left\n", __FUNCTION__,
-						bytes_done, bio->bi_size);
-		bytes_done = bio->bi_size;
-	}
-
-	bio->bi_size = 0; /* expected by some callees - will be removed */
 	if (bio->bi_end_io)
-		bio->bi_end_io(bio, bytes_done, error);
+		bio->bi_end_io(bio, error);
 }
 
 void bio_pair_release(struct bio_pair *bp)
@@ -1038,37 +1025,29 @@ void bio_pair_release(struct bio_pair *bp)
 	if (atomic_dec_and_test(&bp->cnt)) {
 		struct bio *master = bp->bio1.bi_private;
 
-		bio_endio(master, master->bi_size, bp->error);
+		bio_endio(master, bp->error);
 		mempool_free(bp, bp->bio2.bi_private);
 	}
 }
 
-static int bio_pair_end_1(struct bio * bi, unsigned int done, int err)
+static void bio_pair_end_1(struct bio *bi, int err)
 {
 	struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
 
 	if (err)
 		bp->error = err;
 
-	if (bi->bi_size)
-		return 1;
-
 	bio_pair_release(bp);
-	return 0;
 }
 
-static int bio_pair_end_2(struct bio * bi, unsigned int done, int err)
+static void bio_pair_end_2(struct bio *bi, int err)
 {
 	struct bio_pair *bp = container_of(bi, struct bio_pair, bio2);
 
 	if (err)
 		bp->error = err;
 
-	if (bi->bi_size)
-		return 1;
-
 	bio_pair_release(bp);
-	return 0;
 }
 
 /*
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2980eabe577..6339a30879b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -172,7 +172,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 }
 
 #if 0
-static int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error)
+static void blk_end_aio(struct bio *bio, int error)
 {
 	struct kiocb *iocb = bio->bi_private;
 	atomic_t *bio_count = &iocb->ki_bio_count;
diff --git a/fs/buffer.c b/fs/buffer.c
index 0e5ec371ce7..75b51dfa5e0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2634,13 +2634,10 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
 	return tmp.b_blocknr;
 }
 
-static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
+static void end_bio_bh_io_sync(struct bio *bio, int err)
 {
 	struct buffer_head *bh = bio->bi_private;
 
-	if (bio->bi_size)
-		return 1;
-
 	if (err == -EOPNOTSUPP) {
 		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
 		set_bit(BH_Eopnotsupp, &bh->b_state);
@@ -2648,7 +2645,6 @@ static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
 
 	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
 	bio_put(bio);
-	return 0;
 }
 
 int submit_bh(int rw, struct buffer_head * bh)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 901dc55e9f5..b5928a7b6a5 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -264,15 +264,12 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio);
 /*
  * Asynchronous IO callback. 
  */
-static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error)
+static void dio_bio_end_aio(struct bio *bio, int error)
 {
 	struct dio *dio = bio->bi_private;
 	unsigned long remaining;
 	unsigned long flags;
 
-	if (bio->bi_size)
-		return 1;
-
 	/* cleanup the bio */
 	dio_bio_complete(dio, bio);
 
@@ -287,8 +284,6 @@ static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error)
 		aio_complete(dio->iocb, ret, 0);
 		kfree(dio);
 	}
-
-	return 0;
 }
 
 /*
@@ -298,21 +293,17 @@ static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error)
  * During I/O bi_private points at the dio.  After I/O, bi_private is used to
  * implement a singly-linked list of completed BIOs, at dio->bio_list.
  */
-static int dio_bio_end_io(struct bio *bio, unsigned int bytes_done, int error)
+static void dio_bio_end_io(struct bio *bio, int error)
 {
 	struct dio *dio = bio->bi_private;
 	unsigned long flags;
 
-	if (bio->bi_size)
-		return 1;
-
 	spin_lock_irqsave(&dio->bio_lock, flags);
 	bio->bi_private = dio->bio_list;
 	dio->bio_list = bio;
 	if (--dio->refcount == 1 && dio->waiter)
 		wake_up_process(dio->waiter);
 	spin_unlock_irqrestore(&dio->bio_lock, flags);
-	return 0;
 }
 
 static int
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index f916b9740c7..2473e2a86d1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -160,11 +160,9 @@ int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
 }
 
 
-static int end_bio_io_page(struct bio *bio, unsigned int bytes_done, int error)
+static void end_bio_io_page(struct bio *bio, int error)
 {
 	struct page *page = bio->bi_private;
-	if (bio->bi_size)
-		return 1;
 
 	if (!error)
 		SetPageUptodate(page);
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index de3e4a506db..57c3b8ac36b 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2200,16 +2200,13 @@ static int lbmIOWait(struct lbuf * bp, int flag)
  *
  * executed at INTIODONE level
  */
-static int lbmIODone(struct bio *bio, unsigned int bytes_done, int error)
+static void lbmIODone(struct bio *bio, int error)
 {
 	struct lbuf *bp = bio->bi_private;
 	struct lbuf *nextbp, *tail;
 	struct jfs_log *log;
 	unsigned long flags;
 
-	if (bio->bi_size)
-		return 1;
-
 	/*
 	 * get back jfs buffer bound to the i/o buffer
 	 */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 62e96be02ac..1332adc0b9f 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -280,14 +280,10 @@ static void last_read_complete(struct page *page)
 	unlock_page(page);
 }
 
-static int metapage_read_end_io(struct bio *bio, unsigned int bytes_done,
-				int err)
+static void metapage_read_end_io(struct bio *bio, int err)
 {
 	struct page *page = bio->bi_private;
 
-	if (bio->bi_size)
-		return 1;
-
 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
 		printk(KERN_ERR "metapage_read_end_io: I/O error\n");
 		SetPageError(page);
@@ -341,16 +337,12 @@ static void last_write_complete(struct page *page)
 	end_page_writeback(page);
 }
 
-static int metapage_write_end_io(struct bio *bio, unsigned int bytes_done,
-				 int err)
+static void metapage_write_end_io(struct bio *bio, int err)
 {
 	struct page *page = bio->bi_private;
 
 	BUG_ON(!PagePrivate(page));
 
-	if (bio->bi_size)
-		return 1;
-
 	if (! test_bit(BIO_UPTODATE, &bio->bi_flags)) {
 		printk(KERN_ERR "metapage_write_end_io: I/O error\n");
 		SetPageError(page);
diff --git a/fs/mpage.c b/fs/mpage.c
index c1698f2291a..b1c3e589050 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -39,14 +39,11 @@
  * status of that page is hard.  See end_buffer_async_read() for the details.
  * There is no point in duplicating all that complexity.
  */
-static int mpage_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
+static void mpage_end_io_read(struct bio *bio, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 
-	if (bio->bi_size)
-		return 1;
-
 	do {
 		struct page *page = bvec->bv_page;
 
@@ -62,17 +59,13 @@ static int mpage_end_io_read(struct bio *bio, unsigned int bytes_done, int err)
 		unlock_page(page);
 	} while (bvec >= bio->bi_io_vec);
 	bio_put(bio);
-	return 0;
 }
 
-static int mpage_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
+static void mpage_end_io_write(struct bio *bio, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 
-	if (bio->bi_size)
-		return 1;
-
 	do {
 		struct page *page = bvec->bv_page;
 
@@ -87,7 +80,6 @@ static int mpage_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
 		end_page_writeback(page);
 	} while (bvec >= bio->bi_io_vec);
 	bio_put(bio);
-	return 0;
 }
 
 static struct bio *mpage_bio_submit(int rw, struct bio *bio)
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 2bd7f788cf3..da2c2b442b4 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -217,7 +217,6 @@ static void o2hb_wait_on_io(struct o2hb_region *reg,
 }
 
 static int o2hb_bio_end_io(struct bio *bio,
-			   unsigned int bytes_done,
 			   int error)
 {
 	struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
@@ -227,9 +226,6 @@ static int o2hb_bio_end_io(struct bio *bio,
 		wc->wc_error = error;
 	}
 
-	if (bio->bi_size)
-		return 1;
-
 	o2hb_bio_wait_dec(wc, 1);
 	bio_put(bio);
 	return 0;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 5f152f60d74..3f13519436a 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -326,14 +326,10 @@ xfs_iomap_valid(
 STATIC int
 xfs_end_bio(
 	struct bio		*bio,
-	unsigned int		bytes_done,
 	int			error)
 {
 	xfs_ioend_t		*ioend = bio->bi_private;
 
-	if (bio->bi_size)
-		return 1;
-
 	ASSERT(atomic_read(&bio->bi_cnt) >= 1);
 	ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
 
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index b0f0e58866d..6a75f4d984a 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1106,16 +1106,12 @@ _xfs_buf_ioend(
 STATIC int
 xfs_buf_bio_end_io(
 	struct bio		*bio,
-	unsigned int		bytes_done,
 	int			error)
 {
 	xfs_buf_t		*bp = (xfs_buf_t *)bio->bi_private;
 	unsigned int		blocksize = bp->b_target->bt_bsize;
 	struct bio_vec		*bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 
-	if (bio->bi_size)
-		return 1;
-
 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		bp->b_error = EIO;
 
-- 
cgit v1.2.3


From f58c4c0a17e500e767473598b3deafaa1d64051d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 9 Oct 2007 13:23:51 +0200
Subject: compat_ioctl: move common block ioctls to compat_blkdev_ioctl

Make compat_blkdev_ioctl and blkdev_ioctl reflect the respective
native versions. This is somewhat more efficient and makes it easier
to keep the two in sync.

Also get rid of the bogus handling for broken_blkgetsize and the
duplicate entry for BLKRASET.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/compat_ioctl.c | 45 ---------------------------------------------
 1 file changed, 45 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 37310b0e810..6be121868c6 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1537,12 +1537,6 @@ ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg)
 }
 
 #ifdef CONFIG_BLOCK
-static int broken_blkgetsize(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	/* The mkswap binary hard codes it to Intel value :-((( */
-	return w_long(fd, BLKGETSIZE, arg);
-}
-
 struct blkpg_ioctl_arg32 {
 	compat_int_t op;
 	compat_int_t flags;
@@ -1578,29 +1572,6 @@ static int ioc_settimeout(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, arg);
 }
 
-#ifdef CONFIG_BLOCK
-/* Fix sizeof(sizeof()) breakage */
-#define BLKBSZGET_32   _IOR(0x12,112,int)
-#define BLKBSZSET_32   _IOW(0x12,113,int)
-#define BLKGETSIZE64_32        _IOR(0x12,114,int)
-
-static int do_blkbszget(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-       return sys_ioctl(fd, BLKBSZGET, (unsigned long)compat_ptr(arg));
-}
-
-static int do_blkbszset(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-       return sys_ioctl(fd, BLKBSZSET, (unsigned long)compat_ptr(arg));
-}
-
-static int do_blkgetsize64(unsigned int fd, unsigned int cmd,
-                          unsigned long arg)
-{
-       return sys_ioctl(fd, BLKGETSIZE64, (unsigned long)compat_ptr(arg));
-}
-#endif
-
 /* Bluetooth ioctls */
 #define HCIUARTSETPROTO	_IOW('U', 200, int)
 #define HCIUARTGETPROTO	_IOR('U', 201, int)
@@ -2546,19 +2517,11 @@ COMPATIBLE_IOCTL(FDFMTTRK)
 COMPATIBLE_IOCTL(FDRAWCMD)
 /* 0x12 */
 #ifdef CONFIG_BLOCK
-COMPATIBLE_IOCTL(BLKRASET)
-COMPATIBLE_IOCTL(BLKROSET)
-COMPATIBLE_IOCTL(BLKROGET)
-COMPATIBLE_IOCTL(BLKRRPART)
-COMPATIBLE_IOCTL(BLKFLSBUF)
 COMPATIBLE_IOCTL(BLKSECTSET)
-COMPATIBLE_IOCTL(BLKSSZGET)
 COMPATIBLE_IOCTL(BLKTRACESTART)
 COMPATIBLE_IOCTL(BLKTRACESTOP)
 COMPATIBLE_IOCTL(BLKTRACESETUP)
 COMPATIBLE_IOCTL(BLKTRACETEARDOWN)
-ULONG_IOCTL(BLKRASET)
-ULONG_IOCTL(BLKFRASET)
 #endif
 /* RAID */
 COMPATIBLE_IOCTL(RAID_VERSION)
@@ -3337,11 +3300,6 @@ HANDLE_IOCTL(SIOCGSTAMPNS, do_siocgstampns)
 #endif
 #ifdef CONFIG_BLOCK
 HANDLE_IOCTL(HDIO_GETGEO, hdio_getgeo)
-HANDLE_IOCTL(BLKRAGET, w_long)
-HANDLE_IOCTL(BLKGETSIZE, w_long)
-HANDLE_IOCTL(0x1260, broken_blkgetsize)
-HANDLE_IOCTL(BLKFRAGET, w_long)
-HANDLE_IOCTL(BLKSECTGET, w_long)
 HANDLE_IOCTL(BLKPG, blkpg_ioctl_trans)
 HANDLE_IOCTL(HDIO_GET_UNMASKINTR, hdio_ioctl_trans)
 HANDLE_IOCTL(HDIO_GET_MULTCOUNT, hdio_ioctl_trans)
@@ -3415,9 +3373,6 @@ HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl)
 HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl)
 /* block stuff */
 #ifdef CONFIG_BLOCK
-HANDLE_IOCTL(BLKBSZGET_32, do_blkbszget)
-HANDLE_IOCTL(BLKBSZSET_32, do_blkbszset)
-HANDLE_IOCTL(BLKGETSIZE64_32, do_blkgetsize64)
 /* Raw devices */
 HANDLE_IOCTL(RAW_SETBIND, raw_ioctl)
 HANDLE_IOCTL(RAW_GETBIND, raw_ioctl)
-- 
cgit v1.2.3


From 7199d4cdd8485f802df3e1bc131245c69009b9a4 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 9 Oct 2007 13:23:52 +0200
Subject: compat_ioctl: add compat_blkdev_driver_ioctl()

Handle those blockdev ioctl calls that are compatible
directly from the compat_blkdev_ioctl() function, instead
of having to go through the compat_ioctl hash lookup.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/compat_ioctl.c | 83 -------------------------------------------------------
 1 file changed, 83 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 6be121868c6..16d681c331f 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -2477,47 +2477,8 @@ COMPATIBLE_IOCTL(FIONREAD)  /* This is also TIOCINQ */
 /* 0x00 */
 COMPATIBLE_IOCTL(FIBMAP)
 COMPATIBLE_IOCTL(FIGETBSZ)
-/* 0x03 -- HD/IDE ioctl's used by hdparm and friends.
- *         Some need translations, these do not.
- */
-COMPATIBLE_IOCTL(HDIO_GET_IDENTITY)
-COMPATIBLE_IOCTL(HDIO_DRIVE_TASK)
-COMPATIBLE_IOCTL(HDIO_DRIVE_CMD)
-ULONG_IOCTL(HDIO_SET_MULTCOUNT)
-ULONG_IOCTL(HDIO_SET_UNMASKINTR)
-ULONG_IOCTL(HDIO_SET_KEEPSETTINGS)
-ULONG_IOCTL(HDIO_SET_32BIT)
-ULONG_IOCTL(HDIO_SET_NOWERR)
-ULONG_IOCTL(HDIO_SET_DMA)
-ULONG_IOCTL(HDIO_SET_PIO_MODE)
-ULONG_IOCTL(HDIO_SET_NICE)
-ULONG_IOCTL(HDIO_SET_WCACHE)
-ULONG_IOCTL(HDIO_SET_ACOUSTIC)
-ULONG_IOCTL(HDIO_SET_BUSSTATE)
-ULONG_IOCTL(HDIO_SET_ADDRESS)
-COMPATIBLE_IOCTL(HDIO_SCAN_HWIF)
-/* 0x330 is reserved -- it used to be HDIO_GETGEO_BIG */
-COMPATIBLE_IOCTL(0x330)
-/* 0x02 -- Floppy ioctls */
-COMPATIBLE_IOCTL(FDMSGON)
-COMPATIBLE_IOCTL(FDMSGOFF)
-COMPATIBLE_IOCTL(FDSETEMSGTRESH)
-COMPATIBLE_IOCTL(FDFLUSH)
-COMPATIBLE_IOCTL(FDWERRORCLR)
-COMPATIBLE_IOCTL(FDSETMAXERRS)
-COMPATIBLE_IOCTL(FDGETMAXERRS)
-COMPATIBLE_IOCTL(FDGETDRVTYP)
-COMPATIBLE_IOCTL(FDEJECT)
-COMPATIBLE_IOCTL(FDCLRPRM)
-COMPATIBLE_IOCTL(FDFMTBEG)
-COMPATIBLE_IOCTL(FDFMTEND)
-COMPATIBLE_IOCTL(FDRESET)
-COMPATIBLE_IOCTL(FDTWADDLE)
-COMPATIBLE_IOCTL(FDFMTTRK)
-COMPATIBLE_IOCTL(FDRAWCMD)
 /* 0x12 */
 #ifdef CONFIG_BLOCK
-COMPATIBLE_IOCTL(BLKSECTSET)
 COMPATIBLE_IOCTL(BLKTRACESTART)
 COMPATIBLE_IOCTL(BLKTRACESTOP)
 COMPATIBLE_IOCTL(BLKTRACESETUP)
@@ -2770,50 +2731,6 @@ COMPATIBLE_IOCTL(PPGETMODE)
 COMPATIBLE_IOCTL(PPGETPHASE)
 COMPATIBLE_IOCTL(PPGETFLAGS)
 COMPATIBLE_IOCTL(PPSETFLAGS)
-/* CDROM stuff */
-COMPATIBLE_IOCTL(CDROMPAUSE)
-COMPATIBLE_IOCTL(CDROMRESUME)
-COMPATIBLE_IOCTL(CDROMPLAYMSF)
-COMPATIBLE_IOCTL(CDROMPLAYTRKIND)
-COMPATIBLE_IOCTL(CDROMREADTOCHDR)
-COMPATIBLE_IOCTL(CDROMREADTOCENTRY)
-COMPATIBLE_IOCTL(CDROMSTOP)
-COMPATIBLE_IOCTL(CDROMSTART)
-COMPATIBLE_IOCTL(CDROMEJECT)
-COMPATIBLE_IOCTL(CDROMVOLCTRL)
-COMPATIBLE_IOCTL(CDROMSUBCHNL)
-ULONG_IOCTL(CDROMEJECT_SW)
-COMPATIBLE_IOCTL(CDROMMULTISESSION)
-COMPATIBLE_IOCTL(CDROM_GET_MCN)
-COMPATIBLE_IOCTL(CDROMRESET)
-COMPATIBLE_IOCTL(CDROMVOLREAD)
-COMPATIBLE_IOCTL(CDROMSEEK)
-COMPATIBLE_IOCTL(CDROMPLAYBLK)
-COMPATIBLE_IOCTL(CDROMCLOSETRAY)
-ULONG_IOCTL(CDROM_SET_OPTIONS)
-ULONG_IOCTL(CDROM_CLEAR_OPTIONS)
-ULONG_IOCTL(CDROM_SELECT_SPEED)
-ULONG_IOCTL(CDROM_SELECT_DISC)
-ULONG_IOCTL(CDROM_MEDIA_CHANGED)
-ULONG_IOCTL(CDROM_DRIVE_STATUS)
-COMPATIBLE_IOCTL(CDROM_DISC_STATUS)
-COMPATIBLE_IOCTL(CDROM_CHANGER_NSLOTS)
-ULONG_IOCTL(CDROM_LOCKDOOR)
-ULONG_IOCTL(CDROM_DEBUG)
-COMPATIBLE_IOCTL(CDROM_GET_CAPABILITY)
-/* Ignore cdrom.h about these next 5 ioctls, they absolutely do
- * not take a struct cdrom_read, instead they take a struct cdrom_msf
- * which is compatible.
- */
-COMPATIBLE_IOCTL(CDROMREADMODE2)
-COMPATIBLE_IOCTL(CDROMREADMODE1)
-COMPATIBLE_IOCTL(CDROMREADRAW)
-COMPATIBLE_IOCTL(CDROMREADCOOKED)
-COMPATIBLE_IOCTL(CDROMREADALL)
-/* DVD ioctls */
-COMPATIBLE_IOCTL(DVD_READ_STRUCT)
-COMPATIBLE_IOCTL(DVD_WRITE_STRUCT)
-COMPATIBLE_IOCTL(DVD_AUTH)
 /* pktcdvd */
 COMPATIBLE_IOCTL(PACKET_CTRL_CMD)
 /* Big A */
-- 
cgit v1.2.3


From 171044d449611c6e5040b37210ff6aba47f33ee4 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 9 Oct 2007 13:23:53 +0200
Subject: compat_ioctl: handle blk_trace ioctls

blk_trace_setup is broken on x86_64 compat systems,
this makes the code work correctly on all 64 bit architectures
in compat mode.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/compat_ioctl.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 16d681c331f..71065603a58 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -62,7 +62,6 @@
 #include <linux/i2c-dev.h>
 #include <linux/wireless.h>
 #include <linux/atalk.h>
-#include <linux/blktrace_api.h>
 #include <linux/loop.h>
 
 #include <net/bluetooth/bluetooth.h>
@@ -2477,13 +2476,6 @@ COMPATIBLE_IOCTL(FIONREAD)  /* This is also TIOCINQ */
 /* 0x00 */
 COMPATIBLE_IOCTL(FIBMAP)
 COMPATIBLE_IOCTL(FIGETBSZ)
-/* 0x12 */
-#ifdef CONFIG_BLOCK
-COMPATIBLE_IOCTL(BLKTRACESTART)
-COMPATIBLE_IOCTL(BLKTRACESTOP)
-COMPATIBLE_IOCTL(BLKTRACESETUP)
-COMPATIBLE_IOCTL(BLKTRACETEARDOWN)
-#endif
 /* RAID */
 COMPATIBLE_IOCTL(RAID_VERSION)
 COMPATIBLE_IOCTL(GET_ARRAY_INFO)
-- 
cgit v1.2.3


From 9617db085c119879cd371e3212806a15596e121a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 9 Oct 2007 13:23:55 +0200
Subject: compat_ioctl: move hdio calls to block/compat_ioctl.c

These are common to multiple block drivers, so they should
be handled by the block layer.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/compat_ioctl.c | 60 -------------------------------------------------------
 1 file changed, 60 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 71065603a58..e6a94714b1b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -21,7 +21,6 @@
 #include <linux/if.h>
 #include <linux/if_bridge.h>
 #include <linux/slab.h>
-#include <linux/hdreg.h>
 #include <linux/raid/md.h>
 #include <linux/kd.h>
 #include <linux/dirent.h>
@@ -667,53 +666,6 @@ out:
 #endif
 
 #ifdef CONFIG_BLOCK
-struct hd_geometry32 {
-	unsigned char heads;
-	unsigned char sectors;
-	unsigned short cylinders;
-	u32 start;
-};
-                        
-static int hdio_getgeo(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	mm_segment_t old_fs = get_fs();
-	struct hd_geometry geo;
-	struct hd_geometry32 __user *ugeo;
-	int err;
-	
-	set_fs (KERNEL_DS);
-	err = sys_ioctl(fd, HDIO_GETGEO, (unsigned long)&geo);
-	set_fs (old_fs);
-	ugeo = compat_ptr(arg);
-	if (!err) {
-		err = copy_to_user (ugeo, &geo, 4);
-		err |= __put_user (geo.start, &ugeo->start);
-		if (err)
-			err = -EFAULT;
-	}
-	return err;
-}
-
-static int hdio_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	mm_segment_t old_fs = get_fs();
-	unsigned long kval;
-	unsigned int __user *uvp;
-	int error;
-
-	set_fs(KERNEL_DS);
-	error = sys_ioctl(fd, cmd, (long)&kval);
-	set_fs(old_fs);
-
-	if(error == 0) {
-		uvp = compat_ptr(arg);
-		if(put_user(kval, uvp))
-			error = -EFAULT;
-	}
-	return error;
-}
-
-
 typedef struct sg_io_hdr32 {
 	compat_int_t interface_id;	/* [i] 'S' for SCSI generic (required) */
 	compat_int_t dxfer_direction;	/* [i] data transfer direction  */
@@ -3208,19 +3160,7 @@ HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp)
 HANDLE_IOCTL(SIOCGSTAMPNS, do_siocgstampns)
 #endif
 #ifdef CONFIG_BLOCK
-HANDLE_IOCTL(HDIO_GETGEO, hdio_getgeo)
 HANDLE_IOCTL(BLKPG, blkpg_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_UNMASKINTR, hdio_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_MULTCOUNT, hdio_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_KEEPSETTINGS, hdio_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_32BIT, hdio_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_NOWERR, hdio_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_DMA, hdio_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_NICE, hdio_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_WCACHE, hdio_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_ACOUSTIC, hdio_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_ADDRESS, hdio_ioctl_trans)
-HANDLE_IOCTL(HDIO_GET_BUSSTATE, hdio_ioctl_trans)
 HANDLE_IOCTL(FDSETPRM32, fd_ioctl_trans)
 HANDLE_IOCTL(FDDEFPRM32, fd_ioctl_trans)
 HANDLE_IOCTL(FDGETPRM32, fd_ioctl_trans)
-- 
cgit v1.2.3


From 18cf7f8723d913ce02bea43e468bebdd07bc880c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 9 Oct 2007 13:23:56 +0200
Subject: compat_ioctl: move BLKPG handling to block/compat_ioctl.c

BLKPG is common to all block devices, so it should be handled
by common code.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/compat_ioctl.c | 33 ---------------------------------
 1 file changed, 33 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index e6a94714b1b..3baa90d3109 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -47,7 +47,6 @@
 #include <linux/netdevice.h>
 #include <linux/raw.h>
 #include <linux/smb_fs.h>
-#include <linux/blkpg.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/rtc.h>
@@ -1487,37 +1486,6 @@ ret_einval(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return -EINVAL;
 }
 
-#ifdef CONFIG_BLOCK
-struct blkpg_ioctl_arg32 {
-	compat_int_t op;
-	compat_int_t flags;
-	compat_int_t datalen;
-	compat_caddr_t data;
-};
-
-static int blkpg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct blkpg_ioctl_arg32 __user *ua32 = compat_ptr(arg);
-	struct blkpg_ioctl_arg __user *a = compat_alloc_user_space(sizeof(*a));
-	compat_caddr_t udata;
-	compat_int_t n;
-	int err;
-	
-	err = get_user(n, &ua32->op);
-	err |= put_user(n, &a->op);
-	err |= get_user(n, &ua32->flags);
-	err |= put_user(n, &a->flags);
-	err |= get_user(n, &ua32->datalen);
-	err |= put_user(n, &a->datalen);
-	err |= get_user(udata, &ua32->data);
-	err |= put_user(compat_ptr(udata), &a->data);
-	if (err)
-		return err;
-
-	return sys_ioctl(fd, cmd, (unsigned long)a);
-}
-#endif
-
 static int ioc_settimeout(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
 	return rw_long(fd, AUTOFS_IOC_SETTIMEOUT, arg);
@@ -3160,7 +3128,6 @@ HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp)
 HANDLE_IOCTL(SIOCGSTAMPNS, do_siocgstampns)
 #endif
 #ifdef CONFIG_BLOCK
-HANDLE_IOCTL(BLKPG, blkpg_ioctl_trans)
 HANDLE_IOCTL(FDSETPRM32, fd_ioctl_trans)
 HANDLE_IOCTL(FDDEFPRM32, fd_ioctl_trans)
 HANDLE_IOCTL(FDGETPRM32, fd_ioctl_trans)
-- 
cgit v1.2.3


From b3087cc4f31a66c8c7b63419e913ed9d34145f10 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 9 Oct 2007 13:23:56 +0200
Subject: compat_ioctl: move cdrom handlers to block/compat_ioctl.c

These are shared by all cd-rom drivers and should have common
handlers. Do slight cosmetic cleanups in the process.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/compat_ioctl.c | 105 ------------------------------------------------------
 1 file changed, 105 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 3baa90d3109..24c74357112 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -37,7 +37,6 @@
 #include <linux/if_ppp.h>
 #include <linux/if_pppox.h>
 #include <linux/mtio.h>
-#include <linux/cdrom.h>
 #include <linux/auto_fs.h>
 #include <linux/auto_fs4.h>
 #include <linux/tty.h>
@@ -1039,108 +1038,6 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return err ? -EFAULT: 0;
 }
 
-struct cdrom_read_audio32 {
-	union cdrom_addr	addr;
-	u8			addr_format;
-	compat_int_t		nframes;
-	compat_caddr_t		buf;
-};
-
-struct cdrom_generic_command32 {
-	unsigned char	cmd[CDROM_PACKET_SIZE];
-	compat_caddr_t	buffer;
-	compat_uint_t	buflen;
-	compat_int_t	stat;
-	compat_caddr_t	sense;
-	unsigned char	data_direction;
-	compat_int_t	quiet;
-	compat_int_t	timeout;
-	compat_caddr_t	reserved[1];
-};
-  
-static int cdrom_do_read_audio(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct cdrom_read_audio __user *cdread_audio;
-	struct cdrom_read_audio32 __user *cdread_audio32;
-	__u32 data;
-	void __user *datap;
-
-	cdread_audio = compat_alloc_user_space(sizeof(*cdread_audio));
-	cdread_audio32 = compat_ptr(arg);
-
-	if (copy_in_user(&cdread_audio->addr,
-			 &cdread_audio32->addr,
-			 (sizeof(*cdread_audio32) -
-			  sizeof(compat_caddr_t))))
-	 	return -EFAULT;
-
-	if (get_user(data, &cdread_audio32->buf))
-		return -EFAULT;
-	datap = compat_ptr(data);
-	if (put_user(datap, &cdread_audio->buf))
-		return -EFAULT;
-
-	return sys_ioctl(fd, cmd, (unsigned long) cdread_audio);
-}
-
-static int cdrom_do_generic_command(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	struct cdrom_generic_command __user *cgc;
-	struct cdrom_generic_command32 __user *cgc32;
-	u32 data;
-	unsigned char dir;
-	int itmp;
-
-	cgc = compat_alloc_user_space(sizeof(*cgc));
-	cgc32 = compat_ptr(arg);
-
-	if (copy_in_user(&cgc->cmd, &cgc32->cmd, sizeof(cgc->cmd)) ||
-	    get_user(data, &cgc32->buffer) ||
-	    put_user(compat_ptr(data), &cgc->buffer) ||
-	    copy_in_user(&cgc->buflen, &cgc32->buflen,
-			 (sizeof(unsigned int) + sizeof(int))) ||
-	    get_user(data, &cgc32->sense) ||
-	    put_user(compat_ptr(data), &cgc->sense) ||
-	    get_user(dir, &cgc32->data_direction) ||
-	    put_user(dir, &cgc->data_direction) ||
-	    get_user(itmp, &cgc32->quiet) ||
-	    put_user(itmp, &cgc->quiet) ||
-	    get_user(itmp, &cgc32->timeout) ||
-	    put_user(itmp, &cgc->timeout) ||
-	    get_user(data, &cgc32->reserved[0]) ||
-	    put_user(compat_ptr(data), &cgc->reserved[0]))
-		return -EFAULT;
-
-	return sys_ioctl(fd, cmd, (unsigned long) cgc);
-}
-
-static int cdrom_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	int err;
-
-	switch(cmd) {
-	case CDROMREADAUDIO:
-		err = cdrom_do_read_audio(fd, cmd, arg);
-		break;
-
-	case CDROM_SEND_PACKET:
-		err = cdrom_do_generic_command(fd, cmd, arg);
-		break;
-
-	default:
-		do {
-			static int count;
-			if (++count <= 20)
-				printk("cdrom_ioctl: Unknown cmd fd(%d) "
-				       "cmd(%08x) arg(%08x)\n",
-				       (int)fd, (unsigned int)cmd, (unsigned int)arg);
-		} while(0);
-		err = -EINVAL;
-		break;
-	};
-
-	return err;
-}
 #endif /* CONFIG_BLOCK */
 
 #ifdef CONFIG_VT
@@ -3147,8 +3044,6 @@ HANDLE_IOCTL(PPPIOCSACTIVE32, ppp_sock_fprog_ioctl_trans)
 #ifdef CONFIG_BLOCK
 HANDLE_IOCTL(MTIOCGET32, mt_ioctl_trans)
 HANDLE_IOCTL(MTIOCPOS32, mt_ioctl_trans)
-HANDLE_IOCTL(CDROMREADAUDIO, cdrom_ioctl_trans)
-HANDLE_IOCTL(CDROM_SEND_PACKET, cdrom_ioctl_trans)
 #endif
 #define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,unsigned int)
 HANDLE_IOCTL(AUTOFS_IOC_SETTIMEOUT32, ioc_settimeout)
-- 
cgit v1.2.3


From 1ca91cd0336b05b91c51b403c9ed9d297813533f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 9 Oct 2007 13:23:57 +0200
Subject: compat_ioctl: move floppy handlers to block/compat_ioctl.c

The floppy ioctls are used by multiple drivers, so they should be
handled in a shared location. Also, add minor cleanups.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/compat_ioctl.c | 337 ------------------------------------------------------
 1 file changed, 337 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 24c74357112..b9e3357bcc2 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -32,7 +32,6 @@
 #include <linux/vt.h>
 #include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/fd.h>
 #include <linux/ppp_defs.h>
 #include <linux/if_ppp.h>
 #include <linux/if_pppox.h>
@@ -1407,333 +1406,6 @@ static int ioc_settimeout(unsigned int fd, unsigned int cmd, unsigned long arg)
 #define HIDPGETCONNLIST	_IOR('H', 210, int)
 #define HIDPGETCONNINFO	_IOR('H', 211, int)
 
-#ifdef CONFIG_BLOCK
-struct floppy_struct32 {
-	compat_uint_t	size;
-	compat_uint_t	sect;
-	compat_uint_t	head;
-	compat_uint_t	track;
-	compat_uint_t	stretch;
-	unsigned char	gap;
-	unsigned char	rate;
-	unsigned char	spec1;
-	unsigned char	fmt_gap;
-	const compat_caddr_t name;
-};
-
-struct floppy_drive_params32 {
-	char		cmos;
-	compat_ulong_t	max_dtr;
-	compat_ulong_t	hlt;
-	compat_ulong_t	hut;
-	compat_ulong_t	srt;
-	compat_ulong_t	spinup;
-	compat_ulong_t	spindown;
-	unsigned char	spindown_offset;
-	unsigned char	select_delay;
-	unsigned char	rps;
-	unsigned char	tracks;
-	compat_ulong_t	timeout;
-	unsigned char	interleave_sect;
-	struct floppy_max_errors max_errors;
-	char		flags;
-	char		read_track;
-	short		autodetect[8];
-	compat_int_t	checkfreq;
-	compat_int_t	native_format;
-};
-
-struct floppy_drive_struct32 {
-	signed char	flags;
-	compat_ulong_t	spinup_date;
-	compat_ulong_t	select_date;
-	compat_ulong_t	first_read_date;
-	short		probed_format;
-	short		track;
-	short		maxblock;
-	short		maxtrack;
-	compat_int_t	generation;
-	compat_int_t	keep_data;
-	compat_int_t	fd_ref;
-	compat_int_t	fd_device;
-	compat_int_t	last_checked;
-	compat_caddr_t dmabuf;
-	compat_int_t	bufblocks;
-};
-
-struct floppy_fdc_state32 {
-	compat_int_t	spec1;
-	compat_int_t	spec2;
-	compat_int_t	dtr;
-	unsigned char	version;
-	unsigned char	dor;
-	compat_ulong_t	address;
-	unsigned int	rawcmd:2;
-	unsigned int	reset:1;
-	unsigned int	need_configure:1;
-	unsigned int	perp_mode:2;
-	unsigned int	has_fifo:1;
-	unsigned int	driver_version;
-	unsigned char	track[4];
-};
-
-struct floppy_write_errors32 {
-	unsigned int	write_errors;
-	compat_ulong_t	first_error_sector;
-	compat_int_t	first_error_generation;
-	compat_ulong_t	last_error_sector;
-	compat_int_t	last_error_generation;
-	compat_uint_t	badness;
-};
-
-#define FDSETPRM32 _IOW(2, 0x42, struct floppy_struct32)
-#define FDDEFPRM32 _IOW(2, 0x43, struct floppy_struct32)
-#define FDGETPRM32 _IOR(2, 0x04, struct floppy_struct32)
-#define FDSETDRVPRM32 _IOW(2, 0x90, struct floppy_drive_params32)
-#define FDGETDRVPRM32 _IOR(2, 0x11, struct floppy_drive_params32)
-#define FDGETDRVSTAT32 _IOR(2, 0x12, struct floppy_drive_struct32)
-#define FDPOLLDRVSTAT32 _IOR(2, 0x13, struct floppy_drive_struct32)
-#define FDGETFDCSTAT32 _IOR(2, 0x15, struct floppy_fdc_state32)
-#define FDWERRORGET32  _IOR(2, 0x17, struct floppy_write_errors32)
-
-static struct {
-	unsigned int	cmd32;
-	unsigned int	cmd;
-} fd_ioctl_trans_table[] = {
-	{ FDSETPRM32, FDSETPRM },
-	{ FDDEFPRM32, FDDEFPRM },
-	{ FDGETPRM32, FDGETPRM },
-	{ FDSETDRVPRM32, FDSETDRVPRM },
-	{ FDGETDRVPRM32, FDGETDRVPRM },
-	{ FDGETDRVSTAT32, FDGETDRVSTAT },
-	{ FDPOLLDRVSTAT32, FDPOLLDRVSTAT },
-	{ FDGETFDCSTAT32, FDGETFDCSTAT },
-	{ FDWERRORGET32, FDWERRORGET }
-};
-
-#define NR_FD_IOCTL_TRANS ARRAY_SIZE(fd_ioctl_trans_table)
-
-static int fd_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
-	mm_segment_t old_fs = get_fs();
-	void *karg = NULL;
-	unsigned int kcmd = 0;
-	int i, err;
-
-	for (i = 0; i < NR_FD_IOCTL_TRANS; i++)
-		if (cmd == fd_ioctl_trans_table[i].cmd32) {
-			kcmd = fd_ioctl_trans_table[i].cmd;
-			break;
-		}
-	if (!kcmd)
-		return -EINVAL;
-
-	switch (cmd) {
-		case FDSETPRM32:
-		case FDDEFPRM32:
-		case FDGETPRM32:
-		{
-			compat_uptr_t name;
-			struct floppy_struct32 __user *uf;
-			struct floppy_struct *f;
-
-			uf = compat_ptr(arg);
-			f = karg = kmalloc(sizeof(struct floppy_struct), GFP_KERNEL);
-			if (!karg)
-				return -ENOMEM;
-			if (cmd == FDGETPRM32)
-				break;
-			err = __get_user(f->size, &uf->size);
-			err |= __get_user(f->sect, &uf->sect);
-			err |= __get_user(f->head, &uf->head);
-			err |= __get_user(f->track, &uf->track);
-			err |= __get_user(f->stretch, &uf->stretch);
-			err |= __get_user(f->gap, &uf->gap);
-			err |= __get_user(f->rate, &uf->rate);
-			err |= __get_user(f->spec1, &uf->spec1);
-			err |= __get_user(f->fmt_gap, &uf->fmt_gap);
-			err |= __get_user(name, &uf->name);
-			f->name = compat_ptr(name);
-			if (err) {
-				err = -EFAULT;
-				goto out;
-			}
-			break;
-		}
-		case FDSETDRVPRM32:
-		case FDGETDRVPRM32:
-		{
-			struct floppy_drive_params32 __user *uf;
-			struct floppy_drive_params *f;
-
-			uf = compat_ptr(arg);
-			f = karg = kmalloc(sizeof(struct floppy_drive_params), GFP_KERNEL);
-			if (!karg)
-				return -ENOMEM;
-			if (cmd == FDGETDRVPRM32)
-				break;
-			err = __get_user(f->cmos, &uf->cmos);
-			err |= __get_user(f->max_dtr, &uf->max_dtr);
-			err |= __get_user(f->hlt, &uf->hlt);
-			err |= __get_user(f->hut, &uf->hut);
-			err |= __get_user(f->srt, &uf->srt);
-			err |= __get_user(f->spinup, &uf->spinup);
-			err |= __get_user(f->spindown, &uf->spindown);
-			err |= __get_user(f->spindown_offset, &uf->spindown_offset);
-			err |= __get_user(f->select_delay, &uf->select_delay);
-			err |= __get_user(f->rps, &uf->rps);
-			err |= __get_user(f->tracks, &uf->tracks);
-			err |= __get_user(f->timeout, &uf->timeout);
-			err |= __get_user(f->interleave_sect, &uf->interleave_sect);
-			err |= __copy_from_user(&f->max_errors, &uf->max_errors, sizeof(f->max_errors));
-			err |= __get_user(f->flags, &uf->flags);
-			err |= __get_user(f->read_track, &uf->read_track);
-			err |= __copy_from_user(f->autodetect, uf->autodetect, sizeof(f->autodetect));
-			err |= __get_user(f->checkfreq, &uf->checkfreq);
-			err |= __get_user(f->native_format, &uf->native_format);
-			if (err) {
-				err = -EFAULT;
-				goto out;
-			}
-			break;
-		}
-		case FDGETDRVSTAT32:
-		case FDPOLLDRVSTAT32:
-			karg = kmalloc(sizeof(struct floppy_drive_struct), GFP_KERNEL);
-			if (!karg)
-				return -ENOMEM;
-			break;
-		case FDGETFDCSTAT32:
-			karg = kmalloc(sizeof(struct floppy_fdc_state), GFP_KERNEL);
-			if (!karg)
-				return -ENOMEM;
-			break;
-		case FDWERRORGET32:
-			karg = kmalloc(sizeof(struct floppy_write_errors), GFP_KERNEL);
-			if (!karg)
-				return -ENOMEM;
-			break;
-		default:
-			return -EINVAL;
-	}
-	set_fs (KERNEL_DS);
-	err = sys_ioctl (fd, kcmd, (unsigned long)karg);
-	set_fs (old_fs);
-	if (err)
-		goto out;
-	switch (cmd) {
-		case FDGETPRM32:
-		{
-			struct floppy_struct *f = karg;
-			struct floppy_struct32 __user *uf = compat_ptr(arg);
-
-			err = __put_user(f->size, &uf->size);
-			err |= __put_user(f->sect, &uf->sect);
-			err |= __put_user(f->head, &uf->head);
-			err |= __put_user(f->track, &uf->track);
-			err |= __put_user(f->stretch, &uf->stretch);
-			err |= __put_user(f->gap, &uf->gap);
-			err |= __put_user(f->rate, &uf->rate);
-			err |= __put_user(f->spec1, &uf->spec1);
-			err |= __put_user(f->fmt_gap, &uf->fmt_gap);
-			err |= __put_user((u64)f->name, (compat_caddr_t __user *)&uf->name);
-			break;
-		}
-		case FDGETDRVPRM32:
-		{
-			struct floppy_drive_params32 __user *uf;
-			struct floppy_drive_params *f = karg;
-
-			uf = compat_ptr(arg);
-			err = __put_user(f->cmos, &uf->cmos);
-			err |= __put_user(f->max_dtr, &uf->max_dtr);
-			err |= __put_user(f->hlt, &uf->hlt);
-			err |= __put_user(f->hut, &uf->hut);
-			err |= __put_user(f->srt, &uf->srt);
-			err |= __put_user(f->spinup, &uf->spinup);
-			err |= __put_user(f->spindown, &uf->spindown);
-			err |= __put_user(f->spindown_offset, &uf->spindown_offset);
-			err |= __put_user(f->select_delay, &uf->select_delay);
-			err |= __put_user(f->rps, &uf->rps);
-			err |= __put_user(f->tracks, &uf->tracks);
-			err |= __put_user(f->timeout, &uf->timeout);
-			err |= __put_user(f->interleave_sect, &uf->interleave_sect);
-			err |= __copy_to_user(&uf->max_errors, &f->max_errors, sizeof(f->max_errors));
-			err |= __put_user(f->flags, &uf->flags);
-			err |= __put_user(f->read_track, &uf->read_track);
-			err |= __copy_to_user(uf->autodetect, f->autodetect, sizeof(f->autodetect));
-			err |= __put_user(f->checkfreq, &uf->checkfreq);
-			err |= __put_user(f->native_format, &uf->native_format);
-			break;
-		}
-		case FDGETDRVSTAT32:
-		case FDPOLLDRVSTAT32:
-		{
-			struct floppy_drive_struct32 __user *uf;
-			struct floppy_drive_struct *f = karg;
-
-			uf = compat_ptr(arg);
-			err = __put_user(f->flags, &uf->flags);
-			err |= __put_user(f->spinup_date, &uf->spinup_date);
-			err |= __put_user(f->select_date, &uf->select_date);
-			err |= __put_user(f->first_read_date, &uf->first_read_date);
-			err |= __put_user(f->probed_format, &uf->probed_format);
-			err |= __put_user(f->track, &uf->track);
-			err |= __put_user(f->maxblock, &uf->maxblock);
-			err |= __put_user(f->maxtrack, &uf->maxtrack);
-			err |= __put_user(f->generation, &uf->generation);
-			err |= __put_user(f->keep_data, &uf->keep_data);
-			err |= __put_user(f->fd_ref, &uf->fd_ref);
-			err |= __put_user(f->fd_device, &uf->fd_device);
-			err |= __put_user(f->last_checked, &uf->last_checked);
-			err |= __put_user((u64)f->dmabuf, &uf->dmabuf);
-			err |= __put_user((u64)f->bufblocks, &uf->bufblocks);
-			break;
-		}
-		case FDGETFDCSTAT32:
-		{
-			struct floppy_fdc_state32 __user *uf;
-			struct floppy_fdc_state *f = karg;
-
-			uf = compat_ptr(arg);
-			err = __put_user(f->spec1, &uf->spec1);
-			err |= __put_user(f->spec2, &uf->spec2);
-			err |= __put_user(f->dtr, &uf->dtr);
-			err |= __put_user(f->version, &uf->version);
-			err |= __put_user(f->dor, &uf->dor);
-			err |= __put_user(f->address, &uf->address);
-			err |= __copy_to_user((char __user *)&uf->address + sizeof(uf->address),
-					   (char *)&f->address + sizeof(f->address), sizeof(int));
-			err |= __put_user(f->driver_version, &uf->driver_version);
-			err |= __copy_to_user(uf->track, f->track, sizeof(f->track));
-			break;
-		}
-		case FDWERRORGET32:
-		{
-			struct floppy_write_errors32 __user *uf;
-			struct floppy_write_errors *f = karg;
-
-			uf = compat_ptr(arg);
-			err = __put_user(f->write_errors, &uf->write_errors);
-			err |= __put_user(f->first_error_sector, &uf->first_error_sector);
-			err |= __put_user(f->first_error_generation, &uf->first_error_generation);
-			err |= __put_user(f->last_error_sector, &uf->last_error_sector);
-			err |= __put_user(f->last_error_generation, &uf->last_error_generation);
-			err |= __put_user(f->badness, &uf->badness);
-			break;
-		}
-		default:
-			break;
-	}
-	if (err)
-		err = -EFAULT;
-
-out:
-	kfree(karg);
-	return err;
-}
-#endif
-
 struct mtd_oob_buf32 {
 	u_int32_t start;
 	u_int32_t length;
@@ -3025,15 +2697,6 @@ HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp)
 HANDLE_IOCTL(SIOCGSTAMPNS, do_siocgstampns)
 #endif
 #ifdef CONFIG_BLOCK
-HANDLE_IOCTL(FDSETPRM32, fd_ioctl_trans)
-HANDLE_IOCTL(FDDEFPRM32, fd_ioctl_trans)
-HANDLE_IOCTL(FDGETPRM32, fd_ioctl_trans)
-HANDLE_IOCTL(FDSETDRVPRM32, fd_ioctl_trans)
-HANDLE_IOCTL(FDGETDRVPRM32, fd_ioctl_trans)
-HANDLE_IOCTL(FDGETDRVSTAT32, fd_ioctl_trans)
-HANDLE_IOCTL(FDPOLLDRVSTAT32, fd_ioctl_trans)
-HANDLE_IOCTL(FDGETFDCSTAT32, fd_ioctl_trans)
-HANDLE_IOCTL(FDWERRORGET32, fd_ioctl_trans)
 HANDLE_IOCTL(SG_IO,sg_ioctl_trans)
 HANDLE_IOCTL(SG_GET_REQUEST_TABLE, sg_grt_trans)
 #endif
-- 
cgit v1.2.3


From 87124e581bfeaa5864662a435b6ee2a19e91b905 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 23 Jul 2007 09:54:36 +0100
Subject: [GFS2] Fix two races relating to glock callbacks

One of the races relates to referencing a variable while not holding
its protecting spinlock. The patch simply moves the test inside the
spin lock. The other races occurs when a demote to unlocked request
occurs during the time a demote to shared request is already running.
This of course only happens in the case that the lock was in the
exclusive mode to start with. The patch adds a check to see if another
demote request has occurred in the mean time and if it has, then it
performs a second demote.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 3f0974e1afe..6a3eeba102f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -545,12 +545,14 @@ static int rq_demote(struct gfs2_glock *gl)
 		return 0;
 	}
 	set_bit(GLF_LOCK, &gl->gl_flags);
-	spin_unlock(&gl->gl_spin);
 	if (gl->gl_demote_state == LM_ST_UNLOCKED ||
-	    gl->gl_state != LM_ST_EXCLUSIVE)
+	    gl->gl_state != LM_ST_EXCLUSIVE) {
+		spin_unlock(&gl->gl_spin);
 		gfs2_glock_drop_th(gl);
-	else
+	} else {
+		spin_unlock(&gl->gl_spin);
 		gfs2_glock_xmote_th(gl, NULL);
+	}
 	spin_lock(&gl->gl_spin);
 
 	return 0;
@@ -760,10 +762,20 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
 
 	if (!gh) {
 		gl->gl_stamp = jiffies;
-		if (ret & LM_OUT_CANCELED)
+		if (ret & LM_OUT_CANCELED) {
 			op_done = 0;
-		else
+		} else {
+			spin_lock(&gl->gl_spin);
+			if (gl->gl_state != gl->gl_demote_state) {
+				gl->gl_req_bh = NULL;
+				spin_unlock(&gl->gl_spin);
+				gfs2_glock_drop_th(gl);
+				gfs2_glock_put(gl);
+				return;
+			}
 			gfs2_demote_wake(gl);
+			spin_unlock(&gl->gl_spin);
+		}
 	} else {
 		spin_lock(&gl->gl_spin);
 		list_del_init(&gh->gh_list);
@@ -817,7 +829,7 @@ out:
  *
  */
 
-void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
+static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	int flags = gh ? gh->gh_flags : 0;
-- 
cgit v1.2.3


From 26caee5bc643b318fa2e9bd4f66dace1755ec413 Mon Sep 17 00:00:00 2001
From: Josef Whiter <jwhiter@redhat.com>
Date: Mon, 23 Jul 2007 10:02:40 +0100
Subject: [GFS2] Fix calculation of demote state

If a glock is in the exclusive state and a request for demote to
deferred has been received, then further requests for demote to
shared are being ignored. This patch fixes that by ensuring that
we demote to unlocked in that case.

Signed-off-by: Josef Whiter <jwhiter@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6a3eeba102f..6b6ae453734 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -697,8 +697,9 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state, int remot
 			}
 			return;
 		}
-	} else if (gl->gl_demote_state != LM_ST_UNLOCKED) {
-		gl->gl_demote_state = state;
+	} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
+			gl->gl_demote_state != state) {
+		gl->gl_demote_state = LM_ST_UNLOCKED;
 	}
 	spin_unlock(&gl->gl_spin);
 }
-- 
cgit v1.2.3


From aa0481e58a9a97a97035725a712920b5fe32f348 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Sat, 21 Jul 2007 17:03:22 +0200
Subject: [GFS2] Clean up duplicate includes in fs/gfs2/

This patch cleans up duplicate includes in
	fs/gfs2/

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c                | 2 --
 fs/gfs2/locking/dlm/lock_dlm.h | 1 -
 fs/gfs2/locking/nolock/main.c  | 1 -
 3 files changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6b6ae453734..d403fd70807 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -25,8 +25,6 @@
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
-#include <linux/module.h>
-#include <linux/kallsyms.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index 24d70f73b65..9e8265d2837 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -13,7 +13,6 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/module.h>
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/list.h>
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
index 0d149c8c493..d3b8ce6fbbe 100644
--- a/fs/gfs2/locking/nolock/main.c
+++ b/fs/gfs2/locking/nolock/main.c
@@ -9,7 +9,6 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/module.h>
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/fs.h>
-- 
cgit v1.2.3


From afd0942d98f74296b74993739e41d2ca7cb9fd5a Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Fri, 20 Jul 2007 13:07:26 -0500
Subject: [GFS2] GFS2 not checking pointer on create when running under nfsd

When looking at an unrelated problem, I noticed that nfsd does not
set nameidata pointer on create (ie nd is NULL).  This should
cause an oops in some cases in which when NFSd is mounted over GFS2.

Signed-off-by: Steve French <sfrench@us.ibm.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 911c115b5c6..5b8b994b991 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -69,7 +69,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry,
 			mark_inode_dirty(inode);
 			break;
 		} else if (PTR_ERR(inode) != -EEXIST ||
-			   (nd->intent.open.flags & O_EXCL)) {
+			   (nd && (nd->intent.open.flags & O_EXCL))) {
 			gfs2_holder_uninit(ghs);
 			return PTR_ERR(inode);
 		}
-- 
cgit v1.2.3


From 7b08fc620109c2f66575e9ae884f45c37933ea18 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 24 Jul 2007 13:53:36 +0100
Subject: [GFS2] Fix an oops in glock dumping

This fixes an oops which was occurring during glock dumping due to the
seq file code not taking a reference to the glock. Also this fixes a
memory leak which occurred in certain cases, in turn preventing the
filesystem from unmounting.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 73 +++++++++++++++++++++++++++------------------------------
 1 file changed, 35 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d403fd70807..e4bc8ae561a 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -46,7 +46,6 @@ struct glock_iter {
 	int hash;                     /* hash bucket index         */
 	struct gfs2_sbd *sdp;         /* incore superblock         */
 	struct gfs2_glock *gl;        /* current glock struct      */
-	struct hlist_head *hb_list;   /* current hash bucket ptr   */
 	struct seq_file *seq;         /* sequence file for debugfs */
 	char string[512];             /* scratch space             */
 };
@@ -1990,47 +1989,38 @@ int __init gfs2_glock_init(void)
 
 static int gfs2_glock_iter_next(struct glock_iter *gi)
 {
+	struct gfs2_glock *gl;
+
 	read_lock(gl_lock_addr(gi->hash));
-	while (1) {
-		if (!gi->hb_list) {  /* If we don't have a hash bucket yet */
-			gi->hb_list = &gl_hash_table[gi->hash].hb_list;
-			if (hlist_empty(gi->hb_list)) {
-				read_unlock(gl_lock_addr(gi->hash));
-				gi->hash++;
-				read_lock(gl_lock_addr(gi->hash));
-				gi->hb_list = NULL;
-				if (gi->hash >= GFS2_GL_HASH_SIZE) {
-					read_unlock(gl_lock_addr(gi->hash));
-					return 1;
-				}
-				else
-					continue;
-			}
-			if (!hlist_empty(gi->hb_list)) {
-				gi->gl = list_entry(gi->hb_list->first,
-						    struct gfs2_glock,
-						    gl_list);
-			}
-		} else {
-			if (gi->gl->gl_list.next == NULL) {
-				read_unlock(gl_lock_addr(gi->hash));
-				gi->hash++;
-				read_lock(gl_lock_addr(gi->hash));
-				gi->hb_list = NULL;
-				continue;
-			}
-			gi->gl = list_entry(gi->gl->gl_list.next,
-					    struct gfs2_glock, gl_list);
-		}
+	gl = gi->gl;
+	if (gl) {
+		gi->gl = hlist_entry(gl->gl_list.next, struct gfs2_glock,
+				     gl_list);
 		if (gi->gl)
-			break;
+			gfs2_glock_hold(gi->gl);
 	}
 	read_unlock(gl_lock_addr(gi->hash));
+	if (gl)
+		gfs2_glock_put(gl);
+
+	while(gi->gl == NULL) {
+		gi->hash++;
+		if (gi->hash >= GFS2_GL_HASH_SIZE)
+			return 1;
+		read_lock(gl_lock_addr(gi->hash));
+		gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
+				     struct gfs2_glock, gl_list);
+		if (gi->gl)
+			gfs2_glock_hold(gi->gl);
+		read_unlock(gl_lock_addr(gi->hash));
+	}
 	return 0;
 }
 
 static void gfs2_glock_iter_free(struct glock_iter *gi)
 {
+	if (gi->gl)
+		gfs2_glock_put(gi->gl);
 	kfree(gi);
 }
 
@@ -2044,12 +2034,17 @@ static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
 
 	gi->sdp = sdp;
 	gi->hash = 0;
-	gi->gl = NULL;
-	gi->hb_list = NULL;
 	gi->seq = NULL;
 	memset(gi->string, 0, sizeof(gi->string));
 
-	if (gfs2_glock_iter_next(gi)) {
+	read_lock(gl_lock_addr(gi->hash));
+	gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
+			     struct gfs2_glock, gl_list);
+	if (gi->gl)
+		gfs2_glock_hold(gi->gl);
+	read_unlock(gl_lock_addr(gi->hash));
+
+	if (!gi->gl && gfs2_glock_iter_next(gi)) {
 		gfs2_glock_iter_free(gi);
 		return NULL;
 	}
@@ -2066,7 +2061,7 @@ static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos)
 	if (!gi)
 		return NULL;
 
-	while (n--) {
+	while(n--) {
 		if (gfs2_glock_iter_next(gi)) {
 			gfs2_glock_iter_free(gi);
 			return NULL;
@@ -2093,7 +2088,9 @@ static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
 
 static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr)
 {
-	/* nothing for now */
+	struct glock_iter *gi = iter_ptr;
+	if (gi)
+		gfs2_glock_iter_free(gi);
 }
 
 static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
-- 
cgit v1.2.3


From 905d2aefa9e06ebb995df96920d273a516fcd3f9 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 24 Jul 2007 14:05:31 -0500
Subject: [GFS2] Move some code inside the log lock

This is the first of five patches for bug #248176:

There were still some critical variables being manipulated outside
the log_lock spinlock.  That usually resulted in a hang.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/lops.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 3b395c41b2f..a0371f835cf 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -117,7 +117,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 	struct buffer_head *bh;
 	struct gfs2_log_descriptor *ld;
 	struct gfs2_bufdata *bd1 = NULL, *bd2;
-	unsigned int total = sdp->sd_log_num_buf;
+	unsigned int total;
 	unsigned int offset = BUF_OFFSET;
 	unsigned int limit;
 	unsigned int num;
@@ -127,12 +127,16 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 	limit = buf_limit(sdp);
 	/* for 4k blocks, limit = 503 */
 
+	gfs2_log_lock(sdp);
+	total = sdp->sd_log_num_buf;
 	bd1 = bd2 = list_prepare_entry(bd1, &sdp->sd_log_le_buf, bd_le.le_list);
 	while(total) {
 		num = total;
 		if (total > limit)
 			num = limit;
+		gfs2_log_unlock(sdp);
 		bh = gfs2_log_get_buf(sdp);
+		gfs2_log_lock(sdp);
 		ld = (struct gfs2_log_descriptor *)bh->b_data;
 		ptr = (__be64 *)(bh->b_data + offset);
 		ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
@@ -152,21 +156,27 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 				break;
 		}
 
+		gfs2_log_unlock(sdp);
 		set_buffer_dirty(bh);
 		ll_rw_block(WRITE, 1, &bh);
+		gfs2_log_lock(sdp);
 
 		n = 0;
 		list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf,
 					     bd_le.le_list) {
+			gfs2_log_unlock(sdp);
 			bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
 			set_buffer_dirty(bh);
 			ll_rw_block(WRITE, 1, &bh);
+			gfs2_log_lock(sdp);
 			if (++n >= num)
 				break;
 		}
 
+		BUG_ON(total < num);
 		total -= num;
 	}
+	gfs2_log_unlock(sdp);
 }
 
 static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
@@ -524,7 +534,7 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
 	struct gfs2_log_descriptor *ld;
 	unsigned int limit;
 	unsigned int total_dbuf;
-	unsigned int total_jdata = sdp->sd_log_num_jdata;
+	unsigned int total_jdata;
 	unsigned int num, n;
 	__be64 *ptr = NULL;
 
@@ -536,6 +546,7 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
 	 */
 	gfs2_log_lock(sdp);
 	total_dbuf = sdp->sd_log_num_databuf;
+	total_jdata = sdp->sd_log_num_jdata;
 	bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf,
 				       bd_le.le_list);
 	while(total_dbuf) {
@@ -621,10 +632,10 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
 		}
 		gfs2_log_unlock(sdp);
 		if (bh) {
-			set_buffer_mapped(bh);
 			set_buffer_dirty(bh);
 			ll_rw_block(WRITE, 1, &bh);
 			bh = NULL;
+			ptr = NULL;
 		}
 		n = 0;
 		gfs2_log_lock(sdp);
-- 
cgit v1.2.3


From 693ddeabbb3e563f192a7ac74ec04168aa92e8d8 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 24 Jul 2007 14:07:33 -0500
Subject: [GFS2] Revert part of earlier log.c changes

This is patch 2 of 5 for bug #248176.

The list_move code previously concocted in log.c for bug #238162
(see https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=238162#c23)
never runs as bh can now never be NULL at this point.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/log.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index f49a12e2408..f7c0608332f 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -83,11 +83,6 @@ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 
 			gfs2_assert(sdp, bd->bd_ail == ai);
 
-			if (!bh){
-				list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
-                                continue;
-                        }
-
 			if (!buffer_busy(bh)) {
 				if (!buffer_uptodate(bh)) {
 					gfs2_log_unlock(sdp);
@@ -130,11 +125,6 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
 					 bd_ail_st_list) {
 		bh = bd->bd_bh;
 
-		if (!bh){
-			list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
-			continue;
-		}
-
 		gfs2_assert(sdp, bd->bd_ail == ai);
 
 		if (buffer_busy(bh)) {
@@ -155,13 +145,14 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
 
 static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
 {
-	struct list_head *head = &sdp->sd_ail1_list;
+	struct list_head *head;
 	u64 sync_gen;
 	struct list_head *first;
 	struct gfs2_ail *first_ai, *ai, *tmp;
 	int done = 0;
 
 	gfs2_log_lock(sdp);
+	head = &sdp->sd_ail1_list;
 	if (list_empty(head)) {
 		gfs2_log_unlock(sdp);
 		return;
-- 
cgit v1.2.3


From 6760bdcd03a12d7d082794311ccbaf44bfc23b06 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 24 Jul 2007 14:09:32 -0500
Subject: [GFS2] Prevent infinite loop in try_rgrp_unlink()

This is patch three of five for bug #248176.

The try_rgrp_unlink code in rgrp.c had an infinite loop.  This was
caused because the bitmap function rgblk_search can return a block
less than the "goal" block, in which case it was looping.  The fix is
to make it always march forward as needed.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index ce48c4594ec..b93ac45b88b 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -31,6 +31,7 @@
 #include "inode.h"
 
 #define BFITNOENT ((u32)~0)
+#define NO_BLOCK ((u64)~0)
 
 /*
  * These routines are used by the resource group routines (rgrp.c)
@@ -116,8 +117,7 @@ static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
  * @buffer: the buffer that holds the bitmaps
  * @buflen: the length (in bytes) of the buffer
  * @goal: start search at this block's bit-pair (within @buffer)
- * @old_state: GFS2_BLKST_XXX the state of the block we're looking for;
- *       bit 0 = alloc(1)/free(0), bit 1 = meta(1)/data(0)
+ * @old_state: GFS2_BLKST_XXX the state of the block we're looking for.
  *
  * Scope of @goal and returned block number is only within this bitmap buffer,
  * not entire rgrp or filesystem.  @buffer will be offset from the actual
@@ -137,9 +137,13 @@ static u32 gfs2_bitfit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
 	byte = buffer + (goal / GFS2_NBBY);
 	bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
 	end = buffer + buflen;
-	alloc = (old_state & 1) ? 0 : 0x55;
+	alloc = (old_state == GFS2_BLKST_FREE) ? 0x55 : 0;
 
 	while (byte < end) {
+		/* If we're looking for a free block we can eliminate all
+		   bitmap settings with 0x55, which represents four data
+		   blocks in a row.  If we're looking for a data block, we can
+		   eliminate 0x00 which corresponds to four free blocks. */
 		if ((*byte & 0x55) == alloc) {
 			blk += (8 - bit) >> 1;
 
@@ -859,19 +863,21 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
 {
 	struct inode *inode;
-	u32 goal = 0;
+	u32 goal = 0, block;
 	u64 no_addr;
 
 	for(;;) {
 		if (goal >= rgd->rd_data)
 			break;
-		goal = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
-				    GFS2_BLKST_UNLINKED);
-		if (goal == BFITNOENT)
+		block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
+				     GFS2_BLKST_UNLINKED);
+		if (block == BFITNOENT)
 			break;
-		no_addr = goal + rgd->rd_data0;
+		/* rgblk_search can return a block < goal, so we need to
+		   keep it marching forward. */
+		no_addr = block + rgd->rd_data0;
 		goal++;
-		if (no_addr < *last_unlinked)
+		if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked)
 			continue;
 		*last_unlinked = no_addr;
 		inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
@@ -1152,7 +1158,7 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
 	struct gfs2_alloc *al = &ip->i_alloc;
 	struct inode *inode;
 	int error = 0;
-	u64 last_unlinked = 0;
+	u64 last_unlinked = NO_BLOCK;
 
 	if (gfs2_assert_warn(sdp, al->al_requested))
 		return -EINVAL;
@@ -1305,9 +1311,7 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 		goal = 0;
 	}
 
-	if (old_state != new_state) {
-		gfs2_assert_withdraw(rgd->rd_sbd, blk != BFITNOENT);
-
+	if (blk != BFITNOENT && old_state != new_state) {
 		gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
 		gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset,
 			    bi->bi_len, blk, new_state);
-- 
cgit v1.2.3


From cee23c79d08c57bbbb9923703409af3b17518c58 Mon Sep 17 00:00:00 2001
From: Denis Cheng <crquan@gmail.com>
Date: Wed, 25 Jul 2007 17:53:58 +0800
Subject: [GFS2] use an temp variable to reduce a spin_unlock

this is more clear.

Signed-off-by: Denis Cheng <crquan@gmail.com>
Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/locking/dlm/plock.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/gfs2/locking/dlm/plock.c
index fba1f1d87e4..1f7b038530b 100644
--- a/fs/gfs2/locking/dlm/plock.c
+++ b/fs/gfs2/locking/dlm/plock.c
@@ -346,15 +346,16 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
 
 static unsigned int dev_poll(struct file *file, poll_table *wait)
 {
+	unsigned int mask = 0;
+
 	poll_wait(file, &send_wq, wait);
 
 	spin_lock(&ops_lock);
-	if (!list_empty(&send_list)) {
-		spin_unlock(&ops_lock);
-		return POLLIN | POLLRDNORM;
-	}
+	if (!list_empty(&send_list))
+		mask = POLLIN | POLLRDNORM;
 	spin_unlock(&ops_lock);
-	return 0;
+
+	return mask;
 }
 
 static const struct file_operations dev_fops = {
-- 
cgit v1.2.3


From 0f8468c8bef3d04637c924e7bef20ca53018b319 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 25 Jul 2007 10:06:22 -0500
Subject: [GFS2] Detach buf data during in-place writeback

This is patch 5 of 5 for bug #248176

Metadata corruption was occurring because page references weren't
being removed in all cases.  I previously added a function called
detach_bufdata, but I discovered there already WAS a function out
there to do the job.  It's called gfs2_meta_cache_flush.  So I added
a call to that to remove the page references.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/log.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index f7c0608332f..00ab6c070a1 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -219,6 +219,7 @@ static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 {
 	struct list_head *head = &ai->ai_ail2_list;
 	struct gfs2_bufdata *bd;
+	struct gfs2_inode *bh_ip;
 
 	while (!list_empty(head)) {
 		bd = list_entry(head->prev, struct gfs2_bufdata,
@@ -228,6 +229,8 @@ static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 		list_del(&bd->bd_ail_st_list);
 		list_del(&bd->bd_ail_gl_list);
 		atomic_dec(&bd->bd_gl->gl_ail_count);
+		bh_ip = GFS2_I(bd->bd_bh->b_page->mapping->host);
+		gfs2_meta_cache_flush(bh_ip);
 		brelse(bd->bd_bh);
 	}
 }
-- 
cgit v1.2.3


From 4ef290025ccde7c52ba219cf733a4295acd5401f Mon Sep 17 00:00:00 2001
From: Denis Cheng <crquan@gmail.com>
Date: Tue, 31 Jul 2007 18:31:11 +0800
Subject: [GFS2] mark struct *_operations const

these struct *_operations are all method tables, thus should be const.

Signed-off-by: Denis Cheng <crquan@gmail.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/eaops.c | 8 ++++----
 fs/gfs2/eaops.h | 4 ++--
 fs/gfs2/glock.c | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index 1ab3e9d7388..aa8dbf303f6 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -200,28 +200,28 @@ static int security_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 	return gfs2_ea_remove_i(ip, er);
 }
 
-static struct gfs2_eattr_operations gfs2_user_eaops = {
+static const struct gfs2_eattr_operations gfs2_user_eaops = {
 	.eo_get = user_eo_get,
 	.eo_set = user_eo_set,
 	.eo_remove = user_eo_remove,
 	.eo_name = "user",
 };
 
-struct gfs2_eattr_operations gfs2_system_eaops = {
+const struct gfs2_eattr_operations gfs2_system_eaops = {
 	.eo_get = system_eo_get,
 	.eo_set = system_eo_set,
 	.eo_remove = system_eo_remove,
 	.eo_name = "system",
 };
 
-static struct gfs2_eattr_operations gfs2_security_eaops = {
+static const struct gfs2_eattr_operations gfs2_security_eaops = {
 	.eo_get = security_eo_get,
 	.eo_set = security_eo_set,
 	.eo_remove = security_eo_remove,
 	.eo_name = "security",
 };
 
-struct gfs2_eattr_operations *gfs2_ea_ops[] = {
+const struct gfs2_eattr_operations *gfs2_ea_ops[] = {
 	NULL,
 	&gfs2_user_eaops,
 	&gfs2_system_eaops,
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
index 508b4f7a244..da2f7fbbb40 100644
--- a/fs/gfs2/eaops.h
+++ b/fs/gfs2/eaops.h
@@ -22,9 +22,9 @@ struct gfs2_eattr_operations {
 
 unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
 
-extern struct gfs2_eattr_operations gfs2_system_eaops;
+extern const struct gfs2_eattr_operations gfs2_system_eaops;
 
-extern struct gfs2_eattr_operations *gfs2_ea_ops[];
+extern const struct gfs2_eattr_operations *gfs2_ea_ops[];
 
 #endif /* __EAOPS_DOT_H__ */
 
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index e4bc8ae561a..0054b7df714 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2103,7 +2103,7 @@ static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
 	return 0;
 }
 
-static struct seq_operations gfs2_glock_seq_ops = {
+static const struct seq_operations gfs2_glock_seq_ops = {
 	.start = gfs2_glock_seq_start,
 	.next  = gfs2_glock_seq_next,
 	.stop  = gfs2_glock_seq_stop,
-- 
cgit v1.2.3


From ca5a939b33166a9f5a2556e6c4ec031524852ba2 Mon Sep 17 00:00:00 2001
From: Denis Cheng <crquan@gmail.com>
Date: Tue, 31 Jul 2007 18:31:12 +0800
Subject: [GFS2] use the declaration of gfs2_dops in the header file instead

Signed-off-by: Denis Cheng <crquan@gmail.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index cf5aa505054..9a5e84038dd 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -28,6 +28,7 @@
 #include "lm.h"
 #include "mount.h"
 #include "ops_fstype.h"
+#include "ops_dentry.h"
 #include "ops_super.h"
 #include "recovery.h"
 #include "rgrp.h"
@@ -38,8 +39,6 @@
 #define DO 0
 #define UNDO 1
 
-extern struct dentry_operations gfs2_dops;
-
 static struct gfs2_sbd *init_sbd(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp;
-- 
cgit v1.2.3


From 8fbbfd214c853102b614f4705c1904ed14f5a808 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 1 Aug 2007 13:57:10 +0100
Subject: [GFS2] Reduce number of gfs2_scand processes to one

We only need a single gfs2_scand process rather than the one
per filesystem which we had previously. As a result the parameter
determining the frequency of gfs2_scand runs becomes a module
parameter rather than a mount parameter as it was before.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/daemon.c     | 24 --------------------
 fs/gfs2/daemon.h     |  1 -
 fs/gfs2/glock.c      | 63 +++++++++++++++++++++++++++++++++++++++-------------
 fs/gfs2/glock.h      |  4 ++--
 fs/gfs2/incore.h     |  2 --
 fs/gfs2/main.c       |  3 +++
 fs/gfs2/ops_fstype.c |  9 --------
 fs/gfs2/ops_super.c  |  1 -
 fs/gfs2/super.c      |  1 -
 fs/gfs2/sys.c        |  2 --
 10 files changed, 53 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
index 3548d9f31e0..3731ab0771d 100644
--- a/fs/gfs2/daemon.c
+++ b/fs/gfs2/daemon.c
@@ -34,30 +34,6 @@
 
    The kthread functions used to start these daemons block and flush signals. */
 
-/**
- * gfs2_scand - Look for cached glocks and inodes to toss from memory
- * @sdp: Pointer to GFS2 superblock
- *
- * One of these daemons runs, finding candidates to add to sd_reclaim_list.
- * See gfs2_glockd()
- */
-
-int gfs2_scand(void *data)
-{
-	struct gfs2_sbd *sdp = data;
-	unsigned long t;
-
-	while (!kthread_should_stop()) {
-		gfs2_scand_internal(sdp);
-		t = gfs2_tune_get(sdp, gt_scand_secs) * HZ;
-		if (freezing(current))
-			refrigerator();
-		schedule_timeout_interruptible(t);
-	}
-
-	return 0;
-}
-
 /**
  * gfs2_glockd - Reclaim unused glock structures
  * @sdp: Pointer to GFS2 superblock
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
index 801007120fb..0de9b355795 100644
--- a/fs/gfs2/daemon.h
+++ b/fs/gfs2/daemon.h
@@ -10,7 +10,6 @@
 #ifndef __DAEMON_DOT_H__
 #define __DAEMON_DOT_H__
 
-int gfs2_scand(void *data);
 int gfs2_glockd(void *data);
 int gfs2_recoverd(void *data);
 int gfs2_logd(void *data);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 0054b7df714..559937c710f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -25,6 +25,8 @@
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -58,6 +60,8 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
 static void gfs2_glock_drop_th(struct gfs2_glock *gl);
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
+static struct task_struct *scand_process;
+static unsigned int scand_secs = 5;
 
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
@@ -1627,7 +1631,7 @@ static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp,
 		goto out;
 	gl = list_entry(head->first, struct gfs2_glock, gl_list);
 	while(1) {
-		if (gl->gl_sbd == sdp) {
+		if (!sdp || gl->gl_sbd == sdp) {
 			gfs2_glock_hold(gl);
 			read_unlock(gl_lock_addr(hash));
 			if (prev)
@@ -1645,6 +1649,7 @@ out:
 	read_unlock(gl_lock_addr(hash));
 	if (prev)
 		gfs2_glock_put(prev);
+	cond_resched();
 	return has_entries;
 }
 
@@ -1672,20 +1677,6 @@ out_schedule:
 	gfs2_glock_schedule_for_reclaim(gl);
 }
 
-/**
- * gfs2_scand_internal - Look for glocks and inodes to toss from memory
- * @sdp: the filesystem
- *
- */
-
-void gfs2_scand_internal(struct gfs2_sbd *sdp)
-{
-	unsigned int x;
-
-	for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
-		examine_bucket(scan_glock, sdp, x);
-}
-
 /**
  * clear_glock - look at a glock and see if we can free it from glock cache
  * @gl: the glock to look at
@@ -1973,6 +1964,35 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
 	return error;
 }
 
+/**
+ * gfs2_scand - Look for cached glocks and inodes to toss from memory
+ * @sdp: Pointer to GFS2 superblock
+ *
+ * One of these daemons runs, finding candidates to add to sd_reclaim_list.
+ * See gfs2_glockd()
+ */
+
+static int gfs2_scand(void *data)
+{
+	unsigned x;
+	unsigned delay;
+
+	while (!kthread_should_stop()) {
+		for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+			examine_bucket(scan_glock, NULL, x);
+		if (freezing(current))
+			refrigerator();
+		delay = scand_secs;
+		if (delay < 1)
+			delay = 1;
+		schedule_timeout_interruptible(delay * HZ);
+	}
+
+	return 0;
+}
+
+
+
 int __init gfs2_glock_init(void)
 {
 	unsigned i;
@@ -1984,9 +2004,22 @@ int __init gfs2_glock_init(void)
 		rwlock_init(&gl_hash_locks[i]);
 	}
 #endif
+
+	scand_process = kthread_run(gfs2_scand, NULL, "gfs2_scand");
+	if (IS_ERR(scand_process))
+		return PTR_ERR(scand_process);
+
 	return 0;
 }
 
+void gfs2_glock_exit(void)
+{
+	kthread_stop(scand_process);
+}
+
+module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
+
 static int gfs2_glock_iter_next(struct glock_iter *gi)
 {
 	struct gfs2_glock *gl;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 7721ca3fff9..f7a8e626aa0 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -132,11 +132,11 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
 
 void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
-
-void gfs2_scand_internal(struct gfs2_sbd *sdp);
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
 
 int __init gfs2_glock_init(void);
+void gfs2_glock_exit(void);
+
 int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
 void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
 int gfs2_register_debugfs(void);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 170ba93829c..1390b30daf1 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -429,7 +429,6 @@ struct gfs2_tune {
 	unsigned int gt_log_flush_secs;
 	unsigned int gt_jindex_refresh_secs; /* Check for new journal index */
 
-	unsigned int gt_scand_secs;
 	unsigned int gt_recoverd_secs;
 	unsigned int gt_logd_secs;
 	unsigned int gt_quotad_secs;
@@ -574,7 +573,6 @@ struct gfs2_sbd {
 
 	/* Daemon stuff */
 
-	struct task_struct *sd_scand_process;
 	struct task_struct *sd_recoverd_process;
 	struct task_struct *sd_logd_process;
 	struct task_struct *sd_quotad_process;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index d5d4e68b880..79c91fd8381 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -107,6 +107,8 @@ static int __init init_gfs2_fs(void)
 fail_unregister:
 	unregister_filesystem(&gfs2_fs_type);
 fail:
+	gfs2_glock_exit();
+
 	if (gfs2_bufdata_cachep)
 		kmem_cache_destroy(gfs2_bufdata_cachep);
 
@@ -127,6 +129,7 @@ fail:
 
 static void __exit exit_gfs2_fs(void)
 {
+	gfs2_glock_exit();
 	gfs2_unregister_debugfs();
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 9a5e84038dd..58c730be207 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -160,14 +160,6 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
 	if (undo)
 		goto fail_trans;
 
-	p = kthread_run(gfs2_scand, sdp, "gfs2_scand");
-	error = IS_ERR(p);
-	if (error) {
-		fs_err(sdp, "can't start scand thread: %d\n", error);
-		return error;
-	}
-	sdp->sd_scand_process = p;
-
 	for (sdp->sd_glockd_num = 0;
 	     sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
 	     sdp->sd_glockd_num++) {
@@ -228,7 +220,6 @@ fail:
 	while (sdp->sd_glockd_num--)
 		kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
 
-	kthread_stop(sdp->sd_scand_process);
 	return error;
 }
 
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 603d940f115..4316690d86f 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -92,7 +92,6 @@ static void gfs2_put_super(struct super_block *sb)
 	kthread_stop(sdp->sd_recoverd_process);
 	while (sdp->sd_glockd_num--)
 		kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
-	kthread_stop(sdp->sd_scand_process);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		error = gfs2_make_fs_ro(sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index f916b9740c7..55898023782 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -58,7 +58,6 @@ void gfs2_tune_init(struct gfs2_tune *gt)
 	gt->gt_incore_log_blocks = 1024;
 	gt->gt_log_flush_secs = 60;
 	gt->gt_jindex_refresh_secs = 60;
-	gt->gt_scand_secs = 15;
 	gt->gt_recoverd_secs = 60;
 	gt->gt_logd_secs = 1;
 	gt->gt_quotad_secs = 5;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index c26c21b53c1..ba3a1729cc1 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -442,7 +442,6 @@ TUNE_ATTR(quota_simul_sync, 1);
 TUNE_ATTR(quota_cache_secs, 1);
 TUNE_ATTR(stall_secs, 1);
 TUNE_ATTR(statfs_quantum, 1);
-TUNE_ATTR_DAEMON(scand_secs, scand_process);
 TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
 TUNE_ATTR_DAEMON(logd_secs, logd_process);
 TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
@@ -464,7 +463,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_quota_cache_secs.attr,
 	&tune_attr_stall_secs.attr,
 	&tune_attr_statfs_quantum.attr,
-	&tune_attr_scand_secs.attr,
 	&tune_attr_recoverd_secs.attr,
 	&tune_attr_logd_secs.attr,
 	&tune_attr_quotad_secs.attr,
-- 
cgit v1.2.3


From 5f3eae7546093d845ca8ada1b95714202a136a1a Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 8 Aug 2007 16:52:09 -0500
Subject: [GFS2] invalid metadata block - REVISED

This is for bugzilla bug #248176: GFS2: invalid metadata block

Patches 1 thru 3 were accepted upstream, but there were problems
with 4 and 5.  Those issues have been resolved and now the recovery
tests are passing without errors.  This code has gone through
41 * 3 successful gfs2 recovery tests before it hit an
unrelated (openais) problem.

This is a complete rewrite of patch 4 for bug #248176.

Part of the problem was that inodes were being recycled
before their buffers were flushed to the journal logs.
Another problem was that the clone bitmaps were being
searched for deleted inodes to recycle, but only the
"real" bitmaps should be searched for that purpose.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index b93ac45b88b..2d7f7ea0c9a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -865,12 +865,15 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
 	struct inode *inode;
 	u32 goal = 0, block;
 	u64 no_addr;
+	struct gfs2_sbd *sdp = rgd->rd_sbd;
 
 	for(;;) {
 		if (goal >= rgd->rd_data)
 			break;
+		down_write(&sdp->sd_log_flush_lock);
 		block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
 				     GFS2_BLKST_UNLINKED);
+		up_write(&sdp->sd_log_flush_lock);
 		if (block == BFITNOENT)
 			break;
 		/* rgblk_search can return a block < goal, so we need to
@@ -1295,7 +1298,9 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 	   allocatable block anywhere else, we want to be able wrap around and
 	   search in the first part of our first-searched bit block.  */
 	for (x = 0; x <= length; x++) {
-		if (bi->bi_clone)
+		/* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
+		   bitmaps, so we must search the originals for that. */
+		if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
 			blk = gfs2_bitfit(rgd, bi->bi_clone + bi->bi_offset,
 					  bi->bi_len, goal, old_state);
 		else
-- 
cgit v1.2.3


From 75be73a8246ef96f7fa3f05a6a1450159fbb7a64 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 8 Aug 2007 17:08:14 -0500
Subject: [GFS2] Ensure journal file cache is flushed after recovery

This is for bugzilla bug #248176: GFS2: invalid metadata block

Patches 1 thru 3 were accepted upstream, but there were problems
with 4 and 5.  Those issues have been resolved and now the recovery
tests are passing without errors.  This code has gone through
41 * 3 successful gfs2 recovery tests before it hit an
unrelated (openais) problem.  I'm continuing to test it.

This is a complete rewrite of patch 5 for bug #248176, written by
Steve Whitehouse.  This is referred to in the bugzilla record as
"new 6" and "a different solution".

The problem was that the journal inodes, although protected by
a glock, were not synched with the other nodes because they don't
use the inode glock synch operations (i.e. no "glops" were defined).
Therefore, journal recovery on a journal-recovering node were causing
the blocks to get out of sync with the node that was actually trying
to use that journal as it comes back up from a reboot.

There are two possible solutions: (1) To make the journals use the
normal inode glock sync operations, or (2) To make the journal
operations take effect immediately (i.e. no caching).  Although
option 1 works, it turns out to be a lot more code.  Steve opted
for option 2, which is much simpler and therefore less prone to
regression errors.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

--
---
 fs/gfs2/ops_fstype.c | 2 +-
 fs/gfs2/recovery.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 58c730be207..f0bcaa25737 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -358,7 +358,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 
 		ip = GFS2_I(sdp->sd_jdesc->jd_inode);
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
-					   LM_FLAG_NOEXP | GL_EXACT,
+					   LM_FLAG_NOEXP | GL_EXACT | GL_NOCACHE,
 					   &sdp->sd_jinode_gh);
 		if (error) {
 			fs_err(sdp, "can't acquire journal inode glock: %d\n",
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 5ada38c99a2..beb6c7ac008 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -469,7 +469,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
 		};
 
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED,
-					   LM_FLAG_NOEXP, &ji_gh);
+					   LM_FLAG_NOEXP | GL_NOCACHE, &ji_gh);
 		if (error)
 			goto fail_gunlock_j;
 	} else {
-- 
cgit v1.2.3


From adb4ec13cdddb6ab8280e4c7808ba30f46504e1d Mon Sep 17 00:00:00 2001
From: Denis Cheng <crquan@gmail.com>
Date: Sat, 11 Aug 2007 10:27:07 +0800
Subject: [GFS2] use list_for_each_entry instead

Signed-off-by: Denis Cheng <crquan@gmail.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index f0bcaa25737..32b2859a89a 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -808,7 +808,6 @@ static struct super_block* get_gfs2_sb(const char *dev_name)
 	struct nameidata nd;
 	struct file_system_type *fstype;
 	struct super_block *sb = NULL, *s;
-	struct list_head *l;
 	int error;
 
 	error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
@@ -820,8 +819,7 @@ static struct super_block* get_gfs2_sb(const char *dev_name)
 	error = vfs_getattr(nd.mnt, nd.dentry, &stat);
 
 	fstype = get_fs_type("gfs2");
-	list_for_each(l, &fstype->fs_supers) {
-		s = list_entry(l, struct super_block, s_instances);
+	list_for_each_entry(s, &fstype->fs_supers, s_instances) {
 		if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
 		    (S_ISDIR(stat.mode) && s == nd.dentry->d_inode->i_sb)) {
 			sb = s;
-- 
cgit v1.2.3


From 2d3ba1ea97d839e60d742ccf9a6de5bf039c0b53 Mon Sep 17 00:00:00 2001
From: Denis Cheng <crquan@gmail.com>
Date: Sat, 11 Aug 2007 10:27:08 +0800
Subject: [GFS2] unneeded typecast

sb->s_fs_info is a void pointer, thus the type cast is not needed.

Signed-off-by: Denis Cheng <crquan@gmail.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 32b2859a89a..25cfab95eb9 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -849,7 +849,7 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 		error = -ENOENT;
 		goto error;
 	}
-	sdp = (struct gfs2_sbd*) sb->s_fs_info;
+	sdp = sb->s_fs_info;
 	if (sdp->sd_vfs_meta) {
 		printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n");
 		error = -EBUSY;
-- 
cgit v1.2.3


From 5d35e31f43c4910d0b6afc5160728a84bbaf86f0 Mon Sep 17 00:00:00 2001
From: Denis Cheng <crquan@gmail.com>
Date: Mon, 13 Aug 2007 11:01:58 +0800
Subject: [GFS2] better code for translating characters

the original code could work, but I think this code could work better.

Signed-off-by: Denis Cheng <crquan@gmail.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 25cfab95eb9..6c820cb2347 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -144,7 +144,8 @@ static int init_names(struct gfs2_sbd *sdp, int silent)
 	snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
 	snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
 
-	while ((table = strchr(sdp->sd_table_name, '/')))
+	table = sdp->sd_table_name;
+	while ((table = strchr(table, '/')))
 		*table = '_';
 
 out:
-- 
cgit v1.2.3


From 0fd5355470ea40355b8af76d01748ec7b9926d4d Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Tue, 14 Aug 2007 15:34:58 -0500
Subject: [GFS2] Force unstuff of hidden quota inode

This patch forcibly unstuffs (if stuffed) the hidden quota inode at the
first availble opportunity. In any practical scenario the quota inode
won't be stuffed, so this is ok to do. Unstuffing the quota inode allows
us to ignore the case of a stuffed quota inode in gfs2_adjust_quota().

Signed-off-by: Abhijith Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/quota.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 6e546ee8f3d..5dfa4656122 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -614,6 +614,16 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 	s64 value;
 	int err = -EIO;
 
+	if (gfs2_is_stuffed(ip)) {
+		struct gfs2_alloc *al = NULL;
+		al = gfs2_alloc_get(ip);
+		/* just request 1 blk */
+		al->al_requested = 1;
+		gfs2_inplace_reserve(ip);
+		gfs2_unstuff_dinode(ip, NULL);
+		gfs2_inplace_release(ip);
+		gfs2_alloc_put(ip);
+	}
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 34eaae398e29dadeed95efd30f1eb694e5932b34 Mon Sep 17 00:00:00 2001
From: Denis Cheng <crquan@gmail.com>
Date: Wed, 15 Aug 2007 23:54:44 +0800
Subject: [GFS2] fixed a NULL pointer assignment BUG

Signed-off-by: Denis Cheng <crquan@gmail.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 6c820cb2347..c1c6672ebb8 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -292,8 +292,9 @@ static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
 		fs_err(sdp, "can't get root dentry\n");
 		error = -ENOMEM;
 		iput(inode);
-	}
-	sb->s_root->d_op = &gfs2_dops;
+	} else
+		sb->s_root->d_op = &gfs2_dops;
+	
 out:
 	gfs2_glock_dq_uninit(&sb_gh);
 	return error;
-- 
cgit v1.2.3


From 2d9a4bbf6d28673f4057682cc02d16bf288b4a35 Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Wed, 15 Aug 2007 11:25:05 -0500
Subject: [GFS2] Fix quota do_list operation hang

This is the filesystem part of the patches to fix this bz. There are
additional userland patches (gfs2_quota, libgfs2) for the complete
solution. This patch adds a new field qu_ll_next to the gfs2_quota
structure. This field allows us to create linked lists of quotas in the
ondisk quota inode. Instead of scanning through the entire sparse quota
file for valid quotas, we can now simply walk through the user and group
quota linked lists to perform the do_list operation.

Signed-off-by: Abhijith Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/quota.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 5dfa4656122..addb51e0f13 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -70,6 +70,7 @@ struct gfs2_quota_host {
 	u64 qu_limit;
 	u64 qu_warn;
 	s64 qu_value;
+	u32 qu_ll_next;
 };
 
 struct gfs2_quota_change_host {
@@ -580,6 +581,7 @@ static void gfs2_quota_in(struct gfs2_quota_host *qu, const void *buf)
 	qu->qu_limit = be64_to_cpu(str->qu_limit);
 	qu->qu_warn = be64_to_cpu(str->qu_warn);
 	qu->qu_value = be64_to_cpu(str->qu_value);
+	qu->qu_ll_next = be32_to_cpu(str->qu_ll_next);
 }
 
 static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf)
@@ -589,6 +591,7 @@ static void gfs2_quota_out(const struct gfs2_quota_host *qu, void *buf)
 	str->qu_limit = cpu_to_be64(qu->qu_limit);
 	str->qu_warn = cpu_to_be64(qu->qu_warn);
 	str->qu_value = cpu_to_be64(qu->qu_value);
+	str->qu_ll_next = cpu_to_be32(qu->qu_ll_next);
 	memset(&str->qu_reserved, 0, sizeof(str->qu_reserved));
 }
 
-- 
cgit v1.2.3


From bb3b0e3df5420fdf2c6bbb4417525c6d2ef55bbb Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 16 Aug 2007 16:03:57 +0100
Subject: [GFS2] Clean up invalidatepage/releasepage

This patch fixes some bugs relating to journaled data files by cleaning
up the gfs2_invalidatepage() and gfs2_releasepage() functions. We now
never block during gfs2_releasepage(), instead we always either release
or refuse to release depending on the status of the buffers.

This fixes Red Hat bugzillas #248969 and #252392.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/glops.c       |   4 +-
 fs/gfs2/log.c         |   6 ++-
 fs/gfs2/ops_address.c | 132 ++++++--------------------------------------------
 fs/gfs2/ops_fstype.c  |   2 +
 4 files changed, 24 insertions(+), 120 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 777ca46010e..88342e0b4bc 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -156,9 +156,11 @@ static void inode_go_sync(struct gfs2_glock *gl)
 		ip = NULL;
 
 	if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
-		if (ip)
+		if (ip && !gfs2_is_jdata(ip))
 			filemap_fdatawrite(ip->i_inode.i_mapping);
 		gfs2_log_flush(gl->gl_sbd, gl);
+		if (ip && gfs2_is_jdata(ip))
+			filemap_fdatawrite(ip->i_inode.i_mapping);
 		gfs2_meta_sync(gl);
 		if (ip) {
 			struct address_space *mapping = ip->i_inode.i_mapping;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 00ab6c070a1..d0e6b42c86e 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -229,8 +229,10 @@ static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 		list_del(&bd->bd_ail_st_list);
 		list_del(&bd->bd_ail_gl_list);
 		atomic_dec(&bd->bd_gl->gl_ail_count);
-		bh_ip = GFS2_I(bd->bd_bh->b_page->mapping->host);
-		gfs2_meta_cache_flush(bh_ip);
+		if (bd->bd_bh->b_page->mapping) {
+			bh_ip = GFS2_I(bd->bd_bh->b_page->mapping->host);
+			gfs2_meta_cache_flush(bh_ip);
+		}
 		brelse(bd->bd_bh);
 	}
 }
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 42a5f58f6fc..8407d1db4ea 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -616,58 +616,13 @@ static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
 	return dblock;
 }
 
-static void discard_buffer(struct gfs2_sbd *sdp, struct buffer_head *bh)
-{
-	struct gfs2_bufdata *bd;
-
-	gfs2_log_lock(sdp);
-	bd = bh->b_private;
-	if (bd) {
-		bd->bd_bh = NULL;
-		bh->b_private = NULL;
-		if (!bd->bd_ail && list_empty(&bd->bd_le.le_list))
-			kmem_cache_free(gfs2_bufdata_cachep, bd);
-	}
-	gfs2_log_unlock(sdp);
-
-	lock_buffer(bh);
-	clear_buffer_dirty(bh);
-	bh->b_bdev = NULL;
-	clear_buffer_mapped(bh);
-	clear_buffer_req(bh);
-	clear_buffer_new(bh);
-	clear_buffer_delay(bh);
-	unlock_buffer(bh);
-}
-
 static void gfs2_invalidatepage(struct page *page, unsigned long offset)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
-	struct buffer_head *head, *bh, *next;
-	unsigned int curr_off = 0;
-
 	BUG_ON(!PageLocked(page));
 	if (offset == 0)
 		ClearPageChecked(page);
-	if (!page_has_buffers(page))
-		return;
 
-	bh = head = page_buffers(page);
-	do {
-		unsigned int next_off = curr_off + bh->b_size;
-		next = bh->b_this_page;
-
-		if (offset <= curr_off)
-			discard_buffer(sdp, bh);
-
-		curr_off = next_off;
-		bh = next;
-	} while (bh != head);
-
-	if (!offset)
-		try_to_release_page(page, 0);
-
-	return;
+	block_invalidatepage(page, offset);
 }
 
 /**
@@ -735,59 +690,6 @@ out:
 	return rv;
 }
 
-/**
- * stuck_releasepage - We're stuck in gfs2_releasepage().  Print stuff out.
- * @bh: the buffer we're stuck on
- *
- */
-
-static void stuck_releasepage(struct buffer_head *bh)
-{
-	struct inode *inode = bh->b_page->mapping->host;
-	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
-	struct gfs2_bufdata *bd = bh->b_private;
-	struct gfs2_glock *gl;
-static unsigned limit = 0;
-
-	if (limit > 3)
-		return;
-	limit++;
-
-	fs_warn(sdp, "stuck in gfs2_releasepage() %p\n", inode);
-	fs_warn(sdp, "blkno = %llu, bh->b_count = %d\n",
-		(unsigned long long)bh->b_blocknr, atomic_read(&bh->b_count));
-	fs_warn(sdp, "pinned = %u\n", buffer_pinned(bh));
-	fs_warn(sdp, "bh->b_private = %s\n", (bd) ? "!NULL" : "NULL");
-
-	if (!bd)
-		return;
-
-	gl = bd->bd_gl;
-
-	fs_warn(sdp, "gl = (%u, %llu)\n",
-		gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number);
-
-	fs_warn(sdp, "bd_list_tr = %s, bd_le.le_list = %s\n",
-		(list_empty(&bd->bd_list_tr)) ? "no" : "yes",
-		(list_empty(&bd->bd_le.le_list)) ? "no" : "yes");
-
-	if (gl->gl_ops == &gfs2_inode_glops) {
-		struct gfs2_inode *ip = gl->gl_object;
-		unsigned int x;
-
-		if (!ip)
-			return;
-
-		fs_warn(sdp, "ip = %llu %llu\n",
-			(unsigned long long)ip->i_no_formal_ino,
-			(unsigned long long)ip->i_no_addr);
-
-		for (x = 0; x < GFS2_MAX_META_HEIGHT; x++)
-			fs_warn(sdp, "ip->i_cache[%u] = %s\n",
-				x, (ip->i_cache[x]) ? "!NULL" : "NULL");
-	}
-}
-
 /**
  * gfs2_releasepage - free the metadata associated with a page
  * @page: the page that's being released
@@ -805,38 +707,31 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
 	struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info;
 	struct buffer_head *bh, *head;
 	struct gfs2_bufdata *bd;
-	unsigned long t = jiffies + gfs2_tune_get(sdp, gt_stall_secs) * HZ;
 
 	if (!page_has_buffers(page))
 		goto out;
 
+	gfs2_log_lock(sdp);
 	head = bh = page_buffers(page);
 	do {
-		while (atomic_read(&bh->b_count)) {
-			if (!atomic_read(&aspace->i_writecount))
-				return 0;
-
-			if (!(gfp_mask & __GFP_WAIT))
-				return 0;
-
-			if (time_after_eq(jiffies, t)) {
-				stuck_releasepage(bh);
-				/* should we withdraw here? */
-				return 0;
-			}
-
-			yield();
-		}
-
+		if (atomic_read(&bh->b_count))
+			goto cannot_release;
+		bd = bh->b_private;
+		if (bd && bd->bd_ail)
+			goto cannot_release;
 		gfs2_assert_warn(sdp, !buffer_pinned(bh));
 		gfs2_assert_warn(sdp, !buffer_dirty(bh));
+		bh = bh->b_this_page;
+	} while(bh != head);
+	gfs2_log_unlock(sdp);
 
+	head = bh = page_buffers(page);
+	do {
 		gfs2_log_lock(sdp);
 		bd = bh->b_private;
 		if (bd) {
 			gfs2_assert_warn(sdp, bd->bd_bh == bh);
 			gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
-			gfs2_assert_warn(sdp, !bd->bd_ail);
 			bd->bd_bh = NULL;
 			if (!list_empty(&bd->bd_le.le_list))
 				bd = NULL;
@@ -851,6 +746,9 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
 
 out:
 	return try_to_free_buffers(page);
+cannot_release:
+	gfs2_log_unlock(sdp);
+	return 0;
 }
 
 const struct address_space_operations gfs2_file_aops = {
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c1c6672ebb8..9e0e9be1e41 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -35,6 +35,7 @@
 #include "super.h"
 #include "sys.h"
 #include "util.h"
+#include "log.h"
 
 #define DO 0
 #define UNDO 1
@@ -887,6 +888,7 @@ error:
 static void gfs2_kill_sb(struct super_block *sb)
 {
 	gfs2_delete_debugfs_file(sb->s_fs_info);
+	gfs2_meta_syncfs(sb->s_fs_info);
 	kill_block_super(sb);
 }
 
-- 
cgit v1.2.3


From 382e6e256b0cb1a84a45a520cef75d1b8ff44663 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 16 Aug 2007 17:08:20 +0100
Subject: [GFS2] Add a missing gfs2_trans_add_bh()

This was missing from the dir_split_leaf() function although in
most cases its not a problem due to other functions having
already previously called gfs2_trans_add_bh. This makes certain
that it is correct.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Wendy Cheng <wcheng@redhat.com>
---
 fs/gfs2/dir.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2beb2f401aa..08c6dd0c671 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1043,6 +1043,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 
 	error = gfs2_meta_inode_buffer(dip, &dibh);
 	if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
+		gfs2_trans_add_bh(dip->i_gl, dibh, 1);
 		dip->i_di.di_blocks++;
 		gfs2_set_inode_blocks(&dip->i_inode);
 		gfs2_dinode_out(dip, dibh->b_data);
-- 
cgit v1.2.3


From 9a5ad13856cbd10be429f09517c51277c02530f7 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Fri, 17 Aug 2007 20:22:07 -0500
Subject: [GFS2] Add NULL entry to token table

match_token() was returning garbage data instead of a fail value. This data
happened to match a valid option id for an option that required an argument (in
this case, lockproto=%s) For match_token() to correctly fail if the option
doesn't match any of the tokens, the token table must end with a NULL entry.
This patch adds the NULL entry.

Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/mount.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index 4864659555d..b941f9f9f95 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -42,6 +42,7 @@ enum {
 	Opt_nosuiddir,
 	Opt_data_writeback,
 	Opt_data_ordered,
+	Opt_err,
 };
 
 static match_table_t tokens = {
@@ -64,7 +65,8 @@ static match_table_t tokens = {
 	{Opt_suiddir, "suiddir"},
 	{Opt_nosuiddir, "nosuiddir"},
 	{Opt_data_writeback, "data=writeback"},
-	{Opt_data_ordered, "data=ordered"}
+	{Opt_data_ordered, "data=ordered"},
+	{Opt_err, NULL}
 };
 
 /**
@@ -237,6 +239,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 		case Opt_data_ordered:
 			args->ar_data = GFS2_DATA_ORDERED;
 			break;
+		case Opt_err:
 		default:
 			fs_info(sdp, "unknown option: %s\n", o);
 			error = -EINVAL;
-- 
cgit v1.2.3


From a13b8c5f2381495879e6facd3b3ada51c9e68194 Mon Sep 17 00:00:00 2001
From: Wendy Cheng <wcheng@redhat.com>
Date: Mon, 20 Aug 2007 09:29:53 -0400
Subject: [GFS2] Reduce truncate IO traffic

Current GFS2 setattr call unconditionally invokes do_shrink even the
requested size and actual file size are equal. This has generated large
amount of extra IOs found during NFS benchmark runs. This patch moves
the relevant logic out of shrink code path. Since setattr is a system
call, the time stamps update is still required.

Signed-off-by: S. Wendy Cheng <wcheng@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index cd805a66880..9b8990444e6 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1085,6 +1085,33 @@ static int do_shrink(struct gfs2_inode *ip, u64 size)
 	return error;
 }
 
+static int do_touch(struct gfs2_inode *ip, u64 size)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct buffer_head *dibh;
+	int error;
+
+	error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (error)
+		return error;
+
+	down_write(&ip->i_rw_mutex);
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto do_touch_out;
+
+	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+	gfs2_dinode_out(ip, dibh->b_data);
+	brelse(dibh);
+
+do_touch_out:
+	up_write(&ip->i_rw_mutex);
+	gfs2_trans_end(sdp);
+	return error;
+}
+
 /**
  * gfs2_truncatei - make a file a given size
  * @ip: the inode
@@ -1105,8 +1132,11 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
 
 	if (size > ip->i_di.di_size)
 		error = do_grow(ip, size);
-	else
+	else if (size < ip->i_di.di_size)
 		error = do_shrink(ip, size);
+	else
+		/* update time stamps */
+		error = do_touch(ip, size);
 
 	return error;
 }
-- 
cgit v1.2.3


From 61d96be0f474df354c2ff4a2b2bf410b23a5cd60 Mon Sep 17 00:00:00 2001
From: Patrick Caulfield <pcaulfie@redhat.com>
Date: Mon, 20 Aug 2007 15:13:38 +0100
Subject: [DLM] Fix lowcomms socket closing

This patch fixes the slight mess made in lowcomms closing by previous patches
and fixes all sorts of DLM hangs.

Signed-Off-By: Patrick Caulfield <pcaulfie@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/dlm/lowcomms.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 9e9d2e82f40..62a8a6ccd99 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -334,18 +334,8 @@ static void close_connection(struct connection *con, bool and_other)
 		con->rx_page = NULL;
 	}
 
-	/* If we are an 'othercon' then NULL the pointer to us
-	   from the parent and tidy ourself up */
-	if (test_bit(CF_IS_OTHERCON, &con->flags)) {
-		struct connection *parent = __nodeid2con(con->nodeid, 0);
-		parent->othercon = NULL;
-		kmem_cache_free(con_cache, con);
-	}
-	else {
-		/* Parent connections get reused */
-		con->retries = 0;
-		mutex_unlock(&con->sock_mutex);
-	}
+	con->retries = 0;
+	mutex_unlock(&con->sock_mutex);
 }
 
 /* We only send shutdown messages to nodes that are not part of the cluster */
@@ -731,6 +721,8 @@ static int tcp_accept_from_sock(struct connection *con)
 			INIT_WORK(&othercon->swork, process_send_sockets);
 			INIT_WORK(&othercon->rwork, process_recv_sockets);
 			set_bit(CF_IS_OTHERCON, &othercon->flags);
+		}
+		if (!othercon->sock) {
 			newcon->othercon = othercon;
 			othercon->sock = newsock;
 			newsock->sk->sk_user_data = othercon;
-- 
cgit v1.2.3


From a947e0335699a1d387c3826e5b8eff9e0afe505e Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Tue, 21 Aug 2007 09:57:29 -0500
Subject: [GFS2] Wendy's dump lockname in hex & fix glock dump

With this patch, gfs2 glockdump through the debugfs filesystem will only
dump glocks for the specified filesystem instead of all glocks. Also, to
aid debugging, the glock number is dumped in hex instead of decimal.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: S. Wendy Cheng <wcheng@redhat.com>
Signed-off-by: Abhijith Das <adas@redhat.com>
---
 fs/gfs2/glock.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 559937c710f..3d949187fed 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1844,7 +1844,7 @@ static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
 
 	spin_lock(&gl->gl_spin);
 
-	print_dbg(gi, "Glock 0x%p (%u, %llu)\n", gl, gl->gl_name.ln_type,
+	print_dbg(gi, "Glock 0x%p (%u, 0x%llx)\n", gl, gl->gl_name.ln_type,
 		   (unsigned long long)gl->gl_name.ln_number);
 	print_dbg(gi, "  gl_flags =");
 	for (x = 0; x < 32; x++) {
@@ -2024,20 +2024,21 @@ static int gfs2_glock_iter_next(struct glock_iter *gi)
 {
 	struct gfs2_glock *gl;
 
+restart:
 	read_lock(gl_lock_addr(gi->hash));
 	gl = gi->gl;
 	if (gl) {
-		gi->gl = hlist_entry(gl->gl_list.next, struct gfs2_glock,
-				     gl_list);
+		gi->gl = hlist_entry(gl->gl_list.next,
+				     struct gfs2_glock, gl_list);
 		if (gi->gl)
 			gfs2_glock_hold(gi->gl);
 	}
 	read_unlock(gl_lock_addr(gi->hash));
 	if (gl)
 		gfs2_glock_put(gl);
-
-	while(gi->gl == NULL) {
+	if (gl && gi->gl == NULL)
 		gi->hash++;
+	while(gi->gl == NULL) {
 		if (gi->hash >= GFS2_GL_HASH_SIZE)
 			return 1;
 		read_lock(gl_lock_addr(gi->hash));
@@ -2046,7 +2047,12 @@ static int gfs2_glock_iter_next(struct glock_iter *gi)
 		if (gi->gl)
 			gfs2_glock_hold(gi->gl);
 		read_unlock(gl_lock_addr(gi->hash));
+		gi->hash++;
 	}
+
+	if (gi->sdp != gi->gl->gl_sbd)
+		goto restart;
+
 	return 0;
 }
 
@@ -2068,16 +2074,10 @@ static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
 	gi->sdp = sdp;
 	gi->hash = 0;
 	gi->seq = NULL;
+	gi->gl = NULL;
 	memset(gi->string, 0, sizeof(gi->string));
 
-	read_lock(gl_lock_addr(gi->hash));
-	gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
-			     struct gfs2_glock, gl_list);
-	if (gi->gl)
-		gfs2_glock_hold(gi->gl);
-	read_unlock(gl_lock_addr(gi->hash));
-
-	if (!gi->gl && gfs2_glock_iter_next(gi)) {
+	if (gfs2_glock_iter_next(gi)) {
 		gfs2_glock_iter_free(gi);
 		return NULL;
 	}
-- 
cgit v1.2.3


From ec217e0ece60f2240772e6f08e0529775846c627 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 22 Aug 2007 11:15:29 -0500
Subject: [GFS2] Patch to protect sd_log_num_jdata

This is a patch to GFS2 to protect sd_log_num_jdata with the
gfs2_log_lock.  Without this patch, there is a timing window
where you can get hit the following assert from function
gfs2_log_flush():

gfs2_assert_withdraw(sdp,
			sdp->sd_log_num_buf + sdp->sd_log_num_jdata ==
			sdp->sd_log_commited_buf +
			sdp->sd_log_commited_databuf);

I've tested it on my roth cluster and it fixes the problem.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/lops.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index a0371f835cf..7ef33562337 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -492,11 +492,12 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 
 	gfs2_trans_add_gl(bd->bd_gl);
 	if (gfs2_is_jdata(ip)) {
-		sdp->sd_log_num_jdata++;
 		gfs2_pin(sdp, bd->bd_bh);
 		tr->tr_num_databuf_new++;
 	}
 	gfs2_log_lock(sdp);
+	if (gfs2_is_jdata(ip))
+		sdp->sd_log_num_jdata++;
 	sdp->sd_log_num_databuf++;
 	list_add(&le->le_list, &sdp->sd_log_le_databuf);
 	gfs2_log_unlock(sdp);
-- 
cgit v1.2.3


From d1e2777d4f419a865ddccdb9b3412021d0e4de51 Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Thu, 23 Aug 2007 13:33:01 -0500
Subject: [GFS2] panic after can't parse mount arguments

When you try to mount gfs2 with -o garbage, the mount fails and the gfs2
superblock is deallocated and becomes NULL. The vfs comes around later
on and calls gfs2_kill_sb. At this point the hidden gfs2 superblock
pointer (sb->s_fs_info) is NULL and dereferencing it through
gfs2_meta_syncfs causes the panic. (the other function call to
gfs2_delete_debugfs_file() succeeds because this function already checks
for a NULL pointer)

Signed-off-by: Abhijith Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 9e0e9be1e41..314c1134d12 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -887,8 +887,10 @@ error:
 
 static void gfs2_kill_sb(struct super_block *sb)
 {
-	gfs2_delete_debugfs_file(sb->s_fs_info);
-	gfs2_meta_syncfs(sb->s_fs_info);
+	if (sb->s_fs_info) {
+		gfs2_delete_debugfs_file(sb->s_fs_info);
+		gfs2_meta_syncfs(sb->s_fs_info);
+	}
 	kill_block_super(sb);
 }
 
-- 
cgit v1.2.3


From c4f68a130fc1795e4a75ec5bdaf9e85d86c22419 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Thu, 23 Aug 2007 13:19:05 -0500
Subject: [GFS2] delay glock demote for a minimum hold time

When a lot of IO, with some distributed mmap IO, is run on a GFS2 filesystem in
a cluster, it will deadlock. The reason is that do_no_page() will repeatedly
call gfs2_sharewrite_nopage(), because each node keeps giving up the glock
too early, and is forced to call unmap_mapping_range(). This bumps the
mapping->truncate_count sequence count, forcing do_no_page() to retry. This
patch institutes a minimum glock hold time a tenth a second.  This insures
that even in heavy contention cases, the node has enough time to get some
useful work done before it gives up the glock.

A second issue is that when gfs2_glock_dq() is called from within a page fault
to demote a lock, and the associated page needs to be written out, it will
try to acqire a lock on it, but it has already been locked at a higher level.
This patch puts makes gfs2_glock_dq() use the work queue as well, to avoid this
issue. This is the same patch as Steve Whitehouse originally proposed to fix
this issue, execpt that gfs2_glock_dq() now grabs a reference to the glock
before it queues up the work on it.

Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c  | 75 ++++++++++++++++++++++++++++++++++++++++++++------------
 fs/gfs2/glops.c  |  2 ++
 fs/gfs2/incore.h |  5 ++++
 3 files changed, 66 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 3d949187fed..931368a385c 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -27,6 +27,8 @@
 #include <linux/debugfs.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/workqueue.h>
+#include <linux/jiffies.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -58,10 +60,13 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
 static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl);
 static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
 static void gfs2_glock_drop_th(struct gfs2_glock *gl);
+static void run_queue(struct gfs2_glock *gl);
+
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
 static struct task_struct *scand_process;
 static unsigned int scand_secs = 5;
+static struct workqueue_struct *glock_workqueue;
 
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
@@ -277,6 +282,18 @@ static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
 	return gl;
 }
 
+static void glock_work_func(struct work_struct *work)
+{
+	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+
+	spin_lock(&gl->gl_spin);
+	if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags))
+		set_bit(GLF_DEMOTE, &gl->gl_flags);
+	run_queue(gl);
+	spin_unlock(&gl->gl_spin);
+	gfs2_glock_put(gl);
+}
+
 /**
  * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
  * @sdp: The GFS2 superblock
@@ -316,6 +333,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_name = name;
 	atomic_set(&gl->gl_ref, 1);
 	gl->gl_state = LM_ST_UNLOCKED;
+	gl->gl_demote_state = LM_ST_EXCLUSIVE;
 	gl->gl_hash = hash;
 	gl->gl_owner_pid = 0;
 	gl->gl_ip = 0;
@@ -324,10 +342,12 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_req_bh = NULL;
 	gl->gl_vn = 0;
 	gl->gl_stamp = jiffies;
+	gl->gl_tchange = jiffies;
 	gl->gl_object = NULL;
 	gl->gl_sbd = sdp;
 	gl->gl_aspace = NULL;
 	lops_init_le(&gl->gl_le, &gfs2_glock_lops);
+	INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
 
 	/* If this glock protects actual on-disk data or metadata blocks,
 	   create a VFS inode to manage the pages/buffers holding them. */
@@ -441,6 +461,8 @@ static void wait_on_holder(struct gfs2_holder *gh)
 
 static void gfs2_demote_wake(struct gfs2_glock *gl)
 {
+	BUG_ON(!spin_is_locked(&gl->gl_spin));
+	gl->gl_demote_state = LM_ST_EXCLUSIVE;
         clear_bit(GLF_DEMOTE, &gl->gl_flags);
         smp_mb__after_clear_bit();
         wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
@@ -682,10 +704,14 @@ static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
  * practise: LM_ST_SHARED and LM_ST_UNLOCKED
  */
 
-static void handle_callback(struct gfs2_glock *gl, unsigned int state, int remote)
+static void handle_callback(struct gfs2_glock *gl, unsigned int state,
+			    int remote, unsigned long delay)
 {
+	int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
+
 	spin_lock(&gl->gl_spin);
-	if (test_and_set_bit(GLF_DEMOTE, &gl->gl_flags) == 0) {
+	set_bit(bit, &gl->gl_flags);
+	if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
 		gl->gl_demote_state = state;
 		gl->gl_demote_time = jiffies;
 		if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
@@ -727,6 +753,7 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
 	}
 
 	gl->gl_state = new_state;
+	gl->gl_tchange = jiffies;
 }
 
 /**
@@ -813,7 +840,6 @@ out:
 		gl->gl_req_gh = NULL;
 		gl->gl_req_bh = NULL;
 		clear_bit(GLF_LOCK, &gl->gl_flags);
-		run_queue(gl);
 		spin_unlock(&gl->gl_spin);
 	}
 
@@ -885,7 +911,6 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
 	gfs2_assert_warn(sdp, !ret);
 
 	state_change(gl, LM_ST_UNLOCKED);
-	gfs2_demote_wake(gl);
 
 	if (glops->go_inval)
 		glops->go_inval(gl, DIO_METADATA);
@@ -898,10 +923,10 @@ static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
 	}
 
 	spin_lock(&gl->gl_spin);
+	gfs2_demote_wake(gl);
 	gl->gl_req_gh = NULL;
 	gl->gl_req_bh = NULL;
 	clear_bit(GLF_LOCK, &gl->gl_flags);
-	run_queue(gl);
 	spin_unlock(&gl->gl_spin);
 
 	gfs2_glock_put(gl);
@@ -1209,9 +1234,10 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 {
 	struct gfs2_glock *gl = gh->gh_gl;
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	unsigned delay = 0;
 
 	if (gh->gh_flags & GL_NOCACHE)
-		handle_callback(gl, LM_ST_UNLOCKED, 0);
+		handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
 
 	gfs2_glmutex_lock(gl);
 
@@ -1229,8 +1255,14 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 	}
 
 	clear_bit(GLF_LOCK, &gl->gl_flags);
-	run_queue(gl);
 	spin_unlock(&gl->gl_spin);
+
+	gfs2_glock_hold(gl);
+	if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
+		delay = gl->gl_ops->go_min_hold_time;
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+		gfs2_glock_put(gl);
 }
 
 void gfs2_glock_dq_wait(struct gfs2_holder *gh)
@@ -1457,18 +1489,21 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
 			unsigned int state)
 {
 	struct gfs2_glock *gl;
+	unsigned long delay = 0;
+	unsigned long holdtime;
+	unsigned long now = jiffies;
 
 	gl = gfs2_glock_find(sdp, name);
 	if (!gl)
 		return;
 
-	handle_callback(gl, state, 1);
-
-	spin_lock(&gl->gl_spin);
-	run_queue(gl);
-	spin_unlock(&gl->gl_spin);
+	holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
+	if (time_before(now, holdtime))
+		delay = holdtime - now;
 
-	gfs2_glock_put(gl);
+	handle_callback(gl, state, 1, delay);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+		gfs2_glock_put(gl);
 }
 
 /**
@@ -1509,7 +1544,8 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
 			return;
 		if (!gfs2_assert_warn(sdp, gl->gl_req_bh))
 			gl->gl_req_bh(gl, async->lc_ret);
-		gfs2_glock_put(gl);
+		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+			gfs2_glock_put(gl);
 		up_read(&gfs2_umount_flush_sem);
 		return;
 	}
@@ -1602,7 +1638,7 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
 	if (gfs2_glmutex_trylock(gl)) {
 		if (list_empty(&gl->gl_holders) &&
 		    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
-			handle_callback(gl, LM_ST_UNLOCKED, 0);
+			handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
 		gfs2_glmutex_unlock(gl);
 	}
 
@@ -1702,7 +1738,7 @@ static void clear_glock(struct gfs2_glock *gl)
 	if (gfs2_glmutex_trylock(gl)) {
 		if (list_empty(&gl->gl_holders) &&
 		    gl->gl_state != LM_ST_UNLOCKED)
-			handle_callback(gl, LM_ST_UNLOCKED, 0);
+			handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
 		gfs2_glmutex_unlock(gl);
 	}
 }
@@ -2009,11 +2045,18 @@ int __init gfs2_glock_init(void)
 	if (IS_ERR(scand_process))
 		return PTR_ERR(scand_process);
 
+	glock_workqueue = create_workqueue("glock_workqueue");
+	if (IS_ERR(glock_workqueue)) {
+		kthread_stop(scand_process);
+		return PTR_ERR(glock_workqueue);
+	}
+
 	return 0;
 }
 
 void gfs2_glock_exit(void)
 {
+	destroy_workqueue(glock_workqueue);
 	kthread_stop(scand_process);
 }
 
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 88342e0b4bc..7ef6b23bb38 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -454,6 +454,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 	.go_lock = inode_go_lock,
 	.go_unlock = inode_go_unlock,
 	.go_type = LM_TYPE_INODE,
+	.go_min_hold_time = HZ / 10,
 };
 
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -464,6 +465,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 	.go_lock = rgrp_go_lock,
 	.go_unlock = rgrp_go_unlock,
 	.go_type = LM_TYPE_RGRP,
+	.go_min_hold_time = HZ / 10,
 };
 
 const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 1390b30daf1..23b611aa70d 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -11,6 +11,7 @@
 #define __INCORE_DOT_H__
 
 #include <linux/fs.h>
+#include <linux/workqueue.h>
 
 #define DIO_WAIT	0x00000010
 #define DIO_METADATA	0x00000020
@@ -130,6 +131,7 @@ struct gfs2_glock_operations {
 	int (*go_lock) (struct gfs2_holder *gh);
 	void (*go_unlock) (struct gfs2_holder *gh);
 	const int go_type;
+	const unsigned long go_min_hold_time;
 };
 
 enum {
@@ -161,6 +163,7 @@ enum {
 	GLF_LOCK		= 1,
 	GLF_STICKY		= 2,
 	GLF_DEMOTE		= 3,
+	GLF_PENDING_DEMOTE	= 4,
 	GLF_DIRTY		= 5,
 };
 
@@ -193,6 +196,7 @@ struct gfs2_glock {
 
 	u64 gl_vn;
 	unsigned long gl_stamp;
+	unsigned long gl_tchange;
 	void *gl_object;
 
 	struct list_head gl_reclaim;
@@ -203,6 +207,7 @@ struct gfs2_glock {
 	struct gfs2_log_element gl_le;
 	struct list_head gl_ail_list;
 	atomic_t gl_ail_count;
+	struct delayed_work gl_work;
 };
 
 struct gfs2_alloc {
-- 
cgit v1.2.3


From e9bd2b3bafd29bf75522546207f0bba0ec4515c2 Mon Sep 17 00:00:00 2001
From: Wendy Cheng <wcheng@redhat.com>
Date: Fri, 24 Aug 2007 09:15:01 -0400
Subject: [GFS2] fix inode meta data corruption

Fix a nasty inode meta data corruption issue by keeping the buffer head in
icache array. This buffer needs to stay in memory until journal flush occurs
Otherwise, gfs2_meta_inode_buffer could do a disk read before the inode hits
disk. It ends up with meta data corruptions. The buffer will be released as
part of the existing journal flush logic.

Signed-off-by: S. Wendy Cheng <wcheng@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 34f7bcdea1e..013f00b90cc 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -244,6 +244,11 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	return 0;
 }
 
+static void gfs2_inode_bh(struct gfs2_inode *ip, struct buffer_head *bh)
+{
+	ip->i_cache[0] = bh;
+}
+
 /**
  * gfs2_inode_refresh - Refresh the incore copy of the dinode
  * @ip: The GFS2 inode
@@ -688,7 +693,7 @@ out:
 static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 			const struct gfs2_inum_host *inum, unsigned int mode,
 			unsigned int uid, unsigned int gid,
-			const u64 *generation, dev_t dev)
+			const u64 *generation, dev_t dev, struct buffer_head **bhp)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct gfs2_dinode *di;
@@ -743,13 +748,15 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 	di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec);
 	di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec);
 	memset(&di->di_reserved, 0, sizeof(di->di_reserved));
+	
+	set_buffer_uptodate(dibh);
 
-	brelse(dibh);
+	*bhp = dibh;
 }
 
 static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 		       unsigned int mode, const struct gfs2_inum_host *inum,
-		       const u64 *generation, dev_t dev)
+		       const u64 *generation, dev_t dev, struct buffer_head **bhp)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	unsigned int uid, gid;
@@ -770,7 +777,7 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 	if (error)
 		goto out_quota;
 
-	init_dinode(dip, gl, inum, mode, uid, gid, generation, dev);
+	init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, bhp);
 	gfs2_quota_change(dip, +1, uid, gid);
 	gfs2_trans_end(sdp);
 
@@ -909,6 +916,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
 	int error;
 	u64 generation;
+	struct buffer_head *bh=NULL;
 
 	if (!name->len || name->len > GFS2_FNAMESIZE)
 		return ERR_PTR(-ENAMETOOLONG);
@@ -935,7 +943,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	if (error)
 		goto fail_gunlock;
 
-	error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev);
+	error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, &bh);
 	if (error)
 		goto fail_gunlock2;
 
@@ -945,6 +953,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	if (IS_ERR(inode))
 		goto fail_gunlock2;
 
+	gfs2_inode_bh(GFS2_I(inode), bh);
+
 	error = gfs2_inode_refresh(GFS2_I(inode));
 	if (error)
 		goto fail_gunlock2;
-- 
cgit v1.2.3


From 8497a46e178addb27ad1c981befaa17ca788b5c3 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Sun, 26 Aug 2007 14:23:56 +0100
Subject: [GFS2] Correct lock ordering in unlink

This patch corrects the lock ordering in unlink to be the same as
that in the rest of GFS2, i.e. parent -> child -> rgrp.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_inode.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 5b8b994b991..2cbe5a321e8 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -278,17 +278,25 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
 	gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
 
 
-	error = gfs2_glock_nq_m(3, ghs);
+	error = gfs2_glock_nq(ghs); /* parent */
 	if (error)
-		goto out;
+		goto out_parent;
+
+	error = gfs2_glock_nq(ghs + 1); /* child */
+	if (error)
+		goto out_child;
+
+	error = gfs2_glock_nq(ghs + 2); /* rgrp */
+	if (error)
+		goto out_rgrp;
 
 	error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
 	if (error)
-		goto out_gunlock;
+		goto out_rgrp;
 
 	error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
 	if (error)
-		goto out_gunlock;
+		goto out_rgrp;
 
 	error = gfs2_dir_del(dip, &dentry->d_name);
         if (error)
@@ -298,12 +306,15 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
 
 out_end_trans:
 	gfs2_trans_end(sdp);
-out_gunlock:
-	gfs2_glock_dq_m(3, ghs);
-out:
-	gfs2_holder_uninit(ghs);
-	gfs2_holder_uninit(ghs + 1);
+	gfs2_glock_dq(ghs + 2);
+out_rgrp:
 	gfs2_holder_uninit(ghs + 2);
+	gfs2_glock_dq(ghs + 1);
+out_child:
+	gfs2_holder_uninit(ghs + 1);
+	gfs2_glock_dq(ghs);
+out_parent:
+	gfs2_holder_uninit(ghs);
 	gfs2_glock_dq_uninit(&ri_gh);
 	return error;
 }
-- 
cgit v1.2.3


From 1e1a3d03e927d39282208aed676e49d25129feea Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 27 Aug 2007 09:45:26 +0100
Subject: [GFS2] Introduce gfs2_remove_from_ail

This collects together the operations required to remove a gfs2_bufdata
from the ail lists. Its only called from two places to start with, but
expect to see more of this function in future.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glops.c |  6 +-----
 fs/gfs2/log.c   | 31 +++++++++++++++++++++----------
 fs/gfs2/log.h   |  1 +
 3 files changed, 23 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 7ef6b23bb38..b17346a355b 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -60,11 +60,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 		blkno = bh->b_blocknr;
 		gfs2_assert_withdraw(sdp, !buffer_busy(bh));
 
-		bd->bd_ail = NULL;
-		list_del(&bd->bd_ail_st_list);
-		list_del(&bd->bd_ail_gl_list);
-		atomic_dec(&gl->gl_ail_count);
-		brelse(bh);
+		gfs2_remove_from_ail(NULL, bd);
 		gfs2_log_unlock(sdp);
 
 		gfs2_trans_add_revoke(sdp, blkno);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index d0e6b42c86e..d8232ec2539 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -59,6 +59,26 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
 	return blks;
 }
 
+/**
+ * gfs2_remove_from_ail - Remove an entry from the ail lists, updating counters
+ * @mapping: The associated mapping (maybe NULL)
+ * @bd: The gfs2_bufdata to remove
+ *
+ * The log lock _must_ be held when calling this function
+ *
+ */
+
+void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd)
+{
+	bd->bd_ail = NULL;
+	list_del(&bd->bd_ail_st_list);
+	list_del(&bd->bd_ail_gl_list);
+	atomic_dec(&bd->bd_gl->gl_ail_count);
+	if (mapping)
+		gfs2_meta_cache_flush(GFS2_I(mapping->host));
+	brelse(bd->bd_bh);
+}
+
 /**
  * gfs2_ail1_start_one - Start I/O on a part of the AIL
  * @sdp: the filesystem
@@ -219,21 +239,12 @@ static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 {
 	struct list_head *head = &ai->ai_ail2_list;
 	struct gfs2_bufdata *bd;
-	struct gfs2_inode *bh_ip;
 
 	while (!list_empty(head)) {
 		bd = list_entry(head->prev, struct gfs2_bufdata,
 				bd_ail_st_list);
 		gfs2_assert(sdp, bd->bd_ail == ai);
-		bd->bd_ail = NULL;
-		list_del(&bd->bd_ail_st_list);
-		list_del(&bd->bd_ail_gl_list);
-		atomic_dec(&bd->bd_gl->gl_ail_count);
-		if (bd->bd_bh->b_page->mapping) {
-			bh_ip = GFS2_I(bd->bd_bh->b_page->mapping->host);
-			gfs2_meta_cache_flush(bh_ip);
-		}
-		brelse(bd->bd_bh);
+		gfs2_remove_from_ail(bd->bd_bh->b_page->mapping, bd);
 	}
 }
 
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 8e7aa0f2910..639423561b2 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -58,6 +58,7 @@ struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
 				      struct buffer_head *real);
 void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
 void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
+void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd);
 
 void gfs2_log_shutdown(struct gfs2_sbd *sdp);
 void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
-- 
cgit v1.2.3


From eaf965270ffff3086ef929e660ace45e862cfd2d Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 27 Aug 2007 09:49:37 +0100
Subject: [GFS2] Don't mark jdata dirty in gfs2_unstuffer_page()

Journaled data is marked dirty by gfs2_unpin and should not be marked
dirty here.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 9b8990444e6..1e56f4de735 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -95,7 +95,8 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
 	set_buffer_uptodate(bh);
 	if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
 		gfs2_trans_add_bh(ip->i_gl, bh, 0);
-	mark_buffer_dirty(bh);
+	if (!gfs2_is_jdata(ip))
+		mark_buffer_dirty(bh);
 
 	if (release) {
 		unlock_page(page);
-- 
cgit v1.2.3


From 9b9107a5a8b190e6cf09bbdf893869c6a9c482cc Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 27 Aug 2007 13:54:05 +0100
Subject: [GFS2] Move pin/unpin into lops.c, clean up locking

gfs2_pin and gfs2_unpin are only used in lops.c, despite being
defined in meta_io.c, so this patch moves them into lops.c and
makes them static. At the same time, its possible to clean up
the locking in the buf and databuf _lo_add() functions so that
we only need to grab the spinlock once. Also we have to move
lock_buffer() around the _lo_add() functions since we can't
do that in gfs2_pin() any more since we hold the spinlock
for the duration of that function.

As a result, the code shrinks by 12 lines and we do far fewer
operations when adding buffers to the log. It also makes the
code somewhat easier to read & understand.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/lops.c    | 117 +++++++++++++++++++++++++++++++++++++++++-------------
 fs/gfs2/meta_io.c |  70 --------------------------------
 fs/gfs2/meta_io.h |   3 --
 3 files changed, 89 insertions(+), 101 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 7ef33562337..3ec587135d4 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -27,7 +27,71 @@
 #include "trans.h"
 #include "util.h"
 
-static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+/**
+ * gfs2_pin - Pin a buffer in memory
+ * @sdp: The superblock
+ * @bh: The buffer to be pinned
+ *
+ * The log lock must be held when calling this function
+ */
+static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+	struct gfs2_bufdata *bd;
+
+	gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
+
+	clear_buffer_dirty(bh);
+	if (test_set_buffer_pinned(bh))
+		gfs2_assert_withdraw(sdp, 0);
+	if (!buffer_uptodate(bh))
+		gfs2_io_error_bh(sdp, bh);
+	bd = bh->b_private;
+	/* If this buffer is in the AIL and it has already been written
+	 * to in-place disk block, remove it from the AIL.
+	 */
+	if (bd->bd_ail)
+		list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
+	get_bh(bh);
+}
+
+/**
+ * gfs2_unpin - Unpin a buffer
+ * @sdp: the filesystem the buffer belongs to
+ * @bh: The buffer to unpin
+ * @ai:
+ *
+ */
+
+static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
+		       struct gfs2_ail *ai)
+{
+	struct gfs2_bufdata *bd = bh->b_private;
+
+	gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
+
+	if (!buffer_pinned(bh))
+		gfs2_assert_withdraw(sdp, 0);
+
+	lock_buffer(bh);
+	mark_buffer_dirty(bh);
+	clear_buffer_pinned(bh);
+
+	gfs2_log_lock(sdp);
+	if (bd->bd_ail) {
+		list_del(&bd->bd_ail_st_list);
+		brelse(bh);
+	} else {
+		struct gfs2_glock *gl = bd->bd_gl;
+		list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
+		atomic_inc(&gl->gl_ail_count);
+	}
+	bd->bd_ail = ai;
+	list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+	gfs2_log_unlock(sdp);
+	unlock_buffer(bh);
+}
+
+static void __glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 {
 	struct gfs2_glock *gl;
 	struct gfs2_trans *tr = current->journal_info;
@@ -38,15 +102,19 @@ static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(gl)))
 		return;
 
-	gfs2_log_lock(sdp);
-	if (!list_empty(&le->le_list)){
-		gfs2_log_unlock(sdp);
+	if (!list_empty(&le->le_list))
 		return;
-	}
+
 	gfs2_glock_hold(gl);
 	set_bit(GLF_DIRTY, &gl->gl_flags);
 	sdp->sd_log_num_gl++;
 	list_add(&le->le_list, &sdp->sd_log_le_gl);
+}
+
+static void glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
+{
+	gfs2_log_lock(sdp);
+	__glock_lo_add(sdp, le);
 	gfs2_log_unlock(sdp);
 }
 
@@ -71,30 +139,25 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 	struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
 	struct gfs2_trans *tr;
 
+	lock_buffer(bd->bd_bh);
 	gfs2_log_lock(sdp);
-	if (!list_empty(&bd->bd_list_tr)) {
-		gfs2_log_unlock(sdp);
-		return;
-	}
+	if (!list_empty(&bd->bd_list_tr))
+		goto out;
 	tr = current->journal_info;
 	tr->tr_touched = 1;
 	tr->tr_num_buf++;
 	list_add(&bd->bd_list_tr, &tr->tr_list_buf);
-	gfs2_log_unlock(sdp);
-
 	if (!list_empty(&le->le_list))
-		return;
-
-	gfs2_trans_add_gl(bd->bd_gl);
-
+		goto out;
+	__glock_lo_add(sdp, &bd->bd_gl->gl_le);
 	gfs2_meta_check(sdp, bd->bd_bh);
 	gfs2_pin(sdp, bd->bd_bh);
-	gfs2_log_lock(sdp);
 	sdp->sd_log_num_buf++;
 	list_add(&le->le_list, &sdp->sd_log_le_buf);
-	gfs2_log_unlock(sdp);
-
 	tr->tr_num_buf_new++;
+out:
+	gfs2_log_unlock(sdp);
+	unlock_buffer(bd->bd_bh);
 }
 
 static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
@@ -476,31 +539,29 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 	struct address_space *mapping = bd->bd_bh->b_page->mapping;
 	struct gfs2_inode *ip = GFS2_I(mapping->host);
 
+	lock_buffer(bd->bd_bh);
 	gfs2_log_lock(sdp);
-	if (!list_empty(&bd->bd_list_tr)) {
-		gfs2_log_unlock(sdp);
-		return;
-	}
+	if (!list_empty(&bd->bd_list_tr))
+		goto out;
 	tr->tr_touched = 1;
 	if (gfs2_is_jdata(ip)) {
 		tr->tr_num_buf++;
 		list_add(&bd->bd_list_tr, &tr->tr_list_buf);
 	}
-	gfs2_log_unlock(sdp);
 	if (!list_empty(&le->le_list))
-		return;
+		goto out;
 
-	gfs2_trans_add_gl(bd->bd_gl);
+	__glock_lo_add(sdp, &bd->bd_gl->gl_le);
 	if (gfs2_is_jdata(ip)) {
 		gfs2_pin(sdp, bd->bd_bh);
 		tr->tr_num_databuf_new++;
-	}
-	gfs2_log_lock(sdp);
-	if (gfs2_is_jdata(ip))
 		sdp->sd_log_num_jdata++;
+	}
 	sdp->sd_log_num_databuf++;
 	list_add(&le->le_list, &sdp->sd_log_le_databuf);
+out:
 	gfs2_log_unlock(sdp);
+	unlock_buffer(bd->bd_bh);
 }
 
 static int gfs2_check_magic(struct buffer_head *bh)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 8da343b34ae..d762e4f7044 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -297,76 +297,6 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
 		unlock_page(bh->b_page);
 }
 
-/**
- * gfs2_pin - Pin a buffer in memory
- * @sdp: the filesystem the buffer belongs to
- * @bh: The buffer to be pinned
- *
- */
-
-void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
-{
-	struct gfs2_bufdata *bd = bh->b_private;
-
-	gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
-
-	if (test_set_buffer_pinned(bh))
-		gfs2_assert_withdraw(sdp, 0);
-
-	wait_on_buffer(bh);
-
-	/* If this buffer is in the AIL and it has already been written
-	   to in-place disk block, remove it from the AIL. */
-
-	gfs2_log_lock(sdp);
-	if (bd->bd_ail && !buffer_in_io(bh))
-		list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
-	gfs2_log_unlock(sdp);
-
-	clear_buffer_dirty(bh);
-	wait_on_buffer(bh);
-
-	if (!buffer_uptodate(bh))
-		gfs2_io_error_bh(sdp, bh);
-
-	get_bh(bh);
-}
-
-/**
- * gfs2_unpin - Unpin a buffer
- * @sdp: the filesystem the buffer belongs to
- * @bh: The buffer to unpin
- * @ai:
- *
- */
-
-void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
-	        struct gfs2_ail *ai)
-{
-	struct gfs2_bufdata *bd = bh->b_private;
-
-	gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
-
-	if (!buffer_pinned(bh))
-		gfs2_assert_withdraw(sdp, 0);
-
-	mark_buffer_dirty(bh);
-	clear_buffer_pinned(bh);
-
-	gfs2_log_lock(sdp);
-	if (bd->bd_ail) {
-		list_del(&bd->bd_ail_st_list);
-		brelse(bh);
-	} else {
-		struct gfs2_glock *gl = bd->bd_gl;
-		list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list);
-		atomic_inc(&gl->gl_ail_count);
-	}
-	bd->bd_ail = ai;
-	list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
-	gfs2_log_unlock(sdp);
-}
-
 /**
  * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
  * @ip: the inode who owns the buffers
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 527bf19d969..fd67f07cd60 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -50,9 +50,6 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
 
 void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
 			 int meta);
-void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
-void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
-		struct gfs2_ail *ai);
 
 void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
 
-- 
cgit v1.2.3


From d7b616e252b125f12b007c392f7644053bb6f140 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Sun, 2 Sep 2007 10:48:13 +0100
Subject: [GFS2] Clean up ordered write code

The following patch removes the ordered write processing from
databuf_lo_before_commit() and moves it to log.c. This has the effect of
greatly simplyfying databuf_lo_before_commit() and well as potentially
making the ordered write code more efficient.

As a side effect of this, its now possible to remove ordered buffers
from the ordered buffer list at any time, so we now make use of this in
invalidatepage and releasepage to ensure timely release of these
buffers.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h      |   2 +-
 fs/gfs2/log.c         |  57 +++++++++++++++++-
 fs/gfs2/lops.c        | 160 +++++++++++++-------------------------------------
 fs/gfs2/ops_address.c |  50 ++++++++++++++--
 fs/gfs2/ops_fstype.c  |   1 +
 5 files changed, 143 insertions(+), 127 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 23b611aa70d..388dc1bd736 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -612,13 +612,13 @@ struct gfs2_sbd {
 	unsigned int sd_log_num_revoke;
 	unsigned int sd_log_num_rg;
 	unsigned int sd_log_num_databuf;
-	unsigned int sd_log_num_jdata;
 
 	struct list_head sd_log_le_gl;
 	struct list_head sd_log_le_buf;
 	struct list_head sd_log_le_revoke;
 	struct list_head sd_log_le_rg;
 	struct list_head sd_log_le_databuf;
+	struct list_head sd_log_le_ordered;
 
 	unsigned int sd_log_blks_free;
 	struct mutex sd_log_reserve_mutex;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index d8232ec2539..20fa528d457 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -620,6 +620,57 @@ static void log_flush_commit(struct gfs2_sbd *sdp)
 	}
 }
 
+static void gfs2_ordered_write(struct gfs2_sbd *sdp)
+{
+	struct gfs2_bufdata *bd;
+	struct buffer_head *bh;
+	LIST_HEAD(written);
+
+	gfs2_log_lock(sdp);
+	while (!list_empty(&sdp->sd_log_le_ordered)) {
+		bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_le.le_list);
+		list_move(&bd->bd_le.le_list, &written);
+		bh = bd->bd_bh;
+		if (!buffer_dirty(bh))
+			continue;
+		get_bh(bh);
+		gfs2_log_unlock(sdp);
+		lock_buffer(bh);
+		if (test_clear_buffer_dirty(bh)) {
+			bh->b_end_io = end_buffer_write_sync;
+			submit_bh(WRITE, bh);
+		} else {
+			unlock_buffer(bh);
+			brelse(bh);
+		}
+		gfs2_log_lock(sdp);
+	}
+	list_splice(&written, &sdp->sd_log_le_ordered);
+	gfs2_log_unlock(sdp);
+}
+
+static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
+{
+	struct gfs2_bufdata *bd;
+	struct buffer_head *bh;
+
+	gfs2_log_lock(sdp);
+	while (!list_empty(&sdp->sd_log_le_ordered)) {
+		bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_le.le_list);
+		bh = bd->bd_bh;
+		if (buffer_locked(bh)) {
+			get_bh(bh);
+			gfs2_log_unlock(sdp);
+			wait_on_buffer(bh);
+			brelse(bh);
+			gfs2_log_lock(sdp);
+			continue;
+		}
+		list_del_init(&bd->bd_le.le_list);
+	}
+	gfs2_log_unlock(sdp);
+}
+
 /**
  * gfs2_log_flush - flush incore transaction(s)
  * @sdp: the filesystem
@@ -648,7 +699,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 	INIT_LIST_HEAD(&ai->ai_ail2_list);
 
 	gfs2_assert_withdraw(sdp,
-			     sdp->sd_log_num_buf + sdp->sd_log_num_jdata ==
+			     sdp->sd_log_num_buf + sdp->sd_log_num_databuf ==
 			     sdp->sd_log_commited_buf +
 			     sdp->sd_log_commited_databuf);
 	gfs2_assert_withdraw(sdp,
@@ -658,7 +709,10 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 	sdp->sd_log_flush_wrapped = 0;
 	ai->ai_first = sdp->sd_log_flush_head;
 
+	gfs2_ordered_write(sdp);
 	lops_before_commit(sdp);
+	gfs2_ordered_wait(sdp);
+
 	if (!list_empty(&sdp->sd_log_flush_list))
 		log_flush_commit(sdp);
 	else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
@@ -751,7 +805,6 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_gl);
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
-	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_jdata);
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 3ec587135d4..7e2d4e692b5 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -555,10 +555,11 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 	if (gfs2_is_jdata(ip)) {
 		gfs2_pin(sdp, bd->bd_bh);
 		tr->tr_num_databuf_new++;
-		sdp->sd_log_num_jdata++;
+		sdp->sd_log_num_databuf++;
+		list_add(&le->le_list, &sdp->sd_log_le_databuf);
+	} else {
+		list_add(&le->le_list, &sdp->sd_log_le_ordered);
 	}
-	sdp->sd_log_num_databuf++;
-	list_add(&le->le_list, &sdp->sd_log_le_databuf);
 out:
 	gfs2_log_unlock(sdp);
 	unlock_buffer(bd->bd_bh);
@@ -583,114 +584,59 @@ static int gfs2_check_magic(struct buffer_head *bh)
 /**
  * databuf_lo_before_commit - Scan the data buffers, writing as we go
  *
- * Here we scan through the lists of buffers and make the assumption
- * that any buffer thats been pinned is being journaled, and that
- * any unpinned buffer is an ordered write data buffer and therefore
- * will be written back rather than journaled.
  */
+
 static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
 {
-	LIST_HEAD(started);
-	struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
+	struct gfs2_bufdata *bd1 = NULL, *bd2;
 	struct buffer_head *bh = NULL,*bh1 = NULL;
 	struct gfs2_log_descriptor *ld;
 	unsigned int limit;
-	unsigned int total_dbuf;
-	unsigned int total_jdata;
+	unsigned int total;
 	unsigned int num, n;
 	__be64 *ptr = NULL;
+	int magic;
+
 
 	limit = databuf_limit(sdp);
 
-	/*
-	 * Start writing ordered buffers, write journaled buffers
-	 * into the log along with a header
-	 */
 	gfs2_log_lock(sdp);
-	total_dbuf = sdp->sd_log_num_databuf;
-	total_jdata = sdp->sd_log_num_jdata;
+	total = sdp->sd_log_num_databuf;
 	bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf,
 				       bd_le.le_list);
-	while(total_dbuf) {
-		num = total_jdata;
+	while(total) {
+		num = total;
 		if (num > limit)
 			num = limit;
+
+		gfs2_log_unlock(sdp);
+		bh = gfs2_log_get_buf(sdp);
+		gfs2_log_lock(sdp);
+
+		ld = (struct gfs2_log_descriptor *)bh->b_data;
+		ptr = (__be64 *)(bh->b_data + DATABUF_OFFSET);
+		ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+		ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
+		ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
+		ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_JDATA);
+		ld->ld_length = cpu_to_be32(num + 1);
+		ld->ld_data1 = cpu_to_be32(num);
+		ld->ld_data2 = cpu_to_be32(0);
+		memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
+
 		n = 0;
-		list_for_each_entry_safe_continue(bd1, bdt,
-						  &sdp->sd_log_le_databuf,
-						  bd_le.le_list) {
-			/* store off the buffer head in a local ptr since
-			 * gfs2_bufdata might change when we drop the log lock
-			 */
+		list_for_each_entry_continue(bd1, &sdp->sd_log_le_databuf,
+					     bd_le.le_list) {
 			bh1 = bd1->bd_bh;
 
-			/* An ordered write buffer */
-			if (bh1 && !buffer_pinned(bh1)) {
-				list_move(&bd1->bd_le.le_list, &started);
-				if (bd1 == bd2) {
-					bd2 = NULL;
-					bd2 = list_prepare_entry(bd2,
-							&sdp->sd_log_le_databuf,
-							bd_le.le_list);
-				}
-				total_dbuf--;
-				if (bh1) {
-					if (buffer_dirty(bh1)) {
-						get_bh(bh1);
-
-						gfs2_log_unlock(sdp);
-
-						ll_rw_block(SWRITE, 1, &bh1);
-						brelse(bh1);
-
-						gfs2_log_lock(sdp);
-					}
-					continue;
-				}
-				continue;
-			} else if (bh1) { /* A journaled buffer */
-				int magic;
-				gfs2_log_unlock(sdp);
-				if (!bh) {
-					bh = gfs2_log_get_buf(sdp);
-					ld = (struct gfs2_log_descriptor *)
-					     bh->b_data;
-					ptr = (__be64 *)(bh->b_data +
-							 DATABUF_OFFSET);
-					ld->ld_header.mh_magic =
-						cpu_to_be32(GFS2_MAGIC);
-					ld->ld_header.mh_type =
-						cpu_to_be32(GFS2_METATYPE_LD);
-					ld->ld_header.mh_format =
-						cpu_to_be32(GFS2_FORMAT_LD);
-					ld->ld_type =
-						cpu_to_be32(GFS2_LOG_DESC_JDATA);
-					ld->ld_length = cpu_to_be32(num + 1);
-					ld->ld_data1 = cpu_to_be32(num);
-					ld->ld_data2 = cpu_to_be32(0);
-					memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
-				}
-				magic = gfs2_check_magic(bh1);
-				*ptr++ = cpu_to_be64(bh1->b_blocknr);
-				*ptr++ = cpu_to_be64((__u64)magic);
-				clear_buffer_escaped(bh1);
-				if (unlikely(magic != 0))
-					set_buffer_escaped(bh1);
-				gfs2_log_lock(sdp);
-				if (++n >= num)
-					break;
-			} else if (!bh1) {
-				total_dbuf--;
-				sdp->sd_log_num_databuf--;
-				list_del_init(&bd1->bd_le.le_list);
-				if (bd1 == bd2) {
-					bd2 = NULL;
-					bd2 = list_prepare_entry(bd2,
-						&sdp->sd_log_le_databuf,
-						bd_le.le_list);
-                                }
-				kmem_cache_free(gfs2_bufdata_cachep, bd1);
-			}
+			magic = gfs2_check_magic(bh1);
+			*ptr++ = cpu_to_be64(bh1->b_blocknr);
+			*ptr++ = cpu_to_be64((__u64)magic);
+			clear_buffer_escaped(bh1);
+			if (unlikely(magic != 0))
+				set_buffer_escaped(bh1);
+			if (++n >= num)
+				break;
 		}
 		gfs2_log_unlock(sdp);
 		if (bh) {
@@ -727,34 +673,10 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
 				break;
 		}
 		bh = NULL;
-		BUG_ON(total_dbuf < num);
-		total_dbuf -= num;
-		total_jdata -= num;
+		BUG_ON(total < num);
+		total -= num;
 	}
 	gfs2_log_unlock(sdp);
-
-	/* Wait on all ordered buffers */
-	while (!list_empty(&started)) {
-		gfs2_log_lock(sdp);
-		bd1 = list_entry(started.next, struct gfs2_bufdata,
-				 bd_le.le_list);
-		list_del_init(&bd1->bd_le.le_list);
-		sdp->sd_log_num_databuf--;
-		bh = bd1->bd_bh;
-		if (bh) {
-			bh->b_private = NULL;
-			get_bh(bh);
-			gfs2_log_unlock(sdp);
-			wait_on_buffer(bh);
-			brelse(bh);
-		} else
-			gfs2_log_unlock(sdp);
-
-		kmem_cache_free(gfs2_bufdata_cachep, bd1);
-	}
-
-	/* We've removed all the ordered write bufs here, so only jdata left */
-	gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata);
 }
 
 static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -838,11 +760,9 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 		bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
 		list_del_init(&bd->bd_le.le_list);
 		sdp->sd_log_num_databuf--;
-		sdp->sd_log_num_jdata--;
 		gfs2_unpin(sdp, bd->bd_bh, ai);
 	}
 	gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
-	gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
 }
 
 
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 8407d1db4ea..dd1ea491ddc 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -616,13 +616,50 @@ static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock)
 	return dblock;
 }
 
+static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
+{
+	struct gfs2_bufdata *bd;
+
+	lock_buffer(bh);
+	gfs2_log_lock(sdp);
+	clear_buffer_dirty(bh);
+	bd = bh->b_private;
+	if (bd) {
+		if (!list_empty(&bd->bd_le.le_list)) {
+			if (!buffer_pinned(bh))
+				list_del_init(&bd->bd_le.le_list);
+		}
+	}
+	bh->b_bdev = NULL;
+	clear_buffer_mapped(bh);
+	clear_buffer_req(bh);
+	clear_buffer_new(bh);
+	gfs2_log_unlock(sdp);
+	unlock_buffer(bh);
+}
+
 static void gfs2_invalidatepage(struct page *page, unsigned long offset)
 {
+	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
+	struct buffer_head *bh, *head;
+	unsigned long pos = 0;
+
 	BUG_ON(!PageLocked(page));
 	if (offset == 0)
 		ClearPageChecked(page);
+	if (!page_has_buffers(page))
+		goto out;
 
-	block_invalidatepage(page, offset);
+	bh = head = page_buffers(page);
+	do {
+		if (offset <= pos)
+			gfs2_discard(sdp, bh);
+		pos += bh->b_size;
+		bh = bh->b_this_page;
+	} while (bh != head);
+out:
+	if (offset == 0)
+		try_to_release_page(page, 0);
 }
 
 /**
@@ -732,9 +769,14 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
 		if (bd) {
 			gfs2_assert_warn(sdp, bd->bd_bh == bh);
 			gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr));
-			bd->bd_bh = NULL;
-			if (!list_empty(&bd->bd_le.le_list))
-				bd = NULL;
+			if (!list_empty(&bd->bd_le.le_list)) {
+				if (!buffer_pinned(bh))
+					list_del_init(&bd->bd_le.le_list);
+				else
+					bd = NULL;
+			}
+			if (bd)
+				bd->bd_bh = NULL;
 			bh->b_private = NULL;
 		}
 		gfs2_log_unlock(sdp);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 314c1134d12..35f3dfa9bfb 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -82,6 +82,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
 	INIT_LIST_HEAD(&sdp->sd_log_le_rg);
 	INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
+	INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
 
 	mutex_init(&sdp->sd_log_reserve_mutex);
 	INIT_LIST_HEAD(&sdp->sd_ail1_list);
-- 
cgit v1.2.3


From 8475487befb29eeb038fef374a7433d276336a25 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Sun, 2 Sep 2007 10:55:29 +0100
Subject: [GFS2] Fix ordering of dirty/journal for ordered buffer unstuffing

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1e56f4de735..93fa427bb5f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -93,10 +93,10 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
 		map_bh(bh, inode->i_sb, block);
 
 	set_buffer_uptodate(bh);
-	if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
-		gfs2_trans_add_bh(ip->i_gl, bh, 0);
 	if (!gfs2_is_jdata(ip))
 		mark_buffer_dirty(bh);
+	if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip))
+		gfs2_trans_add_bh(ip->i_gl, bh, 0);
 
 	if (release) {
 		unlock_page(page);
-- 
cgit v1.2.3


From 82e86087bb774cd54d47db4a7c771b5b29bea9ed Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Sun, 2 Sep 2007 15:39:43 +0100
Subject: [GFS2] Replace revoke structure with bufdata structure

Both the revoke structure and the bufdata structure are quite similar.
They are basically small tags which are put on lists. In addition to
which the revoke structure is always allocated when there is a bufdata
structure which is (or can be) freed. As such it should be possible to
reduce the number of frees and allocations by using the same structure
for both purposes.

This patch is the first step along that path. It replaces existing uses
of the revoke structure with the bufdata structure.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h | 13 +++++++------
 fs/gfs2/lops.c   | 10 +++++-----
 fs/gfs2/trans.c  | 20 ++++++++++----------
 3 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 388dc1bd736..8aa5780862b 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -114,7 +114,13 @@ struct gfs2_bufdata {
 	struct buffer_head *bd_bh;
 	struct gfs2_glock *bd_gl;
 
-	struct list_head bd_list_tr;
+	union {
+		struct list_head list_tr;
+		u64 blkno;
+	} u;
+#define bd_list_tr u.list_tr
+#define bd_blkno u.blkno
+
 	struct gfs2_log_element bd_le;
 
 	struct gfs2_ail *bd_ail;
@@ -298,11 +304,6 @@ struct gfs2_file {
 	struct gfs2_holder f_fl_gh;
 };
 
-struct gfs2_revoke {
-	struct gfs2_log_element rv_le;
-	u64 rv_blkno;
-};
-
 struct gfs2_revoke_replay {
 	struct list_head rr_list;
 	u64 rr_blkno;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 7e2d4e692b5..cf6fe363155 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -357,7 +357,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 	struct buffer_head *bh;
 	unsigned int offset;
 	struct list_head *head = &sdp->sd_log_le_revoke;
-	struct gfs2_revoke *rv;
+	struct gfs2_bufdata *bd;
 
 	if (!sdp->sd_log_num_revoke)
 		return;
@@ -376,8 +376,8 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 	offset = sizeof(struct gfs2_log_descriptor);
 
 	while (!list_empty(head)) {
-		rv = list_entry(head->next, struct gfs2_revoke, rv_le.le_list);
-		list_del_init(&rv->rv_le.le_list);
+		bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
+		list_del_init(&bd->bd_le.le_list);
 		sdp->sd_log_num_revoke--;
 
 		if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
@@ -392,8 +392,8 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 			offset = sizeof(struct gfs2_meta_header);
 		}
 
-		*(__be64 *)(bh->b_data + offset) = cpu_to_be64(rv->rv_blkno);
-		kfree(rv);
+		*(__be64 *)(bh->b_data + offset) = cpu_to_be64(bd->bd_blkno);
+		kfree(bd);
 
 		offset += sizeof(u64);
 	}
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index f8dabf8446b..eadf96e0051 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -144,23 +144,23 @@ void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
 
 void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno)
 {
-	struct gfs2_revoke *rv = kmalloc(sizeof(struct gfs2_revoke),
-					 GFP_NOFS | __GFP_NOFAIL);
-	lops_init_le(&rv->rv_le, &gfs2_revoke_lops);
-	rv->rv_blkno = blkno;
-	lops_add(sdp, &rv->rv_le);
+	struct gfs2_bufdata *bd = kmalloc(sizeof(struct gfs2_bufdata),
+					  GFP_NOFS | __GFP_NOFAIL);
+	lops_init_le(&bd->bd_le, &gfs2_revoke_lops);
+	bd->bd_blkno = blkno;
+	lops_add(sdp, &bd->bd_le);
 }
 
 void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
 {
-	struct gfs2_revoke *rv;
+	struct gfs2_bufdata *bd;
 	int found = 0;
 
 	gfs2_log_lock(sdp);
 
-	list_for_each_entry(rv, &sdp->sd_log_le_revoke, rv_le.le_list) {
-		if (rv->rv_blkno == blkno) {
-			list_del(&rv->rv_le.le_list);
+	list_for_each_entry(bd, &sdp->sd_log_le_revoke, bd_le.le_list) {
+		if (bd->bd_blkno == blkno) {
+			list_del(&bd->bd_le.le_list);
 			gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
 			sdp->sd_log_num_revoke--;
 			found = 1;
@@ -172,7 +172,7 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
 
 	if (found) {
 		struct gfs2_trans *tr = current->journal_info;
-		kfree(rv);
+		kfree(bd);
 		tr->tr_num_revoke_rm++;
 	}
 }
-- 
cgit v1.2.3


From 0820ab517e1b100ee3f9584ec27f93309689ebe7 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Sun, 2 Sep 2007 16:47:38 +0100
Subject: [GFS2] Use slab operations for all gfs2_bufdata allocations

The old revoke structure was allocated using kalloc/kfree but
there is a slab cache for gfs2_bufdata, so we should use that
now that the structures have been converted.

This is part two of the patch series to merge the revoke
and gfs2_bufdata structures.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/lops.c  | 2 +-
 fs/gfs2/trans.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index cf6fe363155..4cbef4c23a6 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -393,7 +393,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 		}
 
 		*(__be64 *)(bh->b_data + offset) = cpu_to_be64(bd->bd_blkno);
-		kfree(bd);
+		kmem_cache_free(gfs2_bufdata_cachep, bd);
 
 		offset += sizeof(u64);
 	}
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index eadf96e0051..01cc27fefd8 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -144,7 +144,7 @@ void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
 
 void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno)
 {
-	struct gfs2_bufdata *bd = kmalloc(sizeof(struct gfs2_bufdata),
+	struct gfs2_bufdata *bd = kmem_cache_alloc(gfs2_bufdata_cachep,
 					  GFP_NOFS | __GFP_NOFAIL);
 	lops_init_le(&bd->bd_le, &gfs2_revoke_lops);
 	bd->bd_blkno = blkno;
@@ -172,7 +172,7 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
 
 	if (found) {
 		struct gfs2_trans *tr = current->journal_info;
-		kfree(bd);
+		kmem_cache_free(gfs2_bufdata_cachep, bd);
 		tr->tr_num_revoke_rm++;
 	}
 }
-- 
cgit v1.2.3


From 1ad38c437fa33f85ba4b6a85ea8c5478ee72d5bd Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 3 Sep 2007 11:01:33 +0100
Subject: [GFS2] Clean up gfs2_trans_add_revoke()

The following alters gfs2_trans_add_revoke() to take a struct
gfs2_bufdata as an argument. This eliminates the memory allocation which
was previously required by making use of the already existing struct
gfs2_bufdata. It makes some sanity checks to ensure that the
gfs2_bufdata has been removed from all the lists before its recycled as
a revoke structure. This saves one memory allocation and one free per
revoke structure.

Also as a result, and to simplify the locking, since there is no longer
any blocking code in gfs2_trans_add_revoke() we must hold the log lock
whenever this function is called. This reduces the amount of times we
take and unlock the log lock.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glops.c   | 14 +++++---------
 fs/gfs2/log.c     |  4 ++--
 fs/gfs2/lops.c    |  3 ---
 fs/gfs2/meta_io.c | 35 ++++++++++++-----------------------
 fs/gfs2/trans.c   | 10 +++++-----
 fs/gfs2/trans.h   |  2 +-
 6 files changed, 25 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index b17346a355b..4670dcb2a87 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -41,7 +41,6 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 	struct list_head *head = &gl->gl_ail_list;
 	struct gfs2_bufdata *bd;
 	struct buffer_head *bh;
-	u64 blkno;
 	int error;
 
 	blocks = atomic_read(&gl->gl_ail_count);
@@ -57,15 +56,12 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 		bd = list_entry(head->next, struct gfs2_bufdata,
 				bd_ail_gl_list);
 		bh = bd->bd_bh;
-		blkno = bh->b_blocknr;
-		gfs2_assert_withdraw(sdp, !buffer_busy(bh));
-
 		gfs2_remove_from_ail(NULL, bd);
-		gfs2_log_unlock(sdp);
-
-		gfs2_trans_add_revoke(sdp, blkno);
-
-		gfs2_log_lock(sdp);
+		bd->bd_bh = NULL;
+		bh->b_private = NULL;
+		bd->bd_blkno = bh->b_blocknr;
+		gfs2_assert_withdraw(sdp, !buffer_busy(bh));
+		gfs2_trans_add_revoke(sdp, bd);
 	}
 	gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
 	gfs2_log_unlock(sdp);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 20fa528d457..4d04e6f1970 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -71,8 +71,8 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
 void gfs2_remove_from_ail(struct address_space *mapping, struct gfs2_bufdata *bd)
 {
 	bd->bd_ail = NULL;
-	list_del(&bd->bd_ail_st_list);
-	list_del(&bd->bd_ail_gl_list);
+	list_del_init(&bd->bd_ail_st_list);
+	list_del_init(&bd->bd_ail_gl_list);
 	atomic_dec(&bd->bd_gl->gl_ail_count);
 	if (mapping)
 		gfs2_meta_cache_flush(GFS2_I(mapping->host));
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 4cbef4c23a6..342c10e12af 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -343,11 +343,8 @@ static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 	tr = current->journal_info;
 	tr->tr_touched = 1;
 	tr->tr_num_revoke++;
-
-	gfs2_log_lock(sdp);
 	sdp->sd_log_num_revoke++;
 	list_add(&le->le_list, &sdp->sd_log_le_revoke);
-	gfs2_log_unlock(sdp);
 }
 
 static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index d762e4f7044..19097bc7c81 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -313,42 +313,31 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
 	while (blen) {
 		bh = getbuf(ip->i_gl, bstart, NO_CREATE);
 		if (bh) {
-			struct gfs2_bufdata *bd = bh->b_private;
+			struct gfs2_bufdata *bd;
 
+			lock_buffer(bh);
+			gfs2_log_lock(sdp);
+			bd = bh->b_private;
 			if (test_clear_buffer_pinned(bh)) {
 				struct gfs2_trans *tr = current->journal_info;
-				struct gfs2_inode *bh_ip =
-					GFS2_I(bh->b_page->mapping->host);
-
-				gfs2_log_lock(sdp);
 				list_del_init(&bd->bd_le.le_list);
 				gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
 				sdp->sd_log_num_buf--;
-				gfs2_log_unlock(sdp);
-				if (bh_ip->i_inode.i_private != NULL)
-					tr->tr_num_databuf_rm++;
-				else
-					tr->tr_num_buf_rm++;
+				tr->tr_num_buf_rm++;
 				brelse(bh);
 			}
 			if (bd) {
-				gfs2_log_lock(sdp);
 				if (bd->bd_ail) {
-					u64 blkno = bh->b_blocknr;
-					bd->bd_ail = NULL;
-					list_del(&bd->bd_ail_st_list);
-					list_del(&bd->bd_ail_gl_list);
-					atomic_dec(&bd->bd_gl->gl_ail_count);
-					brelse(bh);
-					gfs2_log_unlock(sdp);
-					gfs2_trans_add_revoke(sdp, blkno);
-				} else
-					gfs2_log_unlock(sdp);
+					gfs2_remove_from_ail(NULL, bd);
+					bh->b_private = NULL;
+					bd->bd_bh = NULL;
+					bd->bd_blkno = bh->b_blocknr;
+					gfs2_trans_add_revoke(sdp, bd);
+				}
 			}
-
-			lock_buffer(bh);
 			clear_buffer_dirty(bh);
 			clear_buffer_uptodate(bh);
+			gfs2_log_unlock(sdp);
 			unlock_buffer(bh);
 
 			brelse(bh);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 01cc27fefd8..717983e2c2a 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -142,12 +142,12 @@ void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
 	lops_add(sdp, &bd->bd_le);
 }
 
-void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno)
+void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 {
-	struct gfs2_bufdata *bd = kmem_cache_alloc(gfs2_bufdata_cachep,
-					  GFP_NOFS | __GFP_NOFAIL);
+	BUG_ON(!list_empty(&bd->bd_le.le_list));
+	BUG_ON(!list_empty(&bd->bd_ail_st_list));
+	BUG_ON(!list_empty(&bd->bd_ail_gl_list));
 	lops_init_le(&bd->bd_le, &gfs2_revoke_lops);
-	bd->bd_blkno = blkno;
 	lops_add(sdp, &bd->bd_le);
 }
 
@@ -160,7 +160,7 @@ void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno)
 
 	list_for_each_entry(bd, &sdp->sd_log_le_revoke, bd_le.le_list) {
 		if (bd->bd_blkno == blkno) {
-			list_del(&bd->bd_le.le_list);
+			list_del_init(&bd->bd_le.le_list);
 			gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
 			sdp->sd_log_num_revoke--;
 			found = 1;
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index 23d4cbe1de5..043d5f4b9c4 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -32,7 +32,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp);
 
 void gfs2_trans_add_gl(struct gfs2_glock *gl);
 void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
-void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, u64 blkno);
+void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
 void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno);
 void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
 
-- 
cgit v1.2.3


From b4c20166dcfca106f0f416bfce200099ed76ab18 Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Thu, 13 Sep 2007 23:35:27 -0500
Subject: [GFS2] flocks from same process trip kernel BUG at
 fs/gfs2/glock.c:1118!

This patch adds a new flag to the gfs2_holder structure GL_FLOCK.
It is set on holders of glocks representing flocks. This flag is
checked in add_to_queue() and a process is permitted to queue more
than one holder onto a glock if it is set. This solves the issue
of a process not being able to do multiple flocks on the same file.
Through a single descriptor, a process can now promote and demote
flocks. Through multiple descriptors a process can now queue
multiple flocks on the same file. There's still the problem of
a process deadlocking itself (because gfs2 blocking locks are not
interruptible) by queueing incompatible deadlock.

Signed-off-by: Abhijith Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c    | 43 +++++++++++++++++++++++++------------------
 fs/gfs2/glock.h    |  1 +
 fs/gfs2/ops_file.c | 13 ++++++-------
 3 files changed, 32 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 931368a385c..d631cad0aee 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1106,24 +1106,31 @@ static void add_to_queue(struct gfs2_holder *gh)
 	if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
 		BUG();
 
-	existing = find_holder_by_owner(&gl->gl_holders, gh->gh_owner_pid);
-	if (existing) {
-		print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
-		printk(KERN_INFO "pid : %d\n", existing->gh_owner_pid);
-		printk(KERN_INFO "lock type : %d lock state : %d\n",
-				existing->gh_gl->gl_name.ln_type, existing->gh_gl->gl_state);
-		print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
-		printk(KERN_INFO "pid : %d\n", gh->gh_owner_pid);
-		printk(KERN_INFO "lock type : %d lock state : %d\n",
-				gl->gl_name.ln_type, gl->gl_state);
-		BUG();
-	}
-
-	existing = find_holder_by_owner(&gl->gl_waiters3, gh->gh_owner_pid);
-	if (existing) {
-		print_symbol(KERN_WARNING "original: %s\n", existing->gh_ip);
-		print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
-		BUG();
+	if (!(gh->gh_flags & GL_FLOCK)) {
+		existing = find_holder_by_owner(&gl->gl_holders, 
+						gh->gh_owner_pid);
+		if (existing) {
+			print_symbol(KERN_WARNING "original: %s\n", 
+				     existing->gh_ip);
+			printk(KERN_INFO "pid : %d\n", existing->gh_owner_pid);
+			printk(KERN_INFO "lock type : %d lock state : %d\n",
+			       existing->gh_gl->gl_name.ln_type, 
+			       existing->gh_gl->gl_state);
+			print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
+			printk(KERN_INFO "pid : %d\n", gh->gh_owner_pid);
+			printk(KERN_INFO "lock type : %d lock state : %d\n",
+			       gl->gl_name.ln_type, gl->gl_state);
+			BUG();
+		}
+		
+		existing = find_holder_by_owner(&gl->gl_waiters3, 
+						gh->gh_owner_pid);
+		if (existing) {
+			print_symbol(KERN_WARNING "original: %s\n", 
+				     existing->gh_ip);
+			print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
+			BUG();
+		}
 	}
 
 	if (gh->gh_flags & LM_FLAG_PRIORITY)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index f7a8e626aa0..b16f604eea9 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -26,6 +26,7 @@
 #define GL_SKIP			0x00000100
 #define GL_ATIME		0x00000200
 #define GL_NOCACHE		0x00000400
+#define GL_FLOCK		0x00000800
 #define GL_NOCANCEL		0x00001000
 
 #define GLR_TRYFAILED		13
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 94d76ace0b9..46a9e10ff17 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -571,7 +571,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
 	int error = 0;
 
 	state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
-	flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
+	flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE 
+		| GL_FLOCK;
 
 	mutex_lock(&fp->f_fl_mutex);
 
@@ -579,21 +580,19 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
 	if (gl) {
 		if (fl_gh->gh_state == state)
 			goto out;
-		gfs2_glock_hold(gl);
 		flock_lock_file_wait(file,
 				     &(struct file_lock){.fl_type = F_UNLCK});
-		gfs2_glock_dq_uninit(fl_gh);
+		gfs2_glock_dq_wait(fl_gh);
+		gfs2_holder_reinit(state, flags, fl_gh);
 	} else {
 		error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
 				      ip->i_no_addr, &gfs2_flock_glops,
 				      CREATE, &gl);
 		if (error)
 			goto out;
+		gfs2_holder_init(gl, state, flags, fl_gh);
+		gfs2_glock_put(gl);
 	}
-
-	gfs2_holder_init(gl, state, flags, fl_gh);
-	gfs2_glock_put(gl);
-
 	error = gfs2_glock_nq(fl_gh);
 	if (error) {
 		gfs2_holder_uninit(fl_gh);
-- 
cgit v1.2.3


From 49e61f2ef6f7d1d0296e3e30d366b28e0ca595c2 Mon Sep 17 00:00:00 2001
From: Wendy Cheng <wcheng@redhat.com>
Date: Thu, 13 Sep 2007 17:52:42 -0400
Subject: [GFS2] Move inode deletion out of blocking_cb

Move inode deletion code out of blocking_cb handle_callback route to
avoid racy conditions that end up blocking lock_dlm1 thread. Fix
bugzilla 286821.

Signed-off-by: Wendy Cheng <wcheng@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d631cad0aee..a37efe4aae6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -716,12 +716,8 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
 		gl->gl_demote_time = jiffies;
 		if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
 		    gl->gl_object) {
-			struct inode *inode = igrab(gl->gl_object);
+			gfs2_glock_schedule_for_reclaim(gl);
 			spin_unlock(&gl->gl_spin);
-			if (inode) {
-				d_prune_aliases(inode);
-				iput(inode);
-			}
 			return;
 		}
 	} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
-- 
cgit v1.2.3


From d66f8277f53407754f50ae6bada68f1b68d04d48 Mon Sep 17 00:00:00 2001
From: Patrick Caulfield <pcaulfie@redhat.com>
Date: Fri, 14 Sep 2007 08:49:21 +0100
Subject: [DLM] Make dlm_sendd cond_resched more

Under high recovery loads dlm_sendd can monopolise the CPU and cause soft lockups.

This one extra and one moved cond_resched() make it yield a little more during
such times keeping work moving.

Signed-Off-By: Patrick Caulfield <pcaulfie@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/dlm/lowcomms.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 62a8a6ccd99..58bf3f5cdbe 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1264,14 +1264,15 @@ static void send_to_sock(struct connection *con)
 		if (len) {
 			ret = sendpage(con->sock, e->page, offset, len,
 				       msg_flags);
-			if (ret == -EAGAIN || ret == 0)
+			if (ret == -EAGAIN || ret == 0) {
+				cond_resched();
 				goto out;
+			}
 			if (ret <= 0)
 				goto send_error;
-		} else {
+		}
 			/* Don't starve people filling buffers */
 			cond_resched();
-		}
 
 		spin_lock(&con->writequeue_lock);
 		e->offset += ret;
-- 
cgit v1.2.3


From 55c0c4ac0be144014651b19e77c9b77f367955de Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Fri, 14 Sep 2007 09:27:59 -0500
Subject: [GFS2] GFS2: chmod hung - fix race in thread creation

The problem boiled down to a race between the gdlm_init_threads()
function initializing thread1 and its setting of blist = 1.
Essentially, "if (current == ls->thread1)" was checked by the thread
before the thread creator set ls->thread1.

Since thread1 is the only thread who is allowed to work on the
blocking queue, and since neither thread thought it was thread1, no one
was working on the queue.  So everything just sat.

This patch reuses the ls->async_lock spin_lock to fix the race,
and it fixes the problem.  I've done more than 2000 iterations of the
loop that was recreating the failure and it seems to work.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

--
---
 fs/gfs2/locking/dlm/thread.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index 1aca51e4509..bd938f06481 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -268,20 +268,16 @@ static inline int check_drop(struct gdlm_ls *ls)
 	return 0;
 }
 
-static int gdlm_thread(void *data)
+static int gdlm_thread(void *data, int blist)
 {
 	struct gdlm_ls *ls = (struct gdlm_ls *) data;
 	struct gdlm_lock *lp = NULL;
-	int blist = 0;
 	uint8_t complete, blocking, submit, drop;
 	DECLARE_WAITQUEUE(wait, current);
 
 	/* Only thread1 is allowed to do blocking callbacks since gfs
 	   may wait for a completion callback within a blocking cb. */
 
-	if (current == ls->thread1)
-		blist = 1;
-
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		add_wait_queue(&ls->thread_wait, &wait);
@@ -333,12 +329,22 @@ static int gdlm_thread(void *data)
 	return 0;
 }
 
+static int gdlm_thread1(void *data)
+{
+	return gdlm_thread(data, 1);
+}
+
+static int gdlm_thread2(void *data)
+{
+	return gdlm_thread(data, 0);
+}
+
 int gdlm_init_threads(struct gdlm_ls *ls)
 {
 	struct task_struct *p;
 	int error;
 
-	p = kthread_run(gdlm_thread, ls, "lock_dlm1");
+	p = kthread_run(gdlm_thread1, ls, "lock_dlm1");
 	error = IS_ERR(p);
 	if (error) {
 		log_error("can't start lock_dlm1 thread %d", error);
@@ -346,7 +352,7 @@ int gdlm_init_threads(struct gdlm_ls *ls)
 	}
 	ls->thread1 = p;
 
-	p = kthread_run(gdlm_thread, ls, "lock_dlm2");
+	p = kthread_run(gdlm_thread2, ls, "lock_dlm2");
 	error = IS_ERR(p);
 	if (error) {
 		log_error("can't start lock_dlm2 thread %d", error);
-- 
cgit v1.2.3


From 16615be18cadf53ee6f8a4f0bdd647f0753421b1 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 17 Sep 2007 10:59:52 +0100
Subject: [GFS2] Clean up journaled data writing

This patch cleans up the code for writing journaled data into the log.
It also removes the need to allocate a small "tag" structure for each
block written into the log. Instead we just keep count of the outstanding
I/O so that we can be sure that its all been written at the correct time.
Another result of this patch is that a number of ll_rw_block() calls
have become submit_bh() calls, closing some races at the same time.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h      |   9 +-
 fs/gfs2/log.c         | 145 +++++++++++++++++-------------
 fs/gfs2/log.h         |   1 +
 fs/gfs2/lops.c        | 242 ++++++++++++++++++++++++++------------------------
 fs/gfs2/meta_io.c     |  55 +++++++-----
 fs/gfs2/meta_io.h     |   3 +
 fs/gfs2/ops_address.c |  11 +--
 fs/gfs2/ops_fstype.c  |   3 +-
 fs/gfs2/ops_inode.c   |   7 +-
 fs/gfs2/ops_super.c   |  13 +--
 10 files changed, 269 insertions(+), 220 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 8aa5780862b..eaddfb5a8e6 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -341,12 +341,6 @@ struct gfs2_quota_data {
 	unsigned long qd_last_touched;
 };
 
-struct gfs2_log_buf {
-	struct list_head lb_list;
-	struct buffer_head *lb_bh;
-	struct buffer_head *lb_real;
-};
-
 struct gfs2_trans {
 	unsigned long tr_ip;
 
@@ -631,7 +625,8 @@ struct gfs2_sbd {
 
 	unsigned long sd_log_flush_time;
 	struct rw_semaphore sd_log_flush_lock;
-	struct list_head sd_log_flush_list;
+	atomic_t sd_log_in_flight;
+	wait_queue_head_t sd_log_flush_wait;
 
 	unsigned int sd_log_flush_head;
 	u64 sd_log_flush_wrapped;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 4d04e6f1970..ee704676b2f 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -104,11 +104,8 @@ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 			gfs2_assert(sdp, bd->bd_ail == ai);
 
 			if (!buffer_busy(bh)) {
-				if (!buffer_uptodate(bh)) {
-					gfs2_log_unlock(sdp);
+				if (!buffer_uptodate(bh))
 					gfs2_io_error_bh(sdp, bh);
-					gfs2_log_lock(sdp);
-				}
 				list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
 				continue;
 			}
@@ -118,9 +115,16 @@ static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 
 			list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
 
+			get_bh(bh);
 			gfs2_log_unlock(sdp);
-			wait_on_buffer(bh);
-			ll_rw_block(WRITE, 1, &bh);
+			lock_buffer(bh);
+			if (test_clear_buffer_dirty(bh)) {
+				bh->b_end_io = end_buffer_write_sync;
+				submit_bh(WRITE, bh);
+			} else {
+				unlock_buffer(bh);
+				brelse(bh);
+			}
 			gfs2_log_lock(sdp);
 
 			retry = 1;
@@ -446,10 +450,10 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
 	return tail;
 }
 
-static inline void log_incr_head(struct gfs2_sbd *sdp)
+void gfs2_log_incr_head(struct gfs2_sbd *sdp)
 {
 	if (sdp->sd_log_flush_head == sdp->sd_log_tail)
-		gfs2_assert_withdraw(sdp, sdp->sd_log_flush_head == sdp->sd_log_head);
+		BUG_ON(sdp->sd_log_flush_head != sdp->sd_log_head);
 
 	if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
 		sdp->sd_log_flush_head = 0;
@@ -457,6 +461,23 @@ static inline void log_incr_head(struct gfs2_sbd *sdp)
 	}
 }
 
+/**
+ * gfs2_log_write_endio - End of I/O for a log buffer
+ * @bh: The buffer head
+ * @uptodate: I/O Status
+ *
+ */
+
+static void gfs2_log_write_endio(struct buffer_head *bh, int uptodate)
+{
+	struct gfs2_sbd *sdp = bh->b_private;
+	bh->b_private = NULL;
+
+	end_buffer_write_sync(bh, uptodate);
+	if (atomic_dec_and_test(&sdp->sd_log_in_flight))
+		wake_up(&sdp->sd_log_flush_wait);
+}
+
 /**
  * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
  * @sdp: The GFS2 superblock
@@ -467,24 +488,41 @@ static inline void log_incr_head(struct gfs2_sbd *sdp)
 struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
 {
 	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
-	struct gfs2_log_buf *lb;
 	struct buffer_head *bh;
 
-	lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
-	list_add(&lb->lb_list, &sdp->sd_log_flush_list);
-
-	bh = lb->lb_bh = sb_getblk(sdp->sd_vfs, blkno);
+	bh = sb_getblk(sdp->sd_vfs, blkno);
 	lock_buffer(bh);
 	memset(bh->b_data, 0, bh->b_size);
 	set_buffer_uptodate(bh);
 	clear_buffer_dirty(bh);
-	unlock_buffer(bh);
-
-	log_incr_head(sdp);
+	gfs2_log_incr_head(sdp);
+	atomic_inc(&sdp->sd_log_in_flight);
+	bh->b_private = sdp;
+	bh->b_end_io = gfs2_log_write_endio;
 
 	return bh;
 }
 
+/**
+ * gfs2_fake_write_endio - 
+ * @bh: The buffer head
+ * @uptodate: The I/O Status
+ *
+ */
+
+static void gfs2_fake_write_endio(struct buffer_head *bh, int uptodate)
+{
+	struct buffer_head *real_bh = bh->b_private;
+	struct gfs2_sbd *sdp = GFS2_SB(real_bh->b_page->mapping->host);
+
+	end_buffer_write_sync(bh, uptodate);
+	free_buffer_head(bh);
+	unlock_buffer(real_bh);
+	brelse(real_bh);
+	if (atomic_dec_and_test(&sdp->sd_log_in_flight))
+		wake_up(&sdp->sd_log_flush_wait);
+}
+
 /**
  * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
  * @sdp: the filesystem
@@ -497,22 +535,20 @@ struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
 				      struct buffer_head *real)
 {
 	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
-	struct gfs2_log_buf *lb;
 	struct buffer_head *bh;
 
-	lb = kzalloc(sizeof(struct gfs2_log_buf), GFP_NOFS | __GFP_NOFAIL);
-	list_add(&lb->lb_list, &sdp->sd_log_flush_list);
-	lb->lb_real = real;
-
-	bh = lb->lb_bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
+	bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
 	atomic_set(&bh->b_count, 1);
-	bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate);
+	bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock);
 	set_bh_page(bh, real->b_page, bh_offset(real));
 	bh->b_blocknr = blkno;
 	bh->b_size = sdp->sd_sb.sb_bsize;
 	bh->b_bdev = sdp->sd_vfs->s_bdev;
+	bh->b_private = real;
+	bh->b_end_io = gfs2_fake_write_endio;
 
-	log_incr_head(sdp);
+	gfs2_log_incr_head(sdp);
+	atomic_inc(&sdp->sd_log_in_flight);
 
 	return bh;
 }
@@ -579,45 +615,24 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 		gfs2_assert_withdraw(sdp, !pull);
 
 	sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
-	log_incr_head(sdp);
+	gfs2_log_incr_head(sdp);
 }
 
 static void log_flush_commit(struct gfs2_sbd *sdp)
 {
-	struct list_head *head = &sdp->sd_log_flush_list;
-	struct gfs2_log_buf *lb;
-	struct buffer_head *bh;
-	int flushcount = 0;
-
-	while (!list_empty(head)) {
-		lb = list_entry(head->next, struct gfs2_log_buf, lb_list);
-		list_del(&lb->lb_list);
-		bh = lb->lb_bh;
-
-		wait_on_buffer(bh);
-		if (!buffer_uptodate(bh))
-			gfs2_io_error_bh(sdp, bh);
-		if (lb->lb_real) {
-			while (atomic_read(&bh->b_count) != 1)  /* Grrrr... */
-				schedule();
-			free_buffer_head(bh);
-		} else
-			brelse(bh);
-		kfree(lb);
-		flushcount++;
+	DEFINE_WAIT(wait);
+
+	if (atomic_read(&sdp->sd_log_in_flight)) {
+		do {
+			prepare_to_wait(&sdp->sd_log_flush_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+			if (atomic_read(&sdp->sd_log_in_flight))
+				io_schedule();
+		} while(atomic_read(&sdp->sd_log_in_flight));
+		finish_wait(&sdp->sd_log_flush_wait, &wait);
 	}
 
-	/* If nothing was journaled, the header is unplanned and unwanted. */
-	if (flushcount) {
-		log_write_header(sdp, 0, 0);
-	} else {
-		unsigned int tail;
-		tail = current_tail(sdp);
-
-		gfs2_ail1_empty(sdp, 0);
-		if (sdp->sd_log_tail != tail)
-			log_pull_tail(sdp, tail);
-	}
+	log_write_header(sdp, 0, 0);
 }
 
 static void gfs2_ordered_write(struct gfs2_sbd *sdp)
@@ -698,10 +713,16 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 	INIT_LIST_HEAD(&ai->ai_ail1_list);
 	INIT_LIST_HEAD(&ai->ai_ail2_list);
 
-	gfs2_assert_withdraw(sdp,
-			     sdp->sd_log_num_buf + sdp->sd_log_num_databuf ==
-			     sdp->sd_log_commited_buf +
-			     sdp->sd_log_commited_databuf);
+	if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) {
+		printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf,
+		       sdp->sd_log_commited_buf);
+		gfs2_assert_withdraw(sdp, 0);
+	}
+	if (sdp->sd_log_num_databuf != sdp->sd_log_commited_databuf) {
+		printk(KERN_INFO "GFS2: log databuf %u %u\n",
+		       sdp->sd_log_num_databuf, sdp->sd_log_commited_databuf);
+		gfs2_assert_withdraw(sdp, 0);
+	}
 	gfs2_assert_withdraw(sdp,
 			sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
 
@@ -713,7 +734,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 	lops_before_commit(sdp);
 	gfs2_ordered_wait(sdp);
 
-	if (!list_empty(&sdp->sd_log_flush_list))
+	if (sdp->sd_log_head != sdp->sd_log_flush_head)
 		log_flush_commit(sdp);
 	else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
 		gfs2_log_lock(sdp);
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 639423561b2..dae28240062 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -52,6 +52,7 @@ int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags);
 
 int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
 void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
+void gfs2_log_incr_head(struct gfs2_sbd *sdp);
 
 struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
 struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 342c10e12af..6c27cea761c 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -91,6 +91,39 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
 	unlock_buffer(bh);
 }
 
+
+static inline struct gfs2_log_descriptor *bh_log_desc(struct buffer_head *bh)
+{
+	return (struct gfs2_log_descriptor *)bh->b_data;
+}
+
+static inline __be64 *bh_log_ptr(struct buffer_head *bh)
+{
+	struct gfs2_log_descriptor *ld = bh_log_desc(bh);
+	return (__force __be64 *)(ld + 1);
+}
+
+static inline __be64 *bh_ptr_end(struct buffer_head *bh)
+{
+	return (__force __be64 *)(bh->b_data + bh->b_size);
+}
+
+
+static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type)
+{
+	struct buffer_head *bh = gfs2_log_get_buf(sdp);
+	struct gfs2_log_descriptor *ld = bh_log_desc(bh);
+	ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+	ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
+	ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
+	ld->ld_type = cpu_to_be32(ld_type);
+	ld->ld_length = 0;
+	ld->ld_data1 = 0;
+	ld->ld_data2 = 0;
+	memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
+	return bh;
+}
+
 static void __glock_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 {
 	struct gfs2_glock *gl;
@@ -181,7 +214,6 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 	struct gfs2_log_descriptor *ld;
 	struct gfs2_bufdata *bd1 = NULL, *bd2;
 	unsigned int total;
-	unsigned int offset = BUF_OFFSET;
 	unsigned int limit;
 	unsigned int num;
 	unsigned n;
@@ -198,18 +230,12 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 		if (total > limit)
 			num = limit;
 		gfs2_log_unlock(sdp);
-		bh = gfs2_log_get_buf(sdp);
+		bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA);
 		gfs2_log_lock(sdp);
-		ld = (struct gfs2_log_descriptor *)bh->b_data;
-		ptr = (__be64 *)(bh->b_data + offset);
-		ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
-		ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
-		ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
-		ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_METADATA);
+		ld = bh_log_desc(bh);
+		ptr = bh_log_ptr(bh);
 		ld->ld_length = cpu_to_be32(num + 1);
 		ld->ld_data1 = cpu_to_be32(num);
-		ld->ld_data2 = cpu_to_be32(0);
-		memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
 
 		n = 0;
 		list_for_each_entry_continue(bd1, &sdp->sd_log_le_buf,
@@ -220,17 +246,17 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 		}
 
 		gfs2_log_unlock(sdp);
-		set_buffer_dirty(bh);
-		ll_rw_block(WRITE, 1, &bh);
+		submit_bh(WRITE, bh);
 		gfs2_log_lock(sdp);
 
 		n = 0;
 		list_for_each_entry_continue(bd2, &sdp->sd_log_le_buf,
 					     bd_le.le_list) {
+			get_bh(bd2->bd_bh);
 			gfs2_log_unlock(sdp);
+			lock_buffer(bd2->bd_bh);
 			bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
-			set_buffer_dirty(bh);
-			ll_rw_block(WRITE, 1, &bh);
+			submit_bh(WRITE, bh);
 			gfs2_log_lock(sdp);
 			if (++n >= num)
 				break;
@@ -359,17 +385,11 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 	if (!sdp->sd_log_num_revoke)
 		return;
 
-	bh = gfs2_log_get_buf(sdp);
-	ld = (struct gfs2_log_descriptor *)bh->b_data;
-	ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
-	ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
-	ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
-	ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_REVOKE);
+	bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE);
+	ld = bh_log_desc(bh);
 	ld->ld_length = cpu_to_be32(gfs2_struct2blk(sdp, sdp->sd_log_num_revoke,
 						    sizeof(u64)));
 	ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
-	ld->ld_data2 = cpu_to_be32(0);
-	memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
 	offset = sizeof(struct gfs2_log_descriptor);
 
 	while (!list_empty(head)) {
@@ -378,8 +398,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 		sdp->sd_log_num_revoke--;
 
 		if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
-			set_buffer_dirty(bh);
-			ll_rw_block(WRITE, 1, &bh);
+			submit_bh(WRITE, bh);
 
 			bh = gfs2_log_get_buf(sdp);
 			mh = (struct gfs2_meta_header *)bh->b_data;
@@ -396,8 +415,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 	}
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
 
-	set_buffer_dirty(bh);
-	ll_rw_block(WRITE, 1, &bh);
+	submit_bh(WRITE, bh);
 }
 
 static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -562,118 +580,110 @@ out:
 	unlock_buffer(bd->bd_bh);
 }
 
-static int gfs2_check_magic(struct buffer_head *bh)
+static void gfs2_check_magic(struct buffer_head *bh)
 {
-	struct page *page = bh->b_page;
 	void *kaddr;
 	__be32 *ptr;
-	int rv = 0;
 
-	kaddr = kmap_atomic(page, KM_USER0);
+	clear_buffer_escaped(bh);
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
 	ptr = kaddr + bh_offset(bh);
 	if (*ptr == cpu_to_be32(GFS2_MAGIC))
-		rv = 1;
+		set_buffer_escaped(bh);
 	kunmap_atomic(kaddr, KM_USER0);
-
-	return rv;
 }
 
-/**
- * databuf_lo_before_commit - Scan the data buffers, writing as we go
- *
- */
-
-static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
+static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			      struct list_head *list, struct list_head *done,
+			      unsigned int n)
 {
-	struct gfs2_bufdata *bd1 = NULL, *bd2;
-	struct buffer_head *bh = NULL,*bh1 = NULL;
+	struct buffer_head *bh1;
 	struct gfs2_log_descriptor *ld;
-	unsigned int limit;
-	unsigned int total;
-	unsigned int num, n;
-	__be64 *ptr = NULL;
-	int magic;
+	struct gfs2_bufdata *bd;
+	__be64 *ptr;
 
+	if (!bh)
+		return;
 
-	limit = databuf_limit(sdp);
+	ld = bh_log_desc(bh);
+	ld->ld_length = cpu_to_be32(n + 1);
+	ld->ld_data1 = cpu_to_be32(n);
 
+	ptr = bh_log_ptr(bh);
+	
+	get_bh(bh);
+	submit_bh(WRITE, bh);
 	gfs2_log_lock(sdp);
-	total = sdp->sd_log_num_databuf;
-	bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf,
-				       bd_le.le_list);
-	while(total) {
-		num = total;
-		if (num > limit)
-			num = limit;
-
-		gfs2_log_unlock(sdp);
-		bh = gfs2_log_get_buf(sdp);
-		gfs2_log_lock(sdp);
-
-		ld = (struct gfs2_log_descriptor *)bh->b_data;
-		ptr = (__be64 *)(bh->b_data + DATABUF_OFFSET);
-		ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
-		ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD);
-		ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD);
-		ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_JDATA);
-		ld->ld_length = cpu_to_be32(num + 1);
-		ld->ld_data1 = cpu_to_be32(num);
-		ld->ld_data2 = cpu_to_be32(0);
-		memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
-
-		n = 0;
-		list_for_each_entry_continue(bd1, &sdp->sd_log_le_databuf,
-					     bd_le.le_list) {
-			bh1 = bd1->bd_bh;
-
-			magic = gfs2_check_magic(bh1);
-			*ptr++ = cpu_to_be64(bh1->b_blocknr);
-			*ptr++ = cpu_to_be64((__u64)magic);
-			clear_buffer_escaped(bh1);
-			if (unlikely(magic != 0))
-				set_buffer_escaped(bh1);
-			if (++n >= num)
-				break;
+	while(!list_empty(list)) {
+		bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
+		list_move_tail(&bd->bd_le.le_list, done);
+		get_bh(bd->bd_bh);
+		while (be64_to_cpu(*ptr) != bd->bd_bh->b_blocknr) {
+			gfs2_log_incr_head(sdp);
+			ptr += 2;
 		}
 		gfs2_log_unlock(sdp);
-		if (bh) {
-			set_buffer_dirty(bh);
-			ll_rw_block(WRITE, 1, &bh);
-			bh = NULL;
-			ptr = NULL;
+		lock_buffer(bd->bd_bh);
+		if (buffer_escaped(bd->bd_bh)) {
+			void *kaddr;
+			bh1 = gfs2_log_get_buf(sdp);
+			kaddr = kmap_atomic(bd->bd_bh->b_page, KM_USER0);
+			memcpy(bh1->b_data, kaddr + bh_offset(bd->bd_bh),
+			       bh1->b_size);
+			kunmap_atomic(kaddr, KM_USER0);
+			*(__be32 *)bh1->b_data = 0;
+			clear_buffer_escaped(bd->bd_bh);
+			unlock_buffer(bd->bd_bh);
+			brelse(bd->bd_bh);
+		} else {
+			bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
 		}
-		n = 0;
+		submit_bh(WRITE, bh1);
 		gfs2_log_lock(sdp);
-		list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf,
-					     bd_le.le_list) {
-			if (!bd2->bd_bh)
-				continue;
-			/* copy buffer if it needs escaping */
+		ptr += 2;
+	}
+	gfs2_log_unlock(sdp);
+	brelse(bh);
+}
+
+/**
+ * databuf_lo_before_commit - Scan the data buffers, writing as we go
+ *
+ */
+
+static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
+{
+	struct gfs2_bufdata *bd = NULL;
+	struct buffer_head *bh = NULL;
+	unsigned int n = 0;
+	__be64 *ptr = NULL, *end = NULL;
+	LIST_HEAD(processed);
+	LIST_HEAD(in_progress);
+
+	gfs2_log_lock(sdp);
+	while (!list_empty(&sdp->sd_log_le_databuf)) {
+		if (ptr == end) {
 			gfs2_log_unlock(sdp);
-			if (unlikely(buffer_escaped(bd2->bd_bh))) {
-				void *kaddr;
-				struct page *page = bd2->bd_bh->b_page;
-				bh = gfs2_log_get_buf(sdp);
-				kaddr = kmap_atomic(page, KM_USER0);
-				memcpy(bh->b_data,
-				       kaddr + bh_offset(bd2->bd_bh),
-				       sdp->sd_sb.sb_bsize);
-				kunmap_atomic(kaddr, KM_USER0);
-				*(__be32 *)bh->b_data = 0;
-			} else {
-				bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
-			}
-			set_buffer_dirty(bh);
-			ll_rw_block(WRITE, 1, &bh);
+			gfs2_write_blocks(sdp, bh, &in_progress, &processed, n);
+			n = 0;
+			bh = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_JDATA);
+			ptr = bh_log_ptr(bh);
+			end = bh_ptr_end(bh) - 1;
 			gfs2_log_lock(sdp);
-			if (++n >= num)
-				break;
+			continue;
 		}
-		bh = NULL;
-		BUG_ON(total < num);
-		total -= num;
+		bd = list_entry(sdp->sd_log_le_databuf.next, struct gfs2_bufdata, bd_le.le_list);
+		list_move_tail(&bd->bd_le.le_list, &in_progress);
+		gfs2_check_magic(bd->bd_bh);
+		*ptr++ = cpu_to_be64(bd->bd_bh->b_blocknr);
+		*ptr++ = cpu_to_be64(buffer_escaped(bh) ? 1 : 0);
+		n++;
 	}
 	gfs2_log_unlock(sdp);
+	gfs2_write_blocks(sdp, bh, &in_progress, &processed, n);
+	gfs2_log_lock(sdp);
+	list_splice(&processed, &sdp->sd_log_le_databuf);
+	gfs2_log_unlock(sdp);
 }
 
 static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -807,10 +817,10 @@ const struct gfs2_log_operations gfs2_databuf_lops = {
 
 const struct gfs2_log_operations *gfs2_log_ops[] = {
 	&gfs2_glock_lops,
+	&gfs2_databuf_lops,
 	&gfs2_buf_lops,
-	&gfs2_revoke_lops,
 	&gfs2_rg_lops,
-	&gfs2_databuf_lops,
+	&gfs2_revoke_lops,
 	NULL,
 };
 
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 19097bc7c81..1d80f2d4212 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -297,6 +297,37 @@ void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
 		unlock_page(bh->b_page);
 }
 
+void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(bh->b_page->mapping->host);
+	struct gfs2_bufdata *bd = bh->b_private;
+	if (test_clear_buffer_pinned(bh)) {
+		list_del_init(&bd->bd_le.le_list);
+		if (meta) {
+			gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
+			sdp->sd_log_num_buf--;
+			tr->tr_num_buf_rm++;
+		} else {
+			gfs2_assert_warn(sdp, sdp->sd_log_num_databuf);
+			sdp->sd_log_num_databuf--;
+			tr->tr_num_databuf_rm++;
+		}
+		tr->tr_touched = 1;
+		brelse(bh);
+	}
+	if (bd) {
+		if (bd->bd_ail) {
+			gfs2_remove_from_ail(NULL, bd);
+			bh->b_private = NULL;
+			bd->bd_bh = NULL;
+			bd->bd_blkno = bh->b_blocknr;
+			gfs2_trans_add_revoke(sdp, bd);
+		}
+	}
+	clear_buffer_dirty(bh);
+	clear_buffer_uptodate(bh);
+}
+
 /**
  * gfs2_meta_wipe - make inode's buffers so they aren't dirty/pinned anymore
  * @ip: the inode who owns the buffers
@@ -313,33 +344,11 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
 	while (blen) {
 		bh = getbuf(ip->i_gl, bstart, NO_CREATE);
 		if (bh) {
-			struct gfs2_bufdata *bd;
-
 			lock_buffer(bh);
 			gfs2_log_lock(sdp);
-			bd = bh->b_private;
-			if (test_clear_buffer_pinned(bh)) {
-				struct gfs2_trans *tr = current->journal_info;
-				list_del_init(&bd->bd_le.le_list);
-				gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
-				sdp->sd_log_num_buf--;
-				tr->tr_num_buf_rm++;
-				brelse(bh);
-			}
-			if (bd) {
-				if (bd->bd_ail) {
-					gfs2_remove_from_ail(NULL, bd);
-					bh->b_private = NULL;
-					bd->bd_bh = NULL;
-					bd->bd_blkno = bh->b_blocknr;
-					gfs2_trans_add_revoke(sdp, bd);
-				}
-			}
-			clear_buffer_dirty(bh);
-			clear_buffer_uptodate(bh);
+			gfs2_remove_from_journal(bh, current->journal_info, 1);
 			gfs2_log_unlock(sdp);
 			unlock_buffer(bh);
-
 			brelse(bh);
 		}
 
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index fd67f07cd60..b7048222ebb 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -51,6 +51,9 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
 void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
 			 int meta);
 
+void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr,
+			      int meta);
+
 void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen);
 
 void gfs2_meta_cache_flush(struct gfs2_inode *ip);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index dd1ea491ddc..b7baf183191 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -414,7 +414,8 @@ static int gfs2_prepare_write(struct file *file, struct page *page,
 	if (ind_blocks || data_blocks)
 		rblocks += RES_STATFS + RES_QUOTA;
 
-	error = gfs2_trans_begin(sdp, rblocks, 0);
+	error = gfs2_trans_begin(sdp, rblocks,
+				 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
 	if (error)
 		goto out_trans_fail;
 
@@ -625,10 +626,10 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
 	clear_buffer_dirty(bh);
 	bd = bh->b_private;
 	if (bd) {
-		if (!list_empty(&bd->bd_le.le_list)) {
-			if (!buffer_pinned(bh))
-				list_del_init(&bd->bd_le.le_list);
-		}
+		if (!list_empty(&bd->bd_le.le_list) && !buffer_pinned(bh))
+			list_del_init(&bd->bd_le.le_list);
+		else
+			gfs2_remove_from_journal(bh, current->journal_info, 0);
 	}
 	bh->b_bdev = NULL;
 	clear_buffer_mapped(bh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 35f3dfa9bfb..1ac9afa58c6 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -89,7 +89,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	INIT_LIST_HEAD(&sdp->sd_ail2_list);
 
 	init_rwsem(&sdp->sd_log_flush_lock);
-	INIT_LIST_HEAD(&sdp->sd_log_flush_list);
+	atomic_set(&sdp->sd_log_in_flight, 0);
+	init_waitqueue_head(&sdp->sd_log_flush_wait);
 
 	INIT_LIST_HEAD(&sdp->sd_revoke_list);
 
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 2cbe5a321e8..291f0c7eaa3 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -905,12 +905,17 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
 static int setattr_size(struct inode *inode, struct iattr *attr)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	int error;
 
 	if (attr->ia_size != ip->i_di.di_size) {
-		error = vmtruncate(inode, attr->ia_size);
+		error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
 		if (error)
 			return error;
+		error = vmtruncate(inode, attr->ia_size);
+		gfs2_trans_end(sdp);
+		if (error) 
+			return error;
 	}
 
 	error = gfs2_truncatei(ip, attr->ia_size);
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 4316690d86f..950f31460e8 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -455,12 +455,15 @@ static void gfs2_delete_inode(struct inode *inode)
 	}
 
 	error = gfs2_dinode_dealloc(ip);
-	/*
-	 * Must do this before unlock to avoid trying to write back
-	 * potentially dirty data now that inode no longer exists
-	 * on disk.
-	 */
+	if (error)
+		goto out_unlock;
+
+	error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
+	if (error)
+		goto out_unlock;
+	/* Needs to be done before glock release & also in a transaction */
 	truncate_inode_pages(&inode->i_data, 0);
+	gfs2_trans_end(sdp);
 
 out_unlock:
 	gfs2_glock_dq(&ip->i_iopen_gh);
-- 
cgit v1.2.3


From de986e859a29097fb9211b052d86a9a2c868f6cd Mon Sep 17 00:00:00 2001
From: Wendy Cheng <wcheng@redhat.com>
Date: Tue, 18 Sep 2007 09:19:13 -0400
Subject: [GFS2] Data corruption fix

* GFS2 has been using i_cache array to store its indirect meta blocks.
Its flush routine doesn't correctly clean up all the entries. The
problem would show while multiple nodes do simultaneous writes to the
same file. Upon glock exclusive lock transfer, if the file is a sparse
file with large file size where the indirect meta blocks span multiple
array entries with "zero" entries in between. The flush routine
prematurely stops the flushing that leaves old (stale) entries around.
This leads to several nasty issues, including data corruption.
* Fix gfs2_get_block_noalloc checking to correctly return EIO upon
unmapped buffer.

Signed-off-by: Wendy Cheng <wcheng@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/meta_io.c     | 8 ++++----
 fs/gfs2/ops_address.c | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 1d80f2d4212..4da423985e4 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -374,10 +374,10 @@ void gfs2_meta_cache_flush(struct gfs2_inode *ip)
 
 	for (x = 0; x < GFS2_MAX_META_HEIGHT; x++) {
 		bh_slot = &ip->i_cache[x];
-		if (!*bh_slot)
-			break;
-		brelse(*bh_slot);
-		*bh_slot = NULL;
+		if (*bh_slot) {
+			brelse(*bh_slot);
+			*bh_slot = NULL;
+		}
 	}
 
 	spin_unlock(&ip->i_spin);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index b7baf183191..4002f417dc1 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -90,7 +90,7 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
 	error = gfs2_block_map(inode, lblock, 0, bh_result);
 	if (error)
 		return error;
-	if (bh_result->b_blocknr == 0)
+	if (!buffer_mapped(bh_result))
 		return -EIO;
 	return 0;
 }
-- 
cgit v1.2.3


From 7a9f53b3c1875bef22ad4588e818bc046ef183da Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Tue, 18 Sep 2007 13:33:18 -0500
Subject: [GFS2] Alternate gfs2_iget to avoid looking up inodes being freed

There is a possible deadlock between two processes on the same node, where one
process is deleting an inode, and another process is looking for allocated but
unused inodes to delete in order to create more space.

process A does an iput() on inode X, and it's i_count drops to 0. This causes
iput_final() to be called, which puts an inode into state I_FREEING at
generic_delete_inode(). There no point between when iput_final() is called, and
when I_FREEING is set where GFS2 could acquire any glocks. Once I_FREEING is
set, no other process on that node can successfully look up that inode until
the delete finishes.

process B locks the the resource group for the same inode in get_local_rgrp(),
which is called by gfs2_inplace_reserve_i()

process A tries to lock the resource group for the inode in
gfs2_dinode_dealloc(), but it's already locked by process B

process B waits in find_inode for the inode to have the I_FREEING state cleared.

Deadlock.

This patch solves the problem by adding an alternative to gfs2_iget(),
gfs2_iget_skip(), that simply skips any inodes that are in the I_FREEING
state.o The alternate test function is just like the original one, except that
it fails if the inode is being freed, and sets a skipped flag. The alternate
set function is just like the original, except that it fails if the skipped
flag is set. Only try_rgrp_unlink() calls gfs2_iget_skip() instead of
gfs2_iget().

Signed-off-by: Benjamin E. Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/dir.c        |  2 +-
 fs/gfs2/inode.c      | 58 ++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/gfs2/inode.h      |  3 ++-
 fs/gfs2/ops_export.c |  2 +-
 fs/gfs2/ops_fstype.c |  2 +-
 fs/gfs2/rgrp.c       |  2 +-
 6 files changed, 60 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 08c6dd0c671..9949bb746a5 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1502,7 +1502,7 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name)
 		inode = gfs2_inode_lookup(dir->i_sb, 
 				be16_to_cpu(dent->de_type),
 				be64_to_cpu(dent->de_inum.no_addr),
-				be64_to_cpu(dent->de_inum.no_formal_ino));
+				be64_to_cpu(dent->de_inum.no_formal_ino), 0);
 		brelse(bh);
 		return inode;
 	}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 013f00b90cc..5f6dc32946c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -77,6 +77,49 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr)
 	return iget5_locked(sb, hash, iget_test, iget_set, &no_addr);
 }
 
+struct gfs2_skip_data {
+	u64	no_addr;
+	int	skipped;
+};
+
+static int iget_skip_test(struct inode *inode, void *opaque)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_skip_data *data = opaque;
+
+	if (ip->i_no_addr == data->no_addr && inode->i_private != NULL){
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){
+			data->skipped = 1;
+			return 0;
+		}
+		return 1;
+	}
+	return 0;
+}
+
+static int iget_skip_set(struct inode *inode, void *opaque)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_skip_data *data = opaque;
+
+	if (data->skipped)
+		return 1;
+	inode->i_ino = (unsigned long)(data->no_addr);
+	ip->i_no_addr = data->no_addr;
+	return 0;
+}
+
+static struct inode *gfs2_iget_skip(struct super_block *sb,
+				    u64 no_addr)
+{
+	struct gfs2_skip_data data;
+	unsigned long hash = (unsigned long)no_addr;
+
+	data.no_addr = no_addr;
+	data.skipped = 0;
+	return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data);
+}
+
 /**
  * GFS2 lookup code fills in vfs inode contents based on info obtained
  * from directory entry inside gfs2_inode_lookup(). This has caused issues
@@ -112,6 +155,7 @@ void gfs2_set_iop(struct inode *inode)
  * @sb: The super block
  * @no_addr: The inode number
  * @type: The type of the inode
+ * @skip_freeing: set this not return an inode if it is currently being freed.
  *
  * Returns: A VFS inode, or an error
  */
@@ -119,13 +163,19 @@ void gfs2_set_iop(struct inode *inode)
 struct inode *gfs2_inode_lookup(struct super_block *sb, 
 				unsigned int type,
 				u64 no_addr,
-				u64 no_formal_ino)
+				u64 no_formal_ino, int skip_freeing)
 {
-	struct inode *inode = gfs2_iget(sb, no_addr);
-	struct gfs2_inode *ip = GFS2_I(inode);
+	struct inode *inode;
+	struct gfs2_inode *ip;
 	struct gfs2_glock *io_gl;
 	int error;
 
+	if (skip_freeing)
+		inode = gfs2_iget_skip(sb, no_addr);
+	else
+		inode = gfs2_iget(sb, no_addr);
+	ip = GFS2_I(inode);
+
 	if (!inode)
 		return ERR_PTR(-ENOBUFS);
 
@@ -949,7 +999,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 
 	inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode),
 					inum.no_addr,
-					inum.no_formal_ino);
+					inum.no_formal_ino, 0);
 	if (IS_ERR(inode))
 		goto fail_gunlock2;
 
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 4517ac82c01..351ac87ab38 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -49,7 +49,8 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 void gfs2_inode_attr_in(struct gfs2_inode *ip);
 void gfs2_set_iop(struct inode *inode);
 struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
-				u64 no_addr, u64 no_formal_ino);
+				u64 no_addr, u64 no_formal_ino,
+				int skip_freeing);
 struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
 
 int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index b8312edee0e..e2d1347796a 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -237,7 +237,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, void *inum_obj)
 
 	inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
 					inum->no_addr,
-					0);
+					0, 0);
 	if (!inode)
 		goto fail;
 	if (IS_ERR(inode)) {
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1ac9afa58c6..17de58e83d9 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -230,7 +230,7 @@ fail:
 static inline struct inode *gfs2_lookup_root(struct super_block *sb,
 					     u64 no_addr)
 {
-	return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0);
+	return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
 }
 
 static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 2d7f7ea0c9a..708c287e1d0 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -884,7 +884,7 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
 			continue;
 		*last_unlinked = no_addr;
 		inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
-					  no_addr, -1);
+					  no_addr, -1, 1);
 		if (!IS_ERR(inode))
 			return inode;
 	}
-- 
cgit v1.2.3


From 891ba6d4a5f9e6302bb6542592d73feb4d0d3687 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 20 Sep 2007 15:26:33 +0100
Subject: [GFS2] Don't try to remove buffers that don't exist

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_address.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 4002f417dc1..873a511ef2b 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -747,7 +747,7 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
 	struct gfs2_bufdata *bd;
 
 	if (!page_has_buffers(page))
-		goto out;
+		return 0;
 
 	gfs2_log_lock(sdp);
 	head = bh = page_buffers(page);
@@ -787,7 +787,6 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
 		bh = bh->b_this_page;
 	} while (bh != head);
 
-out:
 	return try_to_free_buffers(page);
 cannot_release:
 	gfs2_log_unlock(sdp);
-- 
cgit v1.2.3


From 5a60c532c9224babc172fafccc9e2fec6937af6f Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 26 Sep 2007 09:39:31 +0100
Subject: [GFS2] Get superblock a different way

The mapping may be NULL by the time the I/O has completed, so
we now get the superblock by a different route (via the bd and glock)
to avoid this problem.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Wendy Cheng <wcheng@redhat.com>
---
 fs/gfs2/log.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index ee704676b2f..7df70247325 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -513,7 +513,8 @@ struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
 static void gfs2_fake_write_endio(struct buffer_head *bh, int uptodate)
 {
 	struct buffer_head *real_bh = bh->b_private;
-	struct gfs2_sbd *sdp = GFS2_SB(real_bh->b_page->mapping->host);
+	struct gfs2_bufdata *bd = real_bh->b_private;
+	struct gfs2_sbd *sdp = bd->bd_gl->gl_sbd;
 
 	end_buffer_write_sync(bh, uptodate);
 	free_buffer_head(bh);
-- 
cgit v1.2.3


From b434eda6fda5bcdcc2dd918e5ffbf7184f2d4e17 Mon Sep 17 00:00:00 2001
From: Patrick Caulfield <pcaulfie@redhat.com>
Date: Mon, 1 Oct 2007 15:28:42 +0100
Subject: [DLM] don't overwrite castparam if it's NULL

If the castaddr passed to the userland API is NULL then don't overwrite the
existing castparam. This allows a different thread to cancel a lock request and
the CANCEL AST gets delivered to the original thread.

bz#306391 (for RHEL4) refers.

Signed-Off-By: Patrick Caulfield <pcaulfie@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/dlm/lock.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 2082daf083d..031229f144f 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -4429,7 +4429,8 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 
 	if (lvb_in && ua->lksb.sb_lvbptr)
 		memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
-	ua->castparam = ua_tmp->castparam;
+	if (ua_tmp->castparam)
+		ua->castparam = ua_tmp->castparam;
 	ua->user_lksb = ua_tmp->user_lksb;
 
 	error = set_unlock_args(flags, ua, &args);
@@ -4474,7 +4475,8 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 		goto out;
 
 	ua = (struct dlm_user_args *)lkb->lkb_astparam;
-	ua->castparam = ua_tmp->castparam;
+	if (ua_tmp->castparam)
+		ua->castparam = ua_tmp->castparam;
 	ua->user_lksb = ua_tmp->user_lksb;
 
 	error = set_unlock_args(flags, ua, &args);
-- 
cgit v1.2.3


From c36258b5925e6cf6bf72904635100593573bfcff Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Thu, 27 Sep 2007 15:53:38 -0500
Subject: [DLM] block dlm_recv in recovery transition

Introduce a per-lockspace rwsem that's held in read mode by dlm_recv
threads while working in the dlm.  This allows dlm_recv activity to be
suspended when the lockspace transitions to, from and between recovery
cycles.

The specific bug prompting this change is one where an in-progress
recovery cycle is aborted by a new recovery cycle.  While dlm_recv was
processing a recovery message, the recovery cycle was aborted and
dlm_recoverd began cleaning up.  dlm_recv decremented recover_locks_count
on an rsb after dlm_recoverd had reset it to zero.  This is fixed by
suspending dlm_recv (taking write lock on the rwsem) before aborting the
current recovery.

The transitions to/from normal and recovery modes are simplified by using
this new ability to block dlm_recv.  The switch from normal to recovery
mode means dlm_recv goes from processing locking messages, to saving them
for later, and vice versa.  Races are avoided by blocking dlm_recv when
setting the flag that switches between modes.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/dlm/dlm_internal.h |   1 +
 fs/dlm/lock.c         | 136 ++++++++++++++++++++++++++++++--------------------
 fs/dlm/lock.h         |   3 +-
 fs/dlm/lockspace.c    |   1 +
 fs/dlm/member.c       |  41 +++++++++------
 fs/dlm/midcomms.c     |  17 +------
 fs/dlm/rcom.c         |  36 +++----------
 fs/dlm/rcom.h         |   5 +-
 fs/dlm/recoverd.c     |  11 +++-
 fs/dlm/requestqueue.c |  58 +++++++++------------
 fs/dlm/requestqueue.h |   4 +-
 11 files changed, 161 insertions(+), 152 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 74901e981e1..d2fc2384c3b 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -491,6 +491,7 @@ struct dlm_ls {
 	uint64_t		ls_recover_seq;
 	struct dlm_recover	*ls_recover_args;
 	struct rw_semaphore	ls_in_recovery;	/* block local requests */
+	struct rw_semaphore	ls_recv_active;	/* block dlm_recv */
 	struct list_head	ls_requestqueue;/* queue remote requests */
 	struct mutex		ls_requestqueue_mutex;
 	char			*ls_recover_buf;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 031229f144f..3915b8e1414 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -3638,55 +3638,8 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
 	dlm_put_lkb(lkb);
 }
 
-int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
+static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms)
 {
-	struct dlm_message *ms = (struct dlm_message *) hd;
-	struct dlm_ls *ls;
-	int error = 0;
-
-	if (!recovery)
-		dlm_message_in(ms);
-
-	ls = dlm_find_lockspace_global(hd->h_lockspace);
-	if (!ls) {
-		log_print("drop message %d from %d for unknown lockspace %d",
-			  ms->m_type, nodeid, hd->h_lockspace);
-		return -EINVAL;
-	}
-
-	/* recovery may have just ended leaving a bunch of backed-up requests
-	   in the requestqueue; wait while dlm_recoverd clears them */
-
-	if (!recovery)
-		dlm_wait_requestqueue(ls);
-
-	/* recovery may have just started while there were a bunch of
-	   in-flight requests -- save them in requestqueue to be processed
-	   after recovery.  we can't let dlm_recvd block on the recovery
-	   lock.  if dlm_recoverd is calling this function to clear the
-	   requestqueue, it needs to be interrupted (-EINTR) if another
-	   recovery operation is starting. */
-
-	while (1) {
-		if (dlm_locking_stopped(ls)) {
-			if (recovery) {
-				error = -EINTR;
-				goto out;
-			}
-			error = dlm_add_requestqueue(ls, nodeid, hd);
-			if (error == -EAGAIN)
-				continue;
-			else {
-				error = -EINTR;
-				goto out;
-			}
-		}
-
-		if (dlm_lock_recovery_try(ls))
-			break;
-		schedule();
-	}
-
 	switch (ms->m_type) {
 
 	/* messages sent to a master node */
@@ -3761,17 +3714,90 @@ int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery)
 		log_error(ls, "unknown message type %d", ms->m_type);
 	}
 
-	dlm_unlock_recovery(ls);
- out:
-	dlm_put_lockspace(ls);
 	dlm_astd_wake();
-	return error;
 }
 
+/* If the lockspace is in recovery mode (locking stopped), then normal
+   messages are saved on the requestqueue for processing after recovery is
+   done.  When not in recovery mode, we wait for dlm_recoverd to drain saved
+   messages off the requestqueue before we process new ones. This occurs right
+   after recovery completes when we transition from saving all messages on
+   requestqueue, to processing all the saved messages, to processing new
+   messages as they arrive. */
 
-/*
- * Recovery related
- */
+static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms,
+				int nodeid)
+{
+	if (dlm_locking_stopped(ls)) {
+		dlm_add_requestqueue(ls, nodeid, (struct dlm_header *) ms);
+	} else {
+		dlm_wait_requestqueue(ls);
+		_receive_message(ls, ms);
+	}
+}
+
+/* This is called by dlm_recoverd to process messages that were saved on
+   the requestqueue. */
+
+void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms)
+{
+	_receive_message(ls, ms);
+}
+
+/* This is called by the midcomms layer when something is received for
+   the lockspace.  It could be either a MSG (normal message sent as part of
+   standard locking activity) or an RCOM (recovery message sent as part of
+   lockspace recovery). */
+
+void dlm_receive_buffer(struct dlm_header *hd, int nodeid)
+{
+	struct dlm_message *ms = (struct dlm_message *) hd;
+	struct dlm_rcom *rc = (struct dlm_rcom *) hd;
+	struct dlm_ls *ls;
+	int type = 0;
+
+	switch (hd->h_cmd) {
+	case DLM_MSG:
+		dlm_message_in(ms);
+		type = ms->m_type;
+		break;
+	case DLM_RCOM:
+		dlm_rcom_in(rc);
+		type = rc->rc_type;
+		break;
+	default:
+		log_print("invalid h_cmd %d from %u", hd->h_cmd, nodeid);
+		return;
+	}
+
+	if (hd->h_nodeid != nodeid) {
+		log_print("invalid h_nodeid %d from %d lockspace %x",
+			  hd->h_nodeid, nodeid, hd->h_lockspace);
+		return;
+	}
+
+	ls = dlm_find_lockspace_global(hd->h_lockspace);
+	if (!ls) {
+		log_print("invalid h_lockspace %x from %d cmd %d type %d",
+			  hd->h_lockspace, nodeid, hd->h_cmd, type);
+
+		if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
+			dlm_send_ls_not_ready(nodeid, rc);
+		return;
+	}
+
+	/* this rwsem allows dlm_ls_stop() to wait for all dlm_recv threads to
+	   be inactive (in this ls) before transitioning to recovery mode */
+
+	down_read(&ls->ls_recv_active);
+	if (hd->h_cmd == DLM_MSG)
+		dlm_receive_message(ls, ms, nodeid);
+	else
+		dlm_receive_rcom(ls, rc, nodeid);
+	up_read(&ls->ls_recv_active);
+
+	dlm_put_lockspace(ls);
+}
 
 static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
 {
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 1720313c22d..ada04680a1e 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -16,7 +16,8 @@
 void dlm_print_rsb(struct dlm_rsb *r);
 void dlm_dump_rsb(struct dlm_rsb *r);
 void dlm_print_lkb(struct dlm_lkb *lkb);
-int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery);
+void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms);
+void dlm_receive_buffer(struct dlm_header *hd, int nodeid);
 int dlm_modes_compat(int mode1, int mode2);
 int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen,
 	unsigned int flags, struct dlm_rsb **r_ret);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 1dc72105ab1..628eaa669e6 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -519,6 +519,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 	ls->ls_recover_seq = 0;
 	ls->ls_recover_args = NULL;
 	init_rwsem(&ls->ls_in_recovery);
+	init_rwsem(&ls->ls_recv_active);
 	INIT_LIST_HEAD(&ls->ls_requestqueue);
 	mutex_init(&ls->ls_requestqueue_mutex);
 	mutex_init(&ls->ls_clear_proc_locks);
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index d09977528f6..e9cdcab306e 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -18,10 +18,6 @@
 #include "rcom.h"
 #include "config.h"
 
-/*
- * Following called by dlm_recoverd thread
- */
-
 static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
 {
 	struct dlm_member *memb = NULL;
@@ -250,18 +246,30 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
 	return error;
 }
 
-/*
- * Following called from lockspace.c
- */
+/* Userspace guarantees that dlm_ls_stop() has completed on all nodes before
+   dlm_ls_start() is called on any of them to start the new recovery. */
 
 int dlm_ls_stop(struct dlm_ls *ls)
 {
 	int new;
 
 	/*
-	 * A stop cancels any recovery that's in progress (see RECOVERY_STOP,
-	 * dlm_recovery_stopped()) and prevents any new locks from being
-	 * processed (see RUNNING, dlm_locking_stopped()).
+	 * Prevent dlm_recv from being in the middle of something when we do
+	 * the stop.  This includes ensuring dlm_recv isn't processing a
+	 * recovery message (rcom), while dlm_recoverd is aborting and
+	 * resetting things from an in-progress recovery.  i.e. we want
+	 * dlm_recoverd to abort its recovery without worrying about dlm_recv
+	 * processing an rcom at the same time.  Stopping dlm_recv also makes
+	 * it easy for dlm_receive_message() to check locking stopped and add a
+	 * message to the requestqueue without races.
+	 */
+
+	down_write(&ls->ls_recv_active);
+
+	/*
+	 * Abort any recovery that's in progress (see RECOVERY_STOP,
+	 * dlm_recovery_stopped()) and tell any other threads running in the
+	 * dlm to quit any processing (see RUNNING, dlm_locking_stopped()).
 	 */
 
 	spin_lock(&ls->ls_recover_lock);
@@ -270,9 +278,15 @@ int dlm_ls_stop(struct dlm_ls *ls)
 	ls->ls_recover_seq++;
 	spin_unlock(&ls->ls_recover_lock);
 
+	/*
+	 * Let dlm_recv run again, now any normal messages will be saved on the
+	 * requestqueue for later.
+	 */
+
+	up_write(&ls->ls_recv_active);
+
 	/*
 	 * This in_recovery lock does two things:
-	 *
 	 * 1) Keeps this function from returning until all threads are out
 	 *    of locking routines and locking is truely stopped.
 	 * 2) Keeps any new requests from being processed until it's unlocked
@@ -284,9 +298,8 @@ int dlm_ls_stop(struct dlm_ls *ls)
 
 	/*
 	 * The recoverd suspend/resume makes sure that dlm_recoverd (if
-	 * running) has noticed the clearing of RUNNING above and quit
-	 * processing the previous recovery.  This will be true for all nodes
-	 * before any nodes start the new recovery.
+	 * running) has noticed RECOVERY_STOP above and quit processing the
+	 * previous recovery.
 	 */
 
 	dlm_recoverd_suspend(ls);
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index a5126e0c68a..f8c69dda16a 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -27,7 +27,6 @@
 #include "dlm_internal.h"
 #include "lowcomms.h"
 #include "config.h"
-#include "rcom.h"
 #include "lock.h"
 #include "midcomms.h"
 
@@ -117,19 +116,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
 		offset &= (limit - 1);
 		len -= msglen;
 
-		switch (msg->h_cmd) {
-		case DLM_MSG:
-			dlm_receive_message(msg, nodeid, 0);
-			break;
-
-		case DLM_RCOM:
-			dlm_receive_rcom(msg, nodeid);
-			break;
-
-		default:
-			log_print("unknown msg type %x from %u: %u %u %u %u",
-				  msg->h_cmd, nodeid, msglen, len, offset, ret);
-		}
+		dlm_receive_buffer(msg, nodeid);
 	}
 
 	if (msg != (struct dlm_header *) __tmp)
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 188b91c027e..ae2fd97fa4a 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -386,7 +386,10 @@ static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 	dlm_recover_process_copy(ls, rc_in);
 }
 
-static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
+/* If the lockspace doesn't exist then still send a status message
+   back; it's possible that it just doesn't have its global_id yet. */
+
+int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
 {
 	struct dlm_rcom *rc;
 	struct rcom_config *rf;
@@ -446,28 +449,11 @@ static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
 	return rv;
 }
 
-/* Called by dlm_recvd; corresponds to dlm_receive_message() but special
+/* Called by dlm_recv; corresponds to dlm_receive_message() but special
    recovery-only comms are sent through here. */
 
-void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
+void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
 {
-	struct dlm_rcom *rc = (struct dlm_rcom *) hd;
-	struct dlm_ls *ls;
-
-	dlm_rcom_in(rc);
-
-	/* If the lockspace doesn't exist then still send a status message
-	   back; it's possible that it just doesn't have its global_id yet. */
-
-	ls = dlm_find_lockspace_global(hd->h_lockspace);
-	if (!ls) {
-		log_print("lockspace %x from %d type %x not found",
-			  hd->h_lockspace, nodeid, rc->rc_type);
-		if (rc->rc_type == DLM_RCOM_STATUS)
-			send_ls_not_ready(nodeid, rc);
-		return;
-	}
-
 	if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
 		log_debug(ls, "ignoring recovery message %x from %d",
 			  rc->rc_type, nodeid);
@@ -477,12 +463,6 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
 	if (is_old_reply(ls, rc))
 		goto out;
 
-	if (nodeid != rc->rc_header.h_nodeid) {
-		log_error(ls, "bad rcom nodeid %d from %d",
-			  rc->rc_header.h_nodeid, nodeid);
-		goto out;
-	}
-
 	switch (rc->rc_type) {
 	case DLM_RCOM_STATUS:
 		receive_rcom_status(ls, rc);
@@ -520,6 +500,6 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid)
 		DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type););
 	}
  out:
-	dlm_put_lockspace(ls);
+	return;
 }
 
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
index d7984321ff4..b09abd29ba3 100644
--- a/fs/dlm/rcom.h
+++ b/fs/dlm/rcom.h
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -18,7 +18,8 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid);
 int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
 int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
 int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
-void dlm_receive_rcom(struct dlm_header *hd, int nodeid);
+void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid);
+int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in);
 
 #endif
 
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 66575997861..4b89e20eebe 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -24,19 +24,28 @@
 
 
 /* If the start for which we're re-enabling locking (seq) has been superseded
-   by a newer stop (ls_recover_seq), we need to leave locking disabled. */
+   by a newer stop (ls_recover_seq), we need to leave locking disabled.
+
+   We suspend dlm_recv threads here to avoid the race where dlm_recv a) sees
+   locking stopped and b) adds a message to the requestqueue, but dlm_recoverd
+   enables locking and clears the requestqueue between a and b. */
 
 static int enable_locking(struct dlm_ls *ls, uint64_t seq)
 {
 	int error = -EINTR;
 
+	down_write(&ls->ls_recv_active);
+
 	spin_lock(&ls->ls_recover_lock);
 	if (ls->ls_recover_seq == seq) {
 		set_bit(LSFL_RUNNING, &ls->ls_flags);
+		/* unblocks processes waiting to enter the dlm */
 		up_write(&ls->ls_in_recovery);
 		error = 0;
 	}
 	spin_unlock(&ls->ls_recover_lock);
+
+	up_write(&ls->ls_recv_active);
 	return error;
 }
 
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 65008d79c96..0de04f17cce 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -20,7 +20,7 @@
 struct rq_entry {
 	struct list_head list;
 	int nodeid;
-	char request[1];
+	char request[0];
 };
 
 /*
@@ -30,42 +30,39 @@ struct rq_entry {
  * lockspace is enabled on some while still suspended on others.
  */
 
-int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd)
 {
 	struct rq_entry *e;
 	int length = hd->h_length;
-	int rv = 0;
 
 	e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL);
 	if (!e) {
-		log_print("dlm_add_requestqueue: out of memory\n");
-		return 0;
+		log_print("dlm_add_requestqueue: out of memory len %d", length);
+		return;
 	}
 
 	e->nodeid = nodeid;
 	memcpy(e->request, hd, length);
 
-	/* We need to check dlm_locking_stopped() after taking the mutex to
-	   avoid a race where dlm_recoverd enables locking and runs
-	   process_requestqueue between our earlier dlm_locking_stopped check
-	   and this addition to the requestqueue. */
-
 	mutex_lock(&ls->ls_requestqueue_mutex);
-	if (dlm_locking_stopped(ls))
-		list_add_tail(&e->list, &ls->ls_requestqueue);
-	else {
-		log_debug(ls, "dlm_add_requestqueue skip from %d", nodeid);
-		kfree(e);
-		rv = -EAGAIN;
-	}
+	list_add_tail(&e->list, &ls->ls_requestqueue);
 	mutex_unlock(&ls->ls_requestqueue_mutex);
-	return rv;
 }
 
+/*
+ * Called by dlm_recoverd to process normal messages saved while recovery was
+ * happening.  Normal locking has been enabled before this is called.  dlm_recv
+ * upon receiving a message, will wait for all saved messages to be drained
+ * here before processing the message it got.  If a new dlm_ls_stop() arrives
+ * while we're processing these saved messages, it may block trying to suspend
+ * dlm_recv if dlm_recv is waiting for us in dlm_wait_requestqueue.  In that
+ * case, we don't abort since locking_stopped is still 0.  If dlm_recv is not
+ * waiting for us, then this processing may be aborted due to locking_stopped.
+ */
+
 int dlm_process_requestqueue(struct dlm_ls *ls)
 {
 	struct rq_entry *e;
-	struct dlm_header *hd;
 	int error = 0;
 
 	mutex_lock(&ls->ls_requestqueue_mutex);
@@ -79,14 +76,7 @@ int dlm_process_requestqueue(struct dlm_ls *ls)
 		e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list);
 		mutex_unlock(&ls->ls_requestqueue_mutex);
 
-		hd = (struct dlm_header *) e->request;
-		error = dlm_receive_message(hd, e->nodeid, 1);
-
-		if (error == -EINTR) {
-			/* entry is left on requestqueue */
-			log_debug(ls, "process_requestqueue abort eintr");
-			break;
-		}
+		dlm_receive_message_saved(ls, (struct dlm_message *)e->request);
 
 		mutex_lock(&ls->ls_requestqueue_mutex);
 		list_del(&e->list);
@@ -106,10 +96,12 @@ int dlm_process_requestqueue(struct dlm_ls *ls)
 
 /*
  * After recovery is done, locking is resumed and dlm_recoverd takes all the
- * saved requests and processes them as they would have been by dlm_recvd.  At
- * the same time, dlm_recvd will start receiving new requests from remote
- * nodes.  We want to delay dlm_recvd processing new requests until
- * dlm_recoverd has finished processing the old saved requests.
+ * saved requests and processes them as they would have been by dlm_recv.  At
+ * the same time, dlm_recv will start receiving new requests from remote nodes.
+ * We want to delay dlm_recv processing new requests until dlm_recoverd has
+ * finished processing the old saved requests.  We don't check for locking
+ * stopped here because dlm_ls_stop won't stop locking until it's suspended us
+ * (dlm_recv).
  */
 
 void dlm_wait_requestqueue(struct dlm_ls *ls)
@@ -118,8 +110,6 @@ void dlm_wait_requestqueue(struct dlm_ls *ls)
 		mutex_lock(&ls->ls_requestqueue_mutex);
 		if (list_empty(&ls->ls_requestqueue))
 			break;
-		if (dlm_locking_stopped(ls))
-			break;
 		mutex_unlock(&ls->ls_requestqueue_mutex);
 		schedule();
 	}
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
index 6a53ea03335..aba34fc05ee 100644
--- a/fs/dlm/requestqueue.h
+++ b/fs/dlm/requestqueue.h
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2007 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
 #ifndef __REQUESTQUEUE_DOT_H__
 #define __REQUESTQUEUE_DOT_H__
 
-int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd);
 int dlm_process_requestqueue(struct dlm_ls *ls);
 void dlm_wait_requestqueue(struct dlm_ls *ls);
 void dlm_purge_requestqueue(struct dlm_ls *ls);
-- 
cgit v1.2.3


From 32da477a5bfe96b6dfc8960e0d22d89ca09fd10a Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 12 Sep 2007 11:37:03 +0200
Subject: [NET]: Don't implement dev_ifname32 inline

The current implementation of dev_ifname makes maintenance difficult
because updates to the implementation of the ioctl have to made in two
places.  So this patch updates dev_ifname32 to do a classic 32/64
structure conversion and call sys_ioctl like the rest of the
compat calls do.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/compat_ioctl.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 37310b0e810..d917e4a26a4 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -324,22 +324,21 @@ struct ifconf32 {
 
 static int dev_ifname32(unsigned int fd, unsigned int cmd, unsigned long arg)
 {
-	struct net_device *dev;
-	struct ifreq32 ifr32;
+	struct ifreq __user *uifr;
 	int err;
 
-	if (copy_from_user(&ifr32, compat_ptr(arg), sizeof(ifr32)))
+	uifr = compat_alloc_user_space(sizeof(struct ifreq));
+	if (copy_in_user(uifr, compat_ptr(arg), sizeof(struct ifreq32)));
 		return -EFAULT;
 
-	dev = dev_get_by_index(ifr32.ifr_ifindex);
-	if (!dev)
-		return -ENODEV;
+	err = sys_ioctl(fd, SIOCGIFNAME, (unsigned long)uifr);
+	if (err)
+		return err;
 
-	strlcpy(ifr32.ifr_name, dev->name, sizeof(ifr32.ifr_name));
-	dev_put(dev);
-	
-	err = copy_to_user(compat_ptr(arg), &ifr32, sizeof(ifr32));
-	return (err ? -EFAULT : 0);
+	if (copy_in_user(compat_ptr(arg), uifr, sizeof(struct ifreq32)))
+		return -EFAULT;
+
+	return 0;
 }
 
 static int dev_ifconf(unsigned int fd, unsigned int cmd, unsigned long arg)
-- 
cgit v1.2.3


From 457c4cbc5a3dde259d2a1f15d5f9785290397267 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 12 Sep 2007 12:01:34 +0200
Subject: [NET]: Make /proc/net per network namespace

This patch makes /proc/net per network namespace.  It modifies the global
variables proc_net and proc_net_stat to be per network namespace.
The proc_net file helpers are modified to take a network namespace argument,
and all of their callers are fixed to pass &init_net for that argument.
This ensures that all of the /proc/net files are only visible and
usable in the initial network namespace until the code behind them
has been updated to be handle multiple network namespaces.

Making /proc/net per namespace is necessary as at least some files
in /proc/net depend upon the set of network devices which is per
network namespace, and even more files in /proc/net have contents
that are relevant to a single network namespace.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/Makefile   | 1 +
 fs/proc/internal.h | 5 +++++
 fs/proc/root.c     | 8 +++-----
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index bce38e3f06c..ebaba021354 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -11,6 +11,7 @@ proc-y       += inode.o root.o base.o generic.o array.o \
 		proc_tty.o proc_misc.o
 
 proc-$(CONFIG_PROC_SYSCTL)	+= proc_sysctl.o
+proc-$(CONFIG_NET)		+= proc_net.o
 proc-$(CONFIG_PROC_KCORE)	+= kcore.o
 proc-$(CONFIG_PROC_VMCORE)	+= vmcore.o
 proc-$(CONFIG_PROC_DEVICETREE)	+= proc_devtree.o
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index b215c3524fa..1820eb2ef76 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -16,6 +16,11 @@ extern int proc_sys_init(void);
 #else
 static inline void proc_sys_init(void) { }
 #endif
+#ifdef CONFIG_NET
+extern int proc_net_init(void);
+#else
+static inline int proc_net_init(void) { return 0; }
+#endif
 
 struct vmalloc_info {
 	unsigned long	used;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 41f17037f73..cf3046638b0 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -21,7 +21,7 @@
 
 #include "internal.h"
 
-struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc_root_driver;
+struct proc_dir_entry *proc_bus, *proc_root_fs, *proc_root_driver;
 
 static int proc_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
@@ -61,8 +61,8 @@ void __init proc_root_init(void)
 		return;
 	}
 	proc_misc_init();
-	proc_net = proc_mkdir("net", NULL);
-	proc_net_stat = proc_mkdir("net/stat", NULL);
+
+	proc_net_init();
 
 #ifdef CONFIG_SYSVIPC
 	proc_mkdir("sysvipc", NULL);
@@ -159,7 +159,5 @@ EXPORT_SYMBOL(create_proc_entry);
 EXPORT_SYMBOL(remove_proc_entry);
 EXPORT_SYMBOL(proc_root);
 EXPORT_SYMBOL(proc_root_fs);
-EXPORT_SYMBOL(proc_net);
-EXPORT_SYMBOL(proc_net_stat);
 EXPORT_SYMBOL(proc_bus);
 EXPORT_SYMBOL(proc_root_driver);
-- 
cgit v1.2.3


From b4b510290b056b86611757ce1175a230f1080f53 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 12 Sep 2007 13:05:38 +0200
Subject: [NET]: Support multiple network namespaces with netlink

Each netlink socket will live in exactly one network namespace,
this includes the controlling kernel sockets.

This patch updates all of the existing netlink protocols
to only support the initial network namespace.  Request
by clients in other namespaces will get -ECONREFUSED.
As they would if the kernel did not have the support for
that netlink protocol compiled in.

As each netlink protocol is updated to be multiple network
namespace safe it can register multiple kernel sockets
to acquire a presence in the rest of the network namespaces.

The implementation in af_netlink is a simple filter implementation
at hash table insertion and hash table look up time.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/ecryptfs/netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
index fe9186312d7..056519cd92b 100644
--- a/fs/ecryptfs/netlink.c
+++ b/fs/ecryptfs/netlink.c
@@ -227,7 +227,7 @@ int ecryptfs_init_netlink(void)
 {
 	int rc;
 
-	ecryptfs_nl_sock = netlink_kernel_create(NETLINK_ECRYPTFS, 0,
+	ecryptfs_nl_sock = netlink_kernel_create(&init_net, NETLINK_ECRYPTFS, 0,
 						 ecryptfs_receive_nl_message,
 						 NULL, THIS_MODULE);
 	if (!ecryptfs_nl_sock) {
-- 
cgit v1.2.3


From 881d966b48b035ab3f3aeaae0f3d3f9b584f45b2 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 17 Sep 2007 11:56:21 -0700
Subject: [NET]: Make the device list and device lookups per namespace.

This patch makes most of the generic device layer network
namespace safe.  This patch makes dev_base_head a
network namespace variable, and then it picks up
a few associated variables.  The functions:
dev_getbyhwaddr
dev_getfirsthwbytype
dev_get_by_flags
dev_get_by_name
__dev_get_by_name
dev_get_by_index
__dev_get_by_index
dev_ioctl
dev_ethtool
dev_load
wireless_process_ioctl

were modified to take a network namespace argument, and
deal with it.

vlan_ioctl_set and brioctl_set were modified so their
hooks will receive a network namespace argument.

So basically anthing in the core of the network stack that was
affected to by the change of dev_base was modified to handle
multiple network namespaces.  The rest of the network stack was
simply modified to explicitly use &init_net the initial network
namespace.  This can be fixed when those components of the network
stack are modified to handle multiple network namespaces.

For now the ifindex generator is left global.

Fundametally ifindex numbers are per namespace, or else
we will have corner case problems with migration when
we get that far.

At the same time there are assumptions in the network stack
that the ifindex of a network device won't change.  Making
the ifindex number global seems a good compromise until
the network stack can cope with ifindex changes when
you change namespaces, and the like.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/afs/netdevices.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
index fc27d4b52e5..49f18942306 100644
--- a/fs/afs/netdevices.c
+++ b/fs/afs/netdevices.c
@@ -8,6 +8,7 @@
 #include <linux/inetdevice.h>
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
+#include <net/net_namespace.h>
 #include "internal.h"
 
 /*
@@ -23,7 +24,7 @@ int afs_get_MAC_address(u8 *mac, size_t maclen)
 		BUG();
 
 	rtnl_lock();
-	dev = __dev_getfirstbyhwtype(ARPHRD_ETHER);
+	dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER);
 	if (dev) {
 		memcpy(mac, dev->dev_addr, maclen);
 		ret = 0;
@@ -47,7 +48,7 @@ int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs,
 	ASSERT(maxbufs > 0);
 
 	rtnl_lock();
-	for_each_netdev(dev) {
+	for_each_netdev(&init_net, dev) {
 		if (dev->type == ARPHRD_LOOPBACK && !wantloopback)
 			continue;
 		idev = __in_dev_get_rtnl(dev);
-- 
cgit v1.2.3


From 3c12afe75f61d9402797d63941367962ca36fcc9 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@kimchee.(none)>
Date: Wed, 12 Sep 2007 14:18:18 +0200
Subject: [NET]: Fix missed addition of fs/proc/proc_net.c

My bad.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/proc_net.c | 192 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 192 insertions(+)
 create mode 100644 fs/proc/proc_net.c

(limited to 'fs')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
new file mode 100644
index 00000000000..45dde2f7034
--- /dev/null
+++ b/fs/proc/proc_net.c
@@ -0,0 +1,192 @@
+/*
+ *  linux/fs/proc/net.c
+ *
+ *  Copyright (C) 2007
+ *
+ *  Author: Eric Biederman <ebiederm@xmission.com>
+ *
+ *  proc net directory handling functions
+ */
+
+#include <asm/uaccess.h>
+
+#include <linux/errno.h>
+#include <linux/time.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/bitops.h>
+#include <linux/smp_lock.h>
+#include <linux/mount.h>
+#include <linux/nsproxy.h>
+#include <net/net_namespace.h>
+
+#include "internal.h"
+
+
+struct proc_dir_entry *proc_net_create(struct net *net,
+	const char *name, mode_t mode, get_info_t *get_info)
+{
+	return create_proc_info_entry(name,mode, net->proc_net, get_info);
+}
+
+struct proc_dir_entry *proc_net_fops_create(struct net *net,
+	const char *name, mode_t mode, const struct file_operations *fops)
+{
+	struct proc_dir_entry *res;
+
+	res = create_proc_entry(name, mode, net->proc_net);
+	if (res)
+		res->proc_fops = fops;
+	return res;
+}
+
+void proc_net_remove(struct net *net, const char *name)
+{
+	remove_proc_entry(name, net->proc_net);
+}
+
+
+static struct proc_dir_entry *proc_net_shadow;
+
+static struct dentry *proc_net_shadow_dentry(struct dentry *parent,
+						struct proc_dir_entry *de)
+{
+	struct dentry *shadow = NULL;
+	struct inode *inode;
+	if (!de)
+		goto out;
+	de_get(de);
+	inode = proc_get_inode(parent->d_inode->i_sb, de->low_ino, de);
+	if (!inode)
+		goto out_de_put;
+	shadow = d_alloc_name(parent, de->name);
+	if (!shadow)
+		goto out_iput;
+	shadow->d_op = parent->d_op; /* proc_dentry_operations */
+	d_instantiate(shadow, inode);
+out:
+	return shadow;
+out_iput:
+	iput(inode);
+out_de_put:
+	de_put(de);
+	goto out;
+}
+
+static void *proc_net_follow_link(struct dentry *parent, struct nameidata *nd)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct dentry *shadow;
+	shadow = proc_net_shadow_dentry(parent, net->proc_net);
+	if (!shadow)
+		return ERR_PTR(-ENOENT);
+
+	dput(nd->dentry);
+	/* My dentry count is 1 and that should be enough as the
+	 * shadow dentry is thrown away immediately.
+	 */
+	nd->dentry = shadow;
+	return NULL;
+}
+
+static struct dentry *proc_net_lookup(struct inode *dir, struct dentry *dentry,
+				      struct nameidata *nd)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct dentry *shadow;
+
+	shadow = proc_net_shadow_dentry(nd->dentry, net->proc_net);
+	if (!shadow)
+		return ERR_PTR(-ENOENT);
+
+	dput(nd->dentry);
+	nd->dentry = shadow;
+
+	return shadow->d_inode->i_op->lookup(shadow->d_inode, dentry, nd);
+}
+
+static int proc_net_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct dentry *shadow;
+	int ret;
+
+	shadow = proc_net_shadow_dentry(dentry->d_parent, net->proc_net);
+	if (!shadow)
+		return -ENOENT;
+	ret = shadow->d_inode->i_op->setattr(shadow, iattr);
+	dput(shadow);
+	return ret;
+}
+
+static const struct file_operations proc_net_dir_operations = {
+	.read			= generic_read_dir,
+};
+
+static struct inode_operations proc_net_dir_inode_operations = {
+	.follow_link	= proc_net_follow_link,
+	.lookup		= proc_net_lookup,
+	.setattr	= proc_net_setattr,
+};
+
+static int proc_net_ns_init(struct net *net)
+{
+	struct proc_dir_entry *root, *netd, *net_statd;
+	int err;
+
+	err = -ENOMEM;
+	root = kzalloc(sizeof(*root), GFP_KERNEL);
+	if (!root)
+		goto out;
+
+	err = -EEXIST;
+	netd = proc_mkdir("net", root);
+	if (!netd)
+		goto free_root;
+
+	err = -EEXIST;
+	net_statd = proc_mkdir("stat", netd);
+	if (!net_statd)
+		goto free_net;
+
+	root->data = net;
+	netd->data = net;
+	net_statd->data = net;
+
+	net->proc_net_root = root;
+	net->proc_net = netd;
+	net->proc_net_stat = net_statd;
+	err = 0;
+
+out:
+	return err;
+free_net:
+	remove_proc_entry("net", root);
+free_root:
+	kfree(root);
+	goto out;
+}
+
+static void proc_net_ns_exit(struct net *net)
+{
+	remove_proc_entry("stat", net->proc_net);
+	remove_proc_entry("net", net->proc_net_root);
+	kfree(net->proc_net_root);
+}
+
+struct pernet_operations proc_net_ns_ops = {
+	.init = proc_net_ns_init,
+	.exit = proc_net_ns_exit,
+};
+
+int proc_net_init(void)
+{
+	proc_net_shadow = proc_mkdir("net", NULL);
+	proc_net_shadow->proc_iops = &proc_net_dir_inode_operations;
+	proc_net_shadow->proc_fops = &proc_net_dir_operations;
+
+	return register_pernet_subsys(&proc_net_ns_ops);
+}
-- 
cgit v1.2.3


From 36ac3135f5e824942fada4efa3204066b4b40ab1 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <dlezcano@fr.ibm.com>
Date: Wed, 12 Sep 2007 14:51:47 +0200
Subject: [NETNS]: Fix export symbols.

Add the appropriate EXPORT_SYMBOLS for proc_net_create,
proc_net_fops_create and proc_net_remove to fix errors when
compiling allmodconfig

Signed-off-by: Mark Nelson <markn@au1.ibm.com>
Acked-by: Benjamin Thery <benjamin.thery@bull.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/proc_net.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 45dde2f7034..358930a3142 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -31,6 +31,7 @@ struct proc_dir_entry *proc_net_create(struct net *net,
 {
 	return create_proc_info_entry(name,mode, net->proc_net, get_info);
 }
+EXPORT_SYMBOL_GPL(proc_net_create);
 
 struct proc_dir_entry *proc_net_fops_create(struct net *net,
 	const char *name, mode_t mode, const struct file_operations *fops)
@@ -42,12 +43,13 @@ struct proc_dir_entry *proc_net_fops_create(struct net *net,
 		res->proc_fops = fops;
 	return res;
 }
+EXPORT_SYMBOL_GPL(proc_net_fops_create);
 
 void proc_net_remove(struct net *net, const char *name)
 {
 	remove_proc_entry(name, net->proc_net);
 }
-
+EXPORT_SYMBOL_GPL(proc_net_remove);
 
 static struct proc_dir_entry *proc_net_shadow;
 
-- 
cgit v1.2.3


From 077130c0cf7d5ba1992f5b51b96136d7b1c8aad5 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 13 Sep 2007 09:18:57 +0200
Subject: [NET]: Fix race when opening a proc file while a network namespace is
 exiting.

The problem:  proc_net files remember which network namespace the are
against but do not remember hold a reference count (as that would pin
the network namespace).   So we currently have a small window where
the reference count on a network namespace may be incremented when opening
a /proc file when it has already gone to zero.

To fix this introduce maybe_get_net and get_proc_net.

maybe_get_net increments the network namespace reference count only if it is
greater then zero, ensuring we don't increment a reference count after it
has gone to zero.

get_proc_net handles all of the magic to go from a proc inode to the network
namespace instance and call maybe_get_net on it.

PROC_NET the old accessor is removed so that we don't get confused and use
the wrong helper function.

Then I fix up the callers to use get_proc_net and handle the case case
where get_proc_net returns NULL.  In that case I return -ENXIO because
effectively the network namespace has already gone away so the files
we are trying to access don't exist anymore.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/proc_net.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 358930a3142..85cc8e8bb86 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -51,6 +51,12 @@ void proc_net_remove(struct net *net, const char *name)
 }
 EXPORT_SYMBOL_GPL(proc_net_remove);
 
+struct net *get_proc_net(const struct inode *inode)
+{
+	return maybe_get_net(PDE_NET(PDE(inode)));
+}
+EXPORT_SYMBOL_GPL(get_proc_net);
+
 static struct proc_dir_entry *proc_net_shadow;
 
 static struct dentry *proc_net_shadow_dentry(struct dentry *parent,
-- 
cgit v1.2.3


From 4665079cbb2a3e17de82f2ab2940b9f97f37d65e Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Mon, 8 Oct 2007 20:38:39 -0700
Subject: [NETNS]: Move some code into __init section when CONFIG_NET_NS=n

With the net namespaces many code leaved the __init section,
thus making the kernel occupy more memory than it did before.
Since we have a config option that prohibits the namespace
creation, the functions that initialize/finalize some netns
stuff are simply not needed and can be freed after the boot.

Currently, this is almost not noticeable, since few calls
are no longer in __init, but when the namespaces will be
merged it will be possible to free more code. I propose to
use the __net_init, __net_exit and __net_initdata "attributes"
for functions/variables that are not used if the CONFIG_NET_NS
is not set to save more space in memory.

The exiting functions cannot just reside in the __exit section,
as noticed by David, since the init section will have
references on it and the compilation will fail due to modpost
checks. These references can exist, since the init namespace
never dies and the exit callbacks are never called. So I
introduce the __exit_refok attribute just like it is already
done with the __init_refok.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/proc/proc_net.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 85cc8e8bb86..2e91fb756e9 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -140,7 +140,7 @@ static struct inode_operations proc_net_dir_inode_operations = {
 	.setattr	= proc_net_setattr,
 };
 
-static int proc_net_ns_init(struct net *net)
+static __net_init int proc_net_ns_init(struct net *net)
 {
 	struct proc_dir_entry *root, *netd, *net_statd;
 	int err;
@@ -178,19 +178,19 @@ free_root:
 	goto out;
 }
 
-static void proc_net_ns_exit(struct net *net)
+static __net_exit void proc_net_ns_exit(struct net *net)
 {
 	remove_proc_entry("stat", net->proc_net);
 	remove_proc_entry("net", net->proc_net_root);
 	kfree(net->proc_net_root);
 }
 
-struct pernet_operations proc_net_ns_ops = {
+struct pernet_operations __net_initdata proc_net_ns_ops = {
 	.init = proc_net_ns_init,
 	.exit = proc_net_ns_exit,
 };
 
-int proc_net_init(void)
+int __init proc_net_init(void)
 {
 	proc_net_shadow = proc_mkdir("net", NULL);
 	proc_net_shadow->proc_iops = &proc_net_dir_inode_operations;
-- 
cgit v1.2.3


From 39699037a5c94d7cd1363dfe48a50c78c643fd9a Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 10 Oct 2007 02:28:42 -0700
Subject: [FS] seq_file: Introduce the seq_open_private()

This function allocates the zeroed chunk of memory and
call seq_open(). The __seq_open_private() helper returns
the allocated memory to make it possible for the caller
to initialize it.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/seq_file.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index bbb19be260c..ca71c115bda 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -429,6 +429,39 @@ int seq_release_private(struct inode *inode, struct file *file)
 }
 EXPORT_SYMBOL(seq_release_private);
 
+void *__seq_open_private(struct file *f, const struct seq_operations *ops,
+		int psize)
+{
+	int rc;
+	void *private;
+	struct seq_file *seq;
+
+	private = kzalloc(psize, GFP_KERNEL);
+	if (private == NULL)
+		goto out;
+
+	rc = seq_open(f, ops);
+	if (rc < 0)
+		goto out_free;
+
+	seq = f->private_data;
+	seq->private = private;
+	return private;
+
+out_free:
+	kfree(private);
+out:
+	return NULL;
+}
+EXPORT_SYMBOL(__seq_open_private);
+
+int seq_open_private(struct file *filp, const struct seq_operations *ops,
+		int psize)
+{
+	return __seq_open_private(filp, ops, psize) ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL(seq_open_private);
+
 int seq_putc(struct seq_file *m, char c)
 {
 	if (m->count < m->size) {
-- 
cgit v1.2.3


From cd40b7d3983c708aabe3d3008ec64ffce56d33b0 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Wed, 10 Oct 2007 21:15:29 -0700
Subject: [NET]: make netlink user -> kernel interface synchronious

This patch make processing netlink user -> kernel messages synchronious.
This change was inspired by the talk with Alexey Kuznetsov about current
netlink messages processing. He says that he was badly wrong when introduced
asynchronious user -> kernel communication.

The call netlink_unicast is the only path to send message to the kernel
netlink socket. But, unfortunately, it is also used to send data to the
user.

Before this change the user message has been attached to the socket queue
and sk->sk_data_ready was called. The process has been blocked until all
pending messages were processed. The bad thing is that this processing
may occur in the arbitrary process context.

This patch changes nlk->data_ready callback to get 1 skb and force packet
processing right in the netlink_unicast.

Kernel -> user path in netlink_unicast remains untouched.

EINTR processing for in netlink_run_queue was changed. It forces rtnl_lock
drop, but the process remains in the cycle until the message will be fully
processed. So, there is no need to use this kludges now.

Signed-off-by: Denis V. Lunev <den@openvz.org>
Acked-by: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/ecryptfs/netlink.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
index 056519cd92b..9aa345121e0 100644
--- a/fs/ecryptfs/netlink.c
+++ b/fs/ecryptfs/netlink.c
@@ -165,22 +165,10 @@ static int ecryptfs_process_nl_quit(struct sk_buff *skb)
  * it to its desired netlink context element and wake up the process
  * that is waiting for a response.
  */
-static void ecryptfs_receive_nl_message(struct sock *sk, int len)
+static void ecryptfs_receive_nl_message(struct sk_buff *skb)
 {
-	struct sk_buff *skb;
 	struct nlmsghdr *nlh;
-	int rc = 0;	/* skb_recv_datagram requires this */
 
-receive:
-	skb = skb_recv_datagram(sk, 0, 0, &rc);
-	if (rc == -EINTR)
-		goto receive;
-	else if (rc < 0) {
-		ecryptfs_printk(KERN_ERR, "Error occurred while "
-				"receiving eCryptfs netlink message; "
-				"rc = [%d]\n", rc);
-		return;
-	}
 	nlh = nlmsg_hdr(skb);
 	if (!NLMSG_OK(nlh, skb->len)) {
 		ecryptfs_printk(KERN_ERR, "Received corrupt netlink "
-- 
cgit v1.2.3


From e30408b2a99cb7b8bf529c7dc2328a19d71894cf Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Fri, 12 Oct 2007 00:01:21 -0400
Subject: JFS: fix bio-related build breakage

Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jfs/jfs_logmgr.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 57c3b8ac36b..ccfd0294405 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2162,7 +2162,7 @@ static void lbmStartIO(struct lbuf * bp)
 	/* check if journaling to disk has been disabled */
 	if (log->no_integrity) {
 		bio->bi_size = 0;
-		lbmIODone(bio, 0, 0);
+		lbmIODone(bio, 0);
 	} else {
 		submit_bio(WRITE_SYNC, bio);
 		INCREMENT(lmStat.submitted);
@@ -2234,8 +2234,6 @@ static void lbmIODone(struct bio *bio, int error)
 
 		/* wakeup I/O initiator */
 		LCACHE_WAKEUP(&bp->l_ioevent);
-
-		return 0;
 	}
 
 	/*
@@ -2260,7 +2258,6 @@ static void lbmIODone(struct bio *bio, int error)
 	if (bp->l_flag & lbmDIRECT) {
 		LCACHE_WAKEUP(&bp->l_ioevent);
 		LCACHE_UNLOCK(flags);
-		return 0;
 	}
 
 	tail = log->wqueue;
@@ -2339,8 +2336,6 @@ static void lbmIODone(struct bio *bio, int error)
 
 		LCACHE_UNLOCK(flags);	/* unlock+enable */
 	}
-
-	return 0;
 }
 
 int jfsIOWait(void *arg)
-- 
cgit v1.2.3


From 782e3b3b3804c38d5130c7f21d7ec7bf6709023f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Fri, 12 Oct 2007 07:17:47 +0100
Subject: Fix up more bio fallout

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/gfs2/super.c              | 1 -
 fs/jfs/jfs_metapage.c        | 3 ---
 fs/ocfs2/cluster/heartbeat.c | 3 +--
 fs/xfs/linux-2.6/xfs_aops.c  | 3 +--
 fs/xfs/linux-2.6/xfs_buf.c   | 3 +--
 5 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2473e2a86d1..a2da76b5ae4 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -169,7 +169,6 @@ static void end_bio_io_page(struct bio *bio, int error)
 	else
 		printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
 	unlock_page(page);
-	return 0;
 }
 
 static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 1332adc0b9f..941369c1ac8 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -291,8 +291,6 @@ static void metapage_read_end_io(struct bio *bio, int err)
 
 	dec_io(page, last_read_complete);
 	bio_put(bio);
-
-	return 0;
 }
 
 static void remove_from_logsync(struct metapage *mp)
@@ -349,7 +347,6 @@ static void metapage_write_end_io(struct bio *bio, int err)
 	}
 	dec_io(page, last_write_complete);
 	bio_put(bio);
-	return 0;
 }
 
 static int metapage_writepage(struct page *page, struct writeback_control *wbc)
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index da2c2b442b4..f14b541fab9 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -216,7 +216,7 @@ static void o2hb_wait_on_io(struct o2hb_region *reg,
 	wait_for_completion(&wc->wc_io_complete);
 }
 
-static int o2hb_bio_end_io(struct bio *bio,
+static void o2hb_bio_end_io(struct bio *bio,
 			   int error)
 {
 	struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
@@ -228,7 +228,6 @@ static int o2hb_bio_end_io(struct bio *bio,
 
 	o2hb_bio_wait_dec(wc, 1);
 	bio_put(bio);
-	return 0;
 }
 
 /* Setup a Bio to cover I/O against num_slots slots starting at
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 3f13519436a..6f4c29e9c3d 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -323,7 +323,7 @@ xfs_iomap_valid(
 /*
  * BIO completion handler for buffered IO.
  */
-STATIC int
+STATIC void
 xfs_end_bio(
 	struct bio		*bio,
 	int			error)
@@ -339,7 +339,6 @@ xfs_end_bio(
 	bio_put(bio);
 
 	xfs_finish_ioend(ioend, 0);
-	return 0;
 }
 
 STATIC void
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 6a75f4d984a..39f44ee572e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1103,7 +1103,7 @@ _xfs_buf_ioend(
 	}
 }
 
-STATIC int
+STATIC void
 xfs_buf_bio_end_io(
 	struct bio		*bio,
 	int			error)
@@ -1139,7 +1139,6 @@ xfs_buf_bio_end_io(
 
 	_xfs_buf_ioend(bp, 1);
 	bio_put(bio);
-	return 0;
 }
 
 STATIC void
-- 
cgit v1.2.3


From bfab36e81611e60573b84eb4e4b4c8d8545b2320 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cam.ac.uk>
Date: Fri, 12 Oct 2007 09:37:15 +0100
Subject: NTFS: Fix a mount time deadlock.

Big thanks go to Mathias Kolehmainen for reporting the bug, providing
debug output and testing the patches I sent him to get it working.

The fix was to stop calling ntfs_attr_set() at mount time as that causes
balance_dirty_pages_ratelimited() to be called which on systems with
little memory actually tries to go and balance the dirty pages which tries
to take the s_umount semaphore but because we are still in fill_super()
across which the VFS holds s_umount for writing this results in a
deadlock.

We now do the dirty work by hand by submitting individual buffers.  This
has the annoying "feature" that mounting can take a few seconds if the
journal is large as we have clear it all.  One day someone should improve
on this by deferring the journal clearing to a helper kernel thread so it
can be done in the background but I don't have time for this at the moment
and the current solution works fine so I am leaving it like this for now.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ntfs/ChangeLog |  12 +++++
 fs/ntfs/Makefile  |   2 +-
 fs/ntfs/aops.c    |  22 ++++-----
 fs/ntfs/attrib.c  |   8 ++-
 fs/ntfs/file.c    |  36 +++++++-------
 fs/ntfs/inode.c   |   3 --
 fs/ntfs/logfile.c | 143 ++++++++++++++++++++++++++++++++++++++++++++++++------
 fs/ntfs/runlist.c |   4 +-
 8 files changed, 178 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index af4ef808fa9..345798ebd36 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -17,6 +17,18 @@ ToDo/Notes:
 	  happen is unclear however so it is worth waiting until someone hits
 	  the problem.
 
+2.1.29 - Fix a deadlock at mount time.
+
+	- During mount the VFS holds s_umount lock on the superblock.  So when
+	  we try to empty the journal $LogFile contents by calling
+	  ntfs_attr_set() when the machine does not have much memory and the
+	  journal is large ntfs_attr_set() results in the VM trying to balance
+	  dirty pages which in turn tries to that the s_umount lock and thus we
+	  get a deadlock.  The solution is to not use ntfs_attr_set() and
+	  instead do the zeroing by hand at the block level rather than page
+	  cache level.
+	- Fix sparse warnings.
+
 2.1.28 - Fix a deadlock.
 
 	- Fix deadlock in fs/ntfs/inode.c::ntfs_put_inode().  Thanks to Sergey
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 82550838556..58b6be99254 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
 	     index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
 	     unistr.o upcase.o
 
-EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.28\"
+EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.29\"
 
 ifeq ($(CONFIG_NTFS_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 6e5c2534f4b..cfdc7900d27 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -2,7 +2,7 @@
  * aops.c - NTFS kernel address space operations and page cache handling.
  *	    Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2006 Anton Altaparmakov
+ * Copyright (c) 2001-2007 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -396,7 +396,7 @@ static int ntfs_readpage(struct file *file, struct page *page)
 	loff_t i_size;
 	struct inode *vi;
 	ntfs_inode *ni, *base_ni;
-	u8 *kaddr;
+	u8 *addr;
 	ntfs_attr_search_ctx *ctx;
 	MFT_RECORD *mrec;
 	unsigned long flags;
@@ -491,15 +491,15 @@ retry_readpage:
 		/* Race with shrinking truncate. */
 		attr_len = i_size;
 	}
-	kaddr = kmap_atomic(page, KM_USER0);
+	addr = kmap_atomic(page, KM_USER0);
 	/* Copy the data to the page. */
-	memcpy(kaddr, (u8*)ctx->attr +
+	memcpy(addr, (u8*)ctx->attr +
 			le16_to_cpu(ctx->attr->data.resident.value_offset),
 			attr_len);
 	/* Zero the remainder of the page. */
-	memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+	memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
 	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
+	kunmap_atomic(addr, KM_USER0);
 put_unm_err_out:
 	ntfs_attr_put_search_ctx(ctx);
 unm_err_out:
@@ -1344,7 +1344,7 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
 	loff_t i_size;
 	struct inode *vi = page->mapping->host;
 	ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
-	char *kaddr;
+	char *addr;
 	ntfs_attr_search_ctx *ctx = NULL;
 	MFT_RECORD *m = NULL;
 	u32 attr_len;
@@ -1484,14 +1484,14 @@ retry_writepage:
 		/* Shrinking cannot fail. */
 		BUG_ON(err);
 	}
-	kaddr = kmap_atomic(page, KM_USER0);
+	addr = kmap_atomic(page, KM_USER0);
 	/* Copy the data from the page to the mft record. */
 	memcpy((u8*)ctx->attr +
 			le16_to_cpu(ctx->attr->data.resident.value_offset),
-			kaddr, attr_len);
+			addr, attr_len);
 	/* Zero out of bounds area in the page cache page. */
-	memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
-	kunmap_atomic(kaddr, KM_USER0);
+	memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+	kunmap_atomic(addr, KM_USER0);
 	flush_dcache_page(page);
 	flush_dcache_mft_record_page(ctx->ntfs_ino);
 	/* We are done with the page. */
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 1c08fefe487..92dabdcf2b8 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1,7 +1,7 @@
 /**
  * attrib.c - NTFS attribute operations.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2006 Anton Altaparmakov
+ * Copyright (c) 2001-2007 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -2500,7 +2500,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 	struct page *page;
 	u8 *kaddr;
 	pgoff_t idx, end;
-	unsigned int start_ofs, end_ofs, size;
+	unsigned start_ofs, end_ofs, size;
 
 	ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.",
 			(long long)ofs, (long long)cnt, val);
@@ -2548,6 +2548,8 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 		kunmap_atomic(kaddr, KM_USER0);
 		set_page_dirty(page);
 		page_cache_release(page);
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
 		if (idx == end)
 			goto done;
 		idx++;
@@ -2604,6 +2606,8 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 		kunmap_atomic(kaddr, KM_USER0);
 		set_page_dirty(page);
 		page_cache_release(page);
+		balance_dirty_pages_ratelimited(mapping);
+		cond_resched();
 	}
 done:
 	ntfs_debug("Done.");
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index ffcc504a166..c814204d4ea 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
 /*
  * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2006 Anton Altaparmakov
+ * Copyright (c) 2001-2007 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -26,7 +26,6 @@
 #include <linux/swap.h>
 #include <linux/uio.h>
 #include <linux/writeback.h>
-#include <linux/sched.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -362,7 +361,7 @@ static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
 	volatile char c;
 
 	/* Set @end to the first byte outside the last page we care about. */
-	end = (const char __user*)PAGE_ALIGN((ptrdiff_t __user)uaddr + bytes);
+	end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes);
 
 	while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
 		;
@@ -532,7 +531,8 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
 	blocksize_bits = vol->sb->s_blocksize_bits;
 	u = 0;
 	do {
-		struct page *page = pages[u];
+		page = pages[u];
+		BUG_ON(!page);
 		/*
 		 * create_empty_buffers() will create uptodate/dirty buffers if
 		 * the page is uptodate/dirty.
@@ -1291,7 +1291,7 @@ static inline size_t ntfs_copy_from_user(struct page **pages,
 		size_t bytes)
 {
 	struct page **last_page = pages + nr_pages;
-	char *kaddr;
+	char *addr;
 	size_t total = 0;
 	unsigned len;
 	int left;
@@ -1300,13 +1300,13 @@ static inline size_t ntfs_copy_from_user(struct page **pages,
 		len = PAGE_CACHE_SIZE - ofs;
 		if (len > bytes)
 			len = bytes;
-		kaddr = kmap_atomic(*pages, KM_USER0);
-		left = __copy_from_user_inatomic(kaddr + ofs, buf, len);
-		kunmap_atomic(kaddr, KM_USER0);
+		addr = kmap_atomic(*pages, KM_USER0);
+		left = __copy_from_user_inatomic(addr + ofs, buf, len);
+		kunmap_atomic(addr, KM_USER0);
 		if (unlikely(left)) {
 			/* Do it the slow way. */
-			kaddr = kmap(*pages);
-			left = __copy_from_user(kaddr + ofs, buf, len);
+			addr = kmap(*pages);
+			left = __copy_from_user(addr + ofs, buf, len);
 			kunmap(*pages);
 			if (unlikely(left))
 				goto err_out;
@@ -1408,26 +1408,26 @@ static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
 		size_t *iov_ofs, size_t bytes)
 {
 	struct page **last_page = pages + nr_pages;
-	char *kaddr;
+	char *addr;
 	size_t copied, len, total = 0;
 
 	do {
 		len = PAGE_CACHE_SIZE - ofs;
 		if (len > bytes)
 			len = bytes;
-		kaddr = kmap_atomic(*pages, KM_USER0);
-		copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs,
+		addr = kmap_atomic(*pages, KM_USER0);
+		copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
 				*iov, *iov_ofs, len);
-		kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(addr, KM_USER0);
 		if (unlikely(copied != len)) {
 			/* Do it the slow way. */
-			kaddr = kmap(*pages);
-			copied = __ntfs_copy_from_user_iovec_inatomic(kaddr + ofs,
+			addr = kmap(*pages);
+			copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
 					*iov, *iov_ofs, len);
 			/*
 			 * Zero the rest of the target like __copy_from_user().
 			 */
-			memset(kaddr + ofs + copied, 0, len - copied);
+			memset(addr + ofs + copied, 0, len - copied);
 			kunmap(*pages);
 			if (unlikely(copied != len))
 				goto err_out;
@@ -1735,8 +1735,6 @@ static int ntfs_commit_pages_after_write(struct page **pages,
 	read_unlock_irqrestore(&ni->size_lock, flags);
 	BUG_ON(initialized_size != i_size);
 	if (end > initialized_size) {
-		unsigned long flags;
-
 		write_lock_irqsave(&ni->size_lock, flags);
 		ni->initialized_size = end;
 		i_size_write(vi, end);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index b532a730cec..e9da092e277 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -34,7 +34,6 @@
 #include "dir.h"
 #include "debug.h"
 #include "inode.h"
-#include "attrib.h"
 #include "lcnalloc.h"
 #include "malloc.h"
 #include "mft.h"
@@ -2500,8 +2499,6 @@ retry_truncate:
 	/* Resize the attribute record to best fit the new attribute size. */
 	if (new_size < vol->mft_record_size &&
 			!ntfs_resident_attr_value_resize(m, a, new_size)) {
-		unsigned long flags;
-
 		/* The resize succeeded! */
 		flush_dcache_mft_record_page(ctx->ntfs_ino);
 		mark_mft_record_dirty(ctx->ntfs_ino);
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index acfed325f4e..d7932e95b1f 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -1,7 +1,7 @@
 /*
  * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project.
  *
- * Copyright (c) 2002-2005 Anton Altaparmakov
+ * Copyright (c) 2002-2007 Anton Altaparmakov
  *
  * This program/include file is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as published
@@ -724,24 +724,139 @@ bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
  */
 bool ntfs_empty_logfile(struct inode *log_vi)
 {
-	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
+	VCN vcn, end_vcn;
+	ntfs_inode *log_ni = NTFS_I(log_vi);
+	ntfs_volume *vol = log_ni->vol;
+	struct super_block *sb = vol->sb;
+	runlist_element *rl;
+	unsigned long flags;
+	unsigned block_size, block_size_bits;
+	int err;
+	bool should_wait = true;
 
 	ntfs_debug("Entering.");
-	if (!NVolLogFileEmpty(vol)) {
-		int err;
-		
-		err = ntfs_attr_set(NTFS_I(log_vi), 0, i_size_read(log_vi),
-				0xff);
-		if (unlikely(err)) {
-			ntfs_error(vol->sb, "Failed to fill $LogFile with "
-					"0xff bytes (error code %i).", err);
-			return false;
-		}
-		/* Set the flag so we do not have to do it again on remount. */
-		NVolSetLogFileEmpty(vol);
+	if (NVolLogFileEmpty(vol)) {
+		ntfs_debug("Done.");
+		return true;
 	}
+	/*
+	 * We cannot use ntfs_attr_set() because we may be still in the middle
+	 * of a mount operation.  Thus we do the emptying by hand by first
+	 * zapping the page cache pages for the $LogFile/$DATA attribute and
+	 * then emptying each of the buffers in each of the clusters specified
+	 * by the runlist by hand.
+	 */
+	block_size = sb->s_blocksize;
+	block_size_bits = sb->s_blocksize_bits;
+	vcn = 0;
+	read_lock_irqsave(&log_ni->size_lock, flags);
+	end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >>
+			vol->cluster_size_bits;
+	read_unlock_irqrestore(&log_ni->size_lock, flags);
+	truncate_inode_pages(log_vi->i_mapping, 0);
+	down_write(&log_ni->runlist.lock);
+	rl = log_ni->runlist.rl;
+	if (unlikely(!rl || vcn < rl->vcn || !rl->length)) {
+map_vcn:
+		err = ntfs_map_runlist_nolock(log_ni, vcn, NULL);
+		if (err) {
+			ntfs_error(sb, "Failed to map runlist fragment (error "
+					"%d).", -err);
+			goto err;
+		}
+		rl = log_ni->runlist.rl;
+		BUG_ON(!rl || vcn < rl->vcn || !rl->length);
+	}
+	/* Seek to the runlist element containing @vcn. */
+	while (rl->length && vcn >= rl[1].vcn)
+		rl++;
+	do {
+		LCN lcn;
+		sector_t block, end_block;
+		s64 len;
+
+		/*
+		 * If this run is not mapped map it now and start again as the
+		 * runlist will have been updated.
+		 */
+		lcn = rl->lcn;
+		if (unlikely(lcn == LCN_RL_NOT_MAPPED)) {
+			vcn = rl->vcn;
+			goto map_vcn;
+		}
+		/* If this run is not valid abort with an error. */
+		if (unlikely(!rl->length || lcn < LCN_HOLE))
+			goto rl_err;
+		/* Skip holes. */
+		if (lcn == LCN_HOLE)
+			continue;
+		block = lcn << vol->cluster_size_bits >> block_size_bits;
+		len = rl->length;
+		if (rl[1].vcn > end_vcn)
+			len = end_vcn - rl->vcn;
+		end_block = (lcn + len) << vol->cluster_size_bits >>
+				block_size_bits;
+		/* Iterate over the blocks in the run and empty them. */
+		do {
+			struct buffer_head *bh;
+
+			/* Obtain the buffer, possibly not uptodate. */
+			bh = sb_getblk(sb, block);
+			BUG_ON(!bh);
+			/* Setup buffer i/o submission. */
+			lock_buffer(bh);
+			bh->b_end_io = end_buffer_write_sync;
+			get_bh(bh);
+			/* Set the entire contents of the buffer to 0xff. */
+			memset(bh->b_data, -1, block_size);
+			if (!buffer_uptodate(bh))
+				set_buffer_uptodate(bh);
+			if (buffer_dirty(bh))
+				clear_buffer_dirty(bh);
+			/*
+			 * Submit the buffer and wait for i/o to complete but
+			 * only for the first buffer so we do not miss really
+			 * serious i/o errors.  Once the first buffer has
+			 * completed ignore errors afterwards as we can assume
+			 * that if one buffer worked all of them will work.
+			 */
+			submit_bh(WRITE, bh);
+			if (should_wait) {
+				should_wait = false;
+				wait_on_buffer(bh);
+				if (unlikely(!buffer_uptodate(bh)))
+					goto io_err;
+			}
+			brelse(bh);
+		} while (++block < end_block);
+	} while ((++rl)->vcn < end_vcn);
+	up_write(&log_ni->runlist.lock);
+	/*
+	 * Zap the pages again just in case any got instantiated whilst we were
+	 * emptying the blocks by hand.  FIXME: We may not have completed
+	 * writing to all the buffer heads yet so this may happen too early.
+	 * We really should use a kernel thread to do the emptying
+	 * asynchronously and then we can also set the volume dirty and output
+	 * an error message if emptying should fail.
+	 */
+	truncate_inode_pages(log_vi->i_mapping, 0);
+	/* Set the flag so we do not have to do it again on remount. */
+	NVolSetLogFileEmpty(vol);
 	ntfs_debug("Done.");
 	return true;
+io_err:
+	ntfs_error(sb, "Failed to write buffer.  Unmount and run chkdsk.");
+	goto dirty_err;
+rl_err:
+	ntfs_error(sb, "Runlist is corrupt.  Unmount and run chkdsk.");
+dirty_err:
+	NVolSetErrors(vol);
+	err = -EIO;
+err:
+	up_write(&log_ni->runlist.lock);
+	ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).",
+			-err);
+	return false;
 }
 
 #endif /* NTFS_RW */
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index 9afd72c7ad0..56a9a6d25a2 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -1,7 +1,7 @@
 /**
  * runlist.c - NTFS runlist handling code.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2005 Anton Altaparmakov
+ * Copyright (c) 2001-2007 Anton Altaparmakov
  * Copyright (c) 2002-2005 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -1714,7 +1714,7 @@ extend_hole:
 					sizeof(*rl));
 		/* Adjust the beginning of the tail if necessary. */
 		if (end > rl->vcn) {
-			s64 delta = end - rl->vcn;
+			delta = end - rl->vcn;
 			rl->vcn = end;
 			rl->length -= delta;
 			/* Only adjust the lcn if it is real. */
-- 
cgit v1.2.3


From c77534f6fb6d58e27d923b6825ea3b0ef485ab26 Mon Sep 17 00:00:00 2001
From: Tao Mao <tao.ma@oracle.com>
Date: Tue, 28 Aug 2007 17:22:33 -0700
Subject: ocfs2: remove mostly unused field from insert structure

ocfs2_insert_type->ins_free_records was only used in one place, and was set
incorrectly in most places. We can free up some memory and lose some code by
removing this.

* Small warning fixup contributed by Andrew Mortom <akpm@linux-foundation.org>

Signed-off-by: Tao Mao <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c | 29 ++++++-----------------------
 1 file changed, 6 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 778a850b463..33d5cab5e69 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -354,7 +354,6 @@ struct ocfs2_insert_type {
 	enum ocfs2_append_type	ins_appending;
 	enum ocfs2_contig_type	ins_contig;
 	int			ins_contig_index;
-	int			ins_free_records;
 	int			ins_tree_depth;
 };
 
@@ -3593,6 +3592,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 				    struct buffer_head *di_bh,
 				    struct buffer_head **last_eb_bh,
 				    struct ocfs2_extent_rec *insert_rec,
+				    int *free_records,
 				    struct ocfs2_insert_type *insert)
 {
 	int ret;
@@ -3633,7 +3633,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 	 * XXX: This test is simplistic, we can search for empty
 	 * extent records too.
 	 */
-	insert->ins_free_records = le16_to_cpu(el->l_count) -
+	*free_records = le16_to_cpu(el->l_count) -
 		le16_to_cpu(el->l_next_free_rec);
 
 	if (!insert->ins_tree_depth) {
@@ -3730,6 +3730,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 			struct ocfs2_alloc_context *meta_ac)
 {
 	int status;
+	int uninitialized_var(free_records);
 	struct buffer_head *last_eb_bh = NULL;
 	struct ocfs2_insert_type insert = {0, };
 	struct ocfs2_extent_rec rec;
@@ -3752,7 +3753,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	rec.e_flags = flags;
 
 	status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
-					  &insert);
+					  &free_records, &insert);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -3762,9 +3763,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	     "Insert.contig_index: %d, Insert.free_records: %d, "
 	     "Insert.tree_depth: %d\n",
 	     insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
-	     insert.ins_free_records, insert.ins_tree_depth);
+	     free_records, insert.ins_tree_depth);
 
-	if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) {
+	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
 		status = ocfs2_grow_tree(inode, handle, fe_bh,
 					 &insert.ins_tree_depth, &last_eb_bh,
 					 meta_ac);
@@ -3847,26 +3848,17 @@ leftright:
 
 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
 	    le16_to_cpu(rightmost_el->l_count)) {
-		int old_depth = depth;
-
 		ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
 				      meta_ac);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
-
-		if (old_depth != depth) {
-			eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
-			rightmost_el = &eb->h_list;
-		}
 	}
 
 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
 	insert.ins_appending = APPEND_NONE;
 	insert.ins_contig = CONTIG_NONE;
-	insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
-		- le16_to_cpu(rightmost_el->l_next_free_rec);
 	insert.ins_tree_depth = depth;
 
 	insert_range = le32_to_cpu(split_rec.e_cpos) +
@@ -4180,27 +4172,18 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
 
 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
 	    le16_to_cpu(rightmost_el->l_count)) {
-		int old_depth = depth;
-
 		ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
 				      meta_ac);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
-
-		if (old_depth != depth) {
-			eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
-			rightmost_el = &eb->h_list;
-		}
 	}
 
 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
 	insert.ins_appending = APPEND_NONE;
 	insert.ins_contig = CONTIG_NONE;
 	insert.ins_split = SPLIT_RIGHT;
-	insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
-		- le16_to_cpu(rightmost_el->l_next_free_rec);
 	insert.ins_tree_depth = depth;
 
 	ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
-- 
cgit v1.2.3


From 518d7269f3c9129ae51d5f804edff998ab945a40 Mon Sep 17 00:00:00 2001
From: Tao Mao <tao.ma@oracle.com>
Date: Tue, 28 Aug 2007 17:25:35 -0700
Subject: ocfs2: remove unused variable

delete_tail_recs in ocfs2_try_to_merge_extent() was only ever set, remove
it.

Signed-off-by: Tao Mao <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c | 40 ++++++++++++++++------------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 33d5cab5e69..c078b3b1f01 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2807,36 +2807,28 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
 				     struct ocfs2_merge_ctxt *ctxt)
 
 {
-	int ret = 0, delete_tail_recs = 0;
+	int ret = 0;
 	struct ocfs2_extent_list *el = path_leaf_el(left_path);
 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
 
 	BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
 
-	if (ctxt->c_split_covers_rec) {
-		delete_tail_recs++;
-
-		if (ctxt->c_contig_type == CONTIG_LEFTRIGHT ||
-		    ctxt->c_has_empty_extent)
-			delete_tail_recs++;
-
-		if (ctxt->c_has_empty_extent) {
-			/*
-			 * The merge code will need to create an empty
-			 * extent to take the place of the newly
-			 * emptied slot. Remove any pre-existing empty
-			 * extents - having more than one in a leaf is
-			 * illegal.
-			 */
-			ret = ocfs2_rotate_tree_left(inode, handle, left_path,
-						     dealloc);
-			if (ret) {
-				mlog_errno(ret);
-				goto out;
-			}
-			split_index--;
-			rec = &el->l_recs[split_index];
+	if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
+		/*
+		 * The merge code will need to create an empty
+		 * extent to take the place of the newly
+		 * emptied slot. Remove any pre-existing empty
+		 * extents - having more than one in a leaf is
+		 * illegal.
+		 */
+		ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+					     dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
 		}
+		split_index--;
+		rec = &el->l_recs[split_index];
 	}
 
 	if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
-- 
cgit v1.2.3


From 015452b15ff6c2d9fa1f82f28d61e7a66e2df86a Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 12 Sep 2007 10:21:22 -0700
Subject: ocfs2: Remove unused structure field

c_used_tail_recs in struct ocfs2_merge_ctxt is only ever set, so we can
remove it.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index c078b3b1f01..c91706f949f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -361,7 +361,6 @@ struct ocfs2_merge_ctxt {
 	enum ocfs2_contig_type	c_contig_type;
 	int			c_has_empty_extent;
 	int			c_split_covers_rec;
-	int			c_used_tail_recs;
 };
 
 /*
@@ -3999,11 +3998,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 	} else
 		rightmost_el = path_root_el(path);
 
-	ctxt.c_used_tail_recs = le16_to_cpu(rightmost_el->l_next_free_rec);
-	if (ctxt.c_used_tail_recs > 0 &&
-	    ocfs2_is_empty_extent(&rightmost_el->l_recs[0]))
-		ctxt.c_used_tail_recs--;
-
 	if (rec->e_cpos == split_rec->e_cpos &&
 	    rec->e_leaf_clusters == split_rec->e_leaf_clusters)
 		ctxt.c_split_covers_rec = 1;
@@ -4012,10 +4006,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 
 	ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
 
-	mlog(0, "index: %d, contig: %u, used_tail_recs: %u, "
-	     "has_empty: %u, split_covers: %u\n", split_index,
-	     ctxt.c_contig_type, ctxt.c_used_tail_recs,
-	     ctxt.c_has_empty_extent, ctxt.c_split_covers_rec);
+	mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
+	     split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
+	     ctxt.c_split_covers_rec);
 
 	if (ctxt.c_contig_type == CONTIG_NONE) {
 		if (ctxt.c_split_covers_rec)
-- 
cgit v1.2.3


From 19b613d41051296be628581e7e21b847e9eaba80 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 4 Oct 2007 14:47:09 -0700
Subject: ocfs2: Clear slot map when umounting a local volume

This is technically harmless (recovery will clean it out later), but leaves
a bogus entry in the slot_map which really shouldn't be there.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/super.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c034b5129c1..19436d1ff57 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1209,12 +1209,13 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 		tmp = ocfs2_request_umount_vote(osb);
 		if (tmp < 0)
 			mlog_errno(tmp);
+	}
 
-		if (osb->slot_num != OCFS2_INVALID_SLOT)
-			ocfs2_put_slot(osb);
+	if (osb->slot_num != OCFS2_INVALID_SLOT)
+		ocfs2_put_slot(osb);
 
+	if (osb->dlm)
 		ocfs2_super_unlock(osb, 1);
-	}
 
 	ocfs2_release_system_inodes(osb);
 
-- 
cgit v1.2.3


From d550071c03f129a60dfad60d23dab73f894129a9 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 6 Sep 2007 13:34:16 -0700
Subject: ocfs2: Implement show_options()

Implement sops->show_options() so as to allow /proc/mounts to show the mount
options.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/super.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 19436d1ff57..e5ac0714dab 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -39,6 +39,7 @@
 #include <linux/parser.h>
 #include <linux/crc32.h>
 #include <linux/debugfs.h>
+#include <linux/mount.h>
 
 #include <cluster/nodemanager.h>
 
@@ -91,6 +92,7 @@ struct mount_options
 static int ocfs2_parse_options(struct super_block *sb, char *options,
 			       struct mount_options *mopt,
 			       int is_remount);
+static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt);
 static void ocfs2_put_super(struct super_block *sb);
 static int ocfs2_mount_volume(struct super_block *sb);
 static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
@@ -133,6 +135,7 @@ static const struct super_operations ocfs2_sops = {
 	.write_super	= ocfs2_write_super,
 	.put_super	= ocfs2_put_super,
 	.remount_fs	= ocfs2_remount,
+	.show_options   = ocfs2_show_options,
 };
 
 enum {
@@ -830,6 +833,41 @@ bail:
 	return status;
 }
 
+static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+	struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb);
+	unsigned long opts = osb->s_mount_opt;
+
+	if (opts & OCFS2_MOUNT_HB_LOCAL)
+		seq_printf(s, ",_netdev,heartbeat=local");
+	else
+		seq_printf(s, ",heartbeat=none");
+
+	if (opts & OCFS2_MOUNT_NOINTR)
+		seq_printf(s, ",nointr");
+
+	if (opts & OCFS2_MOUNT_DATA_WRITEBACK)
+		seq_printf(s, ",data=writeback");
+	else
+		seq_printf(s, ",data=ordered");
+
+	if (opts & OCFS2_MOUNT_BARRIER)
+		seq_printf(s, ",barrier=1");
+
+	if (opts & OCFS2_MOUNT_ERRORS_PANIC)
+		seq_printf(s, ",errors=panic");
+	else
+		seq_printf(s, ",errors=remount-ro");
+
+	if (osb->preferred_slot != OCFS2_INVALID_SLOT)
+		seq_printf(s, ",preferred_slot=%d", osb->preferred_slot);
+
+	if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM)
+		seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum);
+
+	return 0;
+}
+
 static int __init ocfs2_init(void)
 {
 	int status;
-- 
cgit v1.2.3


From bddb8eb37f1460fc19e1af16010c9ad4ca3717a6 Mon Sep 17 00:00:00 2001
From: Denis Cheng <crquan@gmail.com>
Date: Thu, 27 Sep 2007 02:10:04 +0800
Subject: [PATCH] fs/ocfs2/: removed unneeded initial value and function's
 return value

Signed-off-by: Denis Cheng <crquan@gmail.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/super.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index e5ac0714dab..0e2a1b45bf9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -107,7 +107,7 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait);
 
 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
 static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
-static int ocfs2_release_system_inodes(struct ocfs2_super *osb);
+static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
 static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
 static int ocfs2_check_volume(struct ocfs2_super *osb);
 static int ocfs2_verify_volume(struct ocfs2_dinode *di,
@@ -180,7 +180,7 @@ static void ocfs2_write_super(struct super_block *sb)
 
 static int ocfs2_sync_fs(struct super_block *sb, int wait)
 {
-	int status = 0;
+	int status;
 	tid_t target;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 
@@ -278,9 +278,9 @@ bail:
 	return status;
 }
 
-static int ocfs2_release_system_inodes(struct ocfs2_super *osb)
+static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
 {
-	int status = 0, i;
+	int i;
 	struct inode *inode;
 
 	mlog_entry_void();
@@ -305,8 +305,7 @@ static int ocfs2_release_system_inodes(struct ocfs2_super *osb)
 		osb->root_inode = NULL;
 	}
 
-	mlog_exit(status);
-	return status;
+	mlog_exit(0);
 }
 
 /* We're allocating fs objects, use GFP_NOFS */
@@ -456,7 +455,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
 			  struct buffer_head **bh,
 			  int *sector_size)
 {
-	int status = 0, tmpstat;
+	int status, tmpstat;
 	struct ocfs1_vol_disk_hdr *hdr;
 	struct ocfs2_dinode *di;
 	int blksize;
@@ -1314,7 +1313,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 				  struct buffer_head *bh,
 				  int sector_size)
 {
-	int status = 0;
+	int status;
 	int i, cbits, bbits;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
 	struct inode *inode = NULL;
@@ -1635,7 +1634,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 
 static int ocfs2_check_volume(struct ocfs2_super *osb)
 {
-	int status = 0;
+	int status;
 	int dirty;
 	int local;
 	struct ocfs2_dinode *local_alloc = NULL; /* only used if we
-- 
cgit v1.2.3


From 92e91ce2a30b2af53ebf077512801dc01e75cca5 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 20 Sep 2007 11:19:20 -0700
Subject: ocfs2: Sync ocfs2_fs.h with ocfs2-tools

ocfs2-tools added some on-disk fields and flags which are used by
tunefs.ocfs2.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/ocfs2_fs.h | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 82f8a75b207..bf10a545383 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -110,6 +110,17 @@
 /* Support for sparse allocation in b-trees */
 #define OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC	0x0010
 
+/*
+ * Tunefs sets this incompat flag before starting an operation which
+ * would require cleanup on abort. This is done to protect users from
+ * inadvertently mounting the fs after an aborted run without
+ * fsck-ing.
+ *
+ * s_tunefs_flags on the super block describes precisely which
+ * operations were in progress.
+ */
+#define OCFS2_FEATURE_INCOMPAT_TUNEFS_INPROG	0x0020
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
@@ -129,6 +140,11 @@
 /* the max backup superblock nums */
 #define OCFS2_MAX_BACKUP_SUPERBLOCKS	6
 
+/*
+ * Flags on ocfs2_super_block.s_tunefs_flags
+ */
+#define OCFS2_TUNEFS_INPROG_REMOVE_SLOT		0x0001	/* Removing slots */
+
 /*
  * Flags on ocfs2_dinode.i_flags
  */
@@ -447,8 +463,8 @@ struct ocfs2_super_block {
 	__le32 s_clustersize_bits;	/* Clustersize for this fs */
 /*40*/	__le16 s_max_slots;		/* Max number of simultaneous mounts
 					   before tunefs required */
-	__le16 s_reserved1;
-	__le32 s_reserved2;
+	__le16 s_tunefs_flag;
+	__le32 s_reserved1;
 	__le64 s_first_cluster_group;	/* Block offset of 1st cluster
 					 * group header */
 /*50*/	__u8  s_label[OCFS2_MAX_VOL_LABEL_LEN];	/* Label for mounting, etc. */
-- 
cgit v1.2.3


From 65ed39d6ca78f07d2958814e08440e4264b6b488 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 28 Aug 2007 17:13:23 -0700
Subject: ocfs2: move nonsparse hole-filling into ocfs2_write_begin()

By doing this, we can remove any higher level logic which has to have
knowledge of btree functionality - any callers of ocfs2_write_begin() can
now expect it to do anything necessary to prepare the inode for new data.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Reviewed-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/aops.c |  40 +++++++++--
 fs/ocfs2/file.c | 217 ++++++++++++++++++++------------------------------------
 fs/ocfs2/file.h |   2 +
 3 files changed, 115 insertions(+), 144 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f37f25c931f..fae07672eb1 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -301,12 +301,8 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
 {
 	int ret;
 
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-
 	ret = block_prepare_write(page, from, to, ocfs2_get_block);
 
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
 	return ret;
 }
 
@@ -1360,6 +1356,36 @@ out:
 	return ret;
 }
 
+/*
+ * This function only does anything for file systems which can't
+ * handle sparse files.
+ *
+ * What we want to do here is fill in any hole between the current end
+ * of allocation and the end of our write. That way the rest of the
+ * write path can treat it as an non-allocating write, which has no
+ * special case code for sparse/nonsparse files.
+ */
+static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
+					unsigned len,
+					struct ocfs2_write_ctxt *wc)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	loff_t newsize = pos + len;
+
+	if (ocfs2_sparse_alloc(osb))
+		return 0;
+
+	if (newsize <= i_size_read(inode))
+		return 0;
+
+	ret = ocfs2_extend_no_holes(inode, newsize, newsize - len);
+	if (ret)
+		mlog_errno(ret);
+
+	return ret;
+}
+
 int ocfs2_write_begin_nolock(struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned flags,
 			     struct page **pagep, void **fsdata,
@@ -1381,6 +1407,12 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		return ret;
 	}
 
+	ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
 	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
 					&extents_to_split);
 	if (ret) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f3bc3658e7a..781ba6c4ef8 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -779,25 +779,6 @@ leave:
 	return status;
 }
 
-static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
-				   u32 clusters_to_add, int mark_unwritten)
-{
-	int ret;
-
-	/*
-	 * The alloc sem blocks peope in read/write from reading our
-	 * allocation until we're done changing it. We depend on
-	 * i_mutex to block other extend/truncate calls while we're
-	 * here.
-	 */
-	down_write(&OCFS2_I(inode)->ip_alloc_sem);
-	ret = __ocfs2_extend_allocation(inode, logical_start, clusters_to_add,
-					mark_unwritten);
-	up_write(&OCFS2_I(inode)->ip_alloc_sem);
-
-	return ret;
-}
-
 /* Some parts of this taken from generic_cont_expand, which turned out
  * to be too fragile to do exactly what we need without us having to
  * worry about recursive locking in ->prepare_write() and
@@ -889,25 +870,47 @@ out:
 	return ret;
 }
 
-/* 
- * A tail_to_skip value > 0 indicates that we're being called from
- * ocfs2_file_aio_write(). This has the following implications:
- *
- * - we don't want to update i_size
- * - di_bh will be NULL, which is fine because it's only used in the
- *   case where we want to update i_size.
- * - ocfs2_zero_extend() will then only be filling the hole created
- *   between i_size and the start of the write.
- */
+int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
+{
+	int ret;
+	u32 clusters_to_add;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
+	if (clusters_to_add < oi->ip_clusters)
+		clusters_to_add = 0;
+	else
+		clusters_to_add -= oi->ip_clusters;
+
+	if (clusters_to_add) {
+		ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
+						clusters_to_add, 0);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * Call this even if we don't add any clusters to the tree. We
+	 * still need to zero the area between the old i_size and the
+	 * new i_size.
+	 */
+	ret = ocfs2_zero_extend(inode, zero_to);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
 static int ocfs2_extend_file(struct inode *inode,
 			     struct buffer_head *di_bh,
-			     u64 new_i_size,
-			     size_t tail_to_skip)
+			     u64 new_i_size)
 {
 	int ret = 0;
-	u32 clusters_to_add = 0;
 
-	BUG_ON(!tail_to_skip && !di_bh);
+	BUG_ON(!di_bh);
 
 	/* setattr sometimes calls us like this. */
 	if (new_i_size == 0)
@@ -917,13 +920,8 @@ static int ocfs2_extend_file(struct inode *inode,
   		goto out;
 	BUG_ON(new_i_size < i_size_read(inode));
 
-	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
-		BUG_ON(tail_to_skip != 0);
+	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
 		goto out_update_size;
-	}
-
-	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 
-		OCFS2_I(inode)->ip_clusters;
 
 	/* 
 	 * protect the pages that ocfs2_zero_extend is going to be
@@ -938,35 +936,25 @@ static int ocfs2_extend_file(struct inode *inode,
 		goto out;
 	}
 
-	if (clusters_to_add) {
-		ret = ocfs2_extend_allocation(inode,
-					      OCFS2_I(inode)->ip_clusters,
-					      clusters_to_add, 0);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out_unlock;
-		}
-	}
-
 	/*
-	 * Call this even if we don't add any clusters to the tree. We
-	 * still need to zero the area between the old i_size and the
-	 * new i_size.
+	 * The alloc sem blocks people in read/write from reading our
+	 * allocation until we're done changing it. We depend on
+	 * i_mutex to block other extend/truncate calls while we're
+	 * here.
 	 */
-	ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+	ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
 	}
 
 out_update_size:
-	if (!tail_to_skip) {
-		/* We're being called from ocfs2_setattr() which wants
-		 * us to update i_size */
-		ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
-		if (ret < 0)
-			mlog_errno(ret);
-	}
+	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
+	if (ret < 0)
+		mlog_errno(ret);
 
 out_unlock:
 	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
@@ -1035,7 +1023,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		if (i_size_read(inode) > attr->ia_size)
 			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
 		else
-			status = ocfs2_extend_file(inode, bh, attr->ia_size, 0);
+			status = ocfs2_extend_file(inode, bh, attr->ia_size);
 		if (status < 0) {
 			if (status != -ENOSPC)
 				mlog_errno(status);
@@ -1713,15 +1701,13 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 					 int appending,
 					 int *direct_io)
 {
-	int ret = 0, meta_level = appending;
+	int ret = 0, meta_level = 0;
 	struct inode *inode = dentry->d_inode;
-	u32 clusters;
-	loff_t newsize, saved_pos;
+	loff_t saved_pos, end;
 
 	/* 
-	 * We sample i_size under a read level meta lock to see if our write
-	 * is extending the file, if it is we back off and get a write level
-	 * meta lock.
+	 * We start with a read level meta lock and only jump to an ex
+	 * if we need to make modifications here.
 	 */
 	for(;;) {
 		ret = ocfs2_meta_lock(inode, NULL, meta_level);
@@ -1763,87 +1749,38 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 			saved_pos = *ppos;
 		}
 
-		if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
-			loff_t end = saved_pos + count;
-
-			/*
-			 * Skip the O_DIRECT checks if we don't need
-			 * them.
-			 */
-			if (!direct_io || !(*direct_io))
-				break;
-
-			/*
-			 * Allowing concurrent direct writes means
-			 * i_size changes wouldn't be synchronized, so
-			 * one node could wind up truncating another
-			 * nodes writes.
-			 */
-			if (end > i_size_read(inode)) {
-				*direct_io = 0;
-				break;
-			}
+		end = saved_pos + count;
 
-			/*
-			 * We don't fill holes during direct io, so
-			 * check for them here. If any are found, the
-			 * caller will have to retake some cluster
-			 * locks and initiate the io as buffered.
-			 */
-			ret = ocfs2_check_range_for_holes(inode, saved_pos,
-							  count);
-			if (ret == 1) {
-				*direct_io = 0;
-				ret = 0;
-			} else if (ret < 0)
-				mlog_errno(ret);
+		/*
+		 * Skip the O_DIRECT checks if we don't need
+		 * them.
+		 */
+		if (!direct_io || !(*direct_io))
 			break;
-		}
 
 		/*
-		 * The rest of this loop is concerned with legacy file
-		 * systems which don't support sparse files.
+		 * Allowing concurrent direct writes means
+		 * i_size changes wouldn't be synchronized, so
+		 * one node could wind up truncating another
+		 * nodes writes.
 		 */
-
-		newsize = count + saved_pos;
-
-		mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
-		     (long long) saved_pos, (long long) newsize,
-		     (long long) i_size_read(inode));
-
-		/* No need for a higher level metadata lock if we're
-		 * never going past i_size. */
-		if (newsize <= i_size_read(inode))
+		if (end > i_size_read(inode)) {
+			*direct_io = 0;
 			break;
-
-		if (meta_level == 0) {
-			ocfs2_meta_unlock(inode, meta_level);
-			meta_level = 1;
-			continue;
 		}
 
-		spin_lock(&OCFS2_I(inode)->ip_lock);
-		clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
-			OCFS2_I(inode)->ip_clusters;
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-		mlog(0, "Writing at EOF, may need more allocation: "
-		     "i_size = %lld, newsize = %lld, need %u clusters\n",
-		     (long long) i_size_read(inode), (long long) newsize,
-		     clusters);
-
-		/* We only want to continue the rest of this loop if
-		 * our extend will actually require more
-		 * allocation. */
-		if (!clusters)
-			break;
-
-		ret = ocfs2_extend_file(inode, NULL, newsize, count);
-		if (ret < 0) {
-			if (ret != -ENOSPC)
-				mlog_errno(ret);
-			goto out_unlock;
-		}
+		/*
+		 * We don't fill holes during direct io, so
+		 * check for them here. If any are found, the
+		 * caller will have to retake some cluster
+		 * locks and initiate the io as buffered.
+		 */
+		ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
+		if (ret == 1) {
+			*direct_io = 0;
+			ret = 0;
+		} else if (ret < 0)
+			mlog_errno(ret);
 		break;
 	}
 
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 36fe27f268e..066f14add3a 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -47,6 +47,8 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       struct ocfs2_alloc_context *data_ac,
 			       struct ocfs2_alloc_context *meta_ac,
 			       enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
+			  u64 zero_to);
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
-- 
cgit v1.2.3


From 1d410a6e337a0d2d5543ad1d9bccb670a7a05312 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 7 Sep 2007 14:20:45 -0700
Subject: ocfs2: Small refactor of truncate zeroing code

We'll want to reuse most of this when pushing inline data back out to an
extent. Keeping this part as a seperate patch helps to keep the upcoming
changes for write support uncluttered.

The core portion of ocfs2_zero_cluster_pages() responsible for making sure a
page is mapped and properly dirtied is abstracted out into it's own
function, ocfs2_map_and_dirty_page(). Actual functionality doesn't change,
though zeroing becomes optional.

We also turn part of ocfs2_free_write_ctxt() into  a common function for
unlocking and freeing a page array. This operation is very common (and
uniform) for Ocfs2 cluster sizes greater than page size, so it makes sense
to keep the code in one place.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Reviewed-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/alloc.c | 151 ++++++++++++++++++++++++++-----------------------------
 fs/ocfs2/aops.c  |  20 +++++---
 fs/ocfs2/aops.h  |   2 +
 3 files changed, 86 insertions(+), 87 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index c91706f949f..c81bfdfb992 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5633,12 +5633,50 @@ static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
 	return ocfs2_journal_dirty_data(handle, bh);
 }
 
+static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
+				     unsigned int from, unsigned int to,
+				     struct page *page, int zero, u64 *phys)
+{
+	int ret, partial = 0;
+
+	ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
+	if (ret)
+		mlog_errno(ret);
+
+	if (zero)
+		zero_user_page(page, from, to - from, KM_USER0);
+
+	/*
+	 * Need to set the buffers we zero'd into uptodate
+	 * here if they aren't - ocfs2_map_page_blocks()
+	 * might've skipped some
+	 */
+	if (ocfs2_should_order_data(inode)) {
+		ret = walk_page_buffers(handle,
+					page_buffers(page),
+					from, to, &partial,
+					ocfs2_ordered_zero_func);
+		if (ret < 0)
+			mlog_errno(ret);
+	} else {
+		ret = walk_page_buffers(handle, page_buffers(page),
+					from, to, &partial,
+					ocfs2_writeback_zero_func);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+	if (!partial)
+		SetPageUptodate(page);
+
+	flush_dcache_page(page);
+}
+
 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
 				     loff_t end, struct page **pages,
 				     int numpages, u64 phys, handle_t *handle)
 {
-	int i, ret, partial = 0;
-	void *kaddr;
+	int i;
 	struct page *page;
 	unsigned int from, to = PAGE_CACHE_SIZE;
 	struct super_block *sb = inode->i_sb;
@@ -5659,87 +5697,31 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
 		BUG_ON(from > PAGE_CACHE_SIZE);
 		BUG_ON(to > PAGE_CACHE_SIZE);
 
-		ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
-		if (ret)
-			mlog_errno(ret);
-
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr + from, 0, to - from);
-		kunmap_atomic(kaddr, KM_USER0);
-
-		/*
-		 * Need to set the buffers we zero'd into uptodate
-		 * here if they aren't - ocfs2_map_page_blocks()
-		 * might've skipped some
-		 */
-		if (ocfs2_should_order_data(inode)) {
-			ret = walk_page_buffers(handle,
-						page_buffers(page),
-						from, to, &partial,
-						ocfs2_ordered_zero_func);
-			if (ret < 0)
-				mlog_errno(ret);
-		} else {
-			ret = walk_page_buffers(handle, page_buffers(page),
-						from, to, &partial,
-						ocfs2_writeback_zero_func);
-			if (ret < 0)
-				mlog_errno(ret);
-		}
-
-		if (!partial)
-			SetPageUptodate(page);
-
-		flush_dcache_page(page);
+		ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
+					 &phys);
 
 		start = (page->index + 1) << PAGE_CACHE_SHIFT;
 	}
 out:
-	if (pages) {
-		for (i = 0; i < numpages; i++) {
-			page = pages[i];
-			unlock_page(page);
-			mark_page_accessed(page);
-			page_cache_release(page);
-		}
-	}
+	if (pages)
+		ocfs2_unlock_and_free_pages(pages, numpages);
 }
 
 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
-				struct page **pages, int *num, u64 *phys)
+				struct page **pages, int *num)
 {
-	int i, numpages = 0, ret = 0;
-	unsigned int ext_flags;
+	int numpages, ret = 0;
 	struct super_block *sb = inode->i_sb;
 	struct address_space *mapping = inode->i_mapping;
 	unsigned long index;
 	loff_t last_page_bytes;
 
-	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
 	BUG_ON(start > end);
 
-	if (start == end)
-		goto out;
-
 	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
 	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
 
-	ret = ocfs2_extent_map_get_blocks(inode, start >> sb->s_blocksize_bits,
-					  phys, NULL, &ext_flags);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	/* Tail is a hole. */
-	if (*phys == 0)
-		goto out;
-
-	/* Tail is marked as unwritten, we can count on write to zero
-	 * in that case. */
-	if (ext_flags & OCFS2_EXT_UNWRITTEN)
-		goto out;
-
+	numpages = 0;
 	last_page_bytes = PAGE_ALIGN(end);
 	index = start >> PAGE_CACHE_SHIFT;
 	do {
@@ -5756,14 +5738,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
 
 out:
 	if (ret != 0) {
-		if (pages) {
-			for (i = 0; i < numpages; i++) {
-				if (pages[i]) {
-					unlock_page(pages[i]);
-					page_cache_release(pages[i]);
-				}
-			}
-		}
+		if (pages)
+			ocfs2_unlock_and_free_pages(pages, numpages);
 		numpages = 0;
 	}
 
@@ -5784,18 +5760,20 @@ out:
 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
 				  u64 range_start, u64 range_end)
 {
-	int ret, numpages;
+	int ret = 0, numpages;
 	struct page **pages = NULL;
 	u64 phys;
+	unsigned int ext_flags;
+	struct super_block *sb = inode->i_sb;
 
 	/*
 	 * File systems which don't support sparse files zero on every
 	 * extend.
 	 */
-	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+	if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
 		return 0;
 
-	pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
+	pages = kcalloc(ocfs2_pages_per_cluster(sb),
 			sizeof(struct page *), GFP_NOFS);
 	if (pages == NULL) {
 		ret = -ENOMEM;
@@ -5803,15 +5781,30 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
 		goto out;
 	}
 
-	ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
-				   &numpages, &phys);
+	if (range_start == range_end)
+		goto out;
+
+	ret = ocfs2_extent_map_get_blocks(inode,
+					  range_start >> sb->s_blocksize_bits,
+					  &phys, NULL, &ext_flags);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	if (numpages == 0)
+	/*
+	 * Tail is a hole, or is marked unwritten. In either case, we
+	 * can count on read and write to return/push zero's.
+	 */
+	if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
+		goto out;
+
+	ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
+				   &numpages);
+	if (ret) {
+		mlog_errno(ret);
 		goto out;
+	}
 
 	ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
 				 numpages, phys, handle);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index fae07672eb1..8416e383197 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -830,18 +830,22 @@ struct ocfs2_write_ctxt {
 	struct ocfs2_cached_dealloc_ctxt w_dealloc;
 };
 
-static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
 {
 	int i;
 
-	for(i = 0; i < wc->w_num_pages; i++) {
-		if (wc->w_pages[i] == NULL)
-			continue;
-
-		unlock_page(wc->w_pages[i]);
-		mark_page_accessed(wc->w_pages[i]);
-		page_cache_release(wc->w_pages[i]);
+	for(i = 0; i < num_pages; i++) {
+		if (pages[i]) {
+			unlock_page(pages[i]);
+			mark_page_accessed(pages[i]);
+			page_cache_release(pages[i]);
+		}
 	}
+}
+
+static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+{
+	ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
 
 	brelse(wc->w_di_bh);
 	kfree(wc);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 389579bd64e..b4fa37d40db 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -34,6 +34,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 			  struct inode *inode, unsigned int from,
 			  unsigned int to, int new);
 
+void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages);
+
 int walk_page_buffers(	handle_t *handle,
 			struct buffer_head *head,
 			unsigned from,
-- 
cgit v1.2.3


From 316f4b9f98a353ac1be93199694fd97272378815 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 7 Sep 2007 18:21:26 -0700
Subject: ocfs2: Move directory manipulation code into dir.c

The code for adding, removing, deleting directory entries was splattered all
over namei.c. I'd rather have this all centralized, so that it's easier to
make changes for inline dir data, and eventually indexed directories.

None of the code in any of the functions was changed. I only removed the
static keyword from some prototypes so that they could be exported.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Reviewed-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dir.c     | 430 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ocfs2/dir.h     |  44 +++++-
 fs/ocfs2/journal.c |   2 +-
 fs/ocfs2/namei.c   | 433 -----------------------------------------------------
 fs/ocfs2/namei.h   |  19 ---
 5 files changed, 461 insertions(+), 467 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 0d5fdde959c..8e0ae022b2e 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -55,10 +55,16 @@
 #include "journal.h"
 #include "namei.h"
 #include "suballoc.h"
+#include "super.h"
 #include "uptodate.h"
 
 #include "buffer_head_io.h"
 
+#define NAMEI_RA_CHUNKS  2
+#define NAMEI_RA_BLOCKS  4
+#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
+
 static unsigned char ocfs2_filetype_table[] = {
 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
@@ -67,6 +73,347 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 			    struct inode *dir,
 			    struct buffer_head *parent_fe_bh,
 			    struct buffer_head **new_de_bh);
+static int ocfs2_do_extend_dir(struct super_block *sb,
+			       handle_t *handle,
+			       struct inode *dir,
+			       struct buffer_head *parent_fe_bh,
+			       struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       struct buffer_head **new_bh);
+
+int ocfs2_check_dir_entry(struct inode * dir,
+			  struct ocfs2_dir_entry * de,
+			  struct buffer_head * bh,
+			  unsigned long offset)
+{
+	const char *error_msg = NULL;
+	const int rlen = le16_to_cpu(de->rec_len);
+
+	if (rlen < OCFS2_DIR_REC_LEN(1))
+		error_msg = "rec_len is smaller than minimal";
+	else if (rlen % 4 != 0)
+		error_msg = "rec_len % 4 != 0";
+	else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
+		error_msg = "rec_len is too small for name_len";
+	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+		error_msg = "directory entry across blocks";
+
+	if (error_msg != NULL)
+		mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
+		     "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
+		     (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
+		     offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
+		     de->name_len);
+	return error_msg == NULL ? 1 : 0;
+}
+
+static inline int ocfs2_match(int len,
+			      const char * const name,
+			      struct ocfs2_dir_entry *de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+/*
+ * Returns 0 if not found, -1 on failure, and 1 on success
+ */
+static int inline ocfs2_search_dirblock(struct buffer_head *bh,
+					struct inode *dir,
+					const char *name, int namelen,
+					unsigned long offset,
+					struct ocfs2_dir_entry **res_dir)
+{
+	struct ocfs2_dir_entry *de;
+	char *dlimit, *de_buf;
+	int de_len;
+	int ret = 0;
+
+	mlog_entry_void();
+
+	de_buf = bh->b_data;
+	dlimit = de_buf + dir->i_sb->s_blocksize;
+
+	while (de_buf < dlimit) {
+		/* this code is executed quadratically often */
+		/* do minimal checking `by hand' */
+
+		de = (struct ocfs2_dir_entry *) de_buf;
+
+		if (de_buf + namelen <= dlimit &&
+		    ocfs2_match(namelen, name, de)) {
+			/* found a match - just to be sure, do a full check */
+			if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
+				ret = -1;
+				goto bail;
+			}
+			*res_dir = de;
+			ret = 1;
+			goto bail;
+		}
+
+		/* prevent looping on a bad block */
+		de_len = le16_to_cpu(de->rec_len);
+		if (de_len <= 0) {
+			ret = -1;
+			goto bail;
+		}
+
+		de_buf += de_len;
+		offset += de_len;
+	}
+
+bail:
+	mlog_exit(ret);
+	return ret;
+}
+
+struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
+				     struct inode *dir,
+				     struct ocfs2_dir_entry **res_dir)
+{
+	struct super_block *sb;
+	struct buffer_head *bh_use[NAMEI_RA_SIZE];
+	struct buffer_head *bh, *ret = NULL;
+	unsigned long start, block, b;
+	int ra_max = 0;		/* Number of bh's in the readahead
+				   buffer, bh_use[] */
+	int ra_ptr = 0;		/* Current index into readahead
+				   buffer */
+	int num = 0;
+	int nblocks, i, err;
+
+	mlog_entry_void();
+
+	*res_dir = NULL;
+	sb = dir->i_sb;
+
+	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
+	start = OCFS2_I(dir)->ip_dir_start_lookup;
+	if (start >= nblocks)
+		start = 0;
+	block = start;
+
+restart:
+	do {
+		/*
+		 * We deal with the read-ahead logic here.
+		 */
+		if (ra_ptr >= ra_max) {
+			/* Refill the readahead buffer */
+			ra_ptr = 0;
+			b = block;
+			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+				/*
+				 * Terminate if we reach the end of the
+				 * directory and must wrap, or if our
+				 * search has finished at this block.
+				 */
+				if (b >= nblocks || (num && block == start)) {
+					bh_use[ra_max] = NULL;
+					break;
+				}
+				num++;
+
+				bh = ocfs2_bread(dir, b++, &err, 1);
+				bh_use[ra_max] = bh;
+			}
+		}
+		if ((bh = bh_use[ra_ptr++]) == NULL)
+			goto next;
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh)) {
+			/* read error, skip block & hope for the best */
+			ocfs2_error(dir->i_sb, "reading directory %llu, "
+				    "offset %lu\n",
+				    (unsigned long long)OCFS2_I(dir)->ip_blkno,
+				    block);
+			brelse(bh);
+			goto next;
+		}
+		i = ocfs2_search_dirblock(bh, dir, name, namelen,
+					  block << sb->s_blocksize_bits,
+					  res_dir);
+		if (i == 1) {
+			OCFS2_I(dir)->ip_dir_start_lookup = block;
+			ret = bh;
+			goto cleanup_and_exit;
+		} else {
+			brelse(bh);
+			if (i < 0)
+				goto cleanup_and_exit;
+		}
+	next:
+		if (++block >= nblocks)
+			block = 0;
+	} while (block != start);
+
+	/*
+	 * If the directory has grown while we were searching, then
+	 * search the last part of the directory before giving up.
+	 */
+	block = nblocks;
+	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
+	if (block < nblocks) {
+		start = 0;
+		goto restart;
+	}
+
+cleanup_and_exit:
+	/* Clean up the read-ahead blocks */
+	for (; ra_ptr < ra_max; ra_ptr++)
+		brelse(bh_use[ra_ptr]);
+
+	mlog_exit_ptr(ret);
+	return ret;
+}
+
+/*
+ * ocfs2_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+int ocfs2_delete_entry(handle_t *handle,
+		       struct inode *dir,
+		       struct ocfs2_dir_entry *de_del,
+		       struct buffer_head *bh)
+{
+	struct ocfs2_dir_entry *de, *pde;
+	int i, status = -ENOENT;
+
+	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
+
+	i = 0;
+	pde = NULL;
+	de = (struct ocfs2_dir_entry *) bh->b_data;
+	while (i < bh->b_size) {
+		if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
+			status = -EIO;
+			mlog_errno(status);
+			goto bail;
+		}
+		if (de == de_del)  {
+			status = ocfs2_journal_access(handle, dir, bh,
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				status = -EIO;
+				mlog_errno(status);
+				goto bail;
+			}
+			if (pde)
+				pde->rec_len =
+					cpu_to_le16(le16_to_cpu(pde->rec_len) +
+						    le16_to_cpu(de->rec_len));
+			else
+				de->inode = 0;
+			dir->i_version++;
+			status = ocfs2_journal_dirty(handle, bh);
+			goto bail;
+		}
+		i += le16_to_cpu(de->rec_len);
+		pde = de;
+		de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
+	}
+bail:
+	mlog_exit(status);
+	return status;
+}
+
+/* we don't always have a dentry for what we want to add, so people
+ * like orphan dir can call this instead.
+ *
+ * If you pass me insert_bh, I'll skip the search of the other dir
+ * blocks and put the record in there.
+ */
+int __ocfs2_add_entry(handle_t *handle,
+		      struct inode *dir,
+		      const char *name, int namelen,
+		      struct inode *inode, u64 blkno,
+		      struct buffer_head *parent_fe_bh,
+		      struct buffer_head *insert_bh)
+{
+	unsigned long offset;
+	unsigned short rec_len;
+	struct ocfs2_dir_entry *de, *de1;
+	struct super_block *sb;
+	int retval, status;
+
+	mlog_entry_void();
+
+	sb = dir->i_sb;
+
+	if (!namelen)
+		return -EINVAL;
+
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
+	offset = 0;
+	de = (struct ocfs2_dir_entry *) insert_bh->b_data;
+	while (1) {
+		BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data);
+		/* These checks should've already been passed by the
+		 * prepare function, but I guess we can leave them
+		 * here anyway. */
+		if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
+			retval = -ENOENT;
+			goto bail;
+		}
+		if (ocfs2_match(namelen, name, de)) {
+			retval = -EEXIST;
+			goto bail;
+		}
+		if (((le64_to_cpu(de->inode) == 0) &&
+		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
+		    (le16_to_cpu(de->rec_len) >=
+		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
+			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+			retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
+			if (retval < 0) {
+				mlog_errno(retval);
+				goto bail;
+			}
+
+			status = ocfs2_journal_access(handle, dir, insert_bh,
+						      OCFS2_JOURNAL_ACCESS_WRITE);
+			/* By now the buffer is marked for journaling */
+			offset += le16_to_cpu(de->rec_len);
+			if (le64_to_cpu(de->inode)) {
+				de1 = (struct ocfs2_dir_entry *)((char *) de +
+					OCFS2_DIR_REC_LEN(de->name_len));
+				de1->rec_len =
+					cpu_to_le16(le16_to_cpu(de->rec_len) -
+					OCFS2_DIR_REC_LEN(de->name_len));
+				de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+				de = de1;
+			}
+			de->file_type = OCFS2_FT_UNKNOWN;
+			if (blkno) {
+				de->inode = cpu_to_le64(blkno);
+				ocfs2_set_de_type(de, inode->i_mode);
+			} else
+				de->inode = 0;
+			de->name_len = namelen;
+			memcpy(de->name, name, namelen);
+
+			dir->i_version++;
+			status = ocfs2_journal_dirty(handle, insert_bh);
+			retval = 0;
+			goto bail;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
+	}
+
+	/* when you think about it, the assert above should prevent us
+	 * from ever getting here. */
+	retval = -ENOSPC;
+bail:
+
+	mlog_exit(retval);
+	return retval;
+}
+
 /*
  * ocfs2_readdir()
  *
@@ -347,14 +694,83 @@ int ocfs2_empty_dir(struct inode *inode)
 	return 1;
 }
 
+int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+		       handle_t *handle,
+		       struct inode *parent,
+		       struct inode *inode,
+		       struct buffer_head *fe_bh,
+		       struct ocfs2_alloc_context *data_ac)
+{
+	int status;
+	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry *de = NULL;
+
+	mlog_entry_void();
+
+	status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
+				     data_ac, NULL, &new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+	status = ocfs2_journal_access(handle, inode, new_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
+
+	de = (struct ocfs2_dir_entry *) new_bh->b_data;
+	de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+	de->name_len = 1;
+	de->rec_len =
+		cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+	strcpy(de->name, ".");
+	ocfs2_set_de_type(de, S_IFDIR);
+	de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
+	de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
+	de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
+				  OCFS2_DIR_REC_LEN(1));
+	de->name_len = 2;
+	strcpy(de->name, "..");
+	ocfs2_set_de_type(de, S_IFDIR);
+
+	status = ocfs2_journal_dirty(handle, new_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	i_size_write(inode, inode->i_sb->s_blocksize);
+	inode->i_nlink = 2;
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	status = 0;
+bail:
+	if (new_bh)
+		brelse(new_bh);
+
+	mlog_exit(status);
+	return status;
+}
+
 /* returns a bh of the 1st new block in the allocation. */
-int ocfs2_do_extend_dir(struct super_block *sb,
-			handle_t *handle,
-			struct inode *dir,
-			struct buffer_head *parent_fe_bh,
-			struct ocfs2_alloc_context *data_ac,
-			struct ocfs2_alloc_context *meta_ac,
-			struct buffer_head **new_bh)
+static int ocfs2_do_extend_dir(struct super_block *sb,
+			       handle_t *handle,
+			       struct inode *dir,
+			       struct buffer_head *parent_fe_bh,
+			       struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       struct buffer_head **new_bh)
 {
 	int status;
 	int extend;
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index 3f67e146864..7bf9c0a01cd 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -26,6 +26,31 @@
 #ifndef OCFS2_DIR_H
 #define OCFS2_DIR_H
 
+struct buffer_head *ocfs2_find_entry(const char *name,
+				     int namelen,
+				     struct inode *dir,
+				     struct ocfs2_dir_entry **res_dir);
+int ocfs2_delete_entry(handle_t *handle,
+		       struct inode *dir,
+		       struct ocfs2_dir_entry *de_del,
+		       struct buffer_head *bh);
+int __ocfs2_add_entry(handle_t *handle,
+		      struct inode *dir,
+		      const char *name, int namelen,
+		      struct inode *inode, u64 blkno,
+		      struct buffer_head *parent_fe_bh,
+		      struct buffer_head *insert_bh);
+static inline int ocfs2_add_entry(handle_t *handle,
+				  struct dentry *dentry,
+				  struct inode *inode, u64 blkno,
+				  struct buffer_head *parent_fe_bh,
+				  struct buffer_head *insert_bh)
+{
+	return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
+				 dentry->d_name.name, dentry->d_name.len,
+				 inode, blkno, parent_fe_bh, insert_bh);
+}
+
 int ocfs2_check_dir_for_entry(struct inode *dir,
 			      const char *name,
 			      int namelen);
@@ -44,11 +69,16 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
 				 int namelen,
 				 struct buffer_head **ret_de_bh);
 struct ocfs2_alloc_context;
-int ocfs2_do_extend_dir(struct super_block *sb,
-			handle_t *handle,
-			struct inode *dir,
-			struct buffer_head *parent_fe_bh,
-			struct ocfs2_alloc_context *data_ac,
-			struct ocfs2_alloc_context *meta_ac,
-			struct buffer_head **new_bh);
+int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+		       handle_t *handle,
+		       struct inode *parent,
+		       struct inode *inode,
+		       struct buffer_head *fe_bh,
+		       struct ocfs2_alloc_context *data_ac);
+
+int ocfs2_check_dir_entry(struct inode *dir,
+			  struct ocfs2_dir_entry *de,
+			  struct buffer_head *bh,
+			  unsigned long offset);
+
 #endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index dbfb20bb27e..8bbfc80e5c5 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -35,13 +35,13 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
 #include "localalloc.h"
-#include "namei.h"
 #include "slot_map.h"
 #include "super.h"
 #include "vote.h"
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 701e6d04ed5..aae6c0bf669 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -64,29 +64,6 @@
 
 #include "buffer_head_io.h"
 
-#define NAMEI_RA_CHUNKS  2
-#define NAMEI_RA_BLOCKS  4
-#define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
-
-static int inline ocfs2_search_dirblock(struct buffer_head *bh,
-					struct inode *dir,
-					const char *name, int namelen,
-					unsigned long offset,
-					struct ocfs2_dir_entry **res_dir);
-
-static int ocfs2_delete_entry(handle_t *handle,
-			      struct inode *dir,
-			      struct ocfs2_dir_entry *de_del,
-			      struct buffer_head *bh);
-
-static int __ocfs2_add_entry(handle_t *handle,
-			     struct inode *dir,
-			     const char *name, int namelen,
-			     struct inode *inode, u64 blkno,
-			     struct buffer_head *parent_fe_bh,
-			     struct buffer_head *insert_bh);
-
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 			      struct inode *dir,
 			      struct dentry *dentry, int mode,
@@ -97,13 +74,6 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 			      struct inode **ret_inode,
 			      struct ocfs2_alloc_context *inode_ac);
 
-static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
-			      handle_t *handle,
-			      struct inode *parent,
-			      struct inode *inode,
-			      struct buffer_head *fe_bh,
-			      struct ocfs2_alloc_context *data_ac);
-
 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
 				    struct inode **ret_orphan_dir,
 				    struct inode *inode,
@@ -123,17 +93,6 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
 				     struct inode *inode,
 				     const char *symname);
 
-static inline int ocfs2_add_entry(handle_t *handle,
-				  struct dentry *dentry,
-				  struct inode *inode, u64 blkno,
-				  struct buffer_head *parent_fe_bh,
-				  struct buffer_head *insert_bh)
-{
-	return __ocfs2_add_entry(handle, dentry->d_parent->d_inode,
-				 dentry->d_name.name, dentry->d_name.len,
-				 inode, blkno, parent_fe_bh, insert_bh);
-}
-
 /* An orphan dir name is an 8 byte value, printed as a hex string */
 #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
 
@@ -232,75 +191,6 @@ bail:
 	return ret;
 }
 
-static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
-			      handle_t *handle,
-			      struct inode *parent,
-			      struct inode *inode,
-			      struct buffer_head *fe_bh,
-			      struct ocfs2_alloc_context *data_ac)
-{
-	int status;
-	struct buffer_head *new_bh = NULL;
-	struct ocfs2_dir_entry *de = NULL;
-
-	mlog_entry_void();
-
-	status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
-				     data_ac, NULL, &new_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	ocfs2_set_new_buffer_uptodate(inode, new_bh);
-
-	status = ocfs2_journal_access(handle, inode, new_bh,
-				      OCFS2_JOURNAL_ACCESS_CREATE);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
-
-	de = (struct ocfs2_dir_entry *) new_bh->b_data;
-	de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
-	de->name_len = 1;
-	de->rec_len =
-		cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
-	strcpy(de->name, ".");
-	ocfs2_set_de_type(de, S_IFDIR);
-	de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
-	de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
-	de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
-				  OCFS2_DIR_REC_LEN(1));
-	de->name_len = 2;
-	strcpy(de->name, "..");
-	ocfs2_set_de_type(de, S_IFDIR);
-
-	status = ocfs2_journal_dirty(handle, new_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	i_size_write(inode, inode->i_sb->s_blocksize);
-	inode->i_nlink = 2;
-	inode->i_blocks = ocfs2_inode_sector_count(inode);
-	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	status = 0;
-bail:
-	if (new_bh)
-		brelse(new_bh);
-
-	mlog_exit(status);
-	return status;
-}
-
 static int ocfs2_mknod(struct inode *dir,
 		       struct dentry *dentry,
 		       int mode,
@@ -1767,329 +1657,6 @@ bail:
 	return status;
 }
 
-int ocfs2_check_dir_entry(struct inode * dir,
-			  struct ocfs2_dir_entry * de,
-			  struct buffer_head * bh,
-			  unsigned long offset)
-{
-	const char *error_msg = NULL;
-	const int rlen = le16_to_cpu(de->rec_len);
-
-	if (rlen < OCFS2_DIR_REC_LEN(1))
-		error_msg = "rec_len is smaller than minimal";
-	else if (rlen % 4 != 0)
-		error_msg = "rec_len % 4 != 0";
-	else if (rlen < OCFS2_DIR_REC_LEN(de->name_len))
-		error_msg = "rec_len is too small for name_len";
-	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
-		error_msg = "directory entry across blocks";
-
-	if (error_msg != NULL)
-		mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
-		     "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
-		     (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
-		     offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
-		     de->name_len);
-	return error_msg == NULL ? 1 : 0;
-}
-
-/* we don't always have a dentry for what we want to add, so people
- * like orphan dir can call this instead.
- *
- * If you pass me insert_bh, I'll skip the search of the other dir
- * blocks and put the record in there.
- */
-static int __ocfs2_add_entry(handle_t *handle,
-			     struct inode *dir,
-			     const char *name, int namelen,
-			     struct inode *inode, u64 blkno,
-			     struct buffer_head *parent_fe_bh,
-			     struct buffer_head *insert_bh)
-{
-	unsigned long offset;
-	unsigned short rec_len;
-	struct ocfs2_dir_entry *de, *de1;
-	struct super_block *sb;
-	int retval, status;
-
-	mlog_entry_void();
-
-	sb = dir->i_sb;
-
-	if (!namelen)
-		return -EINVAL;
-
-	rec_len = OCFS2_DIR_REC_LEN(namelen);
-	offset = 0;
-	de = (struct ocfs2_dir_entry *) insert_bh->b_data;
-	while (1) {
-		BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data);
-		/* These checks should've already been passed by the
-		 * prepare function, but I guess we can leave them
-		 * here anyway. */
-		if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) {
-			retval = -ENOENT;
-			goto bail;
-		}
-		if (ocfs2_match(namelen, name, de)) {
-			retval = -EEXIST;
-			goto bail;
-		}
-		if (((le64_to_cpu(de->inode) == 0) &&
-		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
-		    (le16_to_cpu(de->rec_len) >=
-		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
-			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-			retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
-			if (retval < 0) {
-				mlog_errno(retval);
-				goto bail;
-			}
-
-			status = ocfs2_journal_access(handle, dir, insert_bh,
-						      OCFS2_JOURNAL_ACCESS_WRITE);
-			/* By now the buffer is marked for journaling */
-			offset += le16_to_cpu(de->rec_len);
-			if (le64_to_cpu(de->inode)) {
-				de1 = (struct ocfs2_dir_entry *)((char *) de +
-					OCFS2_DIR_REC_LEN(de->name_len));
-				de1->rec_len =
-					cpu_to_le16(le16_to_cpu(de->rec_len) -
-					OCFS2_DIR_REC_LEN(de->name_len));
-				de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
-				de = de1;
-			}
-			de->file_type = OCFS2_FT_UNKNOWN;
-			if (blkno) {
-				de->inode = cpu_to_le64(blkno);
-				ocfs2_set_de_type(de, inode->i_mode);
-			} else
-				de->inode = 0;
-			de->name_len = namelen;
-			memcpy(de->name, name, namelen);
-
-			dir->i_version++;
-			status = ocfs2_journal_dirty(handle, insert_bh);
-			retval = 0;
-			goto bail;
-		}
-		offset += le16_to_cpu(de->rec_len);
-		de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
-	}
-
-	/* when you think about it, the assert above should prevent us
-	 * from ever getting here. */
-	retval = -ENOSPC;
-bail:
-
-	mlog_exit(retval);
-	return retval;
-}
-
-
-/*
- * ocfs2_delete_entry deletes a directory entry by merging it with the
- * previous entry
- */
-static int ocfs2_delete_entry(handle_t *handle,
-			      struct inode *dir,
-			      struct ocfs2_dir_entry *de_del,
-			      struct buffer_head *bh)
-{
-	struct ocfs2_dir_entry *de, *pde;
-	int i, status = -ENOENT;
-
-	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
-
-	i = 0;
-	pde = NULL;
-	de = (struct ocfs2_dir_entry *) bh->b_data;
-	while (i < bh->b_size) {
-		if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
-			status = -EIO;
-			mlog_errno(status);
-			goto bail;
-		}
-		if (de == de_del)  {
-			status = ocfs2_journal_access(handle, dir, bh,
-						      OCFS2_JOURNAL_ACCESS_WRITE);
-			if (status < 0) {
-				status = -EIO;
-				mlog_errno(status);
-				goto bail;
-			}
-			if (pde)
-				pde->rec_len =
-					cpu_to_le16(le16_to_cpu(pde->rec_len) +
-						    le16_to_cpu(de->rec_len));
-			else
-				de->inode = 0;
-			dir->i_version++;
-			status = ocfs2_journal_dirty(handle, bh);
-			goto bail;
-		}
-		i += le16_to_cpu(de->rec_len);
-		pde = de;
-		de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
-	}
-bail:
-	mlog_exit(status);
-	return status;
-}
-
-/*
- * Returns 0 if not found, -1 on failure, and 1 on success
- */
-static int inline ocfs2_search_dirblock(struct buffer_head *bh,
-					struct inode *dir,
-					const char *name, int namelen,
-					unsigned long offset,
-					struct ocfs2_dir_entry **res_dir)
-{
-	struct ocfs2_dir_entry *de;
-	char *dlimit, *de_buf;
-	int de_len;
-	int ret = 0;
-
-	mlog_entry_void();
-
-	de_buf = bh->b_data;
-	dlimit = de_buf + dir->i_sb->s_blocksize;
-
-	while (de_buf < dlimit) {
-		/* this code is executed quadratically often */
-		/* do minimal checking `by hand' */
-
-		de = (struct ocfs2_dir_entry *) de_buf;
-
-		if (de_buf + namelen <= dlimit &&
-		    ocfs2_match(namelen, name, de)) {
-			/* found a match - just to be sure, do a full check */
-			if (!ocfs2_check_dir_entry(dir, de, bh, offset)) {
-				ret = -1;
-				goto bail;
-			}
-			*res_dir = de;
-			ret = 1;
-			goto bail;
-		}
-
-		/* prevent looping on a bad block */
-		de_len = le16_to_cpu(de->rec_len);
-		if (de_len <= 0) {
-			ret = -1;
-			goto bail;
-		}
-
-		de_buf += de_len;
-		offset += de_len;
-	}
-
-bail:
-	mlog_exit(ret);
-	return ret;
-}
-
-struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
-				     struct inode *dir,
-				     struct ocfs2_dir_entry **res_dir)
-{
-	struct super_block *sb;
-	struct buffer_head *bh_use[NAMEI_RA_SIZE];
-	struct buffer_head *bh, *ret = NULL;
-	unsigned long start, block, b;
-	int ra_max = 0;		/* Number of bh's in the readahead
-				   buffer, bh_use[] */
-	int ra_ptr = 0;		/* Current index into readahead
-				   buffer */
-	int num = 0;
-	int nblocks, i, err;
-
-	mlog_entry_void();
-
-	*res_dir = NULL;
-	sb = dir->i_sb;
-
-	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
-	start = OCFS2_I(dir)->ip_dir_start_lookup;
-	if (start >= nblocks)
-		start = 0;
-	block = start;
-
-restart:
-	do {
-		/*
-		 * We deal with the read-ahead logic here.
-		 */
-		if (ra_ptr >= ra_max) {
-			/* Refill the readahead buffer */
-			ra_ptr = 0;
-			b = block;
-			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
-				/*
-				 * Terminate if we reach the end of the
-				 * directory and must wrap, or if our
-				 * search has finished at this block.
-				 */
-				if (b >= nblocks || (num && block == start)) {
-					bh_use[ra_max] = NULL;
-					break;
-				}
-				num++;
-
-				bh = ocfs2_bread(dir, b++, &err, 1);
-				bh_use[ra_max] = bh;
-			}
-		}
-		if ((bh = bh_use[ra_ptr++]) == NULL)
-			goto next;
-		wait_on_buffer(bh);
-		if (!buffer_uptodate(bh)) {
-			/* read error, skip block & hope for the best */
-			ocfs2_error(dir->i_sb, "reading directory %llu, "
-				    "offset %lu\n",
-				    (unsigned long long)OCFS2_I(dir)->ip_blkno,
-				    block);
-			brelse(bh);
-			goto next;
-		}
-		i = ocfs2_search_dirblock(bh, dir, name, namelen,
-					  block << sb->s_blocksize_bits,
-					  res_dir);
-		if (i == 1) {
-			OCFS2_I(dir)->ip_dir_start_lookup = block;
-			ret = bh;
-			goto cleanup_and_exit;
-		} else {
-			brelse(bh);
-			if (i < 0)
-				goto cleanup_and_exit;
-		}
-	next:
-		if (++block >= nblocks)
-			block = 0;
-	} while (block != start);
-
-	/*
-	 * If the directory has grown while we were searching, then
-	 * search the last part of the directory before giving up.
-	 */
-	block = nblocks;
-	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
-	if (block < nblocks) {
-		start = 0;
-		goto restart;
-	}
-
-cleanup_and_exit:
-	/* Clean up the read-ahead blocks */
-	for (; ra_ptr < ra_max; ra_ptr++)
-		brelse(bh_use[ra_ptr]);
-
-	mlog_exit_ptr(ret);
-	return ret;
-}
-
 static int ocfs2_blkno_stringify(u64 blkno, char *name)
 {
 	int status, namelen;
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 0975c7b7212..688aef64c87 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -30,29 +30,10 @@ extern const struct inode_operations ocfs2_dir_iops;
 
 struct dentry *ocfs2_get_parent(struct dentry *child);
 
-int ocfs2_check_dir_entry (struct inode *dir,
-			   struct ocfs2_dir_entry *de,
-			   struct buffer_head *bh,
-			   unsigned long offset);
-struct buffer_head *ocfs2_find_entry(const char *name,
-				     int namelen,
-				     struct inode *dir,
-				     struct ocfs2_dir_entry **res_dir);
 int ocfs2_orphan_del(struct ocfs2_super *osb,
 		     handle_t *handle,
 		     struct inode *orphan_dir_inode,
 		     struct inode *inode,
 		     struct buffer_head *orphan_dir_bh);
 
-static inline int ocfs2_match(int len,
-			      const char * const name,
-			      struct ocfs2_dir_entry *de)
-{
-	if (len != de->name_len)
-		return 0;
-	if (!de->inode)
-		return 0;
-	return !memcmp(name, de->name, len);
-}
-
 #endif /* OCFS2_NAMEI_H */
-- 
cgit v1.2.3


From b8bc5f4fde376c9eee524a9a2b7e85560e604e85 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 10 Sep 2007 17:17:52 -0700
Subject: ocfs2: Abstract out core dir listing functionality

Put this in it's own function so that the functionality can be overridden.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Reviewed-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dir.c | 103 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 56 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8e0ae022b2e..d1f92fd2ed9 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -414,11 +414,8 @@ bail:
 	return retval;
 }
 
-/*
- * ocfs2_readdir()
- *
- */
-int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int ocfs2_dir_foreach_blk(struct inode *inode, unsigned long *f_version,
+				 loff_t *f_pos, void *priv, filldir_t filldir)
 {
 	int error = 0;
 	unsigned long offset, blk, last_ra_blk = 0;
@@ -426,45 +423,23 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	struct buffer_head * bh, * tmp;
 	struct ocfs2_dir_entry * de;
 	int err;
-	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct super_block * sb = inode->i_sb;
 	unsigned int ra_sectors = 16;
-	int lock_level = 0;
-
-	mlog_entry("dirino=%llu\n",
-		   (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	stored = 0;
 	bh = NULL;
 
-	error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
-	if (lock_level && error >= 0) {
-		/* We release EX lock which used to update atime
-		 * and get PR lock again to reduce contention
-		 * on commonly accessed directories. */
-		ocfs2_meta_unlock(inode, 1);
-		lock_level = 0;
-		error = ocfs2_meta_lock(inode, NULL, 0);
-	}
-	if (error < 0) {
-		if (error != -ENOENT)
-			mlog_errno(error);
-		/* we haven't got any yet, so propagate the error. */
-		stored = error;
-		goto bail_nolock;
-	}
-
-	offset = filp->f_pos & (sb->s_blocksize - 1);
+	offset = (*f_pos) & (sb->s_blocksize - 1);
 
-	while (!error && !stored && filp->f_pos < i_size_read(inode)) {
-		blk = (filp->f_pos) >> sb->s_blocksize_bits;
+	while (!error && !stored && *f_pos < i_size_read(inode)) {
+		blk = (*f_pos) >> sb->s_blocksize_bits;
 		bh = ocfs2_bread(inode, blk, &err, 0);
 		if (!bh) {
 			mlog(ML_ERROR,
 			     "directory #%llu contains a hole at offset %lld\n",
 			     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-			     filp->f_pos);
-			filp->f_pos += sb->s_blocksize - offset;
+			     *f_pos);
+			*f_pos += sb->s_blocksize - offset;
 			continue;
 		}
 
@@ -490,7 +465,7 @@ revalidate:
 		 * readdir(2), then we might be pointing to an invalid
 		 * dirent right now.  Scan from the start of the block
 		 * to make sure. */
-		if (filp->f_version != inode->i_version) {
+		if (*f_version != inode->i_version) {
 			for (i = 0; i < sb->s_blocksize && i < offset; ) {
 				de = (struct ocfs2_dir_entry *) (bh->b_data + i);
 				/* It's too expensive to do a full
@@ -505,21 +480,20 @@ revalidate:
 				i += le16_to_cpu(de->rec_len);
 			}
 			offset = i;
-			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+			*f_pos = ((*f_pos) & ~(sb->s_blocksize - 1))
 				| offset;
-			filp->f_version = inode->i_version;
+			*f_version = inode->i_version;
 		}
 
-		while (!error && filp->f_pos < i_size_read(inode)
+		while (!error && *f_pos < i_size_read(inode)
 		       && offset < sb->s_blocksize) {
 			de = (struct ocfs2_dir_entry *) (bh->b_data + offset);
 			if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
 				/* On error, skip the f_pos to the
 				   next block. */
-				filp->f_pos = (filp->f_pos |
-					       (sb->s_blocksize - 1)) + 1;
+				*f_pos = ((*f_pos) | (sb->s_blocksize - 1)) + 1;
 				brelse(bh);
-				goto bail;
+				goto out;
 			}
 			offset += le16_to_cpu(de->rec_len);
 			if (le64_to_cpu(de->inode)) {
@@ -530,36 +504,71 @@ revalidate:
 				 * not the directory has been modified
 				 * during the copy operation.
 				 */
-				unsigned long version = filp->f_version;
+				unsigned long version = *f_version;
 				unsigned char d_type = DT_UNKNOWN;
 
 				if (de->file_type < OCFS2_FT_MAX)
 					d_type = ocfs2_filetype_table[de->file_type];
-				error = filldir(dirent, de->name,
+				error = filldir(priv, de->name,
 						de->name_len,
-						filp->f_pos,
+						*f_pos,
 						ino_from_blkno(sb, le64_to_cpu(de->inode)),
 						d_type);
 				if (error)
 					break;
-				if (version != filp->f_version)
+				if (version != *f_version)
 					goto revalidate;
 				stored ++;
 			}
-			filp->f_pos += le16_to_cpu(de->rec_len);
+			*f_pos += le16_to_cpu(de->rec_len);
 		}
 		offset = 0;
 		brelse(bh);
 	}
 
 	stored = 0;
-bail:
+out:
+	return stored;
+}
+
+/*
+ * ocfs2_readdir()
+ *
+ */
+int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
+{
+	int error = 0;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int lock_level = 0;
+
+	mlog_entry("dirino=%llu\n",
+		   (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	error = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+	if (lock_level && error >= 0) {
+		/* We release EX lock which used to update atime
+		 * and get PR lock again to reduce contention
+		 * on commonly accessed directories. */
+		ocfs2_meta_unlock(inode, 1);
+		lock_level = 0;
+		error = ocfs2_meta_lock(inode, NULL, 0);
+	}
+	if (error < 0) {
+		if (error != -ENOENT)
+			mlog_errno(error);
+		/* we haven't got any yet, so propagate the error. */
+		goto bail_nolock;
+	}
+
+	error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
+				      dirent, filldir);
+
 	ocfs2_meta_unlock(inode, lock_level);
 
 bail_nolock:
-	mlog_exit(stored);
+	mlog_exit(error);
 
-	return stored;
+	return error;
 }
 
 /*
-- 
cgit v1.2.3


From 7e8536797d4508ddc790cc3af6a281db1582d485 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 10 Sep 2007 17:30:26 -0700
Subject: ocfs2: Pass raw u64 to filldir

filldir_t can take this, so don't turn de->inode into a 32 bit value. Right
now this doesn't make a difference since no ocfs2 inodes overflow that, but
it could be a nasty surprise later on if some kernel code is calling
ocfs2_dir_foreach_blk() and expecting real inode numbers back...

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Reviewed-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index d1f92fd2ed9..dbfa6f66291 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -512,7 +512,7 @@ revalidate:
 				error = filldir(priv, de->name,
 						de->name_len,
 						*f_pos,
-						ino_from_blkno(sb, le64_to_cpu(de->inode)),
+						le64_to_cpu(de->inode),
 						d_type);
 				if (error)
 					break;
-- 
cgit v1.2.3


From 5eae5b96fc86e6c85f5f90e90fe9e6966f1fec63 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 10 Sep 2007 17:50:51 -0700
Subject: ocfs2: Remove open coded readdir()

ocfs2_queue_orphans() has an open coded readdir loop which can easily just
use a directory accessor function.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Reviewed-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dir.c     |  28 +++++++++++--
 fs/ocfs2/dir.h     |   7 +---
 fs/ocfs2/journal.c | 118 ++++++++++++++++++++---------------------------------
 3 files changed, 70 insertions(+), 83 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index dbfa6f66291..a75c340fc68 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -81,10 +81,10 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct buffer_head **new_bh);
 
-int ocfs2_check_dir_entry(struct inode * dir,
-			  struct ocfs2_dir_entry * de,
-			  struct buffer_head * bh,
-			  unsigned long offset)
+static int ocfs2_check_dir_entry(struct inode * dir,
+				 struct ocfs2_dir_entry * de,
+				 struct buffer_head * bh,
+				 unsigned long offset)
 {
 	const char *error_msg = NULL;
 	const int rlen = le16_to_cpu(de->rec_len);
@@ -531,6 +531,26 @@ out:
 	return stored;
 }
 
+/*
+ * This is intended to be called from inside other kernel functions,
+ * so we fake some arguments.
+ */
+int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
+		      filldir_t filldir)
+{
+	int ret = 0;
+	unsigned long version = inode->i_version;
+
+	while (*f_pos < i_size_read(inode)) {
+		ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv,
+					    filldir);
+		if (ret)
+			break;
+	}
+
+	return 0;
+}
+
 /*
  * ocfs2_readdir()
  *
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index 7bf9c0a01cd..075d0e9fd45 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -62,6 +62,8 @@ int ocfs2_find_files_on_disk(const char *name,
 			     struct buffer_head **dirent_bh,
 			     struct ocfs2_dir_entry **dirent);
 int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
+int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
+		      filldir_t filldir);
 int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
 				 struct inode *dir,
 				 struct buffer_head *parent_fe_bh,
@@ -76,9 +78,4 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
 		       struct buffer_head *fe_bh,
 		       struct ocfs2_alloc_context *data_ac);
 
-int ocfs2_check_dir_entry(struct inode *dir,
-			  struct ocfs2_dir_entry *de,
-			  struct buffer_head *bh,
-			  unsigned long offset);
-
 #endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 8bbfc80e5c5..f9d01e25298 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1213,17 +1213,49 @@ bail:
 	return status;
 }
 
+struct ocfs2_orphan_filldir_priv {
+	struct inode		*head;
+	struct ocfs2_super	*osb;
+};
+
+static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
+				loff_t pos, u64 ino, unsigned type)
+{
+	struct ocfs2_orphan_filldir_priv *p = priv;
+	struct inode *iter;
+
+	if (name_len == 1 && !strncmp(".", name, 1))
+		return 0;
+	if (name_len == 2 && !strncmp("..", name, 2))
+		return 0;
+
+	/* Skip bad inodes so that recovery can continue */
+	iter = ocfs2_iget(p->osb, ino,
+			  OCFS2_FI_FLAG_ORPHAN_RECOVERY);
+	if (IS_ERR(iter))
+		return 0;
+
+	mlog(0, "queue orphan %llu\n",
+	     (unsigned long long)OCFS2_I(iter)->ip_blkno);
+	/* No locking is required for the next_orphan queue as there
+	 * is only ever a single process doing orphan recovery. */
+	OCFS2_I(iter)->ip_next_orphan = p->head;
+	p->head = iter;
+
+	return 0;
+}
+
 static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 			       int slot,
 			       struct inode **head)
 {
 	int status;
 	struct inode *orphan_dir_inode = NULL;
-	struct inode *iter;
-	unsigned long offset, blk, local;
-	struct buffer_head *bh = NULL;
-	struct ocfs2_dir_entry *de;
-	struct super_block *sb = osb->sb;
+	struct ocfs2_orphan_filldir_priv priv;
+	loff_t pos = 0;
+
+	priv.osb = osb;
+	priv.head = *head;
 
 	orphan_dir_inode = ocfs2_get_system_file_inode(osb,
 						       ORPHAN_DIR_SYSTEM_INODE,
@@ -1241,77 +1273,15 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 		goto out;
 	}
 
-	offset = 0;
-	iter = NULL;
-	while(offset < i_size_read(orphan_dir_inode)) {
-		blk = offset >> sb->s_blocksize_bits;
-
-		bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0);
-		if (!bh)
-			status = -EINVAL;
-		if (status < 0) {
-			if (bh)
-				brelse(bh);
-			mlog_errno(status);
-			goto out_unlock;
-		}
-
-		local = 0;
-		while(offset < i_size_read(orphan_dir_inode)
-		      && local < sb->s_blocksize) {
-			de = (struct ocfs2_dir_entry *) (bh->b_data + local);
-
-			if (!ocfs2_check_dir_entry(orphan_dir_inode,
-						  de, bh, local)) {
-				status = -EINVAL;
-				mlog_errno(status);
-				brelse(bh);
-				goto out_unlock;
-			}
-
-			local += le16_to_cpu(de->rec_len);
-			offset += le16_to_cpu(de->rec_len);
-
-			/* I guess we silently fail on no inode? */
-			if (!le64_to_cpu(de->inode))
-				continue;
-			if (de->file_type > OCFS2_FT_MAX) {
-				mlog(ML_ERROR,
-				     "block %llu contains invalid de: "
-				     "inode = %llu, rec_len = %u, "
-				     "name_len = %u, file_type = %u, "
-				     "name='%.*s'\n",
-				     (unsigned long long)bh->b_blocknr,
-				     (unsigned long long)le64_to_cpu(de->inode),
-				     le16_to_cpu(de->rec_len),
-				     de->name_len,
-				     de->file_type,
-				     de->name_len,
-				     de->name);
-				continue;
-			}
-			if (de->name_len == 1 && !strncmp(".", de->name, 1))
-				continue;
-			if (de->name_len == 2 && !strncmp("..", de->name, 2))
-				continue;
-
-			iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
-					  OCFS2_FI_FLAG_ORPHAN_RECOVERY);
-			if (IS_ERR(iter))
-				continue;
-
-			mlog(0, "queue orphan %llu\n",
-			     (unsigned long long)OCFS2_I(iter)->ip_blkno);
-			/* No locking is required for the next_orphan
-			 * queue as there is only ever a single
-			 * process doing orphan recovery. */
-			OCFS2_I(iter)->ip_next_orphan = *head;
-			*head = iter;
-		}
-		brelse(bh);
+	status = ocfs2_dir_foreach(orphan_dir_inode, &pos, &priv,
+				   ocfs2_orphan_filldir);
+	if (status) {
+		mlog_errno(status);
+		goto out;
 	}
 
-out_unlock:
+	*head = priv.head;
+
 	ocfs2_meta_unlock(orphan_dir_inode, 0);
 out:
 	mutex_unlock(&orphan_dir_inode->i_mutex);
-- 
cgit v1.2.3


From 0bfbbf62a8b5a129ba2c689283bfece80a601aba Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 12 Sep 2007 11:19:00 -0700
Subject: ocfs2: Implement ocfs2_empty_dir() as a caller of ocfs2_dir_foreach()

We can preserve the behavior of ocfs2_empty_dir(), while getting rid of the
open coded directory walk by just providing a smart filldir callback. This
also automatically gets to use the dir readahead code, though in this case
any advantage is minor at best.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Reviewed-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dir.c | 96 +++++++++++++++++++++++++++-------------------------------
 fs/ocfs2/dir.h |  2 +-
 2 files changed, 46 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index a75c340fc68..4bb54406354 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -660,67 +660,61 @@ bail:
 	return ret;
 }
 
+struct ocfs2_empty_dir_priv {
+	unsigned seen_dot;
+	unsigned seen_dot_dot;
+	unsigned seen_other;
+};
+static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len,
+				   loff_t pos, u64 ino, unsigned type)
+{
+	struct ocfs2_empty_dir_priv *p = priv;
+
+	/*
+	 * Check the positions of "." and ".." records to be sure
+	 * they're in the correct place.
+	 */
+	if (name_len == 1 && !strncmp(".", name, 1) && pos == 0) {
+		p->seen_dot = 1;
+		return 0;
+	}
+
+	if (name_len == 2 && !strncmp("..", name, 2) &&
+	    pos == OCFS2_DIR_REC_LEN(1)) {
+		p->seen_dot_dot = 1;
+		return 0;
+	}
+
+	p->seen_other = 1;
+	return 1;
+}
 /*
  * routine to check that the specified directory is empty (for rmdir)
+ *
+ * Returns 1 if dir is empty, zero otherwise.
  */
 int ocfs2_empty_dir(struct inode *inode)
 {
-	unsigned long offset;
-	struct buffer_head * bh;
-	struct ocfs2_dir_entry * de, * de1;
-	struct super_block * sb;
-	int err;
+	int ret;
+	loff_t start = 0;
+	struct ocfs2_empty_dir_priv priv;
 
-	sb = inode->i_sb;
-	if ((i_size_read(inode) <
-	     (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) ||
-	    !(bh = ocfs2_bread(inode, 0, &err, 0))) {
-	    	mlog(ML_ERROR, "bad directory (dir #%llu) - no data block\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
-		return 1;
-	}
+	memset(&priv, 0, sizeof(priv));
 
-	de = (struct ocfs2_dir_entry *) bh->b_data;
-	de1 = (struct ocfs2_dir_entry *)
-			((char *)de + le16_to_cpu(de->rec_len));
-	if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) ||
-			!le64_to_cpu(de1->inode) ||
-			strcmp(".", de->name) ||
-			strcmp("..", de1->name)) {
-	    	mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n",
+	ret = ocfs2_dir_foreach(inode, &start, &priv, ocfs2_empty_dir_filldir);
+	if (ret)
+		mlog_errno(ret);
+
+	if (!priv.seen_dot || !priv.seen_dot_dot) {
+		mlog(ML_ERROR, "bad directory (dir #%llu) - no `.' or `..'\n",
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
-		brelse(bh);
+		/*
+		 * XXX: Is it really safe to allow an unlink to continue?
+		 */
 		return 1;
 	}
-	offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
-	de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len));
-	while (offset < i_size_read(inode) ) {
-		if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) {
-			brelse(bh);
-			bh = ocfs2_bread(inode,
-					 offset >> sb->s_blocksize_bits, &err, 0);
-			if (!bh) {
-				mlog(ML_ERROR, "dir %llu has a hole at %lu\n",
-				     (unsigned long long)OCFS2_I(inode)->ip_blkno, offset);
-				offset += sb->s_blocksize;
-				continue;
-			}
-			de = (struct ocfs2_dir_entry *) bh->b_data;
-		}
-		if (!ocfs2_check_dir_entry(inode, de, bh, offset)) {
-			brelse(bh);
-			return 1;
-		}
-		if (le64_to_cpu(de->inode)) {
-			brelse(bh);
-			return 0;
-		}
-		offset += le16_to_cpu(de->rec_len);
-		de = (struct ocfs2_dir_entry *)
-			((char *)de + le16_to_cpu(de->rec_len));
-	}
-	brelse(bh);
-	return 1;
+
+	return !priv.seen_other;
 }
 
 int ocfs2_fill_new_dir(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index 075d0e9fd45..3e65f91b034 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -54,7 +54,7 @@ static inline int ocfs2_add_entry(handle_t *handle,
 int ocfs2_check_dir_for_entry(struct inode *dir,
 			      const char *name,
 			      int namelen);
-int ocfs2_empty_dir(struct inode *inode);  /* FIXME: to namei.c */
+int ocfs2_empty_dir(struct inode *inode);
 int ocfs2_find_files_on_disk(const char *name,
 			     int namelen,
 			     u64 *blkno,
-- 
cgit v1.2.3


From be94d11704ef79030fd2e6a0c41b4a7f65f9e860 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 11 Sep 2007 15:22:06 -0700
Subject: ocfs2: Provide convenience function for ino lookup

A couple paths which needed to just match a parent dir + name pair to an
inode number were a bit messy because they had to deal with
ocfs2_find_files_on_disk() which returns a larger number of values. Provide
a convenience function, ocfs2_lookup_ino_from_name() which internalizes all
the extra accounting.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Reviewed-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dir.c     | 17 +++++++++++++++++
 fs/ocfs2/dir.h     |  2 ++
 fs/ocfs2/export.c  |  8 +-------
 fs/ocfs2/namei.c   |  9 ++-------
 fs/ocfs2/sysfile.c | 10 +++-------
 5 files changed, 25 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 4bb54406354..4791683a119 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -628,6 +628,23 @@ leave:
 	return status;
 }
 
+/*
+ * Convenience function for callers which just want the block number
+ * mapped to a name and don't require the full dirent info, etc.
+ */
+int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
+			       int namelen, u64 *blkno)
+{
+	int ret;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_dir_entry *dirent = NULL;
+
+	ret = ocfs2_find_files_on_disk(name, namelen, blkno, dir, &bh, &dirent);
+	brelse(bh);
+
+	return ret;
+}
+
 /* Check for a name within a directory.
  *
  * Return 0 if the name does not exist
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index 3e65f91b034..d03eaaa5cfd 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -61,6 +61,8 @@ int ocfs2_find_files_on_disk(const char *name,
 			     struct inode *inode,
 			     struct buffer_head **dirent_bh,
 			     struct ocfs2_dir_entry **dirent);
+int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
+			       int namelen, u64 *blkno);
 int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir);
 int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
 		      filldir_t filldir);
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index bc48177bd18..c3bbc198f9c 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -88,8 +88,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
 	struct dentry *parent;
 	struct inode *inode;
 	struct inode *dir = child->d_inode;
-	struct buffer_head *dirent_bh = NULL;
-	struct ocfs2_dir_entry *dirent;
 
 	mlog_entry("(0x%p, '%.*s')\n", child,
 		   child->d_name.len, child->d_name.name);
@@ -105,8 +103,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
 		goto bail;
 	}
 
-	status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh,
-					  &dirent);
+	status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno);
 	if (status < 0) {
 		parent = ERR_PTR(-ENOENT);
 		goto bail_unlock;
@@ -131,9 +128,6 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
 bail_unlock:
 	ocfs2_meta_unlock(dir, 0);
 
-	if (dirent_bh)
-		brelse(dirent_bh);
-
 bail:
 	mlog_exit_ptr(parent);
 
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index aae6c0bf669..98aeebc2c9f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -101,10 +101,8 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
 {
 	int status;
 	u64 blkno;
-	struct buffer_head *dirent_bh = NULL;
 	struct inode *inode = NULL;
 	struct dentry *ret;
-	struct ocfs2_dir_entry *dirent;
 	struct ocfs2_inode_info *oi;
 
 	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
@@ -126,9 +124,8 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
 		goto bail;
 	}
 
-	status = ocfs2_find_files_on_disk(dentry->d_name.name,
-					  dentry->d_name.len, &blkno,
-					  dir, &dirent_bh, &dirent);
+	status = ocfs2_lookup_ino_from_name(dir, dentry->d_name.name,
+					    dentry->d_name.len, &blkno);
 	if (status < 0)
 		goto bail_add;
 
@@ -183,8 +180,6 @@ bail_unlock:
 	ocfs2_meta_unlock(dir, 0);
 
 bail:
-	if (dirent_bh)
-		brelse(dirent_bh);
 
 	mlog_exit_ptr(ret);
 
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 5df6e35d09b..fd2e846e3e6 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -100,17 +100,14 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
 	char namebuf[40];
 	struct inode *inode = NULL;
 	u64 blkno;
-	struct buffer_head *dirent_bh = NULL;
-	struct ocfs2_dir_entry *de = NULL;
 	int status = 0;
 
 	ocfs2_sprintf_system_inode_name(namebuf,
 					sizeof(namebuf),
 					type, slot);
 
-	status = ocfs2_find_files_on_disk(namebuf, strlen(namebuf),
-					  &blkno, osb->sys_root_inode,
-					  &dirent_bh, &de);
+	status = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
+					    strlen(namebuf), &blkno);
 	if (status < 0) {
 		goto bail;
 	}
@@ -122,8 +119,7 @@ static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb,
 		goto bail;
 	}
 bail:
-	if (dirent_bh)
-		brelse(dirent_bh);
+
 	return inode;
 }
 
-- 
cgit v1.2.3


From 38760e243249f03b4c6d78ca624dd846a2681b67 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 11 Sep 2007 17:21:56 -0700
Subject: ocfs2: Rename cleanups

ocfs2_rename() does direct manipulation of the dirent it's gotten back from
a directory search. Wrap this manipulation inside of a function so that we
can transparently change directory update behavior in the future. As an
added bonus, this gets rid of an ugly macro.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Reviewed-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dir.c   | 22 ++++++++++++++++++++++
 fs/ocfs2/dir.h   |  3 +++
 fs/ocfs2/namei.c | 53 ++++++++++++++++++++++-------------------------------
 3 files changed, 47 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 4791683a119..31db7e3881b 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -271,6 +271,28 @@ cleanup_and_exit:
 	return ret;
 }
 
+int ocfs2_update_entry(struct inode *dir, handle_t *handle,
+		       struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
+		       struct inode *new_entry_inode)
+{
+	int ret;
+
+	ret = ocfs2_journal_access(handle, dir, de_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	de->inode = cpu_to_le64(OCFS2_I(new_entry_inode)->ip_blkno);
+	ocfs2_set_de_type(de, new_entry_inode->i_mode);
+
+	ocfs2_journal_dirty(handle, de_bh);
+
+out:
+	return ret;
+}
+
 /*
  * ocfs2_delete_entry deletes a directory entry by merging it with the
  * previous entry
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index d03eaaa5cfd..ce48b9080d8 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -50,6 +50,9 @@ static inline int ocfs2_add_entry(handle_t *handle,
 				 dentry->d_name.name, dentry->d_name.len,
 				 inode, blkno, parent_fe_bh, insert_bh);
 }
+int ocfs2_update_entry(struct inode *dir, handle_t *handle,
+		       struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
+		       struct inode *new_entry_inode);
 
 int ocfs2_check_dir_for_entry(struct inode *dir,
 			      const char *name,
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 98aeebc2c9f..d2f03355d12 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -933,11 +933,6 @@ static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2)
 		ocfs2_meta_unlock(inode2, 1);
 }
 
-#define PARENT_INO(buffer) \
-	((struct ocfs2_dir_entry *) \
-	 ((char *)buffer + \
-	  le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode
-
 static int ocfs2_rename(struct inode *old_dir,
 			struct dentry *old_dentry,
 			struct inode *new_dir,
@@ -959,8 +954,8 @@ static int ocfs2_rename(struct inode *old_dir,
 	handle_t *handle = NULL;
 	struct buffer_head *old_dir_bh = NULL;
 	struct buffer_head *new_dir_bh = NULL;
-	struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry
-							       // and new_dentry
+	struct ocfs2_dir_entry *old_inode_dot_dot_de = NULL, *old_de = NULL,
+		*new_de = NULL;
 	struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above
 	struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir,
 						    // this is the 1st dirent bh
@@ -1044,19 +1039,26 @@ static int ocfs2_rename(struct inode *old_dir,
 	}
 
 	if (S_ISDIR(old_inode->i_mode)) {
-		status = -EIO;
-		old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0);
-		if (!old_inode_de_bh)
+		u64 old_inode_parent;
+
+		status = ocfs2_find_files_on_disk("..", 2, &old_inode_parent,
+						  old_inode, &old_inode_de_bh,
+						  &old_inode_dot_dot_de);
+		if (status) {
+			status = -EIO;
 			goto bail;
+		}
 
-		status = -EIO;
-		if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) !=
-		    OCFS2_I(old_dir)->ip_blkno)
+		if (old_inode_parent != OCFS2_I(old_dir)->ip_blkno) {
+			status = -EIO;
 			goto bail;
-		status = -EMLINK;
-		if (!new_inode && new_dir!=old_dir &&
-		    new_dir->i_nlink >= OCFS2_LINK_MAX)
+		}
+
+		if (!new_inode && new_dir != old_dir &&
+		    new_dir->i_nlink >= OCFS2_LINK_MAX) {
+			status = -EMLINK;
 			goto bail;
+		}
 	}
 
 	status = -ENOENT;
@@ -1206,20 +1208,13 @@ static int ocfs2_rename(struct inode *old_dir,
 		}
 
 		/* change the dirent to point to the correct inode */
-		status = ocfs2_journal_access(handle, new_dir, new_de_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
+		status = ocfs2_update_entry(new_dir, handle, new_de_bh,
+					    new_de, old_inode);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
-		new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno);
-		new_de->file_type = old_de->file_type;
 		new_dir->i_version++;
-		status = ocfs2_journal_dirty(handle, new_de_bh);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
 
 		if (S_ISDIR(new_inode->i_mode))
 			newfe->i_links_count = 0;
@@ -1268,12 +1263,8 @@ static int ocfs2_rename(struct inode *old_dir,
 	}
 	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
 	if (old_inode_de_bh) {
-		status = ocfs2_journal_access(handle, old_inode,
-					     old_inode_de_bh,
-					     OCFS2_JOURNAL_ACCESS_WRITE);
-		PARENT_INO(old_inode_de_bh->b_data) =
-			cpu_to_le64(OCFS2_I(new_dir)->ip_blkno);
-		status = ocfs2_journal_dirty(handle, old_inode_de_bh);
+		status = ocfs2_update_entry(old_inode, handle, old_inode_de_bh,
+					    old_inode_dot_dot_de, new_dir);
 		old_dir->i_nlink--;
 		if (new_inode) {
 			new_inode->i_nlink--;
-- 
cgit v1.2.3


From 8553cf4f360d6fc4913a0bdd3b22dd7b5bb9a3be Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 13 Sep 2007 16:29:01 -0700
Subject: ocfs2: Cleanup dirent size check

The check to see if a new dirent would fit in an old one is pretty ugly, and
it's done at least twice. Clean things up by putting this in it's own
easier-to-read function.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Reviewed-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dir.c | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 31db7e3881b..f2e2ffbf6c9 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -343,6 +343,31 @@ bail:
 	return status;
 }
 
+/*
+ * Check whether 'de' has enough room to hold an entry of
+ * 'new_rec_len' bytes.
+ */
+static inline int ocfs2_dirent_would_fit(struct ocfs2_dir_entry *de,
+					 unsigned int new_rec_len)
+{
+	unsigned int de_really_used;
+
+	/* Check whether this is an empty record with enough space */
+	if (le64_to_cpu(de->inode) == 0 &&
+	    le16_to_cpu(de->rec_len) >= new_rec_len)
+		return 1;
+
+	/*
+	 * Record might have free space at the end which we can
+	 * use.
+	 */
+	de_really_used = OCFS2_DIR_REC_LEN(de->name_len);
+	if (le16_to_cpu(de->rec_len) >= (de_really_used + new_rec_len))
+	    return 1;
+
+	return 0;
+}
+
 /* we don't always have a dentry for what we want to add, so people
  * like orphan dir can call this instead.
  *
@@ -385,10 +410,8 @@ int __ocfs2_add_entry(handle_t *handle,
 			retval = -EEXIST;
 			goto bail;
 		}
-		if (((le64_to_cpu(de->inode) == 0) &&
-		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
-		    (le16_to_cpu(de->rec_len) >=
-		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
+
+		if (ocfs2_dirent_would_fit(de, rec_len)) {
 			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 			retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
 			if (retval < 0) {
@@ -1078,10 +1101,7 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
 			status = -EEXIST;
 			goto bail;
 		}
-		if (((le64_to_cpu(de->inode) == 0) &&
-		     (le16_to_cpu(de->rec_len) >= rec_len)) ||
-		    (le16_to_cpu(de->rec_len) >=
-		     (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) {
+		if (ocfs2_dirent_would_fit(de, rec_len)) {
 			/* Ok, we found a spot. Return this bh and let
 			 * the caller actually fill it in. */
 			*ret_de_bh = bh;
-- 
cgit v1.2.3


From 15b1e36bdb487d67ef924a37b0967453143be53a Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 7 Sep 2007 13:58:15 -0700
Subject: ocfs2: Structure updates for inline data

Add the disk, network and memory structures needed to support data in inode.

Struct ocfs2_inline_data is defined and embedded in ocfs2_dinode for storing
inline data.

A new inode field, i_dyn_features, is added to facilitate tracking of
dynamic inode state. Since it will be used often, we want to mirror it on
ocfs2_inode_info, and transfer it via the meta data lvb.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Reviewed-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlmglue.c  |  2 ++
 fs/ocfs2/dlmglue.h  |  4 ++--
 fs/ocfs2/inode.c    |  3 +++
 fs/ocfs2/inode.h    |  1 +
 fs/ocfs2/ocfs2.h    |  7 +++++++
 fs/ocfs2/ocfs2_fs.h | 44 ++++++++++++++++++++++++++++++++++++++++++--
 6 files changed, 57 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index f71250ed166..41c76ff2fcf 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1482,6 +1482,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 	lvb->lvb_imtime_packed =
 		cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
 	lvb->lvb_iattr    = cpu_to_be32(oi->ip_attr);
+	lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features);
 	lvb->lvb_igeneration = cpu_to_be32(inode->i_generation);
 
 out:
@@ -1515,6 +1516,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 	i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
 
 	oi->ip_attr = be32_to_cpu(lvb->lvb_iattr);
+	oi->ip_dyn_features = be16_to_cpu(lvb->lvb_idynfeatures);
 	ocfs2_set_inode_flags(inode);
 
 	/* fast-symlinks are a special case */
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 492bad32a8c..87a785e4120 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -29,12 +29,12 @@
 
 #include "dcache.h"
 
-#define OCFS2_LVB_VERSION 4
+#define OCFS2_LVB_VERSION 5
 
 struct ocfs2_meta_lvb {
 	__u8         lvb_version;
 	__u8         lvb_reserved0;
-	__be16       lvb_reserved1;
+	__be16       lvb_idynfeatures;
 	__be32       lvb_iclusters;
 	__be32       lvb_iuid;
 	__be32       lvb_igid;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index c53a6763bbb..c8923bab422 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -241,6 +241,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 
 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
 	OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
+	OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
 
 	inode->i_version = 1;
 	inode->i_generation = le32_to_cpu(fe->i_generation);
@@ -1220,6 +1221,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
 	fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
 	ocfs2_get_inode_flags(OCFS2_I(inode));
 	fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr);
+	fe->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features);
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 
 	fe->i_size = cpu_to_le64(i_size_read(inode));
@@ -1257,6 +1259,7 @@ void ocfs2_refresh_inode(struct inode *inode,
 
 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
 	OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
+	OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features);
 	ocfs2_set_inode_flags(inode);
 	i_size_write(inode, le64_to_cpu(fe->i_size));
 	inode->i_nlink = le16_to_cpu(fe->i_links_count);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index a41d0817121..70e881c5553 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -51,6 +51,7 @@ struct ocfs2_inode_info
 
 	u32				ip_flags; /* see below */
 	u32				ip_attr; /* inode attributes */
+	u16				ip_dyn_features;
 
 	/* protected by recovery_lock. */
 	struct inode			*ip_next_orphan;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 58307853fb4..60a23e1906b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -319,6 +319,13 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
 	return 0;
 }
 
+static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
+		return 1;
+	return 0;
+}
+
 /* set / clear functions because cluster events can make these happen
  * in parallel so we want the transitions to be atomic. this also
  * means that any future flags osb_flags must be protected by spinlock
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index bf10a545383..6ef876759a7 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -87,7 +87,8 @@
 
 #define OCFS2_FEATURE_COMPAT_SUPP	OCFS2_FEATURE_COMPAT_BACKUP_SB
 #define OCFS2_FEATURE_INCOMPAT_SUPP	(OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
-					 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
+					 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
+					 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
 
 /*
@@ -121,6 +122,9 @@
  */
 #define OCFS2_FEATURE_INCOMPAT_TUNEFS_INPROG	0x0020
 
+/* Support for data packed into inode blocks */
+#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA	0x0040
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
@@ -162,6 +166,17 @@
 #define OCFS2_CHAIN_FL		(0x00000400)	/* Chain allocator */
 #define OCFS2_DEALLOC_FL	(0x00000800)	/* Truncate log */
 
+/*
+ * Flags on ocfs2_dinode.i_dyn_features
+ *
+ * These can change much more often than i_flags. When adding flags,
+ * keep in mind that i_dyn_features is only 16 bits wide.
+ */
+#define OCFS2_INLINE_DATA_FL	(0x0001)	/* Data stored in inode block */
+#define OCFS2_HAS_XATTR_FL	(0x0002)
+#define OCFS2_INLINE_XATTR_FL	(0x0004)
+#define OCFS2_INDEXED_DIR_FL	(0x0008)
+
 /* Inode attributes, keep in sync with EXT2 */
 #define OCFS2_SECRM_FL		(0x00000001)	/* Secure deletion */
 #define OCFS2_UNRM_FL		(0x00000002)	/* Undelete */
@@ -486,6 +501,19 @@ struct ocfs2_local_alloc
 /*10*/	__u8   la_bitmap[0];
 };
 
+/*
+ * Data-in-inode header. This is only used if i_dyn_features has
+ * OCFS2_INLINE_DATA_FL set.
+ */
+struct ocfs2_inline_data
+{
+/*00*/	__le16	id_count;	/* Number of bytes that can be used
+				 * for data, starting at id_data */
+	__le16	id_reserved0;
+	__le32	id_reserved1;
+	__u8	id_data[0];	/* Start of user data */
+};
+
 /*
  * On disk inode for OCFS2
  */
@@ -518,7 +546,7 @@ struct ocfs2_dinode {
 	__le32 i_attr;
 	__le16 i_orphaned_slot;		/* Only valid when OCFS2_ORPHANED_FL
 					   was set in i_flags */
-	__le16 i_reserved1;
+	__le16 i_dyn_features;
 /*70*/	__le64 i_reserved2[8];
 /*B8*/	union {
 		__le64 i_pad1;		/* Generic way to refer to this
@@ -544,6 +572,7 @@ struct ocfs2_dinode {
 		struct ocfs2_chain_list		i_chain;
 		struct ocfs2_extent_list	i_list;
 		struct ocfs2_truncate_log	i_dealloc;
+		struct ocfs2_inline_data	i_data;
 		__u8               		i_symlink[0];
 	} id2;
 /* Actual on-disk size is one block */
@@ -593,6 +622,12 @@ static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
 		 offsetof(struct ocfs2_dinode, id2.i_symlink);
 }
 
+static inline int ocfs2_max_inline_data(struct super_block *sb)
+{
+	return sb->s_blocksize -
+		offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+}
+
 static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
 {
 	int size;
@@ -672,6 +707,11 @@ static inline int ocfs2_fast_symlink_chars(int blocksize)
 	return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink);
 }
 
+static inline int ocfs2_max_inline_data(int blocksize)
+{
+	return blocksize - offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+}
+
 static inline int ocfs2_extent_recs_per_inode(int blocksize)
 {
 	int size;
-- 
cgit v1.2.3


From 6798d35a31c413bbb3f83bbaa844bd2598168ccc Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 7 Sep 2007 14:05:51 -0700
Subject: ocfs2: Read support for inline data

This hooks up ocfs2_readpage() to populate a page with data from an inode
block. Direct IO reads from inline data are modified to fall back to
buffered I/O. Appropriate checks are also placed in the extent map code to
avoid reading an extent list when inline data might be stored.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Reviewed-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/aops.c       | 80 ++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/ocfs2/extent_map.c |  6 ++++
 2 files changed, 82 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8416e383197..fef0186a91c 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -206,9 +206,70 @@ bail:
 	return err;
 }
 
+static int ocfs2_read_inline_data(struct inode *inode, struct page *page,
+				  struct buffer_head *di_bh)
+{
+	void *kaddr;
+	unsigned int size;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
+		ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		return -EROFS;
+	}
+
+	size = i_size_read(inode);
+
+	if (size > PAGE_CACHE_SIZE ||
+	    size > ocfs2_max_inline_data(inode->i_sb)) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %llu has with inline data has bad size: %u",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno, size);
+		return -EROFS;
+	}
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	if (size)
+		memcpy(kaddr, di->id2.i_data.id_data, size);
+	/* Clear the remaining part of the page */
+	memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	SetPageUptodate(page);
+
+	return 0;
+}
+
+static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
+
+	ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh,
+			       OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_inline_data(inode, page, di_bh);
+out:
+	unlock_page(page);
+
+	brelse(di_bh);
+	return ret;
+}
+
 static int ocfs2_readpage(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
 	int ret, unlock = 1;
 
@@ -222,7 +283,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 		goto out;
 	}
 
-	if (down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem) == 0) {
+	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
 		ret = AOP_TRUNCATED_PAGE;
 		goto out_meta_unlock;
 	}
@@ -252,7 +313,10 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 		goto out_alloc;
 	}
 
-	ret = block_read_full_page(page, ocfs2_get_block);
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		ret = ocfs2_readpage_inline(inode, page);
+	else
+		ret = block_read_full_page(page, ocfs2_get_block);
 	unlock = 0;
 
 	ocfs2_data_unlock(inode, 0);
@@ -397,7 +461,9 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 		down_read(&OCFS2_I(inode)->ip_alloc_sem);
 	}
 
-	err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL);
+	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+		err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+						  NULL);
 
 	if (!INODE_JOURNAL(inode)) {
 		up_read(&OCFS2_I(inode)->ip_alloc_sem);
@@ -411,7 +477,6 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 		goto bail;
 	}
 
-
 bail:
 	status = err ? 0 : p_blkno;
 
@@ -566,6 +631,13 @@ static ssize_t ocfs2_direct_IO(int rw,
 
 	mlog_entry_void();
 
+	/*
+	 * Fallback to buffered I/O if we see an inode without
+	 * extents.
+	 */
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return 0;
+
 	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
 		/*
 		 * We get PR data locks even for O_DIRECT.  This
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 03c1d365c78..c58668a326f 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -387,6 +387,12 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 	struct ocfs2_extent_rec *rec;
 	u32 coff;
 
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = -ERANGE;
+		mlog_errno(ret);
+		goto out;
+	}
+
 	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
 				      num_clusters, extent_flags);
 	if (ret == 0)
-- 
cgit v1.2.3


From 1afc32b952335f665327a1a9001ba1b44bb76fd9 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 7 Sep 2007 14:46:51 -0700
Subject: ocfs2: Write support for inline data

This fixes up write, truncate, mmap, and RESVSP/UNRESVP to understand inline
inode data.

For the most part, the changes to the core write code can be relied on to do
the heavy lifting. Any code calling ocfs2_write_begin (including shared
writeable mmap) can count on it doing the right thing with respect to
growing inline data to an extent tree.

Size reducing truncates, including UNRESVP can simply zero that portion of
the inode block being removed. Size increasing truncatesm, including RESVP
have to be a little bit smarter and grow the inode to an extent tree if
necessary.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Reviewed-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/alloc.c   | 245 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/alloc.h   |   6 ++
 fs/ocfs2/aops.c    | 173 ++++++++++++++++++++++++++++++++++++-
 fs/ocfs2/aops.h    |   4 +
 fs/ocfs2/file.c    |  99 ++++++++++++++++++++--
 fs/ocfs2/inode.c   |   4 +
 fs/ocfs2/journal.h |   3 +
 7 files changed, 526 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index c81bfdfb992..72cefe25382 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3726,6 +3726,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	struct ocfs2_insert_type insert = {0, };
 	struct ocfs2_extent_rec rec;
 
+	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
+
 	mlog(0, "add %u clusters at position %u to inode %llu\n",
 	     new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
@@ -5826,6 +5828,174 @@ out:
 	return ret;
 }
 
+static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di)
+{
+	unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
+
+	memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2));
+}
+
+void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
+{
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_inline_data *idata = &di->id2.i_data;
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+
+	/*
+	 * We clear the entire i_data structure here so that all
+	 * fields can be properly initialized.
+	 */
+	ocfs2_zero_dinode_id2(inode, di);
+
+	idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb));
+}
+
+int ocfs2_convert_inline_data_to_extents(struct inode *inode,
+					 struct buffer_head *di_bh)
+{
+	int ret, i, has_data, num_pages = 0;
+	handle_t *handle;
+	u64 uninitialized_var(block);
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_extent_list *el = &di->id2.i_list;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct page **pages = NULL;
+	loff_t end = osb->s_clustersize;
+
+	has_data = i_size_read(inode) ? 1 : 0;
+
+	if (has_data) {
+		pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
+				sizeof(struct page *), GFP_NOFS);
+		if (pages == NULL) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, di_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	if (has_data) {
+		u32 bit_off, num;
+		unsigned int page_end;
+		u64 phys;
+
+		ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+					   &num);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		/*
+		 * Save two copies, one for insert, and one that can
+		 * be changed by ocfs2_map_and_dirty_page() below.
+		 */
+		block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
+
+		/*
+		 * Non sparse file systems zero on extend, so no need
+		 * to do that now.
+		 */
+		if (!ocfs2_sparse_alloc(osb) &&
+		    PAGE_CACHE_SIZE < osb->s_clustersize)
+			end = PAGE_CACHE_SIZE;
+
+		ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		/*
+		 * This should populate the 1st page for us and mark
+		 * it up to date.
+		 */
+		ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		page_end = PAGE_CACHE_SIZE;
+		if (PAGE_CACHE_SIZE > osb->s_clustersize)
+			page_end = osb->s_clustersize;
+
+		for (i = 0; i < num_pages; i++)
+			ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
+						 pages[i], i > 0, &phys);
+	}
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+
+	ocfs2_zero_dinode_id2(inode, di);
+
+	el->l_tree_depth = 0;
+	el->l_next_free_rec = 0;
+	el->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+	if (has_data) {
+		/*
+		 * An error at this point should be extremely rare. If
+		 * this proves to be false, we could always re-build
+		 * the in-inode data from our pages.
+		 */
+		ret = ocfs2_insert_extent(osb, handle, inode, di_bh,
+					  0, block, 1, 0, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
+	}
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+out:
+	if (pages) {
+		ocfs2_unlock_and_free_pages(pages, num_pages);
+		kfree(pages);
+	}
+
+	return ret;
+}
+
 /*
  * It is expected, that by the time you call this function,
  * inode->i_size and fe->i_size have been adjusted.
@@ -6051,6 +6221,81 @@ bail:
 	return status;
 }
 
+/*
+ * 'start' is inclusive, 'end' is not.
+ */
+int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
+			  unsigned int start, unsigned int end, int trunc)
+{
+	int ret;
+	unsigned int numbytes;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inline_data *idata = &di->id2.i_data;
+
+	if (end > i_size_read(inode))
+		end = i_size_read(inode);
+
+	BUG_ON(start >= end);
+
+	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
+	    !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
+	    !ocfs2_supports_inline_data(osb)) {
+		ocfs2_error(inode->i_sb,
+			    "Inline data flags for inode %llu don't agree! "
+			    "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    le16_to_cpu(di->i_dyn_features),
+			    OCFS2_I(inode)->ip_dyn_features,
+			    osb->s_feature_incompat);
+		ret = -EROFS;
+		goto out;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, di_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	numbytes = end - start;
+	memset(idata->id_data + start, 0, numbytes);
+
+	/*
+	 * No need to worry about the data page here - it's been
+	 * truncated already and inline data doesn't need it for
+	 * pushing zero's to disk, so we'll let readpage pick it up
+	 * later.
+	 */
+	if (trunc) {
+		i_size_write(inode, start);
+		di->i_size = cpu_to_le64(start);
+	}
+
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	ocfs2_journal_dirty(handle, di_bh);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	return ret;
+}
+
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
 {
 	/*
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 990df48ae8d..826e0a6cf5c 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -62,6 +62,10 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
 	return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
 }
 
+void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di);
+int ocfs2_convert_inline_data_to_extents(struct inode *inode,
+					 struct buffer_head *di_bh);
+
 int ocfs2_truncate_log_init(struct ocfs2_super *osb);
 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb);
 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
@@ -115,6 +119,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 			  struct inode *inode,
 			  struct buffer_head *fe_bh,
 			  struct ocfs2_truncate_context *tc);
+int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
+			  unsigned int start, unsigned int end, int trunc);
 
 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
 		    u32 cpos, struct buffer_head **leaf_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index fef0186a91c..34d10452c56 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -206,8 +206,8 @@ bail:
 	return err;
 }
 
-static int ocfs2_read_inline_data(struct inode *inode, struct page *page,
-				  struct buffer_head *di_bh)
+int ocfs2_read_inline_data(struct inode *inode, struct page *page,
+			   struct buffer_head *di_bh)
 {
 	void *kaddr;
 	unsigned int size;
@@ -1432,6 +1432,130 @@ out:
 	return ret;
 }
 
+static int ocfs2_write_begin_inline(struct address_space *mapping,
+				    struct inode *inode,
+				    struct ocfs2_write_ctxt *wc)
+{
+	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct page *page;
+	handle_t *handle;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+
+	page = find_or_create_page(mapping, 0, GFP_NOFS);
+	if (!page) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+	/*
+	 * If we don't set w_num_pages then this page won't get unlocked
+	 * and freed on cleanup of the write context.
+	 */
+	wc->w_pages[0] = wc->w_target_page = page;
+	wc->w_num_pages = 1;
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		ocfs2_commit_trans(osb, handle);
+
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+		ocfs2_set_inode_data_inline(inode, di);
+
+	if (!PageUptodate(page)) {
+		ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh);
+		if (ret) {
+			ocfs2_commit_trans(osb, handle);
+
+			goto out;
+		}
+	}
+
+	wc->w_handle = handle;
+out:
+	return ret;
+}
+
+int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	if (new_size < le16_to_cpu(di->id2.i_data.id_count))
+		return 1;
+	return 0;
+}
+
+static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
+					  struct inode *inode, loff_t pos,
+					  unsigned len, struct page *mmap_page,
+					  struct ocfs2_write_ctxt *wc)
+{
+	int ret, written = 0;
+	loff_t end = pos + len;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+	mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
+	     (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
+	     oi->ip_dyn_features);
+
+	/*
+	 * Handle inodes which already have inline data 1st.
+	 */
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		if (mmap_page == NULL &&
+		    ocfs2_size_fits_inline_data(wc->w_di_bh, end))
+			goto do_inline_write;
+
+		/*
+		 * The write won't fit - we have to give this inode an
+		 * inline extent list now.
+		 */
+		ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Check whether the inode can accept inline data.
+	 */
+	if (oi->ip_clusters != 0 || i_size_read(inode) != 0)
+		return 0;
+
+	/*
+	 * Check whether the write can fit.
+	 */
+	if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb))
+		return 0;
+
+do_inline_write:
+	ret = ocfs2_write_begin_inline(mapping, inode, wc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * This signals to the caller that the data can be written
+	 * inline.
+	 */
+	written = 1;
+out:
+	return written ? written : ret;
+}
+
 /*
  * This function only does anything for file systems which can't
  * handle sparse files.
@@ -1483,6 +1607,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		return ret;
 	}
 
+	if (ocfs2_supports_inline_data(osb)) {
+		ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len,
+						     mmap_page, wc);
+		if (ret == 1) {
+			ret = 0;
+			goto success;
+		}
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
 	ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
 	if (ret) {
 		mlog_errno(ret);
@@ -1570,6 +1707,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
 
+success:
 	*pagep = wc->w_target_page;
 	*fsdata = wc;
 	return 0;
@@ -1637,6 +1775,31 @@ out_fail:
 	return ret;
 }
 
+static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
+				   unsigned len, unsigned *copied,
+				   struct ocfs2_dinode *di,
+				   struct ocfs2_write_ctxt *wc)
+{
+	void *kaddr;
+
+	if (unlikely(*copied < len)) {
+		if (!PageUptodate(wc->w_target_page)) {
+			*copied = 0;
+			return;
+		}
+	}
+
+	kaddr = kmap_atomic(wc->w_target_page, KM_USER0);
+	memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	mlog(0, "Data written to inode at offset %llu. "
+	     "id_count = %u, copied = %u, i_dyn_features = 0x%x\n",
+	     (unsigned long long)pos, *copied,
+	     le16_to_cpu(di->id2.i_data.id_count),
+	     le16_to_cpu(di->i_dyn_features));
+}
+
 int ocfs2_write_end_nolock(struct address_space *mapping,
 			   loff_t pos, unsigned len, unsigned copied,
 			   struct page *page, void *fsdata)
@@ -1650,6 +1813,11 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 	handle_t *handle = wc->w_handle;
 	struct page *tmppage;
 
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
+		goto out_write_size;
+	}
+
 	if (unlikely(copied < len)) {
 		if (!PageUptodate(wc->w_target_page))
 			copied = 0;
@@ -1687,6 +1855,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 		block_commit_write(tmppage, from, to);
 	}
 
+out_write_size:
 	pos += copied;
 	if (pos > inode->i_size) {
 		i_size_write(inode, pos);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index b4fa37d40db..113560877db 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -61,6 +61,10 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 			     struct page **pagep, void **fsdata,
 			     struct buffer_head *di_bh, struct page *mmap_page);
 
+int ocfs2_read_inline_data(struct inode *inode, struct page *page,
+			   struct buffer_head *di_bh);
+int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size);
+
 /* all ocfs2_dio_end_io()'s fault */
 #define ocfs2_iocb_is_rw_locked(iocb) \
 	test_bit(0, (unsigned long *)&iocb->private)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 781ba6c4ef8..a62b14eb406 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -397,6 +397,15 @@ static int ocfs2_truncate_file(struct inode *inode,
 	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
 	truncate_inode_pages(inode->i_mapping, new_i_size);
 
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
+					       i_size_read(inode), 0);
+		if (status)
+			mlog_errno(status);
+
+		goto bail_unlock_data;
+	}
+
 	/* alright, we're going to need to do a full blown alloc size
 	 * change. Orphan the inode so that recovery can complete the
 	 * truncate if necessary. This does the task of marking
@@ -908,7 +917,8 @@ static int ocfs2_extend_file(struct inode *inode,
 			     struct buffer_head *di_bh,
 			     u64 new_i_size)
 {
-	int ret = 0;
+	int ret = 0, data_locked = 0;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
 	BUG_ON(!di_bh);
 
@@ -920,7 +930,17 @@ static int ocfs2_extend_file(struct inode *inode,
   		goto out;
 	BUG_ON(new_i_size < i_size_read(inode));
 
-	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+	/*
+	 * Fall through for converting inline data, even if the fs
+	 * supports sparse files.
+	 *
+	 * The check for inline data here is legal - nobody can add
+	 * the feature since we have i_mutex. We must check it again
+	 * after acquiring ip_alloc_sem though, as paths like mmap
+	 * might have raced us to converting the inode to extents.
+	 */
+	if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+	    && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
 		goto out_update_size;
 
 	/* 
@@ -935,6 +955,7 @@ static int ocfs2_extend_file(struct inode *inode,
 		mlog_errno(ret);
 		goto out;
 	}
+	data_locked = 1;
 
 	/*
 	 * The alloc sem blocks people in read/write from reading our
@@ -942,9 +963,31 @@ static int ocfs2_extend_file(struct inode *inode,
 	 * i_mutex to block other extend/truncate calls while we're
 	 * here.
 	 */
-	down_write(&OCFS2_I(inode)->ip_alloc_sem);
-	ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
-	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+	down_write(&oi->ip_alloc_sem);
+
+	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		/*
+		 * We can optimize small extends by keeping the inodes
+		 * inline data.
+		 */
+		if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) {
+			up_write(&oi->ip_alloc_sem);
+			goto out_update_size;
+		}
+
+		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
+		if (ret) {
+			up_write(&oi->ip_alloc_sem);
+
+			mlog_errno(ret);
+			goto out_unlock;
+		}
+	}
+
+	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+		ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
+
+	up_write(&oi->ip_alloc_sem);
 
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -957,7 +1000,7 @@ out_update_size:
 		mlog_errno(ret);
 
 out_unlock:
-	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+	if (data_locked)
 		ocfs2_data_unlock(inode, 1);
 
 out:
@@ -1231,6 +1274,31 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
 {
 	int ret;
 	u32 cpos, phys_cpos, clusters, alloc_size;
+	u64 end = start + len;
+	struct buffer_head *di_bh = NULL;
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       OCFS2_I(inode)->ip_blkno, &di_bh,
+				       OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * Nothing to do if the requested reservation range
+		 * fits within the inode.
+		 */
+		if (ocfs2_size_fits_inline_data(di_bh, end))
+			goto out;
+
+		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
 
 	/*
 	 * We consider both start and len to be inclusive.
@@ -1276,6 +1344,8 @@ next:
 
 	ret = 0;
 out:
+
+	brelse(di_bh);
 	return ret;
 }
 
@@ -1457,6 +1527,14 @@ static int ocfs2_remove_inode_range(struct inode *inode,
 	if (byte_len == 0)
 		return 0;
 
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
+					    byte_start + byte_len, 1);
+		if (ret)
+			mlog_errno(ret);
+		return ret;
+	}
+
 	trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
 	trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
 	if (trunc_len >= trunc_start)
@@ -1758,6 +1836,15 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 		if (!direct_io || !(*direct_io))
 			break;
 
+		/*
+		 * There's no sane way to do direct writes to an inode
+		 * with inline data.
+		 */
+		if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+			*direct_io = 0;
+			break;
+		}
+
 		/*
 		 * Allowing concurrent direct writes means
 		 * i_size changes wouldn't be synchronized, so
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index c8923bab422..1d5e0cb0fda 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -514,6 +514,10 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
+	/*
+	 * This check will also skip truncate of inodes with inline
+	 * data and fast symlinks.
+	 */
 	if (fe->i_clusters) {
 		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 		if (IS_ERR(handle)) {
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index ce60aab013a..4b32e096156 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -282,6 +282,9 @@ int                  ocfs2_journal_dirty_data(handle_t *handle,
  * prev. group desc. if we relink. */
 #define OCFS2_SUBALLOC_ALLOC (3)
 
+#define OCFS2_INLINE_TO_EXTENTS_CREDITS (OCFS2_SUBALLOC_ALLOC		\
+					 + OCFS2_INODE_UPDATE_CREDITS)
+
 /* dinode + group descriptor update. We don't relink on free yet. */
 #define OCFS2_SUBALLOC_FREE  (2)
 
-- 
cgit v1.2.3


From 23193e513d1cd69411469f028d56fd175d4a6b07 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 12 Sep 2007 13:01:18 -0700
Subject: ocfs2: Read support for directories with inline data

This splits out extent based directory read support and implements
inline-data versions of those functions. All knowledge of inline-data versus
extent based directories is internalized. For lookups the code uses
ocfs2_find_entry_id(), full dir iterations make use of
ocfs2_dir_foreach_blk_id().

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Reviewed-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dir.c | 176 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 168 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f2e2ffbf6c9..8deed89330c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -81,6 +81,10 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct buffer_head **new_bh);
 
+/*
+ * bh passed here can be an inode block or a dir data block, depending
+ * on the inode inline data flag.
+ */
 static int ocfs2_check_dir_entry(struct inode * dir,
 				 struct ocfs2_dir_entry * de,
 				 struct buffer_head * bh,
@@ -125,6 +129,8 @@ static int inline ocfs2_search_dirblock(struct buffer_head *bh,
 					struct inode *dir,
 					const char *name, int namelen,
 					unsigned long offset,
+					char *first_de,
+					unsigned int bytes,
 					struct ocfs2_dir_entry **res_dir)
 {
 	struct ocfs2_dir_entry *de;
@@ -134,8 +140,8 @@ static int inline ocfs2_search_dirblock(struct buffer_head *bh,
 
 	mlog_entry_void();
 
-	de_buf = bh->b_data;
-	dlimit = de_buf + dir->i_sb->s_blocksize;
+	de_buf = first_de;
+	dlimit = de_buf + bytes;
 
 	while (de_buf < dlimit) {
 		/* this code is executed quadratically often */
@@ -171,9 +177,39 @@ bail:
 	return ret;
 }
 
-struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
-				     struct inode *dir,
-				     struct ocfs2_dir_entry **res_dir)
+static struct buffer_head *ocfs2_find_entry_id(const char *name,
+					       int namelen,
+					       struct inode *dir,
+					       struct ocfs2_dir_entry **res_dir)
+{
+	int ret, found;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_inline_data *data;
+
+	ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
+			       &di_bh, OCFS2_BH_CACHED, dir);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	data = &di->id2.i_data;
+
+	found = ocfs2_search_dirblock(di_bh, dir, name, namelen, 0,
+				      data->id_data, i_size_read(dir), res_dir);
+	if (found == 1)
+		return di_bh;
+
+	brelse(di_bh);
+out:
+	return NULL;
+}
+
+struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
+					struct inode *dir,
+					struct ocfs2_dir_entry **res_dir)
 {
 	struct super_block *sb;
 	struct buffer_head *bh_use[NAMEI_RA_SIZE];
@@ -188,7 +224,6 @@ struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
 
 	mlog_entry_void();
 
-	*res_dir = NULL;
 	sb = dir->i_sb;
 
 	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
@@ -236,6 +271,7 @@ restart:
 		}
 		i = ocfs2_search_dirblock(bh, dir, name, namelen,
 					  block << sb->s_blocksize_bits,
+					  bh->b_data, sb->s_blocksize,
 					  res_dir);
 		if (i == 1) {
 			OCFS2_I(dir)->ip_dir_start_lookup = block;
@@ -271,6 +307,30 @@ cleanup_and_exit:
 	return ret;
 }
 
+/*
+ * Try to find an entry of the provided name within 'dir'.
+ *
+ * If nothing was found, NULL is returned. Otherwise, a buffer_head
+ * and pointer to the dir entry are passed back.
+ *
+ * Caller can NOT assume anything about the contents of the
+ * buffer_head - it is passed back only so that it can be passed into
+ * any one of the manipulation functions (add entry, delete entry,
+ * etc). As an example, bh in the extent directory case is a data
+ * block, in the inline-data case it actually points to an inode.
+ */
+struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
+				     struct inode *dir,
+				     struct ocfs2_dir_entry **res_dir)
+{
+	*res_dir = NULL;
+
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return ocfs2_find_entry_id(name, namelen, dir, res_dir);
+
+	return ocfs2_find_entry_el(name, namelen, dir, res_dir);
+}
+
 int ocfs2_update_entry(struct inode *dir, handle_t *handle,
 		       struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
 		       struct inode *new_entry_inode)
@@ -459,8 +519,98 @@ bail:
 	return retval;
 }
 
-static int ocfs2_dir_foreach_blk(struct inode *inode, unsigned long *f_version,
-				 loff_t *f_pos, void *priv, filldir_t filldir)
+static int ocfs2_dir_foreach_blk_id(struct inode *inode,
+				    unsigned long *f_version,
+				    loff_t *f_pos, void *priv,
+				    filldir_t filldir)
+{
+	int ret, i, filldir_ret;
+	unsigned long offset = *f_pos;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_inline_data *data;
+	struct ocfs2_dir_entry *de;
+
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
+			       &di_bh, OCFS2_BH_CACHED, inode);
+	if (ret) {
+		mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		goto out;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	data = &di->id2.i_data;
+
+	while (*f_pos < i_size_read(inode)) {
+revalidate:
+		/* If the dir block has changed since the last call to
+		 * readdir(2), then we might be pointing to an invalid
+		 * dirent right now.  Scan from the start of the block
+		 * to make sure. */
+		if (*f_version != inode->i_version) {
+			for (i = 0; i < i_size_read(inode) && i < offset; ) {
+				de = (struct ocfs2_dir_entry *)
+					(data->id_data + i);
+				/* It's too expensive to do a full
+				 * dirent test each time round this
+				 * loop, but we do have to test at
+				 * least that it is non-zero.  A
+				 * failure will be detected in the
+				 * dirent test below. */
+				if (le16_to_cpu(de->rec_len) <
+				    OCFS2_DIR_REC_LEN(1))
+					break;
+				i += le16_to_cpu(de->rec_len);
+			}
+			*f_pos = offset = i;
+			*f_version = inode->i_version;
+		}
+
+		de = (struct ocfs2_dir_entry *) (data->id_data + *f_pos);
+		if (!ocfs2_check_dir_entry(inode, de, di_bh, *f_pos)) {
+			/* On error, skip the f_pos to the end. */
+			*f_pos = i_size_read(inode);
+			goto out;
+		}
+		offset += le16_to_cpu(de->rec_len);
+		if (le64_to_cpu(de->inode)) {
+			/* We might block in the next section
+			 * if the data destination is
+			 * currently swapped out.  So, use a
+			 * version stamp to detect whether or
+			 * not the directory has been modified
+			 * during the copy operation.
+			 */
+			unsigned long version = *f_version;
+			unsigned char d_type = DT_UNKNOWN;
+
+			if (de->file_type < OCFS2_FT_MAX)
+				d_type = ocfs2_filetype_table[de->file_type];
+
+			filldir_ret = filldir(priv, de->name,
+					      de->name_len,
+					      *f_pos,
+					      le64_to_cpu(de->inode),
+					      d_type);
+			if (filldir_ret)
+				break;
+			if (version != *f_version)
+				goto revalidate;
+		}
+		*f_pos += le16_to_cpu(de->rec_len);
+	}
+
+out:
+	brelse(di_bh);
+
+	return 0;
+}
+
+static int ocfs2_dir_foreach_blk_el(struct inode *inode,
+				    unsigned long *f_version,
+				    loff_t *f_pos, void *priv,
+				    filldir_t filldir)
 {
 	int error = 0;
 	unsigned long offset, blk, last_ra_blk = 0;
@@ -576,6 +726,16 @@ out:
 	return stored;
 }
 
+static int ocfs2_dir_foreach_blk(struct inode *inode, unsigned long *f_version,
+				 loff_t *f_pos, void *priv, filldir_t filldir)
+{
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv,
+						filldir);
+
+	return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir);
+}
+
 /*
  * This is intended to be called from inside other kernel functions,
  * so we fake some arguments.
-- 
cgit v1.2.3


From 5b6a3a2b4a5f071d170f8122038dd647a84810a8 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 13 Sep 2007 16:33:54 -0700
Subject: ocfs2: Write support for directories with inline data

Create all new directories with OCFS2_INLINE_DATA_FL and the inline data
bytes formatted as an empty directory. Inode size field reflects the actual
amount of inline data available, which makes searching for dirent space
very similar to the regular directory search.

Inline-data directories are automatically pushed out to extents on any
insert request which is too large for the available space.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Reviewed-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/alloc.c |  16 +-
 fs/ocfs2/alloc.h |   1 +
 fs/ocfs2/dir.c   | 618 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 fs/ocfs2/namei.c |  57 +++--
 4 files changed, 594 insertions(+), 98 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 72cefe25382..4ba7f0bdc24 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5835,6 +5835,15 @@ static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di)
 	memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2));
 }
 
+void ocfs2_dinode_new_extent_list(struct inode *inode,
+				  struct ocfs2_dinode *di)
+{
+	ocfs2_zero_dinode_id2(inode, di);
+	di->id2.i_list.l_tree_depth = 0;
+	di->id2.i_list.l_next_free_rec = 0;
+	di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
+}
+
 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
@@ -5863,7 +5872,6 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-	struct ocfs2_extent_list *el = &di->id2.i_list;
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct page **pages = NULL;
 	loff_t end = osb->s_clustersize;
@@ -5956,11 +5964,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
 	spin_unlock(&oi->ip_lock);
 
-	ocfs2_zero_dinode_id2(inode, di);
-
-	el->l_tree_depth = 0;
-	el->l_next_free_rec = 0;
-	el->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
+	ocfs2_dinode_new_extent_list(inode, di);
 
 	ocfs2_journal_dirty(handle, di_bh);
 
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 826e0a6cf5c..42ff94bd801 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -62,6 +62,7 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
 	return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
 }
 
+void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di);
 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di);
 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 					 struct buffer_head *di_bh);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8deed89330c..e1535133841 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -72,6 +72,7 @@ static unsigned char ocfs2_filetype_table[] = {
 static int ocfs2_extend_dir(struct ocfs2_super *osb,
 			    struct inode *dir,
 			    struct buffer_head *parent_fe_bh,
+			    unsigned int blocks_wanted,
 			    struct buffer_head **new_de_bh);
 static int ocfs2_do_extend_dir(struct super_block *sb,
 			       handle_t *handle,
@@ -331,12 +332,20 @@ struct buffer_head *ocfs2_find_entry(const char *name, int namelen,
 	return ocfs2_find_entry_el(name, namelen, dir, res_dir);
 }
 
+/*
+ * Update inode number and type of a previously found directory entry.
+ */
 int ocfs2_update_entry(struct inode *dir, handle_t *handle,
 		       struct buffer_head *de_bh, struct ocfs2_dir_entry *de,
 		       struct inode *new_entry_inode)
 {
 	int ret;
 
+	/*
+	 * The same code works fine for both inline-data and extent
+	 * based directories, so no need to split this up.
+	 */
+
 	ret = ocfs2_journal_access(handle, dir, de_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
@@ -353,14 +362,10 @@ out:
 	return ret;
 }
 
-/*
- * ocfs2_delete_entry deletes a directory entry by merging it with the
- * previous entry
- */
-int ocfs2_delete_entry(handle_t *handle,
-		       struct inode *dir,
-		       struct ocfs2_dir_entry *de_del,
-		       struct buffer_head *bh)
+static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
+				struct ocfs2_dir_entry *de_del,
+				struct buffer_head *bh, char *first_de,
+				unsigned int bytes)
 {
 	struct ocfs2_dir_entry *de, *pde;
 	int i, status = -ENOENT;
@@ -369,8 +374,8 @@ int ocfs2_delete_entry(handle_t *handle,
 
 	i = 0;
 	pde = NULL;
-	de = (struct ocfs2_dir_entry *) bh->b_data;
-	while (i < bh->b_size) {
+	de = (struct ocfs2_dir_entry *) first_de;
+	while (i < bytes) {
 		if (!ocfs2_check_dir_entry(dir, de, bh, i)) {
 			status = -EIO;
 			mlog_errno(status);
@@ -403,6 +408,58 @@ bail:
 	return status;
 }
 
+static inline int ocfs2_delete_entry_id(handle_t *handle,
+					struct inode *dir,
+					struct ocfs2_dir_entry *de_del,
+					struct buffer_head *bh)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_inline_data *data;
+
+	ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
+			       &di_bh, OCFS2_BH_CACHED, dir);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	data = &di->id2.i_data;
+
+	ret = __ocfs2_delete_entry(handle, dir, de_del, bh, data->id_data,
+				   i_size_read(dir));
+
+	brelse(di_bh);
+out:
+	return ret;
+}
+
+static inline int ocfs2_delete_entry_el(handle_t *handle,
+					struct inode *dir,
+					struct ocfs2_dir_entry *de_del,
+					struct buffer_head *bh)
+{
+	return __ocfs2_delete_entry(handle, dir, de_del, bh, bh->b_data,
+				    bh->b_size);
+}
+
+/*
+ * ocfs2_delete_entry deletes a directory entry by merging it with the
+ * previous entry
+ */
+int ocfs2_delete_entry(handle_t *handle,
+		       struct inode *dir,
+		       struct ocfs2_dir_entry *de_del,
+		       struct buffer_head *bh)
+{
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return ocfs2_delete_entry_id(handle, dir, de_del, bh);
+
+	return ocfs2_delete_entry_el(handle, dir, de_del, bh);
+}
+
 /*
  * Check whether 'de' has enough room to hold an entry of
  * 'new_rec_len' bytes.
@@ -444,21 +501,30 @@ int __ocfs2_add_entry(handle_t *handle,
 	unsigned long offset;
 	unsigned short rec_len;
 	struct ocfs2_dir_entry *de, *de1;
-	struct super_block *sb;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data;
+	struct super_block *sb = dir->i_sb;
 	int retval, status;
+	unsigned int size = sb->s_blocksize;
+	char *data_start = insert_bh->b_data;
 
 	mlog_entry_void();
 
-	sb = dir->i_sb;
-
 	if (!namelen)
 		return -EINVAL;
 
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		data_start = di->id2.i_data.id_data;
+		size = i_size_read(dir);
+
+		BUG_ON(insert_bh != parent_fe_bh);
+	}
+
 	rec_len = OCFS2_DIR_REC_LEN(namelen);
 	offset = 0;
-	de = (struct ocfs2_dir_entry *) insert_bh->b_data;
+	de = (struct ocfs2_dir_entry *) data_start;
 	while (1) {
-		BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data);
+		BUG_ON((char *)de >= (size + data_start));
+
 		/* These checks should've already been passed by the
 		 * prepare function, but I guess we can leave them
 		 * here anyway. */
@@ -939,16 +1005,78 @@ int ocfs2_empty_dir(struct inode *inode)
 	return !priv.seen_other;
 }
 
-int ocfs2_fill_new_dir(struct ocfs2_super *osb,
-		       handle_t *handle,
-		       struct inode *parent,
-		       struct inode *inode,
-		       struct buffer_head *fe_bh,
-		       struct ocfs2_alloc_context *data_ac)
+static void ocfs2_fill_initial_dirents(struct inode *inode,
+				       struct inode *parent,
+				       char *start, unsigned int size)
+{
+	struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
+
+	de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+	de->name_len = 1;
+	de->rec_len =
+		cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
+	strcpy(de->name, ".");
+	ocfs2_set_de_type(de, S_IFDIR);
+
+	de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
+	de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
+	de->rec_len = cpu_to_le16(size - OCFS2_DIR_REC_LEN(1));
+	de->name_len = 2;
+	strcpy(de->name, "..");
+	ocfs2_set_de_type(de, S_IFDIR);
+}
+
+/*
+ * This works together with code in ocfs2_mknod_locked() which sets
+ * the inline-data flag and initializes the inline-data section.
+ */
+static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
+				 handle_t *handle,
+				 struct inode *parent,
+				 struct inode *inode,
+				 struct buffer_head *di_bh)
+{
+	int ret;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_inline_data *data = &di->id2.i_data;
+	unsigned int size = le16_to_cpu(data->id_count);
+
+	ret = ocfs2_journal_access(handle, inode, di_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_fill_initial_dirents(inode, parent, data->id_data, size);
+
+	ocfs2_journal_dirty(handle, di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	i_size_write(inode, size);
+	inode->i_nlink = 2;
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+
+	ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out:
+	return ret;
+}
+
+static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
+				 handle_t *handle,
+				 struct inode *parent,
+				 struct inode *inode,
+				 struct buffer_head *fe_bh,
+				 struct ocfs2_alloc_context *data_ac)
 {
 	int status;
 	struct buffer_head *new_bh = NULL;
-	struct ocfs2_dir_entry *de = NULL;
 
 	mlog_entry_void();
 
@@ -969,20 +1097,8 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
 	}
 	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
 
-	de = (struct ocfs2_dir_entry *) new_bh->b_data;
-	de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
-	de->name_len = 1;
-	de->rec_len =
-		cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len));
-	strcpy(de->name, ".");
-	ocfs2_set_de_type(de, S_IFDIR);
-	de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len));
-	de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno);
-	de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize -
-				  OCFS2_DIR_REC_LEN(1));
-	de->name_len = 2;
-	strcpy(de->name, "..");
-	ocfs2_set_de_type(de, S_IFDIR);
+	ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data,
+				   osb->sb->s_blocksize);
 
 	status = ocfs2_journal_dirty(handle, new_bh);
 	if (status < 0) {
@@ -1008,6 +1124,230 @@ bail:
 	return status;
 }
 
+int ocfs2_fill_new_dir(struct ocfs2_super *osb,
+		       handle_t *handle,
+		       struct inode *parent,
+		       struct inode *inode,
+		       struct buffer_head *fe_bh,
+		       struct ocfs2_alloc_context *data_ac)
+{
+	BUG_ON(!ocfs2_supports_inline_data(osb) && data_ac == NULL);
+
+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return ocfs2_fill_new_dir_id(osb, handle, parent, inode, fe_bh);
+
+	return ocfs2_fill_new_dir_el(osb, handle, parent, inode, fe_bh,
+				     data_ac);
+}
+
+static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
+				     unsigned int new_size)
+{
+	struct ocfs2_dir_entry *de;
+	struct ocfs2_dir_entry *prev_de;
+	char *de_buf, *limit;
+	unsigned int bytes = new_size - old_size;
+
+	limit = start + old_size;
+	de_buf = start;
+	de = (struct ocfs2_dir_entry *)de_buf;
+	do {
+		prev_de = de;
+		de_buf += le16_to_cpu(de->rec_len);
+		de = (struct ocfs2_dir_entry *)de_buf;
+	} while (de_buf < limit);
+
+	le16_add_cpu(&prev_de->rec_len, bytes);
+}
+
+/*
+ * We allocate enough clusters to fulfill "blocks_wanted", but set
+ * i_size to exactly one block. Ocfs2_extend_dir() will handle the
+ * rest automatically for us.
+ *
+ * *first_block_bh is a pointer to the 1st data block allocated to the
+ *  directory.
+ */
+static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
+				   unsigned int blocks_wanted,
+				   struct buffer_head **first_block_bh)
+{
+	int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS;
+	u32 alloc, bit_off, len;
+	struct super_block *sb = dir->i_sb;
+	u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	struct ocfs2_inode_info *oi = OCFS2_I(dir);
+	struct ocfs2_alloc_context *data_ac;
+	struct buffer_head *dirdata_bh = NULL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	handle_t *handle;
+
+	alloc = ocfs2_clusters_for_bytes(sb, bytes);
+
+	/*
+	 * We should never need more than 2 clusters for this -
+	 * maximum dirent size is far less than one block. In fact,
+	 * the only time we'd need more than one cluster is if
+	 * blocksize == clustersize and the dirent won't fit in the
+	 * extra space that the expansion to a single block gives. As
+	 * of today, that only happens on 4k/4k file systems.
+	 */
+	BUG_ON(alloc > 2);
+
+	ret = ocfs2_reserve_clusters(osb, alloc, &data_ac);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	down_write(&oi->ip_alloc_sem);
+
+	/*
+	 * Prepare for worst case allocation scenario of two seperate
+	 * extents.
+	 */
+	if (alloc == 2)
+		credits += OCFS2_SUBALLOC_ALLOC;
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_sem;
+	}
+
+	/*
+	 * Try to claim as many clusters as the bitmap can give though
+	 * if we only get one now, that's enough to continue. The rest
+	 * will be claimed after the conversion to extents.
+	 */
+	ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * Operations are carefully ordered so that we set up the new
+	 * data block first. The conversion from inline data to
+	 * extents follows.
+	 */
+	blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
+	dirdata_bh = sb_getblk(sb, blkno);
+	if (!dirdata_bh) {
+		ret = -EIO;
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
+
+	ret = ocfs2_journal_access(handle, dir, dirdata_bh,
+				   OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
+	memset(dirdata_bh->b_data + i_size_read(dir), 0,
+	       sb->s_blocksize - i_size_read(dir));
+	ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir),
+				 sb->s_blocksize);
+
+	ret = ocfs2_journal_dirty(handle, dirdata_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * Set extent, i_size, etc on the directory. After this, the
+	 * inode should contain the same exact dirents as before and
+	 * be fully accessible from system calls.
+	 *
+	 * We let the later dirent insert modify c/mtime - to the user
+	 * the data hasn't changed.
+	 */
+	ret = ocfs2_journal_access(handle, dir, di_bh,
+				   OCFS2_JOURNAL_ACCESS_CREATE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	spin_lock(&oi->ip_lock);
+	oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
+	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+	spin_unlock(&oi->ip_lock);
+
+	ocfs2_dinode_new_extent_list(dir, di);
+
+	i_size_write(dir, sb->s_blocksize);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+
+	di->i_size = cpu_to_le64(sb->s_blocksize);
+	di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
+	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
+	dir->i_blocks = ocfs2_inode_sector_count(dir);
+
+	/*
+	 * This should never fail as our extent list is empty and all
+	 * related blocks have been journaled already.
+	 */
+	ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0,
+				  NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_dirty(handle, di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	/*
+	 * We asked for two clusters, but only got one in the 1st
+	 * pass. Claim the 2nd cluster as a separate extent.
+	 */
+	if (alloc > len) {
+		ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
+					   &len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
+		blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
+
+		ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno,
+					  len, 0, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	*first_block_bh = dirdata_bh;
+	dirdata_bh = NULL;
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_sem:
+	up_write(&oi->ip_alloc_sem);
+
+out:
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+
+	brelse(dirdata_bh);
+
+	return ret;
+}
+
 /* returns a bh of the 1st new block in the allocation. */
 static int ocfs2_do_extend_dir(struct super_block *sb,
 			       handle_t *handle,
@@ -1057,10 +1397,18 @@ bail:
 	return status;
 }
 
-/* assumes you already have a cluster lock on the directory. */
+/*
+ * Assumes you already have a cluster lock on the directory.
+ *
+ * 'blocks_wanted' is only used if we have an inline directory which
+ * is to be turned into an extent based one. The size of the dirent to
+ * insert might be larger than the space gained by growing to just one
+ * block, so we may have to grow the inode by two blocks in that case.
+ */
 static int ocfs2_extend_dir(struct ocfs2_super *osb,
 			    struct inode *dir,
 			    struct buffer_head *parent_fe_bh,
+			    unsigned int blocks_wanted,
 			    struct buffer_head **new_de_bh)
 {
 	int status = 0;
@@ -1076,6 +1424,38 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 
 	mlog_entry_void();
 
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		status = ocfs2_expand_inline_dir(dir, parent_fe_bh,
+						 blocks_wanted, &new_bh);
+		if (status) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		if (blocks_wanted == 1) {
+			/*
+			 * If the new dirent will fit inside the space
+			 * created by pushing out to one block, then
+			 * we can complete the operation
+			 * here. Otherwise we have to expand i_size
+			 * and format the 2nd block below.
+			 */
+			BUG_ON(new_bh == NULL);
+			goto bail_bh;
+		}
+
+		/*
+		 * Get rid of 'new_bh' - we want to format the 2nd
+		 * data block and return that instead.
+		 */
+		brelse(new_bh);
+		new_bh = NULL;
+
+		dir_i_size = i_size_read(dir);
+		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
+		goto do_extend;
+	}
+
 	dir_i_size = i_size_read(dir);
 	mlog(0, "extending dir %llu (i_size = %lld)\n",
 	     (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size);
@@ -1113,6 +1493,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 		credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
 	}
 
+do_extend:
 	down_write(&OCFS2_I(dir)->ip_alloc_sem);
 	drop_alloc_sem = 1;
 
@@ -1158,6 +1539,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 		goto bail;
 	}
 
+bail_bh:
 	*new_de_bh = new_bh;
 	get_bh(*new_de_bh);
 bail:
@@ -1178,41 +1560,71 @@ bail:
 	return status;
 }
 
-/*
- * Search the dir for a good spot, extending it if necessary. The
- * block containing an appropriate record is returned in ret_de_bh.
- */
-int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
-				 struct inode *dir,
-				 struct buffer_head *parent_fe_bh,
-				 const char *name,
-				 int namelen,
-				 struct buffer_head **ret_de_bh)
+static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
+				   const char *name, int namelen,
+				   struct buffer_head **ret_de_bh,
+				   unsigned int *blocks_wanted)
 {
-	unsigned long offset;
-	struct buffer_head * bh = NULL;
-	unsigned short rec_len;
-	struct ocfs2_dinode *fe;
-	struct ocfs2_dir_entry *de;
-	struct super_block *sb;
-	int status;
+	int ret;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_dir_entry *de, *last_de = NULL;
+	char *de_buf, *limit;
+	unsigned long offset = 0;
+	unsigned int rec_len, new_rec_len;
+
+	de_buf = di->id2.i_data.id_data;
+	limit = de_buf + i_size_read(dir);
+	rec_len = OCFS2_DIR_REC_LEN(namelen);
 
-	mlog_entry_void();
+	while (de_buf < limit) {
+		de = (struct ocfs2_dir_entry *)de_buf;
 
-	mlog(0, "getting ready to insert namelen %d into dir %llu\n",
-	     namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
+		if (!ocfs2_check_dir_entry(dir, de, di_bh, offset)) {
+			ret = -ENOENT;
+			goto out;
+		}
+		if (ocfs2_match(namelen, name, de)) {
+			ret = -EEXIST;
+			goto out;
+		}
+		if (ocfs2_dirent_would_fit(de, rec_len)) {
+			/* Ok, we found a spot. Return this bh and let
+			 * the caller actually fill it in. */
+			*ret_de_bh = di_bh;
+			get_bh(*ret_de_bh);
+			ret = 0;
+			goto out;
+		}
 
-	BUG_ON(!S_ISDIR(dir->i_mode));
-	fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
-	BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir));
+		last_de = de;
+		de_buf += le16_to_cpu(de->rec_len);
+		offset += le16_to_cpu(de->rec_len);
+	}
 
-	sb = dir->i_sb;
+	/*
+	 * We're going to require expansion of the directory - figure
+	 * out how many blocks we'll need so that a place for the
+	 * dirent can be found.
+	 */
+	*blocks_wanted = 1;
+	new_rec_len = le16_to_cpu(last_de->rec_len) + (dir->i_sb->s_blocksize - i_size_read(dir));
+	if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
+		*blocks_wanted = 2;
 
-	if (!namelen) {
-		status = -EINVAL;
-		mlog_errno(status);
-		goto bail;
-	}
+	ret = -ENOSPC;
+out:
+	return ret;
+}
+
+static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
+				   int namelen, struct buffer_head **ret_de_bh)
+{
+	unsigned long offset;
+	struct buffer_head *bh = NULL;
+	unsigned short rec_len;
+	struct ocfs2_dir_entry *de;
+	struct super_block *sb = dir->i_sb;
+	int status;
 
 	bh = ocfs2_bread(dir, 0, &status, 0);
 	if (!bh) {
@@ -1229,17 +1641,11 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
 			bh = NULL;
 
 			if (i_size_read(dir) <= offset) {
-				status = ocfs2_extend_dir(osb,
-							  dir,
-							  parent_fe_bh,
-							  &bh);
-				if (status < 0) {
-					mlog_errno(status);
-					goto bail;
-				}
-				BUG_ON(!bh);
-				*ret_de_bh = bh;
-				get_bh(*ret_de_bh);
+				/*
+				 * Caller will have to expand this
+				 * directory.
+				 */
+				status = -ENOSPC;
 				goto bail;
 			}
 			bh = ocfs2_bread(dir,
@@ -1281,3 +1687,61 @@ bail:
 	mlog_exit(status);
 	return status;
 }
+
+int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
+				 struct inode *dir,
+				 struct buffer_head *parent_fe_bh,
+				 const char *name,
+				 int namelen,
+				 struct buffer_head **ret_de_bh)
+{
+	int ret;
+	unsigned int blocks_wanted = 1;
+	struct buffer_head *bh = NULL;
+
+	mlog(0, "getting ready to insert namelen %d into dir %llu\n",
+	     namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
+
+	*ret_de_bh = NULL;
+
+	if (!namelen) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_find_dir_space_id(dir, parent_fe_bh, name,
+					      namelen, &bh, &blocks_wanted);
+	} else
+		ret = ocfs2_find_dir_space_el(dir, name, namelen, &bh);
+
+	if (ret && ret != -ENOSPC) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (ret == -ENOSPC) {
+		/*
+		 * We have to expand the directory to add this name.
+		 */
+		BUG_ON(bh);
+
+		ret = ocfs2_extend_dir(osb, dir, parent_fe_bh, blocks_wanted,
+				       &bh);
+		if (ret) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+
+		BUG_ON(!bh);
+	}
+
+	*ret_de_bh = bh;
+	bh = NULL;
+out:
+	if (bh)
+		brelse(bh);
+	return ret;
+}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d2f03355d12..729259016c1 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -250,9 +250,8 @@ static int ocfs2_mknod(struct inode *dir,
 		goto leave;
 	}
 
-	/* are we making a directory? If so, reserve a cluster for his
-	 * 1st extent. */
-	if (S_ISDIR(mode)) {
+	/* Reserve a cluster if creating an extent based directory. */
+	if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
 		status = ocfs2_reserve_clusters(osb, 1, &data_ac);
 		if (status < 0) {
 			if (status != -ENOSPC)
@@ -449,10 +448,21 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 		cpu_to_le32(CURRENT_TIME.tv_nsec);
 	fe->i_dtime = 0;
 
-	fel = &fe->id2.i_list;
-	fel->l_tree_depth = 0;
-	fel->l_next_free_rec = 0;
-	fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
+	/*
+	 * If supported, directories start with inline data.
+	 */
+	if (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) {
+		u16 feat = le16_to_cpu(fe->i_dyn_features);
+
+		fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
+
+		fe->id2.i_data.id_count = cpu_to_le16(ocfs2_max_inline_data(osb->sb));
+	} else {
+		fel = &fe->id2.i_list;
+		fel->l_tree_depth = 0;
+		fel->l_next_free_rec = 0;
+		fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb));
+	}
 
 	status = ocfs2_journal_dirty(handle, *new_fe_bh);
 	if (status < 0) {
@@ -950,7 +960,7 @@ static int ocfs2_rename(struct inode *old_dir,
 	struct buffer_head *old_inode_bh = NULL;
 	struct buffer_head *insert_entry_bh = NULL;
 	struct ocfs2_super *osb = NULL;
-	u64 newfe_blkno;
+	u64 newfe_blkno, old_de_ino;
 	handle_t *handle = NULL;
 	struct buffer_head *old_dir_bh = NULL;
 	struct buffer_head *new_dir_bh = NULL;
@@ -1061,12 +1071,13 @@ static int ocfs2_rename(struct inode *old_dir,
 		}
 	}
 
-	status = -ENOENT;
-	old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
-				     old_dentry->d_name.len,
-				     old_dir, &old_de);
-	if (!old_de_bh)
+	status = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name,
+					    old_dentry->d_name.len,
+					    &old_de_ino);
+	if (status) {
+		status = -ENOENT;
 		goto bail;
+	}
 
 	/*
 	 *  Check for inode number is _not_ due to possible IO errors.
@@ -1074,8 +1085,10 @@ static int ocfs2_rename(struct inode *old_dir,
 	 *  and merrily kill the link to whatever was created under the
 	 *  same name. Goodbye sticky bit ;-<
 	 */
-	if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno)
+	if (old_de_ino != OCFS2_I(old_inode)->ip_blkno) {
+		status = -ENOENT;
 		goto bail;
+	}
 
 	/* check if the target already exists (in which case we need
 	 * to delete it */
@@ -1250,7 +1263,21 @@ static int ocfs2_rename(struct inode *old_dir,
 	} else
 		mlog_errno(status);
 
-	/* now that the name has been added to new_dir, remove the old name */
+	/*
+	 * Now that the name has been added to new_dir, remove the old name.
+	 *
+	 * We don't keep any directory entry context around until now
+	 * because the insert might have changed the type of directory
+	 * we're dealing with.
+	 */
+	old_de_bh = ocfs2_find_entry(old_dentry->d_name.name,
+				     old_dentry->d_name.len,
+				     old_dir, &old_de);
+	if (!old_de_bh) {
+		status = -EIO;
+		goto bail;
+	}
+
 	status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh);
 	if (status < 0) {
 		mlog_errno(status);
-- 
cgit v1.2.3


From e7b34019606ab1dd06196635e931b0c302799228 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 24 Sep 2007 14:25:27 -0700
Subject: ocfs2: Optionally return filldir errors

Modify ocfs2_dir_foreach_blk() to optionally return any error from the
filldir callback. This way ocfs2_dirforeach() can terminate early, as
opposed to always passing through the entire directory. This fixes a bug
introduced during a previous code refactor where ocfs2_empty_dir() would
loop infinitely.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/dir.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index e1535133841..7453b70c1a1 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -588,7 +588,7 @@ bail:
 static int ocfs2_dir_foreach_blk_id(struct inode *inode,
 				    unsigned long *f_version,
 				    loff_t *f_pos, void *priv,
-				    filldir_t filldir)
+				    filldir_t filldir, int *filldir_err)
 {
 	int ret, i, filldir_ret;
 	unsigned long offset = *f_pos;
@@ -659,8 +659,11 @@ revalidate:
 					      *f_pos,
 					      le64_to_cpu(de->inode),
 					      d_type);
-			if (filldir_ret)
+			if (filldir_ret) {
+				if (filldir_err)
+					*filldir_err = filldir_ret;
 				break;
+			}
 			if (version != *f_version)
 				goto revalidate;
 		}
@@ -676,7 +679,7 @@ out:
 static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 				    unsigned long *f_version,
 				    loff_t *f_pos, void *priv,
-				    filldir_t filldir)
+				    filldir_t filldir, int *filldir_err)
 {
 	int error = 0;
 	unsigned long offset, blk, last_ra_blk = 0;
@@ -775,8 +778,11 @@ revalidate:
 						*f_pos,
 						le64_to_cpu(de->inode),
 						d_type);
-				if (error)
+				if (error) {
+					if (filldir_err)
+						*filldir_err = error;
 					break;
+				}
 				if (version != *f_version)
 					goto revalidate;
 				stored ++;
@@ -793,13 +799,15 @@ out:
 }
 
 static int ocfs2_dir_foreach_blk(struct inode *inode, unsigned long *f_version,
-				 loff_t *f_pos, void *priv, filldir_t filldir)
+				 loff_t *f_pos, void *priv, filldir_t filldir,
+				 int *filldir_err)
 {
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 		return ocfs2_dir_foreach_blk_id(inode, f_version, f_pos, priv,
-						filldir);
+						filldir, filldir_err);
 
-	return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir);
+	return ocfs2_dir_foreach_blk_el(inode, f_version, f_pos, priv, filldir,
+					filldir_err);
 }
 
 /*
@@ -809,16 +817,19 @@ static int ocfs2_dir_foreach_blk(struct inode *inode, unsigned long *f_version,
 int ocfs2_dir_foreach(struct inode *inode, loff_t *f_pos, void *priv,
 		      filldir_t filldir)
 {
-	int ret = 0;
+	int ret = 0, filldir_err = 0;
 	unsigned long version = inode->i_version;
 
 	while (*f_pos < i_size_read(inode)) {
 		ret = ocfs2_dir_foreach_blk(inode, &version, f_pos, priv,
-					    filldir);
-		if (ret)
+					    filldir, &filldir_err);
+		if (ret || filldir_err)
 			break;
 	}
 
+	if (ret > 0)
+		ret = -EIO;
+
 	return 0;
 }
 
@@ -852,7 +863,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	}
 
 	error = ocfs2_dir_foreach_blk(inode, &filp->f_version, &filp->f_pos,
-				      dirent, filldir);
+				      dirent, filldir, NULL);
 
 	ocfs2_meta_unlock(inode, lock_level);
 
-- 
cgit v1.2.3


From 19c38de88a80913351fcacefdb461cc0b585fa87 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 12 Sep 2007 15:06:57 -0700
Subject: kobjects: fix up improper use of the kobject name field

A number of different drivers incorrect access the kobject name field
directly.  This is not correct as the name might not be in the array.
Use the proper accessor function instead.
---
 fs/partitions/check.c | 12 +++++++-----
 fs/sysfs/dir.c        |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 783c57ec07d..722e12e5acc 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -381,10 +381,12 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len,
 	p->partno = part;
 	p->policy = disk->policy;
 
-	if (isdigit(disk->kobj.name[strlen(disk->kobj.name)-1]))
-		snprintf(p->kobj.name,KOBJ_NAME_LEN,"%sp%d",disk->kobj.name,part);
+	if (isdigit(disk->kobj.k_name[strlen(disk->kobj.k_name)-1]))
+		kobject_set_name(&p->kobj, "%sp%d",
+				 kobject_name(&disk->kobj), part);
 	else
-		snprintf(p->kobj.name,KOBJ_NAME_LEN,"%s%d",disk->kobj.name,part);
+		kobject_set_name(&p->kobj, "%s%d",
+				 kobject_name(&disk->kobj),part);
 	p->kobj.parent = &disk->kobj;
 	p->kobj.ktype = &ktype_part;
 	kobject_init(&p->kobj);
@@ -477,9 +479,9 @@ void register_disk(struct gendisk *disk)
 	struct hd_struct *p;
 	int err;
 
-	strlcpy(disk->kobj.name,disk->disk_name,KOBJ_NAME_LEN);
+	kobject_set_name(&disk->kobj, "%s", disk->disk_name);
 	/* ewww... some of these buggers have / in name... */
-	s = strchr(disk->kobj.name, '/');
+	s = strchr(disk->kobj.k_name, '/');
 	if (s)
 		*s = '!';
 	if ((err = kobject_add(&disk->kobj)))
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 83e76b3813c..ea33b660ae8 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -1013,7 +1013,7 @@ again:
 		goto again;
 	}
 
-	new_dentry = lookup_one_len(kobj->name, new_parent, strlen(kobj->name));
+	new_dentry = lookup_one_len(kobject_name(kobj), new_parent, strlen(kobject_name(kobj)));
 	if (IS_ERR(new_dentry)) {
 		error = PTR_ERR(new_dentry);
 		goto out_unlock;
-- 
cgit v1.2.3


From 34980ca8faebfcce31094eba6ffbb0113950361f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 12 Sep 2007 15:06:57 -0700
Subject: Drivers: clean up direct setting of the name of a kset

A kset should not have its name set directly, so dynamically set the
name at runtime.

This is needed to remove the static array in the kobject structure which
will be changed in a future patch.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/dlm/lockspace.c          | 2 +-
 fs/gfs2/locking/dlm/sysfs.c | 2 +-
 fs/gfs2/sys.c               | 2 +-
 fs/ocfs2/cluster/masklog.c  | 3 ++-
 4 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 1dc72105ab1..f88f88fdedf 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -167,7 +167,6 @@ static struct kobj_type dlm_ktype = {
 };
 
 static struct kset dlm_kset = {
-	.kobj   = {.name = "dlm",},
 	.ktype  = &dlm_ktype,
 };
 
@@ -228,6 +227,7 @@ int dlm_lockspace_init(void)
 	INIT_LIST_HEAD(&lslist);
 	spin_lock_init(&lslist_lock);
 
+	kobject_set_name(&dlm_kset.kobj, "dlm");
 	kobj_set_kset_s(&dlm_kset, kernel_subsys);
 	error = kset_register(&dlm_kset);
 	if (error)
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index d9fe3ca40e1..ae9e6a25fe2 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -190,7 +190,6 @@ static struct kobj_type gdlm_ktype = {
 };
 
 static struct kset gdlm_kset = {
-	.kobj   = {.name = "lock_dlm",},
 	.ktype  = &gdlm_ktype,
 };
 
@@ -224,6 +223,7 @@ int gdlm_sysfs_init(void)
 {
 	int error;
 
+	kobject_set_name(&gdlm_kset.kobj, "lock_dlm");
 	kobj_set_kset_s(&gdlm_kset, kernel_subsys);
 	error = kset_register(&gdlm_kset);
 	if (error)
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index c26c21b53c1..640cb6a6fc4 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -222,7 +222,6 @@ static struct kobj_type gfs2_ktype = {
 };
 
 static struct kset gfs2_kset = {
-	.kobj   = {.name = "gfs2"},
 	.ktype  = &gfs2_ktype,
 };
 
@@ -553,6 +552,7 @@ int gfs2_sys_init(void)
 {
 	gfs2_sys_margs = NULL;
 	spin_lock_init(&gfs2_sys_margs_lock);
+	kobject_set_name(&gfs2_kset.kobj, "gfs2");
 	kobj_set_kset_s(&gfs2_kset, fs_subsys);
 	return kset_register(&gfs2_kset);
 }
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index e9e042b93db..a4882c8df94 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -143,7 +143,7 @@ static struct kobj_type mlog_ktype = {
 };
 
 static struct kset mlog_kset = {
-	.kobj   = {.name = "logmask", .ktype = &mlog_ktype},
+	.kobj   = {.ktype = &mlog_ktype},
 };
 
 int mlog_sys_init(struct kset *o2cb_subsys)
@@ -156,6 +156,7 @@ int mlog_sys_init(struct kset *o2cb_subsys)
 	}
 	mlog_attr_ptrs[i] = NULL;
 
+	kobject_set_name(&mlog_kset.kobj, "logmask");
 	kobj_set_kset_s(&mlog_kset, *o2cb_subsys);
 	return kset_register(&mlog_kset);
 }
-- 
cgit v1.2.3


From 2ebefc50161a0a1cdebccd62be749e72abdbec37 Mon Sep 17 00:00:00 2001
From: Robin Getz <rgetz@blackfin.uclinux.org>
Date: Thu, 2 Aug 2007 18:23:50 -0400
Subject: debugfs: helper for decimal challenged

Allows debugfs helper functions to have a hex output, rather than just decimal

Signed-off-by: Robin Getz <rgetz@blackfin.uclinux.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/debugfs/file.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 2e124e0075c..a9b99c0dc2e 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -221,6 +221,42 @@ struct dentry *debugfs_create_u64(const char *name, mode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_u64);
 
+DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n");
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n");
+
+/**
+ * debugfs_create_x8 - create a debugfs file that is used to read and write an unsigned 8-bit value
+ * debugfs_create_x16 - create a debugfs file that is used to read and write an unsigned 16-bit value
+ * debugfs_create_x32 - create a debugfs file that is used to read and write an unsigned 32-bit value
+ *
+ * These functions are exactly the same as the above functions, (but use a hex
+ * output for the decimal challenged) for details look at the above unsigned
+ * decimal functions.
+ */
+struct dentry *debugfs_create_x8(const char *name, mode_t mode,
+				 struct dentry *parent, u8 *value)
+{
+	return debugfs_create_file(name, mode, parent, value, &fops_x8);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_x8);
+
+struct dentry *debugfs_create_x16(const char *name, mode_t mode,
+				 struct dentry *parent, u16 *value)
+{
+	return debugfs_create_file(name, mode, parent, value, &fops_x16);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_x16);
+
+struct dentry *debugfs_create_x32(const char *name, mode_t mode,
+				 struct dentry *parent, u32 *value)
+{
+	return debugfs_create_file(name, mode, parent, value, &fops_x32);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_x32);
+
 static ssize_t read_file_bool(struct file *file, char __user *user_buf,
 			      size_t count, loff_t *ppos)
 {
-- 
cgit v1.2.3


From 52e8c209d6d2bae6766b9940a107c73e943583f1 Mon Sep 17 00:00:00 2001
From: Dave Young <hidave.darkstar@gmail.com>
Date: Thu, 26 Jul 2007 11:03:54 +0000
Subject: sysfs/file.c - use mutex instead of semaphore

Use mutex instead of semaphore in sysfs/file.c : sys_buffer.

Signed-off-by: Dave Young <hidave.darkstar@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 3e1cc062a74..b21d11b4675 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -8,8 +8,8 @@
 #include <linux/namei.h>
 #include <linux/poll.h>
 #include <linux/list.h>
+#include <linux/mutex.h>
 #include <asm/uaccess.h>
-#include <asm/semaphore.h>
 
 #include "sysfs.h"
 
@@ -55,7 +55,7 @@ struct sysfs_buffer {
 	loff_t			pos;
 	char			* page;
 	struct sysfs_ops	* ops;
-	struct semaphore	sem;
+	struct mutex		mutex;
 	int			needs_read_fill;
 	int			event;
 };
@@ -128,7 +128,7 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	struct sysfs_buffer * buffer = file->private_data;
 	ssize_t retval = 0;
 
-	down(&buffer->sem);
+	mutex_lock(&buffer->mutex);
 	if (buffer->needs_read_fill) {
 		retval = fill_read_buffer(file->f_path.dentry,buffer);
 		if (retval)
@@ -139,7 +139,7 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
 					 buffer->count);
 out:
-	up(&buffer->sem);
+	mutex_unlock(&buffer->mutex);
 	return retval;
 }
 
@@ -228,13 +228,13 @@ sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t
 	struct sysfs_buffer * buffer = file->private_data;
 	ssize_t len;
 
-	down(&buffer->sem);
+	mutex_lock(&buffer->mutex);
 	len = fill_write_buffer(buffer, buf, count);
 	if (len > 0)
 		len = flush_write_buffer(file->f_path.dentry, buffer, len);
 	if (len > 0)
 		*ppos += len;
-	up(&buffer->sem);
+	mutex_unlock(&buffer->mutex);
 	return len;
 }
 
@@ -294,7 +294,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	if (!buffer)
 		goto err_out;
 
-	init_MUTEX(&buffer->sem);
+	mutex_init(&buffer->mutex);
 	buffer->needs_read_fill = 1;
 	buffer->ops = ops;
 	file->private_data = buffer;
-- 
cgit v1.2.3


From 869512ab5ab93e5e82ad7d4aaf4ed098d23bfc3f Mon Sep 17 00:00:00 2001
From: Dave Young <hidave.darkstar@gmail.com>
Date: Thu, 26 Jul 2007 14:53:53 +0000
Subject: sysfs: cleanup semaphore.h

Cleanup semaphore.h

Signed-off-by: Dave Young <hidave.darkstar@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/bin.c     | 2 +-
 fs/sysfs/dir.c     | 2 +-
 fs/sysfs/group.c   | 1 -
 fs/sysfs/inode.c   | 1 -
 fs/sysfs/mount.c   | 1 -
 fs/sysfs/symlink.c | 2 +-
 6 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 5afe2a26f5d..a819a7e8d74 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -14,9 +14,9 @@
 #include <linux/kobject.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
 
 #include <asm/uaccess.h>
-#include <asm/semaphore.h>
 
 #include "sysfs.h"
 
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index ea33b660ae8..86d75e08de6 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -11,7 +11,7 @@
 #include <linux/namei.h>
 #include <linux/idr.h>
 #include <linux/completion.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 #include "sysfs.h"
 
 DEFINE_MUTEX(sysfs_mutex);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index f318b73c790..e6b904d7163 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -14,7 +14,6 @@
 #include <linux/namei.h>
 #include <linux/err.h>
 #include <linux/fs.h>
-#include <asm/semaphore.h>
 #include "sysfs.h"
 
 
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 10d1b52899f..aecb6e771e2 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -14,7 +14,6 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
-#include <asm/semaphore.h>
 #include "sysfs.h"
 
 extern struct super_block * sysfs_sb;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index fbc7b65fe26..69a73ae307f 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -8,7 +8,6 @@
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/init.h>
-#include <asm/semaphore.h>
 
 #include "sysfs.h"
 
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 4ce687f0b5d..90484533801 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -7,7 +7,7 @@
 #include <linux/module.h>
 #include <linux/kobject.h>
 #include <linux/namei.h>
-#include <asm/semaphore.h>
+#include <linux/mutex.h>
 
 #include "sysfs.h"
 
-- 
cgit v1.2.3


From 90bc61359de0148f8627073d68a22edc7ed9893d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 31 Jul 2007 19:15:08 +0900
Subject: sysfs: Remove first pass at shadow directory support

While shadow directories appear to be a good idea, the current scheme
of controlling their creation and destruction outside of sysfs appears
to be a locking and maintenance nightmare in the face of sysfs
directories dynamically coming and going.  Which can now occur for
directories containing network devices when CONFIG_SYSFS_DEPRECATED is
not set.

This patch removes everything from the initial shadow directory support
that allowed the shadow directory creation to be controlled at a higher
level.  So except for a few bits of sysfs_rename_dir everything from
commit b592fcfe7f06c15ec11774b5be7ce0de3aa86e73 is now gone.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c   | 167 ++++++++-----------------------------------------------
 fs/sysfs/group.c |   1 -
 fs/sysfs/inode.c |  10 ----
 fs/sysfs/mount.c |   2 +-
 fs/sysfs/sysfs.h |   6 --
 5 files changed, 23 insertions(+), 163 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 86d75e08de6..837073dbadf 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -569,9 +569,6 @@ static void sysfs_drop_dentry(struct sysfs_dirent *sd)
 	spin_unlock(&dcache_lock);
 	spin_unlock(&sysfs_assoc_lock);
 
-	/* dentries for shadowed inodes are pinned, unpin */
-	if (dentry && sysfs_is_shadowed_inode(dentry->d_inode))
-		dput(dentry);
 	dput(dentry);
 
 	/* adjust nlink and update timestamp */
@@ -723,19 +720,15 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
 /**
  *	sysfs_create_dir - create a directory for an object.
  *	@kobj:		object we're creating directory for. 
- *	@shadow_parent:	parent object.
  */
-int sysfs_create_dir(struct kobject *kobj,
-		     struct sysfs_dirent *shadow_parent_sd)
+int sysfs_create_dir(struct kobject * kobj)
 {
 	struct sysfs_dirent *parent_sd, *sd;
 	int error = 0;
 
 	BUG_ON(!kobj);
 
-	if (shadow_parent_sd)
-		parent_sd = shadow_parent_sd;
-	else if (kobj->parent)
+	if (kobj->parent)
 		parent_sd = kobj->parent->sd;
 	else if (sysfs_mount && sysfs_mount->mnt_sb)
 		parent_sd = sysfs_mount->mnt_sb->s_root->d_fsdata;
@@ -890,45 +883,44 @@ void sysfs_remove_dir(struct kobject * kobj)
 	__sysfs_remove_dir(sd);
 }
 
-int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd,
-		     const char *new_name)
+int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
 {
-	struct sysfs_dirent *sd = kobj->sd;
-	struct dentry *new_parent = NULL;
+	struct sysfs_dirent *sd;
+	struct dentry *parent = NULL;
 	struct dentry *old_dentry = NULL, *new_dentry = NULL;
+	struct sysfs_dirent *parent_sd;
 	const char *dup_name = NULL;
 	int error;
 
+	if (!kobj->parent)
+		return -EINVAL;
+
 	/* get dentries */
+	sd = kobj->sd;
 	old_dentry = sysfs_get_dentry(sd);
 	if (IS_ERR(old_dentry)) {
 		error = PTR_ERR(old_dentry);
 		goto out_dput;
 	}
 
-	new_parent = sysfs_get_dentry(new_parent_sd);
-	if (IS_ERR(new_parent)) {
-		error = PTR_ERR(new_parent);
+	parent_sd = kobj->parent->sd;
+	parent = sysfs_get_dentry(parent_sd);
+	if (IS_ERR(parent)) {
+		error = PTR_ERR(parent);
 		goto out_dput;
 	}
 
-	/* lock new_parent and get dentry for new name */
-	mutex_lock(&new_parent->d_inode->i_mutex);
+	/* lock parent and get dentry for new name */
+	mutex_lock(&parent->d_inode->i_mutex);
 
-	new_dentry = lookup_one_len(new_name, new_parent, strlen(new_name));
+	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
 	if (IS_ERR(new_dentry)) {
 		error = PTR_ERR(new_dentry);
 		goto out_unlock;
 	}
 
-	/* By allowing two different directories with the same
-	 * d_parent we allow this routine to move between different
-	 * shadows of the same directory
-	 */
 	error = -EINVAL;
-	if (old_dentry->d_parent->d_inode != new_parent->d_inode ||
-	    new_dentry->d_parent->d_inode != new_parent->d_inode ||
-	    old_dentry == new_dentry)
+	if (old_dentry == new_dentry)
 		goto out_unlock;
 
 	error = -EEXIST;
@@ -955,9 +947,9 @@ int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd,
 	d_move(sd->s_dentry, new_dentry);
 
 	sysfs_unlink_sibling(sd);
-	sysfs_get(new_parent_sd);
+	sysfs_get(parent_sd);
 	sysfs_put(sd->s_parent);
-	sd->s_parent = new_parent_sd;
+	sd->s_parent = parent_sd;
 	sysfs_link_sibling(sd);
 
 	mutex_unlock(&sysfs_mutex);
@@ -968,10 +960,10 @@ int sysfs_rename_dir(struct kobject *kobj, struct sysfs_dirent *new_parent_sd,
  out_drop:
 	d_drop(new_dentry);
  out_unlock:
-	mutex_unlock(&new_parent->d_inode->i_mutex);
+	mutex_unlock(&parent->d_inode->i_mutex);
  out_dput:
 	kfree(dup_name);
-	dput(new_parent);
+	dput(parent);
 	dput(old_dentry);
 	dput(new_dentry);
 	return error;
@@ -1192,121 +1184,6 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
 	return offset;
 }
 
-
-/**
- *	sysfs_make_shadowed_dir - Setup so a directory can be shadowed
- *	@kobj:	object we're creating shadow of.
- */
-
-int sysfs_make_shadowed_dir(struct kobject *kobj,
-	void * (*follow_link)(struct dentry *, struct nameidata *))
-{
-	struct dentry *dentry;
-	struct inode *inode;
-	struct inode_operations *i_op;
-
-	/* get dentry for @kobj->sd, dentry of a shadowed dir is pinned */
-	dentry = sysfs_get_dentry(kobj->sd);
-	if (IS_ERR(dentry))
-		return PTR_ERR(dentry);
-
-	inode = dentry->d_inode;
-	if (inode->i_op != &sysfs_dir_inode_operations) {
-		dput(dentry);
-		return -EINVAL;
-	}
-
-	i_op = kmalloc(sizeof(*i_op), GFP_KERNEL);
-	if (!i_op)
-		return -ENOMEM;
-
-	memcpy(i_op, &sysfs_dir_inode_operations, sizeof(*i_op));
-	i_op->follow_link = follow_link;
-
-	/* Locking of inode->i_op?
-	 * Since setting i_op is a single word write and they
-	 * are atomic we should be ok here.
-	 */
-	inode->i_op = i_op;
-	return 0;
-}
-
-/**
- *	sysfs_create_shadow_dir - create a shadow directory for an object.
- *	@kobj:	object we're creating directory for.
- *
- *	sysfs_make_shadowed_dir must already have been called on this
- *	directory.
- */
-
-struct sysfs_dirent *sysfs_create_shadow_dir(struct kobject *kobj)
-{
-	struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
-	struct dentry *dir, *parent, *shadow;
-	struct inode *inode;
-	struct sysfs_dirent *sd;
-	struct sysfs_addrm_cxt acxt;
-
-	dir = sysfs_get_dentry(kobj->sd);
-	if (IS_ERR(dir)) {
-		sd = (void *)dir;
-		goto out;
-	}
-	parent = dir->d_parent;
-
-	inode = dir->d_inode;
-	sd = ERR_PTR(-EINVAL);
-	if (!sysfs_is_shadowed_inode(inode))
-		goto out_dput;
-
-	shadow = d_alloc(parent, &dir->d_name);
-	if (!shadow)
-		goto nomem;
-
-	sd = sysfs_new_dirent("_SHADOW_", inode->i_mode, SYSFS_DIR);
-	if (!sd)
-		goto nomem;
-	sd->s_elem.dir.kobj = kobj;
-
-	sysfs_addrm_start(&acxt, parent_sd);
-
-	/* add but don't link into children list */
-	sysfs_add_one(&acxt, sd);
-
-	/* attach and instantiate dentry */
-	sysfs_attach_dentry(sd, shadow);
-	d_instantiate(shadow, igrab(inode));
-	inc_nlink(inode);	/* tj: synchronization? */
-
-	sysfs_addrm_finish(&acxt);
-
-	dget(shadow);		/* Extra count - pin the dentry in core */
-
-	goto out_dput;
-
- nomem:
-	dput(shadow);
-	sd = ERR_PTR(-ENOMEM);
- out_dput:
-	dput(dir);
- out:
-	return sd;
-}
-
-/**
- *	sysfs_remove_shadow_dir - remove an object's directory.
- *	@shadow_sd: sysfs_dirent of shadow directory
- *
- *	The only thing special about this is that we remove any files in
- *	the directory before we remove the directory, and we've inlined
- *	what used to be sysfs_rmdir() below, instead of calling separately.
- */
-
-void sysfs_remove_shadow_dir(struct sysfs_dirent *shadow_sd)
-{
-	__sysfs_remove_dir(shadow_sd);
-}
-
 const struct file_operations sysfs_dir_operations = {
 	.open		= sysfs_dir_open,
 	.release	= sysfs_dir_close,
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index e6b904d7163..d1972374655 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -13,7 +13,6 @@
 #include <linux/dcache.h>
 #include <linux/namei.h>
 #include <linux/err.h>
-#include <linux/fs.h>
 #include "sysfs.h"
 
 
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index aecb6e771e2..45128b79bc6 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -33,16 +33,6 @@ static const struct inode_operations sysfs_inode_operations ={
 	.setattr	= sysfs_setattr,
 };
 
-void sysfs_delete_inode(struct inode *inode)
-{
-	/* Free the shadowed directory inode operations */
-	if (sysfs_is_shadowed_inode(inode)) {
-		kfree(inode->i_op);
-		inode->i_op = NULL;
-	}
-	return generic_delete_inode(inode);
-}
-
 int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
 {
 	struct inode * inode = dentry->d_inode;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 69a73ae307f..119f39da1ae 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -20,7 +20,7 @@ struct kmem_cache *sysfs_dir_cachep;
 
 static const struct super_operations sysfs_ops = {
 	.statfs		= simple_statfs,
-	.drop_inode	= sysfs_delete_inode,
+	.drop_inode	= generic_delete_inode,
 };
 
 struct sysfs_dirent sysfs_root = {
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 6b8c8d76d30..b55e510ea23 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -70,7 +70,6 @@ extern void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
 			     struct sysfs_dirent *sd);
 extern int sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 
-extern void sysfs_delete_inode(struct inode *inode);
 extern struct inode * sysfs_get_inode(struct sysfs_dirent *sd);
 extern void sysfs_instantiate(struct dentry *dentry, struct inode *inode);
 
@@ -121,8 +120,3 @@ static inline void sysfs_put(struct sysfs_dirent * sd)
 	if (sd && atomic_dec_and_test(&sd->s_count))
 		release_sysfs_dirent(sd);
 }
-
-static inline int sysfs_is_shadowed_inode(struct inode *inode)
-{
-	return S_ISDIR(inode->i_mode) && inode->i_op->follow_link;
-}
-- 
cgit v1.2.3


From a7a0475497f9018e2e28cd421ee467d2ad68643e Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 2 Aug 2007 21:38:02 +0900
Subject: sysfs: cosmetic changes in sysfs_lookup()

* remove space between * and symbol name in variable declaration.

* kill unnecessary new line.

* kill 'found' and test 'sd' instead.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 837073dbadf..5da8da80666 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -756,24 +756,19 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 				struct nameidata *nd)
 {
 	struct dentry *ret = NULL;
-	struct sysfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
-	struct sysfs_dirent * sd;
+	struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata;
+	struct sysfs_dirent *sd;
 	struct bin_attribute *bin_attr;
 	struct inode *inode;
-	int found = 0;
 
 	mutex_lock(&sysfs_mutex);
 
-	for (sd = parent_sd->s_children; sd; sd = sd->s_sibling) {
-		if (sysfs_type(sd) &&
-		    !strcmp(sd->s_name, dentry->d_name.name)) {
-			found = 1;
+	for (sd = parent_sd->s_children; sd; sd = sd->s_sibling)
+		if (sysfs_type(sd) && !strcmp(sd->s_name, dentry->d_name.name))
 			break;
-		}
-	}
 
 	/* no such entry */
-	if (!found)
+	if (!sd)
 		goto out_unlock;
 
 	/* attach dentry and inode */
-- 
cgit v1.2.3


From ff869de7bf5e76adffebd3a176c1c73bca7eddb7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 2 Aug 2007 21:38:02 +0900
Subject: sysfs: simplify sysfs_rename_dir()

With the shadow directories gone, sysfs_rename_dir() can be simplified.

* parent doesn't need to be grabbed separately.  Just access
  old_dentry->d_parent.

* parent sd can never change.  Remove code to move under the new
  parent.

* Massage comments a bit.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5da8da80666..9504d4cb63e 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -883,14 +883,10 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
 	struct sysfs_dirent *sd;
 	struct dentry *parent = NULL;
 	struct dentry *old_dentry = NULL, *new_dentry = NULL;
-	struct sysfs_dirent *parent_sd;
 	const char *dup_name = NULL;
 	int error;
 
-	if (!kobj->parent)
-		return -EINVAL;
-
-	/* get dentries */
+	/* get the original dentry */
 	sd = kobj->sd;
 	old_dentry = sysfs_get_dentry(sd);
 	if (IS_ERR(old_dentry)) {
@@ -898,12 +894,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
 		goto out_dput;
 	}
 
-	parent_sd = kobj->parent->sd;
-	parent = sysfs_get_dentry(parent_sd);
-	if (IS_ERR(parent)) {
-		error = PTR_ERR(parent);
-		goto out_dput;
-	}
+	parent = old_dentry->d_parent;
 
 	/* lock parent and get dentry for new name */
 	mutex_lock(&parent->d_inode->i_mutex);
@@ -933,22 +924,14 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
 		goto out_drop;
 
 	mutex_lock(&sysfs_mutex);
-
 	dup_name = sd->s_name;
 	sd->s_name = new_name;
+	mutex_unlock(&sysfs_mutex);
 
-	/* move under the new parent */
+	/* rename */
 	d_add(new_dentry, NULL);
 	d_move(sd->s_dentry, new_dentry);
 
-	sysfs_unlink_sibling(sd);
-	sysfs_get(parent_sd);
-	sysfs_put(sd->s_parent);
-	sd->s_parent = parent_sd;
-	sysfs_link_sibling(sd);
-
-	mutex_unlock(&sysfs_mutex);
-
 	error = 0;
 	goto out_unlock;
 
@@ -958,7 +941,6 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
 	mutex_unlock(&parent->d_inode->i_mutex);
  out_dput:
 	kfree(dup_name);
-	dput(parent);
 	dput(old_dentry);
 	dput(new_dentry);
 	return error;
-- 
cgit v1.2.3


From 41fc1c27452e041a18e5141b8203ee0ea72bc483 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 2 Aug 2007 21:38:03 +0900
Subject: sysfs: make sysfs_add/remove_one() call link/unlink_sibling()
 implictly

When adding or removing a sysfs_dirent, the user used to be required
to call link/unlink separately.  It was for two reasons - code looked
like that before sysfs_addrm_cxt conversion and to avoid looping
through parent_sd->children list twice during removal.

Performance optimization during removal just isn't worth it.  Make
sysfs_add/remove_one() call sysfs_link/unlink_sibing() implicitly.
This makes code simpler albeit slightly less efficient.  This change
doesn't introduce any noticeable behavior change.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c     | 21 ++++++++++-----------
 fs/sysfs/file.c    |  4 +---
 fs/sysfs/inode.c   | 17 ++++-------------
 fs/sysfs/symlink.c |  4 +---
 fs/sysfs/sysfs.h   |  2 --
 5 files changed, 16 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 9504d4cb63e..354675ad096 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -30,7 +30,7 @@ static DEFINE_IDA(sysfs_ino_ida);
  *	Locking:
  *	mutex_lock(sysfs_mutex)
  */
-void sysfs_link_sibling(struct sysfs_dirent *sd)
+static void sysfs_link_sibling(struct sysfs_dirent *sd)
 {
 	struct sysfs_dirent *parent_sd = sd->s_parent;
 
@@ -49,7 +49,7 @@ void sysfs_link_sibling(struct sysfs_dirent *sd)
  *	Locking:
  *	mutex_lock(sysfs_mutex)
  */
-void sysfs_unlink_sibling(struct sysfs_dirent *sd)
+static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
 {
 	struct sysfs_dirent **pos;
 
@@ -500,6 +500,8 @@ void sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 		inc_nlink(acxt->parent_inode);
 
 	acxt->cnt++;
+
+	sysfs_link_sibling(sd);
 }
 
 /**
@@ -521,7 +523,9 @@ void sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
  */
 void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
-	BUG_ON(sd->s_sibling || (sd->s_flags & SYSFS_FLAG_REMOVED));
+	BUG_ON(sd->s_flags & SYSFS_FLAG_REMOVED);
+
+	sysfs_unlink_sibling(sd);
 
 	sd->s_flags |= SYSFS_FLAG_REMOVED;
 	sd->s_sibling = acxt->removed;
@@ -697,10 +701,8 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 	/* link in */
 	sysfs_addrm_start(&acxt, parent_sd);
 
-	if (!sysfs_find_dirent(parent_sd, name)) {
+	if (!sysfs_find_dirent(parent_sd, name))
 		sysfs_add_one(&acxt, sd);
-		sysfs_link_sibling(sd);
-	}
 
 	if (!sysfs_addrm_finish(&acxt)) {
 		sysfs_put(sd);
@@ -821,7 +823,6 @@ static void remove_dir(struct sysfs_dirent *sd)
 	struct sysfs_addrm_cxt acxt;
 
 	sysfs_addrm_start(&acxt, sd->s_parent);
-	sysfs_unlink_sibling(sd);
 	sysfs_remove_one(&acxt, sd);
 	sysfs_addrm_finish(&acxt);
 }
@@ -846,11 +847,9 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
 	while (*pos) {
 		struct sysfs_dirent *sd = *pos;
 
-		if (sysfs_type(sd) && sysfs_type(sd) != SYSFS_DIR) {
-			*pos = sd->s_sibling;
-			sd->s_sibling = NULL;
+		if (sysfs_type(sd) && sysfs_type(sd) != SYSFS_DIR)
 			sysfs_remove_one(&acxt, sd);
-		} else
+		else
 			pos = &(*pos)->s_sibling;
 	}
 	sysfs_addrm_finish(&acxt);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index b21d11b4675..ea0e494d7d5 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -405,10 +405,8 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
 
 	sysfs_addrm_start(&acxt, dir_sd);
 
-	if (!sysfs_find_dirent(dir_sd, attr->name)) {
+	if (!sysfs_find_dirent(dir_sd, attr->name))
 		sysfs_add_one(&acxt, sd);
-		sysfs_link_sibling(sd);
-	}
 
 	if (!sysfs_addrm_finish(&acxt)) {
 		sysfs_put(sd);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 45128b79bc6..efb4062fe09 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -189,25 +189,16 @@ void sysfs_instantiate(struct dentry *dentry, struct inode *inode)
 int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
 {
 	struct sysfs_addrm_cxt acxt;
-	struct sysfs_dirent **pos, *sd;
+	struct sysfs_dirent *sd;
 
 	if (!dir_sd)
 		return -ENOENT;
 
 	sysfs_addrm_start(&acxt, dir_sd);
 
-	for (pos = &dir_sd->s_children; *pos; pos = &(*pos)->s_sibling) {
-		sd = *pos;
-
-		if (!sysfs_type(sd))
-			continue;
-		if (!strcmp(sd->s_name, name)) {
-			*pos = sd->s_sibling;
-			sd->s_sibling = NULL;
-			sysfs_remove_one(&acxt, sd);
-			break;
-		}
-	}
+	sd = sysfs_find_dirent(dir_sd, name);
+	if (sd)
+		sysfs_remove_one(&acxt, sd);
 
 	if (sysfs_addrm_finish(&acxt))
 		return 0;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 90484533801..c129f307936 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -92,10 +92,8 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 
 	sysfs_addrm_start(&acxt, parent_sd);
 
-	if (!sysfs_find_dirent(parent_sd, name)) {
+	if (!sysfs_find_dirent(parent_sd, name))
 		sysfs_add_one(&acxt, sd);
-		sysfs_link_sibling(sd);
-	}
 
 	if (!sysfs_addrm_finish(&acxt)) {
 		error = -EEXIST;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index b55e510ea23..32b8b64e5cf 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -56,8 +56,6 @@ extern struct sysfs_dirent sysfs_root;
 extern struct kmem_cache *sysfs_dir_cachep;
 
 extern struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
-extern void sysfs_link_sibling(struct sysfs_dirent *sd);
-extern void sysfs_unlink_sibling(struct sysfs_dirent *sd);
 extern struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
 extern void sysfs_put_active(struct sysfs_dirent *sd);
 extern struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
-- 
cgit v1.2.3


From 23dc279950a056c33a14d09cf759f5173d41abd9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 2 Aug 2007 21:38:03 +0900
Subject: sysfs: make sysfs_add_one() automatically check for duplicate entry

Make sysfs_add_one() check for duplicate entry and return -EEXIST if
such entry exists.  This simplifies node addition code a bit.

This patch doesn't introduce any noticeable behavior change.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c     | 26 +++++++++++++++++---------
 fs/sysfs/file.c    | 12 +++++-------
 fs/sysfs/symlink.c |  9 +++------
 fs/sysfs/sysfs.h   |  2 +-
 4 files changed, 26 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 354675ad096..620603296c6 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -491,9 +491,16 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
  *
  *	LOCKING:
  *	Determined by sysfs_addrm_start().
+ *
+ *	RETURNS:
+ *	0 on success, -EEXIST if entry with the given name already
+ *	exists.
  */
-void sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
+	if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
+		return -EEXIST;
+
 	sd->s_parent = sysfs_get(acxt->parent_sd);
 
 	if (sysfs_type(sd) == SYSFS_DIR && acxt->parent_inode)
@@ -502,6 +509,8 @@ void sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 	acxt->cnt++;
 
 	sysfs_link_sibling(sd);
+
+	return 0;
 }
 
 /**
@@ -691,6 +700,7 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 	umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
 	struct sysfs_addrm_cxt acxt;
 	struct sysfs_dirent *sd;
+	int rc;
 
 	/* allocate */
 	sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
@@ -700,17 +710,15 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 
 	/* link in */
 	sysfs_addrm_start(&acxt, parent_sd);
+	rc = sysfs_add_one(&acxt, sd);
+	sysfs_addrm_finish(&acxt);
 
-	if (!sysfs_find_dirent(parent_sd, name))
-		sysfs_add_one(&acxt, sd);
-
-	if (!sysfs_addrm_finish(&acxt)) {
+	if (rc == 0)
+		*p_sd = sd;
+	else
 		sysfs_put(sd);
-		return -EEXIST;
-	}
 
-	*p_sd = sd;
-	return 0;
+	return rc;
 }
 
 int sysfs_create_subdir(struct kobject *kobj, const char *name,
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index ea0e494d7d5..33bb3406dc4 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -397,6 +397,7 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
 	umode_t mode = (attr->mode & S_IALLUGO) | S_IFREG;
 	struct sysfs_addrm_cxt acxt;
 	struct sysfs_dirent *sd;
+	int rc;
 
 	sd = sysfs_new_dirent(attr->name, mode, type);
 	if (!sd)
@@ -404,16 +405,13 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
 	sd->s_elem.attr.attr = (void *)attr;
 
 	sysfs_addrm_start(&acxt, dir_sd);
+	rc = sysfs_add_one(&acxt, sd);
+	sysfs_addrm_finish(&acxt);
 
-	if (!sysfs_find_dirent(dir_sd, attr->name))
-		sysfs_add_one(&acxt, sd);
-
-	if (!sysfs_addrm_finish(&acxt)) {
+	if (rc)
 		sysfs_put(sd);
-		return -EEXIST;
-	}
 
-	return 0;
+	return rc;
 }
 
 
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index c129f307936..a6b13f12b0e 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -91,14 +91,11 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 	target_sd = NULL;	/* reference is now owned by the symlink */
 
 	sysfs_addrm_start(&acxt, parent_sd);
+	error = sysfs_add_one(&acxt, sd);
+	sysfs_addrm_finish(&acxt);
 
-	if (!sysfs_find_dirent(parent_sd, name))
-		sysfs_add_one(&acxt, sd);
-
-	if (!sysfs_addrm_finish(&acxt)) {
-		error = -EEXIST;
+	if (error)
 		goto out_put;
-	}
 
 	return 0;
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 32b8b64e5cf..bb3f0c999b1 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -62,7 +62,7 @@ extern struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
 extern void sysfs_put_active_two(struct sysfs_dirent *sd);
 extern void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
 			      struct sysfs_dirent *parent_sd);
-extern void sysfs_add_one(struct sysfs_addrm_cxt *acxt,
+extern int sysfs_add_one(struct sysfs_addrm_cxt *acxt,
 			  struct sysfs_dirent *sd);
 extern void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
 			     struct sysfs_dirent *sd);
-- 
cgit v1.2.3


From 990e53f880be9ff93072b4cce590ec2826cee0b6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 2 Aug 2007 21:38:03 +0900
Subject: sysfs: make sysfs_addrm_finish() return void

With the previous sysfs_add_one() update, there is only one user of
the return value of sysfs_addrm_finish() and the user can switch to
testing @sd easily.  Make sysfs_addrm_finish() return void for cleaner
semantics as suggested by Satyam Sharma.

This patch doesn't introduce any noticeable behavior change.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Satyam Sharma <satyam.sharma@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c   | 7 +------
 fs/sysfs/inode.c | 7 +++++--
 fs/sysfs/sysfs.h | 2 +-
 3 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 620603296c6..a0da2b05a75 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -609,11 +609,8 @@ static void sysfs_drop_dentry(struct sysfs_dirent *sd)
  *
  *	LOCKING:
  *	All mutexes acquired by sysfs_addrm_start() are released.
- *
- *	RETURNS:
- *	Number of added/removed sysfs_dirents since sysfs_addrm_start().
  */
-int sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
+void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 {
 	/* release resources acquired by sysfs_addrm_start() */
 	mutex_unlock(&sysfs_mutex);
@@ -639,8 +636,6 @@ int sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 		sysfs_deactivate(sd);
 		sysfs_put(sd);
 	}
-
-	return acxt->cnt;
 }
 
 /**
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index efb4062fe09..e22db6cd4df 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -200,7 +200,10 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
 	if (sd)
 		sysfs_remove_one(&acxt, sd);
 
-	if (sysfs_addrm_finish(&acxt))
+	sysfs_addrm_finish(&acxt);
+
+	if (sd)
 		return 0;
-	return -ENOENT;
+	else
+		return -ENOENT;
 }
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index bb3f0c999b1..04367547103 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -66,7 +66,7 @@ extern int sysfs_add_one(struct sysfs_addrm_cxt *acxt,
 			  struct sysfs_dirent *sd);
 extern void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
 			     struct sysfs_dirent *sd);
-extern int sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
+extern void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 
 extern struct inode * sysfs_get_inode(struct sysfs_dirent *sd);
 extern void sysfs_instantiate(struct dentry *dentry, struct inode *inode);
-- 
cgit v1.2.3


From a93720eeb4b3bedc1fe15e4b6ca364e6be577d20 Mon Sep 17 00:00:00 2001
From: Rolf Eike Beer <eike-kernel@sf-tec.de>
Date: Fri, 10 Aug 2007 13:51:07 -0700
Subject: sysfs: Fix typos in fs/sysfs/file.c

Signed-off-by: Rolf Eike Beer <eike-kernel@sf-tec.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 33bb3406dc4..16f39c30b09 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -335,7 +335,7 @@ static int sysfs_release(struct inode * inode, struct file * filp)
  * again will not get new data, or reset the state of 'poll'.
  * Reminder: this only works for attributes which actively support
  * it, and it is not possible to test an attribute from userspace
- * to see if it supports poll (Nether 'poll' or 'select' return
+ * to see if it supports poll (Neither 'poll' nor 'select' return
  * an appropriate error code).  When in doubt, set a suitable timeout value.
  */
 static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
-- 
cgit v1.2.3


From 253280267a7f1ced0c434fb24b7bef92d7d22628 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 20 Aug 2007 21:36:29 +0900
Subject: sysfs: fix i_mutex locking in sysfs_get_dentry()

lookup_one_len_kern() should be called with the parent's i_mutex
locked.  Fix it.

Spotted by Eric W. Biederman.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index a0da2b05a75..54ca4bc02dc 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -130,8 +130,10 @@ struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd)
 
 		/* look it up */
 		parent_dentry = dentry;
+		mutex_lock(&parent_dentry->d_inode->i_mutex);
 		dentry = lookup_one_len_kern(cur->s_name, parent_dentry,
 					     strlen(cur->s_name));
+		mutex_unlock(&parent_dentry->d_inode->i_mutex);
 		dput(parent_dentry);
 
 		if (IS_ERR(dentry)) {
-- 
cgit v1.2.3


From 372e88bd1922228e0a55228f6dc8e311d1696fa0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 20 Aug 2007 21:36:29 +0900
Subject: sysfs: Move all of inode initialization into sysfs_init_inode

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c   | 37 -------------------------------------
 fs/sysfs/inode.c | 48 +++++++++++++++++++++++++++++++++++++++++++++---
 fs/sysfs/mount.c |  5 -----
 3 files changed, 45 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 54ca4bc02dc..5a70a93fc2f 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -748,24 +748,12 @@ int sysfs_create_dir(struct kobject * kobj)
 	return error;
 }
 
-static int sysfs_count_nlink(struct sysfs_dirent *sd)
-{
-	struct sysfs_dirent *child;
-	int nr = 0;
-
-	for (child = sd->s_children; child; child = child->s_sibling)
-		if (sysfs_type(child) == SYSFS_DIR)
-			nr++;
-	return nr + 2;
-}
-
 static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 				struct nameidata *nd)
 {
 	struct dentry *ret = NULL;
 	struct sysfs_dirent *parent_sd = dentry->d_parent->d_fsdata;
 	struct sysfs_dirent *sd;
-	struct bin_attribute *bin_attr;
 	struct inode *inode;
 
 	mutex_lock(&sysfs_mutex);
@@ -785,31 +773,6 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	if (inode->i_state & I_NEW) {
-		/* initialize inode according to type */
-		switch (sysfs_type(sd)) {
-		case SYSFS_DIR:
-			inode->i_op = &sysfs_dir_inode_operations;
-			inode->i_fop = &sysfs_dir_operations;
-			inode->i_nlink = sysfs_count_nlink(sd);
-			break;
-		case SYSFS_KOBJ_ATTR:
-			inode->i_size = PAGE_SIZE;
-			inode->i_fop = &sysfs_file_operations;
-			break;
-		case SYSFS_KOBJ_BIN_ATTR:
-			bin_attr = sd->s_elem.bin_attr.bin_attr;
-			inode->i_size = bin_attr->size;
-			inode->i_fop = &bin_fops;
-			break;
-		case SYSFS_KOBJ_LINK:
-			inode->i_op = &sysfs_symlink_inode_operations;
-			break;
-		default:
-			BUG();
-		}
-	}
-
 	sysfs_instantiate(dentry, inode);
 	sysfs_attach_dentry(sd, dentry);
 
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e22db6cd4df..200e1bf6f93 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -122,8 +122,22 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
  */
 static struct lock_class_key sysfs_inode_imutex_key;
 
+static int sysfs_count_nlink(struct sysfs_dirent *sd)
+{
+	struct sysfs_dirent *child;
+	int nr = 0;
+
+	for (child = sd->s_children; child; child = child->s_sibling)
+		if (sysfs_type(child) == SYSFS_DIR)
+			nr++;
+
+	return nr + 2;
+}
+
 static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
+	struct bin_attribute *bin_attr;
+
 	inode->i_blocks = 0;
 	inode->i_mapping->a_ops = &sysfs_aops;
 	inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
@@ -139,6 +153,37 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 		set_inode_attr(inode, sd->s_iattr);
 	} else
 		set_default_inode_attr(inode, sd->s_mode);
+
+
+	/* initialize inode according to type */
+	switch (sysfs_type(sd)) {
+	case SYSFS_ROOT:
+		inode->i_op = &sysfs_dir_inode_operations;
+		inode->i_fop = &sysfs_dir_operations;
+		inc_nlink(inode); /* directory, account for "." */
+		break;
+	case SYSFS_DIR:
+		inode->i_op = &sysfs_dir_inode_operations;
+		inode->i_fop = &sysfs_dir_operations;
+		inode->i_nlink = sysfs_count_nlink(sd);
+		break;
+	case SYSFS_KOBJ_ATTR:
+		inode->i_size = PAGE_SIZE;
+		inode->i_fop = &sysfs_file_operations;
+		break;
+	case SYSFS_KOBJ_BIN_ATTR:
+		bin_attr = sd->s_elem.bin_attr.bin_attr;
+		inode->i_size = bin_attr->size;
+		inode->i_fop = &bin_fops;
+		break;
+	case SYSFS_KOBJ_LINK:
+		inode->i_op = &sysfs_symlink_inode_operations;
+		break;
+	default:
+		BUG();
+	}
+
+	unlock_new_inode(inode);
 }
 
 /**
@@ -180,9 +225,6 @@ void sysfs_instantiate(struct dentry *dentry, struct inode *inode)
 {
 	BUG_ON(!dentry || dentry->d_inode);
 
-	if (inode->i_state & I_NEW)
-		unlock_new_inode(inode);
-
 	d_instantiate(dentry, inode);
 }
 
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 119f39da1ae..92f407fb126 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -49,11 +49,6 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 		return -ENOMEM;
 	}
 
-	inode->i_op = &sysfs_dir_inode_operations;
-	inode->i_fop = &sysfs_dir_operations;
-	inc_nlink(inode); /* directory, account for "." */
-	unlock_new_inode(inode);
-
 	/* instantiate and link root dentry */
 	root = d_alloc_root(inode);
 	if (!root) {
-- 
cgit v1.2.3


From 119dd52be33dfe6285f586ab7354897fdefc7e23 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 20 Aug 2007 21:36:29 +0900
Subject: sysfs: Remove sysfs_instantiate

Now that sysfs_get_inode is dropping the inode lock
we no longer have a need from sysfs_instantiate.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c   |  2 +-
 fs/sysfs/inode.c | 17 -----------------
 fs/sysfs/sysfs.h |  1 -
 3 files changed, 1 insertion(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5a70a93fc2f..739dda176b4 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -773,7 +773,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	sysfs_instantiate(dentry, inode);
+	d_instantiate(dentry, inode);
 	sysfs_attach_dentry(sd, dentry);
 
  out_unlock:
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 200e1bf6f93..0d706a86d2c 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -211,23 +211,6 @@ struct inode * sysfs_get_inode(struct sysfs_dirent *sd)
 	return inode;
 }
 
-/**
- *	sysfs_instantiate - instantiate dentry
- *	@dentry: dentry to be instantiated
- *	@inode: inode associated with @sd
- *
- *	Unlock @inode if locked and instantiate @dentry with @inode.
- *
- *	LOCKING:
- *	None.
- */
-void sysfs_instantiate(struct dentry *dentry, struct inode *inode)
-{
-	BUG_ON(!dentry || dentry->d_inode);
-
-	d_instantiate(dentry, inode);
-}
-
 int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
 {
 	struct sysfs_addrm_cxt acxt;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 04367547103..8a0aea1ab86 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -69,7 +69,6 @@ extern void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
 extern void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 
 extern struct inode * sysfs_get_inode(struct sysfs_dirent *sd);
-extern void sysfs_instantiate(struct dentry *dentry, struct inode *inode);
 
 extern void release_sysfs_dirent(struct sysfs_dirent * sd);
 extern struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
-- 
cgit v1.2.3


From 0333cd8a3f4249fde2c50929a6eac35245fc685b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 20 Aug 2007 21:36:29 +0900
Subject: sysfs: Use kill_anon_super

Since sysfs no longer stores fs directory information in the dcache
on a permanent basis kill_litter_super it is inappropriate and actively
wrong.  It will decrement the count on all dentries left in the
dcache before trying to free them.

At the moment this is not biting us only because we never unmount sysfs.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/mount.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 92f407fb126..ac7625631fc 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -71,7 +71,7 @@ static int sysfs_get_sb(struct file_system_type *fs_type,
 static struct file_system_type sysfs_fs_type = {
 	.name		= "sysfs",
 	.get_sb		= sysfs_get_sb,
-	.kill_sb	= kill_litter_super,
+	.kill_sb	= kill_anon_super,
 };
 
 int __init sysfs_init(void)
-- 
cgit v1.2.3


From 7d0c7d676cc066413e1583b5af9fba8011972d41 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 20 Aug 2007 21:36:30 +0900
Subject: sysfs: Make sysfs_mount static

This patch modifies the users of sysfs_mount to use sysfs_root
instead (which is what they are looking for).  It then
makes sysfs_mount static to keep people from using it
by accident.

The net result is slightly faster and cleaner code.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c     | 4 +---
 fs/sysfs/mount.c   | 2 +-
 fs/sysfs/symlink.c | 7 +++----
 fs/sysfs/sysfs.h   | 1 -
 4 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 739dda176b4..7f4abe17670 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -737,10 +737,8 @@ int sysfs_create_dir(struct kobject * kobj)
 
 	if (kobj->parent)
 		parent_sd = kobj->parent->sd;
-	else if (sysfs_mount && sysfs_mount->mnt_sb)
-		parent_sd = sysfs_mount->mnt_sb->s_root->d_fsdata;
 	else
-		return -EFAULT;
+		parent_sd = &sysfs_root;
 
 	error = create_dir(kobj, parent_sd, kobject_name(kobj), &sd);
 	if (!error)
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index ac7625631fc..8989cbb51a3 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -14,7 +14,7 @@
 /* Random magic number */
 #define SYSFS_MAGIC 0x62656572
 
-struct vfsmount *sysfs_mount;
+static struct vfsmount *sysfs_mount;
 struct super_block * sysfs_sb = NULL;
 struct kmem_cache *sysfs_dir_cachep;
 
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index a6b13f12b0e..8ad38bccc0e 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -60,10 +60,9 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 
 	BUG_ON(!name);
 
-	if (!kobj) {
-		if (sysfs_mount && sysfs_mount->mnt_sb)
-			parent_sd = sysfs_mount->mnt_sb->s_root->d_fsdata;
-	} else
+	if (!kobj)
+		parent_sd = &sysfs_root;
+	else
 		parent_sd = kobj->sd;
 
 	error = -EFAULT;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 8a0aea1ab86..77253aabc4a 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -51,7 +51,6 @@ struct sysfs_addrm_cxt {
 	int			cnt;
 };
 
-extern struct vfsmount * sysfs_mount;
 extern struct sysfs_dirent sysfs_root;
 extern struct kmem_cache *sysfs_dir_cachep;
 
-- 
cgit v1.2.3


From 94777e09180b6249d455baa2dbe34cf630e0c033 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 20 Aug 2007 21:36:30 +0900
Subject: sysfs: In sysfs_lookup don't open code sysfs_find_dirent

This is a small cleanup patch that makes the code just
a little bit cleaner.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 7f4abe17670..b72b42ed80d 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -756,9 +756,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	mutex_lock(&sysfs_mutex);
 
-	for (sd = parent_sd->s_children; sd; sd = sd->s_sibling)
-		if (sysfs_type(sd) && !strcmp(sd->s_name, dentry->d_name.name))
-			break;
+	sd = sysfs_find_dirent(parent_sd, dentry->d_name.name);
 
 	/* no such entry */
 	if (!sd)
-- 
cgit v1.2.3


From 3efa65b92d832873ece62b42a4268c2515943977 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 20 Aug 2007 21:36:30 +0900
Subject: sysfs: Simplify readdir.

At some point someone wrote sysfs_readdir to insert a cursor
into the list of sysfs_dirents to ensure that sysfs_readdir would
restart properly.  That works but it is complex code and tends
to be expensive.

The same effect can be achieved by keeping the sysfs_dirents in
inode order and using the inode number as the f_pos.  Then
when we restart we just have to find the first dirent whose inode
number is equal or greater then the last sysfs_dirent we attempted
to return.

Removing the sysfs directory cursor also allows the remove of
all of the mysterious checks for sysfs_type(sd) != 0.   Which
were nonbovious checks to see if a cursor was in a directory list.

tj: offset marker for EOF is changed from UINT_MAX to INT_MAX to avoid
    overflow in case offset is 32bit.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 175 +++++++++++++++------------------------------------------
 1 file changed, 44 insertions(+), 131 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index b72b42ed80d..953e8432b0a 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -33,10 +33,20 @@ static DEFINE_IDA(sysfs_ino_ida);
 static void sysfs_link_sibling(struct sysfs_dirent *sd)
 {
 	struct sysfs_dirent *parent_sd = sd->s_parent;
+	struct sysfs_dirent **pos;
 
 	BUG_ON(sd->s_sibling);
-	sd->s_sibling = parent_sd->s_children;
-	parent_sd->s_children = sd;
+
+	/* Store directory entries in order by ino.  This allows
+	 * readdir to properly restart without having to add a
+	 * cursor into the s_children list.
+	 */
+	for (pos = &parent_sd->s_children; *pos; pos = &(*pos)->s_sibling) {
+		if (sd->s_ino < (*pos)->s_ino)
+			break;
+	}
+	sd->s_sibling = *pos;
+	*pos = sd;
 }
 
 /**
@@ -659,7 +669,7 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 	struct sysfs_dirent *sd;
 
 	for (sd = parent_sd->s_children; sd; sd = sd->s_sibling)
-		if (sysfs_type(sd) && !strcmp(sd->s_name, name))
+		if (!strcmp(sd->s_name, name))
 			return sd;
 	return NULL;
 }
@@ -811,7 +821,7 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
 	while (*pos) {
 		struct sysfs_dirent *sd = *pos;
 
-		if (sysfs_type(sd) && sysfs_type(sd) != SYSFS_DIR)
+		if (sysfs_type(sd) != SYSFS_DIR)
 			sysfs_remove_one(&acxt, sd);
 		else
 			pos = &(*pos)->s_sibling;
@@ -976,37 +986,6 @@ again:
 	return error;
 }
 
-static int sysfs_dir_open(struct inode *inode, struct file *file)
-{
-	struct dentry * dentry = file->f_path.dentry;
-	struct sysfs_dirent * parent_sd = dentry->d_fsdata;
-	struct sysfs_dirent * sd;
-
-	sd = sysfs_new_dirent("_DIR_", 0, 0);
-	if (sd) {
-		mutex_lock(&sysfs_mutex);
-		sd->s_parent = sysfs_get(parent_sd);
-		sysfs_link_sibling(sd);
-		mutex_unlock(&sysfs_mutex);
-	}
-
-	file->private_data = sd;
-	return sd ? 0 : -ENOMEM;
-}
-
-static int sysfs_dir_close(struct inode *inode, struct file *file)
-{
-	struct sysfs_dirent * cursor = file->private_data;
-
-	mutex_lock(&sysfs_mutex);
-	sysfs_unlink_sibling(cursor);
-	mutex_unlock(&sysfs_mutex);
-
-	release_sysfs_dirent(cursor);
-
-	return 0;
-}
-
 /* Relationship between s_mode and the DT_xxx types */
 static inline unsigned char dt_type(struct sysfs_dirent *sd)
 {
@@ -1017,117 +996,51 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
 	struct dentry *dentry = filp->f_path.dentry;
 	struct sysfs_dirent * parent_sd = dentry->d_fsdata;
-	struct sysfs_dirent *cursor = filp->private_data;
-	struct sysfs_dirent **pos;
+	struct sysfs_dirent *pos;
 	ino_t ino;
-	int i = filp->f_pos;
 
-	switch (i) {
-		case 0:
-			ino = parent_sd->s_ino;
-			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
-				break;
+	if (filp->f_pos == 0) {
+		ino = parent_sd->s_ino;
+		if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0)
 			filp->f_pos++;
-			i++;
-			/* fallthrough */
-		case 1:
-			if (parent_sd->s_parent)
-				ino = parent_sd->s_parent->s_ino;
-			else
-				ino = parent_sd->s_ino;
-			if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
-				break;
+	}
+	if (filp->f_pos == 1) {
+		if (parent_sd->s_parent)
+			ino = parent_sd->s_parent->s_ino;
+		else
+			ino = parent_sd->s_ino;
+		if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0)
 			filp->f_pos++;
-			i++;
-			/* fallthrough */
-		default:
-			mutex_lock(&sysfs_mutex);
-
-			pos = &parent_sd->s_children;
-			while (*pos != cursor)
-				pos = &(*pos)->s_sibling;
-
-			/* unlink cursor */
-			*pos = cursor->s_sibling;
-
-			if (filp->f_pos == 2)
-				pos = &parent_sd->s_children;
-
-			for ( ; *pos; pos = &(*pos)->s_sibling) {
-				struct sysfs_dirent *next = *pos;
-				const char * name;
-				int len;
-
-				if (!sysfs_type(next))
-					continue;
-
-				name = next->s_name;
-				len = strlen(name);
-				ino = next->s_ino;
-
-				if (filldir(dirent, name, len, filp->f_pos, ino,
-						 dt_type(next)) < 0)
-					break;
-
-				filp->f_pos++;
-			}
+	}
+	if ((filp->f_pos > 1) && (filp->f_pos < INT_MAX)) {
+		mutex_lock(&sysfs_mutex);
 
-			/* put cursor back in */
-			cursor->s_sibling = *pos;
-			*pos = cursor;
+		/* Skip the dentries we have already reported */
+		pos = parent_sd->s_children;
+		while (pos && (filp->f_pos > pos->s_ino))
+			pos = pos->s_sibling;
 
-			mutex_unlock(&sysfs_mutex);
-	}
-	return 0;
-}
+		for ( ; pos; pos = pos->s_sibling) {
+			const char * name;
+			int len;
 
-static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin)
-{
-	struct dentry * dentry = file->f_path.dentry;
+			name = pos->s_name;
+			len = strlen(name);
+			filp->f_pos = ino = pos->s_ino;
 
-	switch (origin) {
-		case 1:
-			offset += file->f_pos;
-		case 0:
-			if (offset >= 0)
+			if (filldir(dirent, name, len, filp->f_pos, ino,
+					 dt_type(pos)) < 0)
 				break;
-		default:
-			return -EINVAL;
-	}
-	if (offset != file->f_pos) {
-		mutex_lock(&sysfs_mutex);
-
-		file->f_pos = offset;
-		if (file->f_pos >= 2) {
-			struct sysfs_dirent *sd = dentry->d_fsdata;
-			struct sysfs_dirent *cursor = file->private_data;
-			struct sysfs_dirent **pos;
-			loff_t n = file->f_pos - 2;
-
-			sysfs_unlink_sibling(cursor);
-
-			pos = &sd->s_children;
-			while (n && *pos) {
-				struct sysfs_dirent *next = *pos;
-				if (sysfs_type(next))
-					n--;
-				pos = &(*pos)->s_sibling;
-			}
-
-			cursor->s_sibling = *pos;
-			*pos = cursor;
 		}
-
+		if (!pos)
+			filp->f_pos = INT_MAX;
 		mutex_unlock(&sysfs_mutex);
 	}
-
-	return offset;
+	return 0;
 }
 
+
 const struct file_operations sysfs_dir_operations = {
-	.open		= sysfs_dir_open,
-	.release	= sysfs_dir_close,
-	.llseek		= sysfs_dir_lseek,
 	.read		= generic_read_dir,
 	.readdir	= sysfs_readdir,
 };
-- 
cgit v1.2.3


From 89bec09705d2033b8b765f3c3ac5093f80bd5bc4 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 20 Aug 2007 21:36:30 +0900
Subject: sysfs: Rewrite sysfs_drop_dentry.

Currently we find the dentry to drop by looking at sd->s_dentry.
We can just as easily accomplish the same task by looking up the
sysfs inode and finding all of the dentries from there, with the
added bonus that we don't need to play with the sysfs_assoc_lock.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 53 ++++++++++++++++++++++++++---------------------------
 1 file changed, 26 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 953e8432b0a..1af963e66e3 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -565,50 +565,49 @@ void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
  *	Drop dentry for @sd.  @sd must have been unlinked from its
  *	parent on entry to this function such that it can't be looked
  *	up anymore.
- *
- *	@sd->s_dentry which is protected with sysfs_assoc_lock points
- *	to the currently associated dentry but we're not holding a
- *	reference to it and racing with dput().  Grab dcache_lock and
- *	verify dentry before dropping it.  If @sd->s_dentry is NULL or
- *	dput() beats us, no need to bother.
  */
 static void sysfs_drop_dentry(struct sysfs_dirent *sd)
 {
-	struct dentry *dentry = NULL;
 	struct inode *inode;
+	struct dentry *dentry;
+
+	inode = ilookup(sysfs_sb, sd->s_ino);
+	if (!inode)
+		return;
 
-	/* We're not holding a reference to ->s_dentry dentry but the
-	 * field will stay valid as long as sysfs_assoc_lock is held.
+	/* Drop any existing dentries associated with sd.
+	 *
+	 * For the dentry to be properly freed we need to grab a
+	 * reference to the dentry under the dcache lock,  unhash it,
+	 * and then put it.  The playing with the dentry count allows
+	 * dput to immediately free the dentry  if it is not in use.
 	 */
-	spin_lock(&sysfs_assoc_lock);
+repeat:
 	spin_lock(&dcache_lock);
-
-	/* drop dentry if it's there and dput() didn't kill it yet */
-	if (sd->s_dentry && sd->s_dentry->d_inode) {
-		dentry = dget_locked(sd->s_dentry);
+	list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
+		if (d_unhashed(dentry))
+			continue;
+		dget_locked(dentry);
 		spin_lock(&dentry->d_lock);
 		__d_drop(dentry);
 		spin_unlock(&dentry->d_lock);
+		spin_unlock(&dcache_lock);
+		dput(dentry);
+		goto repeat;
 	}
-
 	spin_unlock(&dcache_lock);
-	spin_unlock(&sysfs_assoc_lock);
-
-	dput(dentry);
 
 	/* adjust nlink and update timestamp */
-	inode = ilookup(sysfs_sb, sd->s_ino);
-	if (inode) {
-		mutex_lock(&inode->i_mutex);
+	mutex_lock(&inode->i_mutex);
 
-		inode->i_ctime = CURRENT_TIME;
+	inode->i_ctime = CURRENT_TIME;
+	drop_nlink(inode);
+	if (sysfs_type(sd) == SYSFS_DIR)
 		drop_nlink(inode);
-		if (sysfs_type(sd) == SYSFS_DIR)
-			drop_nlink(inode);
 
-		mutex_unlock(&inode->i_mutex);
-		iput(inode);
-	}
+	mutex_unlock(&inode->i_mutex);
+
+	iput(inode);
 }
 
 /**
-- 
cgit v1.2.3


From 932ea2e374dd1ca26676297a5eccd1cdab86f7cd Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 20 Aug 2007 21:36:30 +0900
Subject: sysfs: Introduce sysfs_rename_mutex

Looking carefully at the rename code we have a subtle dependency
that the structure of sysfs not change while we are performing
a rename.  If the parent directory of the object we are renaming
changes while the rename is being performed nasty things could
happen when we go to release our locks.

So introduce a sysfs_rename_mutex to prevent this highly
unlikely theoretical issue.

In addition hold sysfs_rename_mutex over all calls to
sysfs_get_dentry. Allowing sysfs_get_dentry to be simplified
in the future.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c   | 8 +++++++-
 fs/sysfs/file.c  | 4 ++++
 fs/sysfs/sysfs.h | 1 +
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 1af963e66e3..9fe83d23dc2 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -15,6 +15,7 @@
 #include "sysfs.h"
 
 DEFINE_MUTEX(sysfs_mutex);
+DEFINE_MUTEX(sysfs_rename_mutex);
 spinlock_t sysfs_assoc_lock = SPIN_LOCK_UNLOCKED;
 
 static spinlock_t sysfs_ino_lock = SPIN_LOCK_UNLOCKED;
@@ -82,7 +83,7 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
  *	down from there looking up dentry for each step.
  *
  *	LOCKING:
- *	Kernel thread context (may sleep)
+ *	mutex_lock(sysfs_rename_mutex)
  *
  *	RETURNS:
  *	Pointer to found dentry on success, ERR_PTR() value on error.
@@ -858,6 +859,8 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
 	const char *dup_name = NULL;
 	int error;
 
+	mutex_lock(&sysfs_rename_mutex);
+
 	/* get the original dentry */
 	sd = kobj->sd;
 	old_dentry = sysfs_get_dentry(sd);
@@ -915,6 +918,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
 	kfree(dup_name);
 	dput(old_dentry);
 	dput(new_dentry);
+	mutex_unlock(&sysfs_rename_mutex);
 	return error;
 }
 
@@ -926,6 +930,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
 	struct dentry *old_dentry = NULL, *new_dentry = NULL;
 	int error;
 
+	mutex_lock(&sysfs_rename_mutex);
 	BUG_ON(!sd->s_parent);
 	new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root;
 
@@ -982,6 +987,7 @@ again:
 	dput(new_parent);
 	dput(old_dentry);
 	dput(new_dentry);
+	mutex_unlock(&sysfs_rename_mutex);
 	return error;
 }
 
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 16f39c30b09..ff93c92164b 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -470,7 +470,9 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
 	if (!victim_sd)
 		goto out;
 
+	mutex_lock(&sysfs_rename_mutex);
 	victim = sysfs_get_dentry(victim_sd);
+	mutex_unlock(&sysfs_rename_mutex);
 	if (IS_ERR(victim)) {
 		rc = PTR_ERR(victim);
 		victim = NULL;
@@ -509,7 +511,9 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
 	if (!victim_sd)
 		goto out;
 
+	mutex_lock(&sysfs_rename_mutex);
 	victim = sysfs_get_dentry(victim_sd);
+	mutex_unlock(&sysfs_rename_mutex);
 	if (IS_ERR(victim)) {
 		rc = PTR_ERR(victim);
 		victim = NULL;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 77253aabc4a..179e6a26ece 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -90,6 +90,7 @@ extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 
 extern spinlock_t sysfs_assoc_lock;
 extern struct mutex sysfs_mutex;
+extern struct mutex sysfs_rename_mutex;
 extern struct super_block * sysfs_sb;
 extern const struct file_operations sysfs_dir_operations;
 extern const struct file_operations sysfs_file_operations;
-- 
cgit v1.2.3


From e0712bbfd9cb617fc3a822781c2466fb6b7ede50 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Mon, 20 Aug 2007 21:36:30 +0900
Subject: sysfs: simply sysfs_get_dentry

Now that we know the sysfs tree structure cannot change under us and
sysfs shadow support is dropped, sysfs_get_dentry() can be simplified
greatly.  It can just look up from the root and there's no need to
retry on failure.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 91 +++++++++++-----------------------------------------------
 1 file changed, 16 insertions(+), 75 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 9fe83d23dc2..1c3dc5d01cc 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -78,9 +78,8 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
  *	@sd: sysfs_dirent of interest
  *
  *	Get dentry for @sd.  Dentry is looked up if currently not
- *	present.  This function climbs sysfs_dirent tree till it
- *	reaches a sysfs_dirent with valid dentry attached and descends
- *	down from there looking up dentry for each step.
+ *	present.  This function descends from the root looking up
+ *	dentry for each step.
  *
  *	LOCKING:
  *	mutex_lock(sysfs_rename_mutex)
@@ -90,86 +89,28 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
  */
 struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd)
 {
-	struct sysfs_dirent *cur;
-	struct dentry *parent_dentry, *dentry;
-	int i, depth;
+	struct dentry *dentry = dget(sysfs_sb->s_root);
 
-	/* Find the first parent which has valid s_dentry and get the
-	 * dentry.
-	 */
-	mutex_lock(&sysfs_mutex);
- restart0:
-	spin_lock(&sysfs_assoc_lock);
- restart1:
-	spin_lock(&dcache_lock);
-
-	dentry = NULL;
-	depth = 0;
-	cur = sd;
-	while (!cur->s_dentry || !cur->s_dentry->d_inode) {
-		if (cur->s_flags & SYSFS_FLAG_REMOVED) {
-			dentry = ERR_PTR(-ENOENT);
-			depth = 0;
-			break;
-		}
-		cur = cur->s_parent;
-		depth++;
-	}
-	if (!IS_ERR(dentry))
-		dentry = dget_locked(cur->s_dentry);
+	while (dentry->d_fsdata != sd) {
+		struct sysfs_dirent *cur;
+		struct dentry *parent;
 
-	spin_unlock(&dcache_lock);
-	spin_unlock(&sysfs_assoc_lock);
-
-	/* from the found dentry, look up depth times */
-	while (depth--) {
-		/* find and get depth'th ancestor */
-		for (cur = sd, i = 0; cur && i < depth; i++)
+		/* find the first ancestor which hasn't been looked up */
+		cur = sd;
+		while (cur->s_parent != dentry->d_fsdata)
 			cur = cur->s_parent;
 
-		/* This can happen if tree structure was modified due
-		 * to move/rename.  Restart.
-		 */
-		if (i != depth) {
-			dput(dentry);
-			goto restart0;
-		}
-
-		sysfs_get(cur);
-
-		mutex_unlock(&sysfs_mutex);
-
 		/* look it up */
-		parent_dentry = dentry;
-		mutex_lock(&parent_dentry->d_inode->i_mutex);
-		dentry = lookup_one_len_kern(cur->s_name, parent_dentry,
+		parent = dentry;
+		mutex_lock(&parent->d_inode->i_mutex);
+		dentry = lookup_one_len_kern(cur->s_name, parent,
 					     strlen(cur->s_name));
-		mutex_unlock(&parent_dentry->d_inode->i_mutex);
-		dput(parent_dentry);
-
-		if (IS_ERR(dentry)) {
-			sysfs_put(cur);
-			return dentry;
-		}
+		mutex_unlock(&parent->d_inode->i_mutex);
+		dput(parent);
 
-		mutex_lock(&sysfs_mutex);
-		spin_lock(&sysfs_assoc_lock);
-
-		/* This, again, can happen if tree structure has
-		 * changed and we looked up the wrong thing.  Restart.
-		 */
-		if (cur->s_dentry != dentry) {
-			dput(dentry);
-			sysfs_put(cur);
-			goto restart1;
-		}
-
-		spin_unlock(&sysfs_assoc_lock);
-
-		sysfs_put(cur);
+		if (IS_ERR(dentry))
+			break;
 	}
-
-	mutex_unlock(&sysfs_mutex);
 	return dentry;
 }
 
-- 
cgit v1.2.3


From 5a26b79c426f8e55ebf7204cb138eb6b1645d4d3 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 20 Aug 2007 21:36:30 +0900
Subject: sysfs: Remove s_dentry

The only uses of s_dentry left are the code that maintains
s_dentry and trivial users that don't actually need it.
So this patch removes the s_dentry maintenance code and
restructures the trivial uses to use something else.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c   | 32 ++++----------------------------
 fs/sysfs/mount.c |  1 -
 fs/sysfs/sysfs.h |  1 -
 3 files changed, 4 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 1c3dc5d01cc..36b6c796d4d 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -289,22 +289,7 @@ static void sysfs_d_iput(struct dentry * dentry, struct inode * inode)
 {
 	struct sysfs_dirent * sd = dentry->d_fsdata;
 
-	if (sd) {
-		/* sd->s_dentry is protected with sysfs_assoc_lock.
-		 * This allows sysfs_drop_dentry() to dereference it.
-		 */
-		spin_lock(&sysfs_assoc_lock);
-
-		/* The dentry might have been deleted or another
-		 * lookup could have happened updating sd->s_dentry to
-		 * point the new dentry.  Ignore if it isn't pointing
-		 * to this dentry.
-		 */
-		if (sd->s_dentry == dentry)
-			sd->s_dentry = NULL;
-		spin_unlock(&sysfs_assoc_lock);
-		sysfs_put(sd);
-	}
+	sysfs_put(sd);
 	iput(inode);
 }
 
@@ -352,9 +337,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
  *	@sd: target sysfs_dirent
  *	@dentry: dentry to associate
  *
- *	Associate @sd with @dentry.  This is protected by
- *	sysfs_assoc_lock to avoid race with sysfs_d_iput().
- *
  *	LOCKING:
  *	mutex_lock(sysfs_mutex)
  */
@@ -362,12 +344,6 @@ static void sysfs_attach_dentry(struct sysfs_dirent *sd, struct dentry *dentry)
 {
 	dentry->d_op = &sysfs_dentry_ops;
 	dentry->d_fsdata = sysfs_get(sd);
-
-	/* protect sd->s_dentry against sysfs_d_iput */
-	spin_lock(&sysfs_assoc_lock);
-	sd->s_dentry = dentry;
-	spin_unlock(&sysfs_assoc_lock);
-
 	d_rehash(dentry);
 }
 
@@ -846,7 +822,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
 
 	/* rename */
 	d_add(new_dentry, NULL);
-	d_move(sd->s_dentry, new_dentry);
+	d_move(old_dentry, new_dentry);
 
 	error = 0;
 	goto out_unlock;
@@ -881,7 +857,7 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
 		error = PTR_ERR(old_dentry);
 		goto out_dput;
 	}
-	old_parent = sd->s_parent->s_dentry;
+	old_parent = old_dentry->d_parent;
 
 	new_parent = sysfs_get_dentry(new_parent_sd);
 	if (IS_ERR(new_parent)) {
@@ -907,7 +883,7 @@ again:
 	} else
 		error = 0;
 	d_add(new_dentry, NULL);
-	d_move(sd->s_dentry, new_dentry);
+	d_move(old_dentry, new_dentry);
 	dput(new_dentry);
 
 	/* Remove from old parent's list and insert into new parent's list. */
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 8989cbb51a3..28bf359981f 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -56,7 +56,6 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 		iput(inode);
 		return -ENOMEM;
 	}
-	sysfs_root.s_dentry = root;
 	root->d_fsdata = &sysfs_root;
 	sb->s_root = root;
 	return 0;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 179e6a26ece..791b3ed91a9 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -37,7 +37,6 @@ struct sysfs_dirent {
 	unsigned int		s_flags;
 	umode_t			s_mode;
 	ino_t			s_ino;
-	struct dentry		* s_dentry;
 	struct iattr		* s_iattr;
 	atomic_t		s_event;
 };
-- 
cgit v1.2.3


From 9918f9a4817cb6241c727b434d5f8ec5564198de Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 20 Aug 2007 21:36:31 +0900
Subject: sysfs: Rewrite rename in terms of sysfs dirents

This patch rewrites sysfs_rename_dir to perform it's checks
as much as possible on the underlying sysfs_dirents instead
of the contents of the dcache.  It turns out that this version
is a little simpler, and a little more like the rest of
the sysfs directory modification code.

tj: fixed double locking of sysfs_mutex

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 38 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 36b6c796d4d..463c5e3ccf1 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -770,7 +770,7 @@ void sysfs_remove_dir(struct kobject * kobj)
 
 int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
 {
-	struct sysfs_dirent *sd;
+	struct sysfs_dirent *sd = kobj->sd;
 	struct dentry *parent = NULL;
 	struct dentry *old_dentry = NULL, *new_dentry = NULL;
 	const char *dup_name = NULL;
@@ -778,63 +778,57 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
 
 	mutex_lock(&sysfs_rename_mutex);
 
+	error = 0;
+	if (strcmp(sd->s_name, new_name) == 0)
+		goto out;	/* nothing to rename */
+
 	/* get the original dentry */
-	sd = kobj->sd;
 	old_dentry = sysfs_get_dentry(sd);
 	if (IS_ERR(old_dentry)) {
 		error = PTR_ERR(old_dentry);
-		goto out_dput;
+		goto out;
 	}
 
 	parent = old_dentry->d_parent;
 
 	/* lock parent and get dentry for new name */
 	mutex_lock(&parent->d_inode->i_mutex);
+	mutex_lock(&sysfs_mutex);
 
-	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
-	if (IS_ERR(new_dentry)) {
-		error = PTR_ERR(new_dentry);
-		goto out_unlock;
-	}
-
-	error = -EINVAL;
-	if (old_dentry == new_dentry)
+	error = -EEXIST;
+	if (sysfs_find_dirent(sd->s_parent, new_name))
 		goto out_unlock;
 
-	error = -EEXIST;
-	if (new_dentry->d_inode)
+	error = -ENOMEM;
+	new_dentry = d_alloc_name(parent, new_name);
+	if (!new_dentry)
 		goto out_unlock;
 
 	/* rename kobject and sysfs_dirent */
 	error = -ENOMEM;
 	new_name = dup_name = kstrdup(new_name, GFP_KERNEL);
 	if (!new_name)
-		goto out_drop;
+		goto out_unlock;
 
 	error = kobject_set_name(kobj, "%s", new_name);
 	if (error)
-		goto out_drop;
+		goto out_unlock;
 
-	mutex_lock(&sysfs_mutex);
 	dup_name = sd->s_name;
 	sd->s_name = new_name;
-	mutex_unlock(&sysfs_mutex);
 
 	/* rename */
 	d_add(new_dentry, NULL);
 	d_move(old_dentry, new_dentry);
 
 	error = 0;
-	goto out_unlock;
-
- out_drop:
-	d_drop(new_dentry);
  out_unlock:
+	mutex_unlock(&sysfs_mutex);
 	mutex_unlock(&parent->d_inode->i_mutex);
- out_dput:
 	kfree(dup_name);
 	dput(old_dentry);
 	dput(new_dentry);
+ out:
 	mutex_unlock(&sysfs_rename_mutex);
 	return error;
 }
-- 
cgit v1.2.3


From 45aaae9c51d768d5a8fd53fb372b1eb714f37691 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 20 Aug 2007 21:36:31 +0900
Subject: sysfs: Rewrite sysfs_move_dir in terms of sysfs dirents

This patch rewrites sysfs_move_dir to perform it's checks
as much as possible on the underlying sysfs_dirents instead
of the contents of the dcache, making sysfs_move_dir
more like the rest of the sysfs directory modification
code.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 463c5e3ccf1..da4bb66a610 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -845,56 +845,58 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
 	BUG_ON(!sd->s_parent);
 	new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root;
 
+	error = 0;
+	if (sd->s_parent == new_parent_sd)
+		goto out;	/* nothing to move */
+
 	/* get dentries */
 	old_dentry = sysfs_get_dentry(sd);
 	if (IS_ERR(old_dentry)) {
 		error = PTR_ERR(old_dentry);
-		goto out_dput;
+		goto out;
 	}
 	old_parent = old_dentry->d_parent;
 
 	new_parent = sysfs_get_dentry(new_parent_sd);
 	if (IS_ERR(new_parent)) {
 		error = PTR_ERR(new_parent);
-		goto out_dput;
+		goto out;
 	}
 
-	if (old_parent->d_inode == new_parent->d_inode) {
-		error = 0;
-		goto out_dput;	/* nothing to move */
-	}
 again:
 	mutex_lock(&old_parent->d_inode->i_mutex);
 	if (!mutex_trylock(&new_parent->d_inode->i_mutex)) {
 		mutex_unlock(&old_parent->d_inode->i_mutex);
 		goto again;
 	}
+	mutex_lock(&sysfs_mutex);
 
-	new_dentry = lookup_one_len(kobject_name(kobj), new_parent, strlen(kobject_name(kobj)));
-	if (IS_ERR(new_dentry)) {
-		error = PTR_ERR(new_dentry);
+	error = -EEXIST;
+	if (sysfs_find_dirent(new_parent_sd, sd->s_name))
 		goto out_unlock;
-	} else
-		error = 0;
+
+	error = -ENOMEM;
+	new_dentry = d_alloc_name(new_parent, sd->s_name);
+	if (!new_dentry)
+		goto out_unlock;
+
+	error = 0;
 	d_add(new_dentry, NULL);
 	d_move(old_dentry, new_dentry);
 	dput(new_dentry);
 
 	/* Remove from old parent's list and insert into new parent's list. */
-	mutex_lock(&sysfs_mutex);
-
 	sysfs_unlink_sibling(sd);
 	sysfs_get(new_parent_sd);
 	sysfs_put(sd->s_parent);
 	sd->s_parent = new_parent_sd;
 	sysfs_link_sibling(sd);
 
-	mutex_unlock(&sysfs_mutex);
-
  out_unlock:
+	mutex_unlock(&sysfs_mutex);
 	mutex_unlock(&new_parent->d_inode->i_mutex);
 	mutex_unlock(&old_parent->d_inode->i_mutex);
- out_dput:
+ out:
 	dput(new_parent);
 	dput(old_dentry);
 	dput(new_dentry);
-- 
cgit v1.2.3


From 5c3e8964ce87477a12e3e9edc3742156a3929a74 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 13 Sep 2007 02:53:13 -0700
Subject: sysfs: spit a warning to users when they try to create a duplicate
 sysfs file

We want to let people know when we create a duplicate sysfs file, as
they need to fix up their code.


Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index da4bb66a610..5d95aa8e23c 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -428,8 +428,12 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
  */
 int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
-	if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
+	if (sysfs_find_dirent(acxt->parent_sd, sd->s_name)) {
+		printk(KERN_WARNING "sysfs: duplicate filename '%s' "
+		       "can not be created\n", sd->s_name);
+		WARN_ON(1);
 		return -EEXIST;
+	}
 
 	sd->s_parent = sysfs_get(acxt->parent_sd);
 
-- 
cgit v1.2.3


From 181b2e4be1603ce3ccace8322047a548f29f4b20 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 20 Sep 2007 16:05:09 +0900
Subject: sysfs: fix comments of sysfs_add/remove_one()

sysfs_add/remove_one() now link and unlink the target dirent into and
from the children list.  Update comments accordingly.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5d95aa8e23c..fc615eed67d 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -410,10 +410,8 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
  *	@sd: sysfs_dirent to be added
  *
  *	Get @acxt->parent_sd and set sd->s_parent to it and increment
- *	nlink of parent inode if @sd is a directory.  @sd is NOT
- *	linked into the children list of the parent.  The caller
- *	should invoke sysfs_link_sibling() after this function
- *	completes if @sd needs to be on the children list.
+ *	nlink of parent inode if @sd is a directory and link into the
+ *	children list of the parent.
  *
  *	This function should be called between calls to
  *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
@@ -453,9 +451,7 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
  *	@sd: sysfs_dirent to be added
  *
  *	Mark @sd removed and drop nlink of parent inode if @sd is a
- *	directory.  @sd is NOT unlinked from the children list of the
- *	parent.  The caller is repsonsible for removing @sd from the
- *	children list before calling this function.
+ *	directory.  @sd is unlinked from the children list.
  *
  *	This function should be called between calls to
  *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
-- 
cgit v1.2.3


From f88123eaf953f13a0c597dde54745d28f81236de Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 20 Sep 2007 16:05:10 +0900
Subject: sysfs: fix sysfs_chmod_file() such that it updates sd->s_mode too

sysfs_chmod_file() looked and updated only inode of the target file.
Dentry and inode are reclaimable and the update mode data will go away
when the inode is reclaimed.  This patch makes sysfs_chmod_file()
update sd->s_mode too such that the change is permanent.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index ff93c92164b..9fdf8dae0dc 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -521,10 +521,19 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
 	}
 
 	inode = victim->d_inode;
+
 	mutex_lock(&inode->i_mutex);
+
 	newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 	rc = notify_change(victim, &newattrs);
+
+	if (rc == 0) {
+		mutex_lock(&sysfs_mutex);
+		victim_sd->s_mode = newattrs.ia_mode;
+		mutex_unlock(&sysfs_mutex);
+	}
+
 	mutex_unlock(&inode->i_mutex);
  out:
 	dput(victim);
-- 
cgit v1.2.3


From 59f69015684b3de7b9472be9a81b1a978f93a496 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 20 Sep 2007 16:05:10 +0900
Subject: sysfs: clean up header files

sysfs is about to go through major overhaul making this a pretty good
opportunity to clean up (out-of-tree changes and pending patches will
need regeneration anyway).  Clean up headers.

* Kill space between * and symbolname.

* Move SYSFS_* type constants and flags into fs/sysfs/sysfs.h.
  They're internal to sysfs.

* Reformat function prototypes and add argument symbol names.

* Make dummy function definition order match that of function
  prototypes.

* Add some comments.

* Reorganize fs/sysfs/sysfs.h according to which file the declared
  variable or feature lives in.

This patch does not introduce any behavior change.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/sysfs.h | 148 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 92 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 791b3ed91a9..63adbecc9ba 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -1,20 +1,24 @@
+/* type-specific structures for sysfs_dirent->s_* union members */
 struct sysfs_elem_dir {
-	struct kobject		* kobj;
+	struct kobject		*kobj;
 };
 
 struct sysfs_elem_symlink {
-	struct sysfs_dirent	* target_sd;
+	struct sysfs_dirent	*target_sd;
 };
 
 struct sysfs_elem_attr {
-	struct attribute	* attr;
+	struct attribute	*attr;
 };
 
 struct sysfs_elem_bin_attr {
-	struct bin_attribute	* bin_attr;
+	struct bin_attribute	*bin_attr;
 };
 
 /*
+ * sysfs_dirent - the building block of sysfs hierarchy.  Each and
+ * every sysfs node is represented by single sysfs_dirent.
+ *
  * As long as s_count reference is held, the sysfs_dirent itself is
  * accessible.  Dereferencing s_elem or any other outer entity
  * requires s_active reference.
@@ -22,10 +26,10 @@ struct sysfs_elem_bin_attr {
 struct sysfs_dirent {
 	atomic_t		s_count;
 	atomic_t		s_active;
-	struct sysfs_dirent	* s_parent;
-	struct sysfs_dirent	* s_sibling;
-	struct sysfs_dirent	* s_children;
-	const char		* s_name;
+	struct sysfs_dirent	*s_parent;
+	struct sysfs_dirent	*s_sibling;
+	struct sysfs_dirent	*s_children;
+	const char		*s_name;
 
 	union {
 		struct sysfs_elem_dir		dir;
@@ -37,12 +41,31 @@ struct sysfs_dirent {
 	unsigned int		s_flags;
 	umode_t			s_mode;
 	ino_t			s_ino;
-	struct iattr		* s_iattr;
+	struct iattr		*s_iattr;
 	atomic_t		s_event;
 };
 
-#define SD_DEACTIVATED_BIAS	INT_MIN
+#define SD_DEACTIVATED_BIAS		INT_MIN
 
+#define SYSFS_TYPE_MASK			0x00ff
+#define SYSFS_ROOT			0x0001
+#define SYSFS_DIR			0x0002
+#define SYSFS_KOBJ_ATTR			0x0004
+#define SYSFS_KOBJ_BIN_ATTR		0x0008
+#define SYSFS_KOBJ_LINK			0x0020
+#define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK)
+
+#define SYSFS_FLAG_MASK			~SYSFS_TYPE_MASK
+#define SYSFS_FLAG_REMOVED		0x0200
+
+static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
+{
+	return sd->s_flags & SYSFS_TYPE_MASK;
+}
+
+/*
+ * Context structure to be used while adding/removing nodes.
+ */
 struct sysfs_addrm_cxt {
 	struct sysfs_dirent	*parent_sd;
 	struct inode		*parent_inode;
@@ -50,59 +73,47 @@ struct sysfs_addrm_cxt {
 	int			cnt;
 };
 
+/*
+ * mount.c
+ */
 extern struct sysfs_dirent sysfs_root;
+extern struct super_block *sysfs_sb;
 extern struct kmem_cache *sysfs_dir_cachep;
 
-extern struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
-extern struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
-extern void sysfs_put_active(struct sysfs_dirent *sd);
-extern struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
-extern void sysfs_put_active_two(struct sysfs_dirent *sd);
-extern void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
-			      struct sysfs_dirent *parent_sd);
-extern int sysfs_add_one(struct sysfs_addrm_cxt *acxt,
-			  struct sysfs_dirent *sd);
-extern void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
-			     struct sysfs_dirent *sd);
-extern void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
-
-extern struct inode * sysfs_get_inode(struct sysfs_dirent *sd);
-
-extern void release_sysfs_dirent(struct sysfs_dirent * sd);
-extern struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
-					      const unsigned char *name);
-extern struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
-					     const unsigned char *name);
-extern struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode,
-					     int type);
-
-extern int sysfs_add_file(struct sysfs_dirent *dir_sd,
-			  const struct attribute *attr, int type);
-extern int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
-extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name);
-
-extern int sysfs_create_subdir(struct kobject *kobj, const char *name,
-			       struct sysfs_dirent **p_sd);
-extern void sysfs_remove_subdir(struct sysfs_dirent *sd);
-
-extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
-
-extern spinlock_t sysfs_assoc_lock;
+/*
+ * dir.c
+ */
 extern struct mutex sysfs_mutex;
 extern struct mutex sysfs_rename_mutex;
-extern struct super_block * sysfs_sb;
+extern spinlock_t sysfs_assoc_lock;
+
 extern const struct file_operations sysfs_dir_operations;
-extern const struct file_operations sysfs_file_operations;
-extern const struct file_operations bin_fops;
 extern const struct inode_operations sysfs_dir_inode_operations;
-extern const struct inode_operations sysfs_symlink_inode_operations;
 
-static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
-{
-	return sd->s_flags & SYSFS_TYPE_MASK;
-}
-
-static inline struct sysfs_dirent * sysfs_get(struct sysfs_dirent * sd)
+struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
+struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
+void sysfs_put_active(struct sysfs_dirent *sd);
+struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
+void sysfs_put_active_two(struct sysfs_dirent *sd);
+void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
+		       struct sysfs_dirent *parent_sd);
+int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
+void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
+void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
+
+struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
+				       const unsigned char *name);
+struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
+				      const unsigned char *name);
+struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
+
+void release_sysfs_dirent(struct sysfs_dirent *sd);
+
+int sysfs_create_subdir(struct kobject *kobj, const char *name,
+			struct sysfs_dirent **p_sd);
+void sysfs_remove_subdir(struct sysfs_dirent *sd);
+
+static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
 {
 	if (sd) {
 		WARN_ON(!atomic_read(&sd->s_count));
@@ -111,8 +122,33 @@ static inline struct sysfs_dirent * sysfs_get(struct sysfs_dirent * sd)
 	return sd;
 }
 
-static inline void sysfs_put(struct sysfs_dirent * sd)
+static inline void sysfs_put(struct sysfs_dirent *sd)
 {
 	if (sd && atomic_dec_and_test(&sd->s_count))
 		release_sysfs_dirent(sd);
 }
+
+/*
+ * inode.c
+ */
+struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
+int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
+int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
+
+/*
+ * file.c
+ */
+extern const struct file_operations sysfs_file_operations;
+
+int sysfs_add_file(struct sysfs_dirent *dir_sd,
+		   const struct attribute *attr, int type);
+
+/*
+ * bin.c
+ */
+extern const struct file_operations bin_fops;
+
+/*
+ * symlink.c
+ */
+extern const struct inode_operations sysfs_symlink_inode_operations;
-- 
cgit v1.2.3


From 5a7ad7f044941316dc98eda2a087a12a7a50649d Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 20 Sep 2007 16:05:10 +0900
Subject: sysfs: kill sysfs_update_file()

sysfs_update_file() depends on inode->i_mtime but sysfs iondes are now
reclaimable making the reported modification time unreliable.  There's
only one user (pci hotplug) of this notification mechanism and it
reportedly isn't utilized from userland.

Kill sysfs_update_file().

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c | 40 ----------------------------------------
 1 file changed, 40 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 9fdf8dae0dc..61a8c19df7c 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -3,7 +3,6 @@
  */
 
 #include <linux/module.h>
-#include <linux/fsnotify.h>
 #include <linux/kobject.h>
 #include <linux/namei.h>
 #include <linux/poll.h>
@@ -453,44 +452,6 @@ int sysfs_add_file_to_group(struct kobject *kobj,
 }
 EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
 
-
-/**
- * sysfs_update_file - update the modified timestamp on an object attribute.
- * @kobj: object we're acting for.
- * @attr: attribute descriptor.
- */
-int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
-{
-	struct sysfs_dirent *victim_sd = NULL;
-	struct dentry *victim = NULL;
-	int rc;
-
-	rc = -ENOENT;
-	victim_sd = sysfs_get_dirent(kobj->sd, attr->name);
-	if (!victim_sd)
-		goto out;
-
-	mutex_lock(&sysfs_rename_mutex);
-	victim = sysfs_get_dentry(victim_sd);
-	mutex_unlock(&sysfs_rename_mutex);
-	if (IS_ERR(victim)) {
-		rc = PTR_ERR(victim);
-		victim = NULL;
-		goto out;
-	}
-
-	mutex_lock(&victim->d_inode->i_mutex);
-	victim->d_inode->i_mtime = CURRENT_TIME;
-	fsnotify_modify(victim);
-	mutex_unlock(&victim->d_inode->i_mutex);
-	rc = 0;
- out:
-	dput(victim);
-	sysfs_put(victim_sd);
-	return rc;
-}
-
-
 /**
  * sysfs_chmod_file - update the modified mode value on an object attribute.
  * @kobj: object we're acting for.
@@ -641,4 +602,3 @@ EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
 
 EXPORT_SYMBOL_GPL(sysfs_create_file);
 EXPORT_SYMBOL_GPL(sysfs_remove_file);
-EXPORT_SYMBOL_GPL(sysfs_update_file);
-- 
cgit v1.2.3


From b13dc89c5a5bd5e34aadb44c0fb7e870959dcd06 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 20 Sep 2007 16:05:10 +0900
Subject: sysfs: reposition sysfs_dirent->s_mode.

Move s_mode downward such that it's side-by-side with s_iattr which is
used for the same thing.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/sysfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 63adbecc9ba..6cf61c882ba 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -39,8 +39,8 @@ struct sysfs_dirent {
 	}			s_elem;
 
 	unsigned int		s_flags;
-	umode_t			s_mode;
 	ino_t			s_ino;
+	umode_t			s_mode;
 	struct iattr		*s_iattr;
 	atomic_t		s_event;
 };
-- 
cgit v1.2.3


From b05f0548dabd20433f8c201a0307103721d6a18b Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 20 Sep 2007 16:05:10 +0900
Subject: sysfs: kill unnecessary sysfs_get() in open paths

There's no reason to get an extra reference to sysfs_dirent for an
open file.  Open file has a reference to the dentry which in turn has
a reference to sysfs_dirent.  This is fairly obvious as otherwise open
itself won't be able to access the sysfs_dirent.  Kill the extra
sysfs_get() and matching sysfs_put().

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/bin.c  | 4 +---
 fs/sysfs/file.c | 6 +-----
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index a819a7e8d74..e93fe5e2fa4 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -193,9 +193,8 @@ static int open(struct inode * inode, struct file * file)
 	mutex_init(&bb->mutex);
 	file->private_data = bb;
 
-	/* open succeeded, put active reference and pin attr_sd */
+	/* open succeeded, put active reference */
 	sysfs_put_active(attr_sd);
-	sysfs_get(attr_sd);
 	return 0;
 
  err_out:
@@ -211,7 +210,6 @@ static int release(struct inode * inode, struct file * file)
 
 	if (bb->mmapped)
 		sysfs_put_active_two(attr_sd);
-	sysfs_put(attr_sd);
 	kfree(bb->buffer);
 	kfree(bb);
 	return 0;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 61a8c19df7c..73333dc6854 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -298,9 +298,8 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	buffer->ops = ops;
 	file->private_data = buffer;
 
-	/* open succeeded, put active references and pin attr_sd */
+	/* open succeeded, put active references */
 	sysfs_put_active_two(attr_sd);
-	sysfs_get(attr_sd);
 	return 0;
 
  err_out:
@@ -310,11 +309,8 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 
 static int sysfs_release(struct inode * inode, struct file * filp)
 {
-	struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
 	struct sysfs_buffer *buffer = filp->private_data;
 
-	sysfs_put(attr_sd);
-
 	if (buffer) {
 		if (buffer->page)
 			free_page((unsigned long)buffer->page);
-- 
cgit v1.2.3


From 50ab1a72863b1ad4b117862bc52610f8d4535609 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 20 Sep 2007 16:05:10 +0900
Subject: sysfs: kill unnecessary NULL pointer check in sysfs_release()

In sysfs_release(), sysfs_buffer pointed to by filp->private_data is
guaranteed to exist.  Kill the unnecessary NULL check.  This also
makes the code more consistent with the counterpart in fs/sysfs/bin.c.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 73333dc6854..8f1ebd88b9c 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -311,11 +311,10 @@ static int sysfs_release(struct inode * inode, struct file * filp)
 {
 	struct sysfs_buffer *buffer = filp->private_data;
 
-	if (buffer) {
-		if (buffer->page)
-			free_page((unsigned long)buffer->page);
-		kfree(buffer);
-	}
+	if (buffer->page)
+		free_page((unsigned long)buffer->page);
+	kfree(buffer);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 078ce6409ca54d5fc6eb7d2147cd6efc3eb09078 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 20 Sep 2007 16:05:10 +0900
Subject: sysfs: make bin attr open get active reference of parent too

All bin attr operations require active references of itself and its
parent.  There's no reason to allow open when its parent has been
deactivated and allowing it is inconsistent with regular sysfs file.
Use sysfs_get_active_two() in bin attribute open function.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/bin.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index e93fe5e2fa4..9c8f8824e66 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -171,8 +171,8 @@ static int open(struct inode * inode, struct file * file)
 	struct bin_buffer *bb = NULL;
 	int error;
 
-	/* need attr_sd for attr */
-	if (!sysfs_get_active(attr_sd))
+	/* binary file operations requires both @sd and its parent */
+	if (!sysfs_get_active_two(attr_sd))
 		return -ENODEV;
 
 	error = -EACCES;
@@ -193,12 +193,12 @@ static int open(struct inode * inode, struct file * file)
 	mutex_init(&bb->mutex);
 	file->private_data = bb;
 
-	/* open succeeded, put active reference */
-	sysfs_put_active(attr_sd);
+	/* open succeeded, put active references */
+	sysfs_put_active_two(attr_sd);
 	return 0;
 
  err_out:
-	sysfs_put_active(attr_sd);
+	sysfs_put_active_two(attr_sd);
 	kfree(bb);
 	return error;
 }
-- 
cgit v1.2.3


From b1fc3d6144d56360d1373b01c7881826f558b6cd Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 20 Sep 2007 16:05:11 +0900
Subject: sysfs: make s_elem an anonymous union

Make s_elem an anonymous union.  Prefixing with s_elem makes things
needlessly longer without any advantage.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/bin.c     | 14 +++++++-------
 fs/sysfs/dir.c     |  4 ++--
 fs/sysfs/file.c    | 14 +++++++-------
 fs/sysfs/inode.c   |  2 +-
 fs/sysfs/symlink.c |  4 ++--
 fs/sysfs/sysfs.h   | 10 +++++-----
 6 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 9c8f8824e66..247ea19721c 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -30,8 +30,8 @@ static int
 fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count)
 {
 	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
-	struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr;
-	struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj;
+	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
+	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
 	int rc;
 
 	/* need attr_sd for attr, its parent for kobj */
@@ -87,8 +87,8 @@ static int
 flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count)
 {
 	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
-	struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr;
-	struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj;
+	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
+	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
 	int rc;
 
 	/* need attr_sd for attr, its parent for kobj */
@@ -140,8 +140,8 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct bin_buffer *bb = file->private_data;
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr;
-	struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj;
+	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
+	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
 	int rc;
 
 	mutex_lock(&bb->mutex);
@@ -167,7 +167,7 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
 static int open(struct inode * inode, struct file * file)
 {
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct bin_attribute *attr = attr_sd->s_elem.bin_attr.bin_attr;
+	struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
 	struct bin_buffer *bb = NULL;
 	int error;
 
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index fc615eed67d..6ee76a82eb3 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -273,7 +273,7 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
 	parent_sd = sd->s_parent;
 
 	if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
-		sysfs_put(sd->s_elem.symlink.target_sd);
+		sysfs_put(sd->s_symlink.target_sd);
 	if (sysfs_type(sd) & SYSFS_COPY_NAME)
 		kfree(sd->s_name);
 	kfree(sd->s_iattr);
@@ -630,7 +630,7 @@ static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
 	sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
 	if (!sd)
 		return -ENOMEM;
-	sd->s_elem.dir.kobj = kobj;
+	sd->s_dir.kobj = kobj;
 
 	/* link in */
 	sysfs_addrm_start(&acxt, parent_sd);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 8f1ebd88b9c..3c91a57a1ed 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -73,7 +73,7 @@ struct sysfs_buffer {
 static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer)
 {
 	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
-	struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj;
+	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
 	struct sysfs_ops * ops = buffer->ops;
 	int ret = 0;
 	ssize_t count;
@@ -88,7 +88,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
 		return -ENODEV;
 
 	buffer->event = atomic_read(&attr_sd->s_event);
-	count = ops->show(kobj, attr_sd->s_elem.attr.attr, buffer->page);
+	count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page);
 
 	sysfs_put_active_two(attr_sd);
 
@@ -188,7 +188,7 @@ static int
 flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t count)
 {
 	struct sysfs_dirent *attr_sd = dentry->d_fsdata;
-	struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj;
+	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
 	struct sysfs_ops * ops = buffer->ops;
 	int rc;
 
@@ -196,7 +196,7 @@ flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t
 	if (!sysfs_get_active_two(attr_sd))
 		return -ENODEV;
 
-	rc = ops->store(kobj, attr_sd->s_elem.attr.attr, buffer->page, count);
+	rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count);
 
 	sysfs_put_active_two(attr_sd);
 
@@ -240,7 +240,7 @@ sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t
 static int sysfs_open_file(struct inode *inode, struct file *file)
 {
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj;
+	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
 	struct sysfs_buffer * buffer;
 	struct sysfs_ops * ops = NULL;
 	int error;
@@ -336,7 +336,7 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
 {
 	struct sysfs_buffer * buffer = filp->private_data;
 	struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
-	struct kobject *kobj = attr_sd->s_parent->s_elem.dir.kobj;
+	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
 
 	/* need parent for the kobj, grab both */
 	if (!sysfs_get_active_two(attr_sd))
@@ -396,7 +396,7 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
 	sd = sysfs_new_dirent(attr->name, mode, type);
 	if (!sd)
 		return -ENOMEM;
-	sd->s_elem.attr.attr = (void *)attr;
+	sd->s_attr.attr = (void *)attr;
 
 	sysfs_addrm_start(&acxt, dir_sd);
 	rc = sysfs_add_one(&acxt, sd);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 0d706a86d2c..b6ac4e6d6e7 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -172,7 +172,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 		inode->i_fop = &sysfs_file_operations;
 		break;
 	case SYSFS_KOBJ_BIN_ATTR:
-		bin_attr = sd->s_elem.bin_attr.bin_attr;
+		bin_attr = sd->s_bin_attr.bin_attr;
 		inode->i_size = bin_attr->size;
 		inode->i_fop = &bin_fops;
 		break;
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 8ad38bccc0e..ffa82e9802a 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -86,7 +86,7 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
 	if (!sd)
 		goto out_put;
 
-	sd->s_elem.symlink.target_sd = target_sd;
+	sd->s_symlink.target_sd = target_sd;
 	target_sd = NULL;	/* reference is now owned by the symlink */
 
 	sysfs_addrm_start(&acxt, parent_sd);
@@ -142,7 +142,7 @@ static int sysfs_getlink(struct dentry *dentry, char * path)
 {
 	struct sysfs_dirent *sd = dentry->d_fsdata;
 	struct sysfs_dirent *parent_sd = sd->s_parent;
-	struct sysfs_dirent *target_sd = sd->s_elem.symlink.target_sd;
+	struct sysfs_dirent *target_sd = sd->s_symlink.target_sd;
 	int error;
 
 	mutex_lock(&sysfs_mutex);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 6cf61c882ba..2a68bfa46e4 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -32,11 +32,11 @@ struct sysfs_dirent {
 	const char		*s_name;
 
 	union {
-		struct sysfs_elem_dir		dir;
-		struct sysfs_elem_symlink	symlink;
-		struct sysfs_elem_attr		attr;
-		struct sysfs_elem_bin_attr	bin_attr;
-	}			s_elem;
+		struct sysfs_elem_dir		s_dir;
+		struct sysfs_elem_symlink	s_symlink;
+		struct sysfs_elem_attr		s_attr;
+		struct sysfs_elem_bin_attr	s_bin_attr;
+	};
 
 	unsigned int		s_flags;
 	ino_t			s_ino;
-- 
cgit v1.2.3


From d6b4fd2faeb9ddf55ce09cf90b88981e579ee010 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 20 Sep 2007 16:05:11 +0900
Subject: sysfs: open code sysfs_attach_dentry()

sysfs_attach_dentry() now has only one caller and isn't doing much
other than obfuscating the code.  Open code and kill it.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 6ee76a82eb3..48a3ed4f453 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -332,21 +332,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
 	return NULL;
 }
 
-/**
- *	sysfs_attach_dentry - associate sysfs_dirent with dentry
- *	@sd: target sysfs_dirent
- *	@dentry: dentry to associate
- *
- *	LOCKING:
- *	mutex_lock(sysfs_mutex)
- */
-static void sysfs_attach_dentry(struct sysfs_dirent *sd, struct dentry *dentry)
-{
-	dentry->d_op = &sysfs_dentry_ops;
-	dentry->d_fsdata = sysfs_get(sd);
-	d_rehash(dentry);
-}
-
 static int sysfs_ilookup_test(struct inode *inode, void *arg)
 {
 	struct sysfs_dirent *sd = arg;
@@ -696,8 +681,11 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
+	/* instantiate and hash dentry */
+	dentry->d_op = &sysfs_dentry_ops;
+	dentry->d_fsdata = sysfs_get(sd);
 	d_instantiate(dentry, inode);
-	sysfs_attach_dentry(sd, dentry);
+	d_rehash(dentry);
 
  out_unlock:
 	mutex_unlock(&sysfs_mutex);
-- 
cgit v1.2.3


From dc2f75f0e0cac645c22c85bcf429683b3fa0d2d9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 20 Sep 2007 16:05:12 +0900
Subject: sysfs: make sysfs_root a regular directory dirent

sysfs_root is different from a regular directory dirent in that it's
of type SYSFS_ROOT and doesn't have a name.  These differences aren't
used by anybody and only adds to complexity.  Make sysfs_root a
regular directory dirent.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/inode.c | 5 -----
 fs/sysfs/mount.c | 3 ++-
 fs/sysfs/sysfs.h | 9 ++++-----
 3 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index b6ac4e6d6e7..c40fb9fdd04 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -157,11 +157,6 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 
 	/* initialize inode according to type */
 	switch (sysfs_type(sd)) {
-	case SYSFS_ROOT:
-		inode->i_op = &sysfs_dir_inode_operations;
-		inode->i_fop = &sysfs_dir_operations;
-		inc_nlink(inode); /* directory, account for "." */
-		break;
 	case SYSFS_DIR:
 		inode->i_op = &sysfs_dir_inode_operations;
 		inode->i_fop = &sysfs_dir_operations;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 28bf359981f..465902c18d3 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -24,8 +24,9 @@ static const struct super_operations sysfs_ops = {
 };
 
 struct sysfs_dirent sysfs_root = {
+	.s_name		= "",
 	.s_count	= ATOMIC_INIT(1),
-	.s_flags	= SYSFS_ROOT,
+	.s_flags	= SYSFS_DIR,
 	.s_mode		= S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
 	.s_ino		= 1,
 };
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 2a68bfa46e4..60405a6e4c2 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -48,11 +48,10 @@ struct sysfs_dirent {
 #define SD_DEACTIVATED_BIAS		INT_MIN
 
 #define SYSFS_TYPE_MASK			0x00ff
-#define SYSFS_ROOT			0x0001
-#define SYSFS_DIR			0x0002
-#define SYSFS_KOBJ_ATTR			0x0004
-#define SYSFS_KOBJ_BIN_ATTR		0x0008
-#define SYSFS_KOBJ_LINK			0x0020
+#define SYSFS_DIR			0x0001
+#define SYSFS_KOBJ_ATTR			0x0002
+#define SYSFS_KOBJ_BIN_ATTR		0x0004
+#define SYSFS_KOBJ_LINK			0x0008
 #define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK)
 
 #define SYSFS_FLAG_MASK			~SYSFS_TYPE_MASK
-- 
cgit v1.2.3


From bc747f37a0f089b9366f7385ff870e12911f2383 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 20 Sep 2007 16:05:12 +0900
Subject: sysfs: move sysfs_dirent->s_children into sysfs_dirent->s_dir

Children list head is only meaninful for directory nodes.  Move it
into s_dir.  This doesn't save any space currently but it will with
further changes.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c   | 17 +++++++++--------
 fs/sysfs/inode.c |  2 +-
 fs/sysfs/sysfs.h |  3 ++-
 3 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 48a3ed4f453..4ad9422566a 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -26,7 +26,7 @@ static DEFINE_IDA(sysfs_ino_ida);
  *	@sd: sysfs_dirent of interest
  *
  *	Link @sd into its sibling list which starts from
- *	sd->s_parent->s_children.
+ *	sd->s_parent->s_dir.children.
  *
  *	Locking:
  *	mutex_lock(sysfs_mutex)
@@ -40,9 +40,9 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
 
 	/* Store directory entries in order by ino.  This allows
 	 * readdir to properly restart without having to add a
-	 * cursor into the s_children list.
+	 * cursor into the s_dir.children list.
 	 */
-	for (pos = &parent_sd->s_children; *pos; pos = &(*pos)->s_sibling) {
+	for (pos = &parent_sd->s_dir.children; *pos; pos = &(*pos)->s_sibling) {
 		if (sd->s_ino < (*pos)->s_ino)
 			break;
 	}
@@ -55,7 +55,7 @@ static void sysfs_link_sibling(struct sysfs_dirent *sd)
  *	@sd: sysfs_dirent of interest
  *
  *	Unlink @sd from its sibling list which starts from
- *	sd->s_parent->s_children.
+ *	sd->s_parent->s_dir.children.
  *
  *	Locking:
  *	mutex_lock(sysfs_mutex)
@@ -64,7 +64,8 @@ static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
 {
 	struct sysfs_dirent **pos;
 
-	for (pos = &sd->s_parent->s_children; *pos; pos = &(*pos)->s_sibling) {
+	for (pos = &sd->s_parent->s_dir.children; *pos;
+	     pos = &(*pos)->s_sibling) {
 		if (*pos == sd) {
 			*pos = sd->s_sibling;
 			sd->s_sibling = NULL;
@@ -570,7 +571,7 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 {
 	struct sysfs_dirent *sd;
 
-	for (sd = parent_sd->s_children; sd; sd = sd->s_sibling)
+	for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling)
 		if (!strcmp(sd->s_name, name))
 			return sd;
 	return NULL;
@@ -722,7 +723,7 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
 
 	pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
 	sysfs_addrm_start(&acxt, dir_sd);
-	pos = &dir_sd->s_children;
+	pos = &dir_sd->s_dir.children;
 	while (*pos) {
 		struct sysfs_dirent *sd = *pos;
 
@@ -922,7 +923,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 		mutex_lock(&sysfs_mutex);
 
 		/* Skip the dentries we have already reported */
-		pos = parent_sd->s_children;
+		pos = parent_sd->s_dir.children;
 		while (pos && (filp->f_pos > pos->s_ino))
 			pos = pos->s_sibling;
 
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index c40fb9fdd04..2210cf0005f 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -127,7 +127,7 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd)
 	struct sysfs_dirent *child;
 	int nr = 0;
 
-	for (child = sd->s_children; child; child = child->s_sibling)
+	for (child = sd->s_dir.children; child; child = child->s_sibling)
 		if (sysfs_type(child) == SYSFS_DIR)
 			nr++;
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 60405a6e4c2..42b0327d162 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -1,6 +1,8 @@
 /* type-specific structures for sysfs_dirent->s_* union members */
 struct sysfs_elem_dir {
 	struct kobject		*kobj;
+	/* children list starts here and goes through sd->s_sibling */
+	struct sysfs_dirent	*children;
 };
 
 struct sysfs_elem_symlink {
@@ -28,7 +30,6 @@ struct sysfs_dirent {
 	atomic_t		s_active;
 	struct sysfs_dirent	*s_parent;
 	struct sysfs_dirent	*s_sibling;
-	struct sysfs_dirent	*s_children;
 	const char		*s_name;
 
 	union {
-- 
cgit v1.2.3


From 85a4ffad3de77177591f7c2c18c26c3c8dd28bff Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 20 Sep 2007 16:05:12 +0900
Subject: sysfs: implement sysfs_open_dirent

Implement sysfs_open_dirent which represents an open file (attribute)
sysfs_dirent.  A file sysfs_dirent with one or more open files have
one sysfs_dirent and all sysfs_buffers (one for each open instance)
are linked to it.

sysfs_open_dirent doesn't actually do anything yet but will be used to
off-load things which are specific for open file sysfs_dirent from it.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c  | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/sysfs/sysfs.h |   3 ++
 2 files changed, 111 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 3c91a57a1ed..b13ba94cf8a 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -49,6 +49,22 @@ static struct sysfs_ops subsys_sysfs_ops = {
 	.store	= subsys_attr_store,
 };
 
+/*
+ * There's one sysfs_buffer for each open file and one
+ * sysfs_open_dirent for each sysfs_dirent with one or more open
+ * files.
+ *
+ * filp->private_data points to sysfs_buffer and
+ * sysfs_dirent->s_attr.open points to sysfs_open_dirent.  s_attr.open
+ * is protected by sysfs_open_dirent_lock.
+ */
+static spinlock_t sysfs_open_dirent_lock = SPIN_LOCK_UNLOCKED;
+
+struct sysfs_open_dirent {
+	atomic_t		refcnt;
+	struct list_head	buffers; /* goes through sysfs_buffer.list */
+};
+
 struct sysfs_buffer {
 	size_t			count;
 	loff_t			pos;
@@ -57,6 +73,7 @@ struct sysfs_buffer {
 	struct mutex		mutex;
 	int			needs_read_fill;
 	int			event;
+	struct list_head	list;
 };
 
 /**
@@ -237,6 +254,86 @@ sysfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t
 	return len;
 }
 
+/**
+ *	sysfs_get_open_dirent - get or create sysfs_open_dirent
+ *	@sd: target sysfs_dirent
+ *	@buffer: sysfs_buffer for this instance of open
+ *
+ *	If @sd->s_attr.open exists, increment its reference count;
+ *	otherwise, create one.  @buffer is chained to the buffers
+ *	list.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ *
+ *	RETURNS:
+ *	0 on success, -errno on failure.
+ */
+static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
+				 struct sysfs_buffer *buffer)
+{
+	struct sysfs_open_dirent *od, *new_od = NULL;
+
+ retry:
+	spin_lock(&sysfs_open_dirent_lock);
+
+	if (!sd->s_attr.open && new_od) {
+		sd->s_attr.open = new_od;
+		new_od = NULL;
+	}
+
+	od = sd->s_attr.open;
+	if (od) {
+		atomic_inc(&od->refcnt);
+		list_add_tail(&buffer->list, &od->buffers);
+	}
+
+	spin_unlock(&sysfs_open_dirent_lock);
+
+	if (od) {
+		kfree(new_od);
+		return 0;
+	}
+
+	/* not there, initialize a new one and retry */
+	new_od = kmalloc(sizeof(*new_od), GFP_KERNEL);
+	if (!new_od)
+		return -ENOMEM;
+
+	atomic_set(&new_od->refcnt, 0);
+	INIT_LIST_HEAD(&new_od->buffers);
+	goto retry;
+}
+
+/**
+ *	sysfs_put_open_dirent - put sysfs_open_dirent
+ *	@sd: target sysfs_dirent
+ *	@buffer: associated sysfs_buffer
+ *
+ *	Put @sd->s_attr.open and unlink @buffer from the buffers list.
+ *	If reference count reaches zero, disassociate and free it.
+ *
+ *	LOCKING:
+ *	None.
+ */
+static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
+				  struct sysfs_buffer *buffer)
+{
+	struct sysfs_open_dirent *od = sd->s_attr.open;
+
+	spin_lock(&sysfs_open_dirent_lock);
+
+	list_del(&buffer->list);
+	if (atomic_dec_and_test(&od->refcnt))
+		sd->s_attr.open = NULL;
+	else
+		od = NULL;
+
+	spin_unlock(&sysfs_open_dirent_lock);
+
+	kfree(od);
+}
+
 static int sysfs_open_file(struct inode *inode, struct file *file)
 {
 	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
@@ -298,19 +395,29 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	buffer->ops = ops;
 	file->private_data = buffer;
 
+	/* make sure we have open dirent struct */
+	error = sysfs_get_open_dirent(attr_sd, buffer);
+	if (error)
+		goto err_free;
+
 	/* open succeeded, put active references */
 	sysfs_put_active_two(attr_sd);
 	return 0;
 
+ err_free:
+	kfree(buffer);
  err_out:
 	sysfs_put_active_two(attr_sd);
 	return error;
 }
 
-static int sysfs_release(struct inode * inode, struct file * filp)
+static int sysfs_release(struct inode *inode, struct file *filp)
 {
+	struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata;
 	struct sysfs_buffer *buffer = filp->private_data;
 
+	sysfs_put_open_dirent(sd, buffer);
+
 	if (buffer->page)
 		free_page((unsigned long)buffer->page);
 	kfree(buffer);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 42b0327d162..3adce7d5e4f 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -1,3 +1,5 @@
+struct sysfs_open_dirent;
+
 /* type-specific structures for sysfs_dirent->s_* union members */
 struct sysfs_elem_dir {
 	struct kobject		*kobj;
@@ -11,6 +13,7 @@ struct sysfs_elem_symlink {
 
 struct sysfs_elem_attr {
 	struct attribute	*attr;
+	struct sysfs_open_dirent *open;
 };
 
 struct sysfs_elem_bin_attr {
-- 
cgit v1.2.3


From a4e8b912541d5372ae049a3b7c1979968e52c40b Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 20 Sep 2007 16:05:12 +0900
Subject: sysfs: move sysfs file poll implementation to sysfs_open_dirent

Sysfs file poll implementation is scattered over sysfs and kobject.
Event numbering is done in sysfs_dirent but wait itself is done on
kobject.  This not only unecessarily bloats both kobject and
sysfs_dirent but is also buggy - if a sysfs_dirent is removed while
there still are pollers, the associaton betwen the kobject and
sysfs_dirent breaks and kobject may be freed with the pollers still
sleeping on it.

This patch moves whole poll implementation into sysfs_open_dirent.
Each time a sysfs_open_dirent is created, event number restarts from 1
and pollers sleep on sysfs_open_dirent.  As event sequence number is
meaningless without any open file and pollers should have open file
and thus sysfs_open_dirent, this ephemeral event counting works and is
a saner implementation.

This patch fixes the dnagling sleepers bug and reduces the sizes of
kobject and sysfs_dirent by one pointer.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c   |  1 -
 fs/sysfs/file.c  | 25 +++++++++++++++++++------
 fs/sysfs/sysfs.h |  1 -
 3 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 4ad9422566a..e301a1207b6 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -318,7 +318,6 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
 
 	atomic_set(&sd->s_count, 1);
 	atomic_set(&sd->s_active, 0);
-	atomic_set(&sd->s_event, 1);
 
 	sd->s_name = name;
 	sd->s_mode = mode;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index b13ba94cf8a..c05f9618b2d 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -62,6 +62,8 @@ static spinlock_t sysfs_open_dirent_lock = SPIN_LOCK_UNLOCKED;
 
 struct sysfs_open_dirent {
 	atomic_t		refcnt;
+	atomic_t		event;
+	wait_queue_head_t	poll;
 	struct list_head	buffers; /* goes through sysfs_buffer.list */
 };
 
@@ -104,7 +106,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
 	if (!sysfs_get_active_two(attr_sd))
 		return -ENODEV;
 
-	buffer->event = atomic_read(&attr_sd->s_event);
+	buffer->event = atomic_read(&attr_sd->s_attr.open->event);
 	count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page);
 
 	sysfs_put_active_two(attr_sd);
@@ -301,6 +303,8 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
 		return -ENOMEM;
 
 	atomic_set(&new_od->refcnt, 0);
+	atomic_set(&new_od->event, 1);
+	init_waitqueue_head(&new_od->poll);
 	INIT_LIST_HEAD(&new_od->buffers);
 	goto retry;
 }
@@ -443,17 +447,17 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
 {
 	struct sysfs_buffer * buffer = filp->private_data;
 	struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
-	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
+	struct sysfs_open_dirent *od = attr_sd->s_attr.open;
 
 	/* need parent for the kobj, grab both */
 	if (!sysfs_get_active_two(attr_sd))
 		goto trigger;
 
-	poll_wait(filp, &kobj->poll, wait);
+	poll_wait(filp, &od->poll, wait);
 
 	sysfs_put_active_two(attr_sd);
 
-	if (buffer->event != atomic_read(&attr_sd->s_event))
+	if (buffer->event != atomic_read(&od->event))
 		goto trigger;
 
 	return 0;
@@ -474,8 +478,17 @@ void sysfs_notify(struct kobject *k, char *dir, char *attr)
 	if (sd && attr)
 		sd = sysfs_find_dirent(sd, attr);
 	if (sd) {
-		atomic_inc(&sd->s_event);
-		wake_up_interruptible(&k->poll);
+		struct sysfs_open_dirent *od;
+
+		spin_lock(&sysfs_open_dirent_lock);
+
+		od = sd->s_attr.open;
+		if (od) {
+			atomic_inc(&od->event);
+			wake_up_interruptible(&od->poll);
+		}
+
+		spin_unlock(&sysfs_open_dirent_lock);
 	}
 
 	mutex_unlock(&sysfs_mutex);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 3adce7d5e4f..269c845c590 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -46,7 +46,6 @@ struct sysfs_dirent {
 	ino_t			s_ino;
 	umode_t			s_mode;
 	struct iattr		*s_iattr;
-	atomic_t		s_event;
 };
 
 #define SD_DEACTIVATED_BIAS		INT_MIN
-- 
cgit v1.2.3


From 6d66f5cd26e4c482e986130b7572f2735a0f7e8b Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 20 Sep 2007 17:31:38 +0900
Subject: sysfs: add copyrights

Sysfs has gone through considerable amount of reimplementation.  Add
copyrights.  Any objections?  :-)

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/bin.c     |  8 +++++++-
 fs/sysfs/dir.c     | 10 +++++++++-
 fs/sysfs/file.c    | 10 +++++++++-
 fs/sysfs/inode.c   |  8 ++++++--
 fs/sysfs/mount.c   | 10 +++++++++-
 fs/sysfs/symlink.c | 10 +++++++++-
 fs/sysfs/sysfs.h   | 10 ++++++++++
 7 files changed, 59 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 247ea19721c..006fc64227d 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -1,9 +1,15 @@
 /*
- * bin.c - binary file operations for sysfs.
+ * fs/sysfs/bin.c - sysfs binary file implementation
  *
  * Copyright (c) 2003 Patrick Mochel
  * Copyright (c) 2003 Matthew Wilcox
  * Copyright (c) 2004 Silicon Graphics, Inc.
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Please see Documentation/filesystems/sysfs.txt for more information.
  */
 
 #undef DEBUG
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index e301a1207b6..9161db4d6b5 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -1,5 +1,13 @@
 /*
- * dir.c - Operations for sysfs directories.
+ * fs/sysfs/dir.c - sysfs core and dir operation implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Please see Documentation/filesystems/sysfs.txt for more information.
  */
 
 #undef DEBUG
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index c05f9618b2d..d3be1e7fb48 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -1,5 +1,13 @@
 /*
- * file.c - operations for regular (text) files.
+ * fs/sysfs/file.c - sysfs regular (text) file implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Please see Documentation/filesystems/sysfs.txt for more information.
  */
 
 #include <linux/module.h>
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 2210cf0005f..9236635111f 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -1,7 +1,11 @@
 /*
- * inode.c - basic inode and dentry operations.
+ * fs/sysfs/inode.c - basic sysfs inode and dentry operations
  *
- * sysfs is Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
  *
  * Please see Documentation/filesystems/sysfs.txt for more information.
  */
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 465902c18d3..c76c540be3c 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -1,5 +1,13 @@
 /*
- * mount.c - operations for initializing and mounting sysfs.
+ * fs/sysfs/symlink.c - operations for initializing and mounting sysfs
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Please see Documentation/filesystems/sysfs.txt for more information.
  */
 
 #define DEBUG 
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index ffa82e9802a..3eac20c63c4 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -1,5 +1,13 @@
 /*
- * symlink.c - operations for sysfs symlinks.
+ * fs/sysfs/symlink.c - sysfs symlink implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Please see Documentation/filesystems/sysfs.txt for more information.
  */
 
 #include <linux/fs.h>
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 269c845c590..f0326f281d1 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -1,3 +1,13 @@
+/*
+ * fs/sysfs/sysfs.h - sysfs internal header file
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ */
+
 struct sysfs_open_dirent;
 
 /* type-specific structures for sysfs_dirent->s_* union members */
-- 
cgit v1.2.3