123 files changed, 23328 insertions, 1544 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 86b203fc3c5..9f7270f36b2 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -175,9 +175,34 @@ source "fs/qnx4/Kconfig"
 source "fs/romfs/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
-
 source "fs/exofs/Kconfig"
 
+config NILFS2_FS
+	tristate "NILFS2 file system support (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	select CRC32
+	help
+	  NILFS2 is a log-structured file system (LFS) supporting continuous
+	  snapshotting.  In addition to versioning capability of the entire
+	  file system, users can even restore files mistakenly overwritten or
+	  destroyed just a few seconds ago.  Since this file system can keep
+	  consistency like conventional LFS, it achieves quick recovery after
+	  system crashes.
+
+	  NILFS2 creates a number of checkpoints every few seconds or per
+	  synchronous write basis (unless there is no change).  Users can
+	  select significant versions among continuously created checkpoints,
+	  and can change them into snapshots which will be preserved for long
+	  periods until they are changed back to checkpoints.  Each
+	  snapshot is mountable as a read-only file system concurrently with
+	  its writable mount, and this feature is convenient for online backup.
+
+	  Some features including atime, extended attributes, and POSIX ACLs,
+	  are not supported yet.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called nilfs2.  If unsure, say N.
+
 endif # MISC_FILESYSTEMS
 
 menuconfig NETWORK_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index 70b2aed8713..af6d04700d9 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -114,6 +114,7 @@ obj-$(CONFIG_JFS_FS)		+= jfs/
 obj-$(CONFIG_XFS_FS)		+= xfs/
 obj-$(CONFIG_9P_FS)		+= 9p/
 obj-$(CONFIG_AFS_FS)		+= afs/
+obj-$(CONFIG_NILFS2_FS)		+= nilfs2/
 obj-$(CONFIG_BEFS_FS)		+= befs/
 obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)		+= hppfs/
diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
index 49f18942306..7ad36506c25 100644
--- a/fs/afs/netdevices.c
+++ b/fs/afs/netdevices.c
@@ -20,8 +20,7 @@ int afs_get_MAC_address(u8 *mac, size_t maclen)
 	struct net_device *dev;
 	int ret = -ENODEV;
 
-	if (maclen != ETH_ALEN)
-		BUG();
+	BUG_ON(maclen != ETH_ALEN);
 
 	rtnl_lock();
 	dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER);
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index b8e304a0661..622e73775c8 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -17,6 +17,7 @@
 #include <linux/spinlock.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
+#include <linux/slab.h>
 
 #endif				/* __KERNEL__ */
 
diff --git a/fs/befs/super.c b/fs/befs/super.c
index 41f2b4d0093..ca40f828f64 100644
--- a/fs/befs/super.c
+++ b/fs/befs/super.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/fs.h>
+#include <asm/page.h> /* for PAGE_SIZE */
 
 #include "befs.h"
 #include "super.h"
diff --git a/fs/buffer.c b/fs/buffer.c
index 5d55a896ff7..13edf7ad3ff 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -737,7 +737,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
 	struct buffer_head *bh;
 	struct list_head tmp;
-	struct address_space *mapping;
+	struct address_space *mapping, *prev_mapping = NULL;
 	int err = 0, err2;
 
 	INIT_LIST_HEAD(&tmp);
@@ -762,7 +762,18 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				 * contents - it is a noop if I/O is still in
 				 * flight on potentially older contents.
 				 */
-				ll_rw_block(SWRITE_SYNC, 1, &bh);
+				ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
+
+				/*
+				 * Kick off IO for the previous mapping. Note
+				 * that we will not run the very last mapping,
+				 * wait_on_buffer() will do that for us
+				 * through sync_buffer().
+				 */
+				if (prev_mapping && prev_mapping != mapping)
+					blk_run_address_space(prev_mapping);
+				prev_mapping = mapping;
+
 				brelse(bh);
 				spin_lock(lock);
 			}
@@ -1585,6 +1596,16 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
  * locked buffer.   This only can happen if someone has written the buffer
  * directly, with submit_bh().  At the address_space level PageWriteback
  * prevents this contention from occurring.
+ *
+ * If block_write_full_page() is called with wbc->sync_mode ==
+ * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
+ * causes the writes to be flagged as synchronous writes, but the
+ * block device queue will NOT be unplugged, since usually many pages
+ * will be pushed to the out before the higher-level caller actually
+ * waits for the writes to be completed.  The various wait functions,
+ * such as wait_on_writeback_range() will ultimately call sync_page()
+ * which will ultimately call blk_run_backing_dev(), which will end up
+ * unplugging the device queue.
  */
 static int __block_write_full_page(struct inode *inode, struct page *page,
 			get_block_t *get_block, struct writeback_control *wbc)
@@ -1595,7 +1616,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	struct buffer_head *bh, *head;
 	const unsigned blocksize = 1 << inode->i_blkbits;
 	int nr_underway = 0;
-	int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
+	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
+			WRITE_SYNC_PLUG : WRITE);
 
 	BUG_ON(!PageLocked(page));
 
@@ -2957,12 +2979,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 	for (i = 0; i < nr; i++) {
 		struct buffer_head *bh = bhs[i];
 
-		if (rw == SWRITE || rw == SWRITE_SYNC)
+		if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
 			lock_buffer(bh);
 		else if (!trylock_buffer(bh))
 			continue;
 
-		if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
+		if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
+		    rw == SWRITE_SYNC_PLUG) {
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
 				get_bh(bh);
@@ -2998,7 +3021,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
 	if (test_clear_buffer_dirty(bh)) {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_write_sync;
-		ret = submit_bh(WRITE, bh);
+		ret = submit_bh(WRITE_SYNC, bh);
 		wait_on_buffer(bh);
 		if (buffer_eopnotsupp(bh)) {
 			clear_buffer_eopnotsupp(bh);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b6d43908ff7..da258e7249c 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1126,7 +1126,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	int acquire_i_mutex = 0;
 
 	if (rw & WRITE)
-		rw = WRITE_SYNC;
+		rw = WRITE_ODIRECT;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index b43b9556366..acf67883110 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -590,9 +590,8 @@ static int ext2_get_blocks(struct inode *inode,
 
 	if (depth == 0)
 		return (err);
-reread:
-	partial = ext2_get_branch(inode, depth, offsets, chain, &err);
 
+	partial = ext2_get_branch(inode, depth, offsets, chain, &err);
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
 		first_block = le32_to_cpu(chain[depth - 1].key);
@@ -602,15 +601,16 @@ reread:
 		while (count < maxblocks && count <= blocks_to_boundary) {
 			ext2_fsblk_t blk;
 
-			if (!verify_chain(chain, partial)) {
+			if (!verify_chain(chain, chain + depth - 1)) {
 				/*
 				 * Indirect block might be removed by
 				 * truncate while we were reading it.
 				 * Handling of that case: forget what we've
 				 * got now, go to reread.
 				 */
+				err = -EAGAIN;
 				count = 0;
-				goto changed;
+				break;
 			}
 			blk = le32_to_cpu(*(chain[depth-1].p + count));
 			if (blk == first_block + count)
@@ -618,7 +618,8 @@ reread:
 			else
 				break;
 		}
-		goto got_it;
+		if (err != -EAGAIN)
+			goto got_it;
 	}
 
 	/* Next simple case - plain lookup or failed read of indirect block */
@@ -626,6 +627,33 @@ reread:
 		goto cleanup;
 
 	mutex_lock(&ei->truncate_mutex);
+	/*
+	 * If the indirect block is missing while we are reading
+	 * the chain(ext3_get_branch() returns -EAGAIN err), or
+	 * if the chain has been changed after we grab the semaphore,
+	 * (either because another process truncated this branch, or
+	 * another get_block allocated this branch) re-grab the chain to see if
+	 * the request block has been allocated or not.
+	 *
+	 * Since we already block the truncate/other get_block
+	 * at this point, we will have the current copy of the chain when we
+	 * splice the branch into the tree.
+	 */
+	if (err == -EAGAIN || !verify_chain(chain, partial)) {
+		while (partial > chain) {
+			brelse(partial->bh);
+			partial--;
+		}
+		partial = ext2_get_branch(inode, depth, offsets, chain, &err);
+		if (!partial) {
+			count++;
+			mutex_unlock(&ei->truncate_mutex);
+			if (err)
+				goto cleanup;
+			clear_buffer_new(bh_result);
+			goto got_it;
+		}
+	}
 
 	/*
 	 * Okay, we need to do block allocation.  Lazily initialize the block
@@ -683,12 +711,6 @@ cleanup:
 		partial--;
 	}
 	return err;
-changed:
-	while (partial > chain) {
-		brelse(partial->bh);
-		partial--;
-	}
-	goto reread;
 }
 
 int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 8e0cfe44b0f..fb3c1a21b13 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -28,6 +28,25 @@ config EXT3_FS
 	  To compile this file system support as a module, choose M here: the
 	  module will be called ext3.
 
+config EXT3_DEFAULTS_TO_ORDERED
+	bool "Default to 'data=ordered' in ext3 (legacy option)"
+	depends on EXT3_FS
+	help
+	  If a filesystem does not explicitly specify a data ordering
+	  mode, and the journal capability allowed it, ext3 used to
+	  historically default to 'data=ordered'.
+
+	  That was a rather unfortunate choice, because it leads to all
+	  kinds of latency problems, and the 'data=writeback' mode is more
+	  appropriate these days.
+
+	  You should probably always answer 'n' here, and if you really
+	  want to use 'data=ordered' mode, set it in the filesystem itself
+	  with 'tune2fs -o journal_data_ordered'.
+
+	  But if you really want to enable the legacy default, you can do
+	  so by answering 'y' to this question.
+
 config EXT3_FS_XATTR
 	bool "Ext3 extended attributes"
 	depends on EXT3_FS
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 466a332e0bd..fcfa2436185 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1521,12 +1521,16 @@ static int ext3_ordered_writepage(struct page *page,
 	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, inode->i_sb->s_blocksize,
 				(1 << BH_Dirty)|(1 << BH_Uptodate));
-	} else if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
-		/* Provide NULL instead of get_block so that we catch bugs if buffers weren't really mapped */
-		return block_write_full_page(page, NULL, wbc);
+		page_bufs = page_buffers(page);
+	} else {
+		page_bufs = page_buffers(page);
+		if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
+				       NULL, buffer_unmapped)) {
+			/* Provide NULL get_block() to catch bugs if buffers
+			 * weren't really mapped */
+			return block_write_full_page(page, NULL, wbc);
+		}
 	}
-	page_bufs = page_buffers(page);
-
 	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
 
 	if (IS_ERR(handle)) {
@@ -1581,6 +1585,15 @@ static int ext3_writeback_writepage(struct page *page,
 	if (ext3_journal_current_handle())
 		goto out_fail;
 
+	if (page_has_buffers(page)) {
+		if (!walk_page_buffers(NULL, page_buffers(page), 0,
+				      PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
+			/* Provide NULL get_block() to catch bugs if buffers
+			 * weren't really mapped */
+			return block_write_full_page(page, NULL, wbc);
+		}
+	}
+
 	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9e5b8e387e1..599dbfe504c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -44,6 +44,12 @@
 #include "acl.h"
 #include "namei.h"
 
+#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
+#else
+  #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
+#endif
+
 static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
 			     unsigned long journal_devnum);
 static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
@@ -1919,7 +1925,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                    cope, else JOURNAL_DATA */
 		if (journal_check_available_features
 		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
-			set_opt(sbi->s_mount_opt, ORDERED_DATA);
+			set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
 		else
 			set_opt(sbi->s_mount_opt, JOURNAL_DATA);
 		break;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ac77d8b8251..6132353dcf6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -342,7 +342,7 @@ static int ext4_valid_extent_idx(struct inode *inode,
 	ext4_fsblk_t block = idx_pblock(ext_idx);
 	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 	if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
-			(block > ext4_blocks_count(es))))
+			(block >= ext4_blocks_count(es))))
 		return 0;
 	else
 		return 1;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a2e7952bc5f..c6bd6ced3bb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -372,16 +372,16 @@ static int ext4_block_to_path(struct inode *inode,
 }
 
 static int __ext4_check_blockref(const char *function, struct inode *inode,
-				 unsigned int *p, unsigned int max) {
+				 __le32 *p, unsigned int max) {
 
 	unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
-	unsigned int *bref = p;
+	__le32 *bref = p;
 	while (bref < p+max) {
-		if (unlikely(*bref >= maxblocks)) {
+		if (unlikely(le32_to_cpu(*bref) >= maxblocks)) {
 			ext4_error(inode->i_sb, function,
 				   "block reference %u >= max (%u) "
 				   "in inode #%lu, offset=%d",
-				   *bref, maxblocks,
+				   le32_to_cpu(*bref), maxblocks,
 				   inode->i_ino, (int)(bref-p));
  			return -EIO;
  		}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9987bba99db..2958f4e6f22 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2508,6 +2508,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
 		goto cantfind_ext4;
 
+	/* check blocks count against device size */
+	blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+	if (blocks_count && ext4_blocks_count(es) > blocks_count) {
+		printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu "
+		       "exceeds size of device (%llu blocks)\n",
+		       ext4_blocks_count(es), blocks_count);
+		goto failed_mount;
+	}
+
         /*
          * It makes no sense for the first data block to be beyond the end
          * of the filesystem.
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2b25133524a..06f30e96567 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -938,9 +938,9 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
 }
 
 static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
-			       unsigned *nbytesp, int write)
+			       size_t *nbytesp, int write)
 {
-	unsigned nbytes = *nbytesp;
+	size_t nbytes = *nbytesp;
 	unsigned long user_addr = (unsigned long) buf;
 	unsigned offset = user_addr & ~PAGE_MASK;
 	int npages;
@@ -955,7 +955,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
 		return 0;
 	}
 
-	nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+	nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
 	npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
 	down_read(&current->mm->mmap_sem);
@@ -1298,6 +1298,8 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma->vm_flags & VM_MAYSHARE)
 		return -ENODEV;
 
+	invalidate_inode_pages2(file->f_mapping);
+
 	return generic_file_mmap(file, vma);
 }
 
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 9435dda8f1e..a1cbff2b4d9 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -70,6 +70,10 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
 		BUG();
 		return 0;
 	}
+
+	if (!tree)
+		return 0;
+
 	if (tree->node_size >= PAGE_CACHE_SIZE) {
 		nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
 		spin_lock(&tree->hash_lock);
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 36ca2e1a4fa..7b6165f25fb 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -349,6 +349,7 @@ void hfs_mdb_put(struct super_block *sb)
 	if (HFS_SB(sb)->nls_disk)
 		unload_nls(HFS_SB(sb)->nls_disk);
 
+	free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
 	kfree(HFS_SB(sb));
 	sb->s_fs_info = NULL;
 }
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index f8077b9c898..a8e8513a78a 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -351,8 +351,13 @@ void journal_commit_transaction(journal_t *journal)
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
+	/*
+	 * Use plugged writes here, since we want to submit several before
+	 * we unplug the device. We don't do explicit unplugging in here,
+	 * instead we rely on sync_buffer() doing the unplug for us.
+	 */
 	if (commit_transaction->t_synchronous_commit)
-		write_op = WRITE_SYNC;
+		write_op = WRITE_SYNC_PLUG;
 	spin_lock(&commit_transaction->t_handle_lock);
 	while (commit_transaction->t_updates) {
 		DEFINE_WAIT(wait);
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index c7bd649bbbd..3e9afc2a91d 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -55,6 +55,25 @@
  *			need do nothing.
  * RevokeValid set, Revoked set:
  *			buffer has been revoked.
+ *
+ * Locking rules:
+ * We keep two hash tables of revoke records. One hashtable belongs to the
+ * running transaction (is pointed to by journal->j_revoke), the other one
+ * belongs to the committing transaction. Accesses to the second hash table
+ * happen only from the kjournald and no other thread touches this table.  Also
+ * journal_switch_revoke_table() which switches which hashtable belongs to the
+ * running and which to the committing transaction is called only from
+ * kjournald. Therefore we need no locks when accessing the hashtable belonging
+ * to the committing transaction.
+ *
+ * All users operating on the hash table belonging to the running transaction
+ * have a handle to the transaction. Therefore they are safe from kjournald
+ * switching hash tables under them. For operations on the lists of entries in
+ * the hash table j_revoke_lock is used.
+ *
+ * Finally, also replay code uses the hash tables but at this moment noone else
+ * can touch them (filesystem isn't mounted yet) and hence no locking is
+ * needed.
  */
 
 #ifndef __KERNEL__
@@ -402,8 +421,6 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
  * the second time we would still have a pending revoke to cancel.  So,
  * do not trust the Revoked bit on buffers unless RevokeValid is also
  * set.
- *
- * The caller must have the journal locked.
  */
 int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 {
@@ -481,10 +498,7 @@ void journal_switch_revoke_table(journal_t *journal)
 /*
  * Write revoke records to the journal for all entries in the current
  * revoke hash, deleting the entries as we go.
- *
- * Called with the journal lock held.
  */
-
 void journal_write_revoke_records(journal_t *journal,
 				  transaction_t *transaction)
 {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4ea72377c7a..073c8c3df7c 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -138,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
 		set_buffer_ordered(bh);
 		barrier_done = 1;
 	}
-	ret = submit_bh(WRITE_SYNC, bh);
+	ret = submit_bh(WRITE_SYNC_PLUG, bh);
 	if (barrier_done)
 		clear_buffer_ordered(bh);
 
@@ -159,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
 		lock_buffer(bh);
 		set_buffer_uptodate(bh);
 		clear_buffer_dirty(bh);
-		ret = submit_bh(WRITE_SYNC, bh);
+		ret = submit_bh(WRITE_SYNC_PLUG, bh);
 	}
 	*cbh = bh;
 	return ret;
@@ -190,7 +190,7 @@ retry:
 		set_buffer_uptodate(bh);
 		bh->b_end_io = journal_end_buffer_io_sync;
 
-		ret = submit_bh(WRITE_SYNC, bh);
+		ret = submit_bh(WRITE_SYNC_PLUG, bh);
 		if (ret) {
 			unlock_buffer(bh);
 			return ret;
@@ -402,8 +402,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_LOCKED;
 
+	/*
+	 * Use plugged writes here, since we want to submit several before
+	 * we unplug the device. We don't do explicit unplugging in here,
+	 * instead we rely on sync_buffer() doing the unplug for us.
+	 */
 	if (commit_transaction->t_synchronous_commit)
-		write_op = WRITE_SYNC;
+		write_op = WRITE_SYNC_PLUG;
 	stats.u.run.rs_wait = commit_transaction->t_max_wait;
 	stats.u.run.rs_locked = jiffies;
 	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 77ccf8cb082..043740dde20 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -38,12 +38,12 @@ static int jffs2_acl_count(size_t size)
 	size_t s;
 
 	size -= sizeof(struct jffs2_acl_header);
-	s = size - 4 * sizeof(struct jffs2_acl_entry_short);
-	if (s < 0) {
+	if (size < 4 * sizeof(struct jffs2_acl_entry_short)) {
 		if (size % sizeof(struct jffs2_acl_entry_short))
 			return -1;
 		return size / sizeof(struct jffs2_acl_entry_short);
 	} else {
+		s = size - 4 * sizeof(struct jffs2_acl_entry_short);
 		if (s % sizeof(struct jffs2_acl_entry))
 			return -1;
 		return s / sizeof(struct jffs2_acl_entry) + 4;
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index f9211252b5f..9eff2bdae8a 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -284,10 +284,9 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
 struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
 {
 	struct jffs2_xattr_datum *xd;
-	xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL);
+	xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
 	dbg_memalloc("%p\n", xd);
 
-	memset(xd, 0, sizeof(struct jffs2_xattr_datum));
 	xd->class = RAWNODE_CLASS_XATTR_DATUM;
 	xd->node = (void *)xd;
 	INIT_LIST_HEAD(&xd->xindex);
@@ -303,10 +302,9 @@ void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
 struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
 {
 	struct jffs2_xattr_ref *ref;
-	ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL);
+	ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
 	dbg_memalloc("%p\n", ref);
 
-	memset(ref, 0, sizeof(struct jffs2_xattr_ref));
 	ref->class = RAWNODE_CLASS_XATTR_REF;
 	ref->node = (void *)ref;
 	return ref;
diff --git a/fs/libfs.c b/fs/libfs.c
index 4910a36f516..cd223190c4e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -575,6 +575,21 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
  * possibly a read which collects the result - which is stored in a
  * file-local buffer.
  */
+
+void simple_transaction_set(struct file *file, size_t n)
+{
+	struct simple_transaction_argresp *ar = file->private_data;
+
+	BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
+
+	/*
+	 * The barrier ensures that ar->size will really remain zero until
+	 * ar->data is ready for reading.
+	 */
+	smp_mb();
+	ar->size = n;
+}
+
 char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
 {
 	struct simple_transaction_argresp *ar;
@@ -820,6 +835,7 @@ EXPORT_SYMBOL(simple_sync_file);
 EXPORT_SYMBOL(simple_unlink);
 EXPORT_SYMBOL(simple_read_from_buffer);
 EXPORT_SYMBOL(memory_read_from_buffer);
+EXPORT_SYMBOL(simple_transaction_set);
 EXPORT_SYMBOL(simple_transaction_get);
 EXPORT_SYMBOL(simple_transaction_read);
 EXPORT_SYMBOL(simple_transaction_release);
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 763b78a6e9d..83ee34203bd 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -426,8 +426,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			ret = nlm_granted;
 			goto out;
 		case -EAGAIN:
+			/*
+			 * If this is a blocking request for an
+			 * already pending lock request then we need
+			 * to put it back on lockd's block list
+			 */
+			if (wait)
+				break;
 			ret = nlm_lck_denied;
-			break;
+			goto out;
 		case FILE_LOCK_DEFERRED:
 			if (wait)
 				break;
@@ -443,10 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			goto out;
 	}
 
-	ret = nlm_lck_denied;
-	if (!wait)
-		goto out;
-
 	ret = nlm_lck_blocked;
 
 	/* Append to list of blocked */
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 3523b895eb4..5a97bcfe03e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -516,8 +516,6 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		goto out_unlock;
 
 	ret = nfs_updatepage(filp, page, 0, pagelen);
-	if (ret == 0)
-		ret = pagelen;
 out_unlock:
 	unlock_page(page);
 	if (ret)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 82eaadbff40..6717200923f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1228,7 +1228,6 @@ static int nfs_parse_mount_options(char *raw,
 				goto out_nomem;
 			token = match_token(string,
 					    nfs_xprt_protocol_tokens, args);
-			kfree(string);
 
 			switch (token) {
 			case Opt_xprt_udp:
@@ -1258,6 +1257,7 @@ static int nfs_parse_mount_options(char *raw,
 				goto out_nomem;
 			token = match_token(string,
 					    nfs_xprt_protocol_tokens, args);
+			kfree(string);
 
 			switch (token) {
 			case Opt_xprt_udp:
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 44d7d04dab9..503b9da159a 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -1,6 +1,7 @@
 config NFSD
 	tristate "NFS server support"
 	depends on INET
+	depends on FILE_LOCKING
 	select LOCKD
 	select SUNRPC
 	select EXPORTFS
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9dbd2eb9128..7c9fe838f03 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -18,6 +18,7 @@
 #include <linux/unistd.h>
 #include <linux/slab.h>
 #include <linux/major.h>
+#include <linux/magic.h>
 
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
@@ -202,6 +203,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 					 struct nfsd3_writeres  *resp)
 {
 	__be32	nfserr;
+	unsigned long cnt = argp->len;
 
 	dprintk("nfsd: WRITE(3)    %s %d bytes at %ld%s\n",
 				SVCFH_fmt(&argp->fh),
@@ -214,9 +216,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 	nfserr = nfsd_write(rqstp, &resp->fh, NULL,
 				   argp->offset,
 				   rqstp->rq_vec, argp->vlen,
-				   argp->len,
+				   &cnt,
 				   &resp->committed);
-	resp->count = argp->count;
+	resp->count = cnt;
 	RETURN_STATUS(nfserr);
 }
 
@@ -569,7 +571,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
 		struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
 
 		/* Note that we don't care for remote fs's here */
-		if (sb->s_magic == 0x4d44 /* MSDOS_SUPER_MAGIC */) {
+		if (sb->s_magic == MSDOS_SUPER_MAGIC) {
 			resp->f_properties = NFS3_FSF_BILLYBOY;
 		}
 		resp->f_maxfilesize = sb->s_maxbytes;
@@ -610,7 +612,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle      *argp,
 			resp->p_link_max = EXT2_LINK_MAX;
 			resp->p_name_max = EXT2_NAME_LEN;
 			break;
-		case 0x4d44:	/* MSDOS_SUPER_MAGIC */
+		case MSDOS_SUPER_MAGIC:
 			resp->p_case_insensitive = 1;
 			resp->p_case_preserving  = 0;
 			break;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c464181b599..290289bd44f 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -218,7 +218,7 @@ static int
 encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 {
 	__be32 *p;
-	int len = cb_rec->cbr_fhlen;
+	int len = cb_rec->cbr_fh.fh_size;
 
 	RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
 	WRITE32(OP_CB_RECALL);
@@ -226,7 +226,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
 	WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
 	WRITE32(cb_rec->cbr_trunc);
 	WRITE32(len);
-	WRITEMEM(cb_rec->cbr_fhval, len);
+	WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
 	return 0;
 }
 
@@ -361,9 +361,8 @@ static struct rpc_program cb_program = {
 /* Reference counting, callback cleanup, etc., all look racy as heck.
  * And why is cb_set an atomic? */
 
-static int do_probe_callback(void *data)
+static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
 {
-	struct nfs4_client *clp = data;
 	struct sockaddr_in	addr;
 	struct nfs4_callback    *cb = &clp->cl_callback;
 	struct rpc_timeout	timeparms = {
@@ -384,17 +383,10 @@ static int do_probe_callback(void *data)
 		.flags		= (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
 		.client_name    = clp->cl_principal,
 	};
-	struct rpc_message msg = {
-		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
-		.rpc_argp       = clp,
-	};
 	struct rpc_clnt *client;
-	int status;
 
-	if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) {
-		status = nfserr_cb_path_down;
-		goto out_err;
-	}
+	if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
+		return ERR_PTR(-EINVAL);
 
 	/* Initialize address */
 	memset(&addr, 0, sizeof(addr));
@@ -404,9 +396,29 @@ static int do_probe_callback(void *data)
 
 	/* Create RPC client */
 	client = rpc_create(&args);
+	if (IS_ERR(client))
+		dprintk("NFSD: couldn't create callback client: %ld\n",
+			PTR_ERR(client));
+	return client;
+
+}
+
+static int do_probe_callback(void *data)
+{
+	struct nfs4_client *clp = data;
+	struct nfs4_callback    *cb = &clp->cl_callback;
+	struct rpc_message msg = {
+		.rpc_proc       = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
+		.rpc_argp       = clp,
+	};
+	struct rpc_clnt *client;
+	int status;
+
+	client = setup_callback_client(clp);
 	if (IS_ERR(client)) {
-		dprintk("NFSD: couldn't create callback client\n");
 		status = PTR_ERR(client);
+		dprintk("NFSD: couldn't create callback client: %d\n",
+								status);
 		goto out_err;
 	}
 
@@ -422,10 +434,10 @@ static int do_probe_callback(void *data)
 out_release_client:
 	rpc_shutdown_client(client);
 out_err:
-	dprintk("NFSD: warning: no callback path to client %.*s\n",
-		(int)clp->cl_name.len, clp->cl_name.data);
+	dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
+		(int)clp->cl_name.len, clp->cl_name.data, status);
 	put_nfs4_client(clp);
-	return status;
+	return 0;
 }
 
 /*
@@ -451,7 +463,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 
 /*
  * called with dp->dl_count inc'ed.
- * nfs4_lock_state() may or may not have been called.
  */
 void
 nfsd4_cb_recall(struct nfs4_delegation *dp)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9fa60a3ad48..b2883e9c638 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -93,6 +93,21 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 	open->op_truncate = 0;
 
 	if (open->op_create) {
+		/* FIXME: check session persistence and pnfs flags.
+		 * The nfsv4.1 spec requires the following semantics:
+		 *
+		 * Persistent   | pNFS   | Server REQUIRED | Client Allowed
+		 * Reply Cache  | server |                 |
+		 * -------------+--------+-----------------+--------------------
+		 * no           | no     | EXCLUSIVE4_1    | EXCLUSIVE4_1
+		 *              |        |                 | (SHOULD)
+		 *              |        | and EXCLUSIVE4  | or EXCLUSIVE4
+		 *              |        |                 | (SHOULD NOT)
+		 * no           | yes    | EXCLUSIVE4_1    | EXCLUSIVE4_1
+		 * yes          | no     | GUARDED4        | GUARDED4
+		 * yes          | yes    | GUARDED4        | GUARDED4
+		 */
+
 		/*
 		 * Note: create modes (UNCHECKED,GUARDED...) are the same
 		 * in NFSv4 as in v3.
@@ -103,11 +118,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 					(u32 *)open->op_verf.data,
 					&open->op_truncate, &created);
 
-		/* If we ever decide to use different attrs to store the
-		 * verifier in nfsd_create_v3, then we'll need to change this
+		/*
+		 * Following rfc 3530 14.2.16, use the returned bitmask
+		 * to indicate which attributes we used to store the
+		 * verifier:
 		 */
 		if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
-			open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
+			open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
 						FATTR4_WORD1_TIME_MODIFY);
 	} else {
 		status = nfsd_lookup(rqstp, current_fh,
@@ -118,13 +135,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 		goto out;
 
 	set_change_info(&open->op_cinfo, current_fh);
-
-	/* set reply cache */
 	fh_dup2(current_fh, &resfh);
-	open->op_stateowner->so_replay.rp_openfh_len = resfh.fh_handle.fh_size;
-	memcpy(open->op_stateowner->so_replay.rp_openfh,
-			&resfh.fh_handle.fh_base, resfh.fh_handle.fh_size);
 
+	/* set reply cache */
+	fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+			&resfh.fh_handle);
 	if (!created)
 		status = do_open_permission(rqstp, current_fh, open,
 					    NFSD_MAY_NOP);
@@ -150,10 +165,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
 	memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
 
 	/* set replay cache */
-	open->op_stateowner->so_replay.rp_openfh_len = current_fh->fh_handle.fh_size;
-	memcpy(open->op_stateowner->so_replay.rp_openfh,
-		&current_fh->fh_handle.fh_base,
-		current_fh->fh_handle.fh_size);
+	fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
+			&current_fh->fh_handle);
 
 	open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
 		(open->op_iattr.ia_size == 0);
@@ -164,12 +177,23 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
 	return status;
 }
 
+static void
+copy_clientid(clientid_t *clid, struct nfsd4_session *session)
+{
+	struct nfsd4_sessionid *sid =
+			(struct nfsd4_sessionid *)session->se_sessionid.data;
+
+	clid->cl_boot = sid->clientid.cl_boot;
+	clid->cl_id = sid->clientid.cl_id;
+}
 
 static __be32
 nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	   struct nfsd4_open *open)
 {
 	__be32 status;
+	struct nfsd4_compoundres *resp;
+
 	dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
 		(int)open->op_fname.len, open->op_fname.data,
 		open->op_stateowner);
@@ -178,16 +202,19 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
 		return nfserr_inval;
 
+	if (nfsd4_has_session(cstate))
+		copy_clientid(&open->op_clientid, cstate->session);
+
 	nfs4_lock_state();
 
 	/* check seqid for replay. set nfs4_owner */
-	status = nfsd4_process_open1(open);
+	resp = rqstp->rq_resp;
+	status = nfsd4_process_open1(&resp->cstate, open);
 	if (status == nfserr_replay_me) {
 		struct nfs4_replay *rp = &open->op_stateowner->so_replay;
 		fh_put(&cstate->current_fh);
-		cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len;
-		memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh,
-				rp->rp_openfh_len);
+		fh_copy_shallow(&cstate->current_fh.fh_handle,
+				&rp->rp_openfh);
 		status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
 		if (status)
 			dprintk("nfsd4_open: replay failed"
@@ -209,10 +236,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	switch (open->op_claim_type) {
 		case NFS4_OPEN_CLAIM_DELEGATE_CUR:
-			status = nfserr_inval;
-			if (open->op_create)
-				goto out;
-			/* fall through */
 		case NFS4_OPEN_CLAIM_NULL:
 			/*
 			 * (1) set CURRENT_FH to the file being opened,
@@ -455,8 +478,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
 		return nfserr_inval;
 
-	getattr->ga_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0;
-	getattr->ga_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1;
+	getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
+	getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+	getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
 
 	getattr->ga_fhp = &cstate->current_fh;
 	return nfs_ok;
@@ -520,9 +544,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 	/* check stateid */
-	if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh,
-				&read->rd_stateid,
-				CHECK_FH | RD_STATE, &read->rd_filp))) {
+	if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid,
+						 RD_STATE, &read->rd_filp))) {
 		dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
 		goto out;
 	}
@@ -548,8 +571,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
 		return nfserr_inval;
 
-	readdir->rd_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0;
-	readdir->rd_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1;
+	readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
+	readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
+	readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
 
 	if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
 	    (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
@@ -653,8 +677,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
 		nfs4_lock_state();
-		status = nfs4_preprocess_stateid_op(&cstate->current_fh,
-			&setattr->sa_stateid, CHECK_FH | WR_STATE, NULL);
+		status = nfs4_preprocess_stateid_op(cstate,
+			&setattr->sa_stateid, WR_STATE, NULL);
 		nfs4_unlock_state();
 		if (status) {
 			dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
@@ -685,6 +709,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct file *filp = NULL;
 	u32 *p;
 	__be32 status = nfs_ok;
+	unsigned long cnt;
 
 	/* no need to check permission - this will be done in nfsd_write() */
 
@@ -692,8 +717,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		return nfserr_inval;
 
 	nfs4_lock_state();
-	status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid,
-					CHECK_FH | WR_STATE, &filp);
+	status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp);
 	if (filp)
 		get_file(filp);
 	nfs4_unlock_state();
@@ -703,7 +727,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		return status;
 	}
 
-	write->wr_bytes_written = write->wr_buflen;
+	cnt = write->wr_buflen;
 	write->wr_how_written = write->wr_stable_how;
 	p = (u32 *)write->wr_verifier.data;
 	*p++ = nfssvc_boot.tv_sec;
@@ -711,10 +735,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	status =  nfsd_write(rqstp, &cstate->current_fh, filp,
 			     write->wr_offset, rqstp->rq_vec, write->wr_vlen,
-			     write->wr_buflen, &write->wr_how_written);
+			     &cnt, &write->wr_how_written);
 	if (filp)
 		fput(filp);
 
+	write->wr_bytes_written = cnt;
+
 	if (status == nfserr_symlink)
 		status = nfserr_inval;
 	return status;
@@ -737,8 +763,9 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status)
 		return status;
 
-	if ((verify->ve_bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0)
-	    || (verify->ve_bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1))
+	if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion))
+	    || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion))
+	    || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
 		return nfserr_attrnotsupp;
 	if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
 	    || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
@@ -766,7 +793,8 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status)
 		goto out_kfree;
 
-	p = buf + 3;
+	/* skip bitmap */
+	p = buf + 1 + ntohl(buf[0]);
 	status = nfserr_not_same;
 	if (ntohl(*p++) != verify->ve_attrlen)
 		goto out_kfree;
@@ -813,39 +841,17 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
 		nfsdstats.nfs4_opcount[opnum]++;
 }
 
-static void cstate_free(struct nfsd4_compound_state *cstate)
-{
-	if (cstate == NULL)
-		return;
-	fh_put(&cstate->current_fh);
-	fh_put(&cstate->save_fh);
-	BUG_ON(cstate->replay_owner);
-	kfree(cstate);
-}
-
-static struct nfsd4_compound_state *cstate_alloc(void)
-{
-	struct nfsd4_compound_state *cstate;
-
-	cstate = kmalloc(sizeof(struct nfsd4_compound_state), GFP_KERNEL);
-	if (cstate == NULL)
-		return NULL;
-	fh_init(&cstate->current_fh, NFS4_FHSIZE);
-	fh_init(&cstate->save_fh, NFS4_FHSIZE);
-	cstate->replay_owner = NULL;
-	return cstate;
-}
-
 typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
 			      void *);
+enum nfsd4_op_flags {
+	ALLOWED_WITHOUT_FH = 1 << 0,	/* No current filehandle required */
+	ALLOWED_ON_ABSENT_FS = 2 << 0,	/* ops processed on absent fs */
+	ALLOWED_AS_FIRST_OP = 3 << 0,	/* ops reqired first in compound */
+};
 
 struct nfsd4_operation {
 	nfsd4op_func op_func;
 	u32 op_flags;
-/* Most ops require a valid current filehandle; a few don't: */
-#define ALLOWED_WITHOUT_FH 1
-/* GETATTR and ops not listed as returning NFS4ERR_MOVED: */
-#define ALLOWED_ON_ABSENT_FS 2
 	char *op_name;
 };
 
@@ -854,6 +860,51 @@ static struct nfsd4_operation nfsd4_ops[];
 static const char *nfsd4_op_name(unsigned opnum);
 
 /*
+ * This is a replay of a compound for which no cache entry pages
+ * were used. Encode the sequence operation, and if cachethis is FALSE
+ * encode the uncache rep error on the next operation.
+ */
+static __be32
+nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
+			 struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_op *op;
+
+	dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
+		resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
+
+	/* Encode the replayed sequence operation */
+	BUG_ON(resp->opcnt != 1);
+	op = &args->ops[resp->opcnt - 1];
+	nfsd4_encode_operation(resp, op);
+
+	/*return nfserr_retry_uncached_rep in next operation. */
+	if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
+		op = &args->ops[resp->opcnt++];
+		op->status = nfserr_retry_uncached_rep;
+		nfsd4_encode_operation(resp, op);
+	}
+	return op->status;
+}
+
+/*
+ * Enforce NFSv4.1 COMPOUND ordering rules.
+ *
+ * TODO:
+ * - enforce NFS4ERR_NOT_ONLY_OP,
+ * - DESTROY_SESSION MUST be the final operation in the COMPOUND request.
+ */
+static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args)
+{
+	if (args->minorversion && args->opcnt > 0) {
+		struct nfsd4_op *op = &args->ops[0];
+		return (op->status == nfserr_op_illegal) ||
+		       (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP);
+	}
+	return true;
+}
+
+/*
  * COMPOUND call.
  */
 static __be32
@@ -863,12 +914,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 {
 	struct nfsd4_op	*op;
 	struct nfsd4_operation *opdesc;
-	struct nfsd4_compound_state *cstate = NULL;
+	struct nfsd4_compound_state *cstate = &resp->cstate;
 	int		slack_bytes;
 	__be32		status;
 
 	resp->xbuf = &rqstp->rq_res;
-	resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;
+	resp->p = rqstp->rq_res.head[0].iov_base +
+						rqstp->rq_res.head[0].iov_len;
 	resp->tagp = resp->p;
 	/* reserve space for: taglen, tag, and opcnt */
 	resp->p += 2 + XDR_QUADLEN(args->taglen);
@@ -877,18 +929,25 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 	resp->tag = args->tag;
 	resp->opcnt = 0;
 	resp->rqstp = rqstp;
+	resp->cstate.minorversion = args->minorversion;
+	resp->cstate.replay_owner = NULL;
+	fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
+	fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
+	/* Use the deferral mechanism only for NFSv4.0 compounds */
+	rqstp->rq_usedeferral = (args->minorversion == 0);
 
 	/*
 	 * According to RFC3010, this takes precedence over all other errors.
 	 */
 	status = nfserr_minor_vers_mismatch;
-	if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+	if (args->minorversion > nfsd_supported_minorversion)
 		goto out;
 
-	status = nfserr_resource;
-	cstate = cstate_alloc();
-	if (cstate == NULL)
-		goto out;
+	if (!nfs41_op_ordering_ok(args)) {
+		op = &args->ops[0];
+		op->status = nfserr_sequence_pos;
+		goto encode_op;
+	}
 
 	status = nfs_ok;
 	while (!status && resp->opcnt < args->opcnt) {
@@ -897,7 +956,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 		dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
 			resp->opcnt, args->opcnt, op->opnum,
 			nfsd4_op_name(op->opnum));
-
 		/*
 		 * The XDR decode routines may have pre-set op->status;
 		 * for example, if there is a miscellaneous XDR error
@@ -938,6 +996,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
 			BUG_ON(op->status == nfs_ok);
 
 encode_op:
+		/* Only from SEQUENCE or CREATE_SESSION */
+		if (resp->cstate.status == nfserr_replay_cache) {
+			dprintk("%s NFS4.1 replay from cache\n", __func__);
+			if (nfsd4_not_cached(resp))
+				status = nfsd4_enc_uncached_replay(args, resp);
+			else
+				status = op->status;
+			goto out;
+		}
 		if (op->status == nfserr_replay_me) {
 			op->replay = &cstate->replay_owner->so_replay;
 			nfsd4_encode_replay(resp, op);
@@ -961,15 +1028,24 @@ encode_op:
 
 		nfsd4_increment_op_stats(op->opnum);
 	}
+	if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
+		dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
+		status = nfserr_jukebox;
+	}
 
-	cstate_free(cstate);
+	resp->cstate.status = status;
+	fh_put(&resp->cstate.current_fh);
+	fh_put(&resp->cstate.save_fh);
+	BUG_ON(resp->cstate.replay_owner);
 out:
 	nfsd4_release_compoundargs(args);
+	/* Reset deferral mechanism for RPC deferrals */
+	rqstp->rq_usedeferral = 1;
 	dprintk("nfsv4 compound returned %d\n", ntohl(status));
 	return status;
 }
 
-static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
+static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_ACCESS] = {
 		.op_func = (nfsd4op_func)nfsd4_access,
 		.op_name = "OP_ACCESS",
@@ -1045,7 +1121,7 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
 		.op_name = "OP_PUTFH",
 	},
 	[OP_PUTPUBFH] = {
-		/* unsupported, just for future reference: */
+		.op_func = (nfsd4op_func)nfsd4_putrootfh,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
 		.op_name = "OP_PUTPUBFH",
 	},
@@ -1119,6 +1195,28 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
 		.op_name = "OP_RELEASE_LOCKOWNER",
 	},
+
+	/* NFSv4.1 operations */
+	[OP_EXCHANGE_ID] = {
+		.op_func = (nfsd4op_func)nfsd4_exchange_id,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_EXCHANGE_ID",
+	},
+	[OP_CREATE_SESSION] = {
+		.op_func = (nfsd4op_func)nfsd4_create_session,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_CREATE_SESSION",
+	},
+	[OP_DESTROY_SESSION] = {
+		.op_func = (nfsd4op_func)nfsd4_destroy_session,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_DESTROY_SESSION",
+	},
+	[OP_SEQUENCE] = {
+		.op_func = (nfsd4op_func)nfsd4_sequence,
+		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
+		.op_name = "OP_SEQUENCE",
+	},
 };
 
 static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 74f7b67567f..3444c0052a8 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -182,36 +182,26 @@ out_unlock:
 
 typedef int (recdir_func)(struct dentry *, struct dentry *);
 
-struct dentry_list {
-	struct dentry *dentry;
+struct name_list {
+	char name[HEXDIR_LEN];
 	struct list_head list;
 };
 
-struct dentry_list_arg {
-	struct list_head dentries;
-	struct dentry *parent;
-};
-
 static int
-nfsd4_build_dentrylist(void *arg, const char *name, int namlen,
+nfsd4_build_namelist(void *arg, const char *name, int namlen,
 		loff_t offset, u64 ino, unsigned int d_type)
 {
-	struct dentry_list_arg *dla = arg;
-	struct list_head *dentries = &dla->dentries;
-	struct dentry *parent = dla->parent;
-	struct dentry *dentry;
-	struct dentry_list *child;
+	struct list_head *names = arg;
+	struct name_list *entry;
 
-	if (name && isdotent(name, namlen))
+	if (namlen != HEXDIR_LEN - 1)
 		return 0;
-	dentry = lookup_one_len(name, parent, namlen);
-	if (IS_ERR(dentry))
-		return PTR_ERR(dentry);
-	child = kmalloc(sizeof(*child), GFP_KERNEL);
-	if (child == NULL)
+	entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
+	if (entry == NULL)
 		return -ENOMEM;
-	child->dentry = dentry;
-	list_add(&child->list, dentries);
+	memcpy(entry->name, name, HEXDIR_LEN - 1);
+	entry->name[HEXDIR_LEN - 1] = '\0';
+	list_add(&entry->list, names);
 	return 0;
 }
 
@@ -220,11 +210,9 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 {
 	const struct cred *original_cred;
 	struct file *filp;
-	struct dentry_list_arg dla = {
-		.parent = dir,
-	};
-	struct list_head *dentries = &dla.dentries;
-	struct dentry_list *child;
+	LIST_HEAD(names);
+	struct name_list *entry;
+	struct dentry *dentry;
 	int status;
 
 	if (!rec_dir_init)
@@ -233,31 +221,34 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
 	status = nfs4_save_creds(&original_cred);
 	if (status < 0)
 		return status;
-	INIT_LIST_HEAD(dentries);
 
 	filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
 			   current_cred());
 	status = PTR_ERR(filp);
 	if (IS_ERR(filp))
 		goto out;
-	INIT_LIST_HEAD(dentries);
-	status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla);
+	status = vfs_readdir(filp, nfsd4_build_namelist, &names);
 	fput(filp);
-	while (!list_empty(dentries)) {
-		child = list_entry(dentries->next, struct dentry_list, list);
-		status = f(dir, child->dentry);
+	while (!list_empty(&names)) {
+		entry = list_entry(names.next, struct name_list, list);
+
+		dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
+		if (IS_ERR(dentry)) {
+			status = PTR_ERR(dentry);
+			goto out;
+		}
+		status = f(dir, dentry);
+		dput(dentry);
 		if (status)
 			goto out;
-		list_del(&child->list);
-		dput(child->dentry);
-		kfree(child);
+		list_del(&entry->list);
+		kfree(entry);
 	}
 out:
-	while (!list_empty(dentries)) {
-		child = list_entry(dentries->next, struct dentry_list, list);
-		list_del(&child->list);
-		dput(child->dentry);
-		kfree(child);
+	while (!list_empty(&names)) {
+		entry = list_entry(names.next, struct name_list, list);
+		list_del(&entry->list);
+		kfree(entry);
 	}
 	nfs4_reset_creds(original_cred);
 	return status;
@@ -353,7 +344,8 @@ purge_old(struct dentry *parent, struct dentry *child)
 {
 	int status;
 
-	if (nfs4_has_reclaimed_state(child->d_name.name))
+	/* note: we currently use this path only for minorversion 0 */
+	if (nfs4_has_reclaimed_state(child->d_name.name, false))
 		return 0;
 
 	status = nfsd4_clear_clid_dir(parent, child);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b6f60f48e94..c65a27b76a9 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -68,6 +68,7 @@ static u32 current_delegid = 1;
 static u32 nfs4_init;
 static stateid_t zerostateid;             /* bits all 0 */
 static stateid_t onestateid;              /* bits all 1 */
+static u64 current_sessionid = 1;
 
 #define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
 #define ONE_STATEID(stateid)  (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
@@ -75,18 +76,21 @@ static stateid_t onestateid;              /* bits all 1 */
 /* forward declarations */
 static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
 static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
-static void release_stateid_lockowners(struct nfs4_stateid *open_stp);
 static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
 static void nfs4_set_recdir(char *recdir);
 
-/* Locking:
- *
- * client_mutex:
- * 	protects clientid_hashtbl[], clientstr_hashtbl[],
- * 	unconfstr_hashtbl[], uncofid_hashtbl[].
- */
+/* Locking: */
+
+/* Currently used for almost all code touching nfsv4 state: */
 static DEFINE_MUTEX(client_mutex);
 
+/*
+ * Currently used for the del_recall_lru and file hash table.  In an
+ * effort to decrease the scope of the client_mutex, this spinlock may
+ * eventually cover more:
+ */
+static DEFINE_SPINLOCK(recall_lock);
+
 static struct kmem_cache *stateowner_slab = NULL;
 static struct kmem_cache *file_slab = NULL;
 static struct kmem_cache *stateid_slab = NULL;
@@ -117,37 +121,23 @@ opaque_hashval(const void *ptr, int nbytes)
 	return x;
 }
 
-/* forward declarations */
-static void release_stateowner(struct nfs4_stateowner *sop);
-static void release_stateid(struct nfs4_stateid *stp, int flags);
-
-/*
- * Delegation state
- */
-
-/* recall_lock protects the del_recall_lru */
-static DEFINE_SPINLOCK(recall_lock);
 static struct list_head del_recall_lru;
 
-static void
-free_nfs4_file(struct kref *kref)
-{
-	struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref);
-	list_del(&fp->fi_hash);
-	iput(fp->fi_inode);
-	kmem_cache_free(file_slab, fp);
-}
-
 static inline void
 put_nfs4_file(struct nfs4_file *fi)
 {
-	kref_put(&fi->fi_ref, free_nfs4_file);
+	if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
+		list_del(&fi->fi_hash);
+		spin_unlock(&recall_lock);
+		iput(fi->fi_inode);
+		kmem_cache_free(file_slab, fi);
+	}
 }
 
 static inline void
 get_nfs4_file(struct nfs4_file *fi)
 {
-	kref_get(&fi->fi_ref);
+	atomic_inc(&fi->fi_ref);
 }
 
 static int num_delegations;
@@ -220,9 +210,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	dp->dl_stateid.si_stateownerid = current_delegid++;
 	dp->dl_stateid.si_fileid = 0;
 	dp->dl_stateid.si_generation = 0;
-	dp->dl_fhlen = current_fh->fh_handle.fh_size;
-	memcpy(dp->dl_fhval, &current_fh->fh_handle.fh_base,
-		        current_fh->fh_handle.fh_size);
+	fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
 	dp->dl_time = 0;
 	atomic_set(&dp->dl_count, 1);
 	list_add(&dp->dl_perfile, &fp->fi_delegations);
@@ -311,6 +299,291 @@ static struct list_head	unconf_id_hashtbl[CLIENT_HASH_SIZE];
 static struct list_head client_lru;
 static struct list_head close_lru;
 
+static void unhash_generic_stateid(struct nfs4_stateid *stp)
+{
+	list_del(&stp->st_hash);
+	list_del(&stp->st_perfile);
+	list_del(&stp->st_perstateowner);
+}
+
+static void free_generic_stateid(struct nfs4_stateid *stp)
+{
+	put_nfs4_file(stp->st_file);
+	kmem_cache_free(stateid_slab, stp);
+}
+
+static void release_lock_stateid(struct nfs4_stateid *stp)
+{
+	unhash_generic_stateid(stp);
+	locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner);
+	free_generic_stateid(stp);
+}
+
+static void unhash_lockowner(struct nfs4_stateowner *sop)
+{
+	struct nfs4_stateid *stp;
+
+	list_del(&sop->so_idhash);
+	list_del(&sop->so_strhash);
+	list_del(&sop->so_perstateid);
+	while (!list_empty(&sop->so_stateids)) {
+		stp = list_first_entry(&sop->so_stateids,
+				struct nfs4_stateid, st_perstateowner);
+		release_lock_stateid(stp);
+	}
+}
+
+static void release_lockowner(struct nfs4_stateowner *sop)
+{
+	unhash_lockowner(sop);
+	nfs4_put_stateowner(sop);
+}
+
+static void
+release_stateid_lockowners(struct nfs4_stateid *open_stp)
+{
+	struct nfs4_stateowner *lock_sop;
+
+	while (!list_empty(&open_stp->st_lockowners)) {
+		lock_sop = list_entry(open_stp->st_lockowners.next,
+				struct nfs4_stateowner, so_perstateid);
+		/* list_del(&open_stp->st_lockowners);  */
+		BUG_ON(lock_sop->so_is_open_owner);
+		release_lockowner(lock_sop);
+	}
+}
+
+static void release_open_stateid(struct nfs4_stateid *stp)
+{
+	unhash_generic_stateid(stp);
+	release_stateid_lockowners(stp);
+	nfsd_close(stp->st_vfs_file);
+	free_generic_stateid(stp);
+}
+
+static void unhash_openowner(struct nfs4_stateowner *sop)
+{
+	struct nfs4_stateid *stp;
+
+	list_del(&sop->so_idhash);
+	list_del(&sop->so_strhash);
+	list_del(&sop->so_perclient);
+	list_del(&sop->so_perstateid); /* XXX: necessary? */
+	while (!list_empty(&sop->so_stateids)) {
+		stp = list_first_entry(&sop->so_stateids,
+				struct nfs4_stateid, st_perstateowner);
+		release_open_stateid(stp);
+	}
+}
+
+static void release_openowner(struct nfs4_stateowner *sop)
+{
+	unhash_openowner(sop);
+	list_del(&sop->so_close_lru);
+	nfs4_put_stateowner(sop);
+}
+
+static DEFINE_SPINLOCK(sessionid_lock);
+#define SESSION_HASH_SIZE	512
+static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
+
+static inline int
+hash_sessionid(struct nfs4_sessionid *sessionid)
+{
+	struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid;
+
+	return sid->sequence % SESSION_HASH_SIZE;
+}
+
+static inline void
+dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
+{
+	u32 *ptr = (u32 *)(&sessionid->data[0]);
+	dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
+}
+
+static void
+gen_sessionid(struct nfsd4_session *ses)
+{
+	struct nfs4_client *clp = ses->se_client;
+	struct nfsd4_sessionid *sid;
+
+	sid = (struct nfsd4_sessionid *)ses->se_sessionid.data;
+	sid->clientid = clp->cl_clientid;
+	sid->sequence = current_sessionid++;
+	sid->reserved = 0;
+}
+
+/*
+ * Give the client the number of slots it requests bound by
+ * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages.
+ *
+ * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we
+ * should (up to a point) re-negotiate active sessions and reduce their
+ * slot usage to make rooom for new connections. For now we just fail the
+ * create session.
+ */
+static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
+{
+	int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
+
+	spin_lock(&nfsd_serv->sv_lock);
+	if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
+		np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
+	nfsd_serv->sv_drc_pages_used += np;
+	spin_unlock(&nfsd_serv->sv_lock);
+
+	if (np <= 0) {
+		status = nfserr_resource;
+		fchan->maxreqs = 0;
+	} else
+		fchan->maxreqs = np / NFSD_PAGES_PER_SLOT;
+
+	return status;
+}
+
+/*
+ * fchan holds the client values on input, and the server values on output
+ */
+static int init_forechannel_attrs(struct svc_rqst *rqstp,
+				    struct nfsd4_session *session,
+				    struct nfsd4_channel_attrs *fchan)
+{
+	int status = 0;
+	__u32   maxcount = svc_max_payload(rqstp);
+
+	/* headerpadsz set to zero in encode routine */
+
+	/* Use the client's max request and max response size if possible */
+	if (fchan->maxreq_sz > maxcount)
+		fchan->maxreq_sz = maxcount;
+	session->se_fmaxreq_sz = fchan->maxreq_sz;
+
+	if (fchan->maxresp_sz > maxcount)
+		fchan->maxresp_sz = maxcount;
+	session->se_fmaxresp_sz = fchan->maxresp_sz;
+
+	/* Set the max response cached size our default which is
+	 * a multiple of PAGE_SIZE and small */
+	session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
+	fchan->maxresp_cached = session->se_fmaxresp_cached;
+
+	/* Use the client's maxops if possible */
+	if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
+		fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
+	session->se_fmaxops = fchan->maxops;
+
+	/* try to use the client requested number of slots */
+	if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
+		fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
+
+	/* FIXME: Error means no more DRC pages so the server should
+	 * recover pages from existing sessions. For now fail session
+	 * creation.
+	 */
+	status = set_forechannel_maxreqs(fchan);
+
+	session->se_fnumslots = fchan->maxreqs;
+	return status;
+}
+
+static int
+alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
+		   struct nfsd4_create_session *cses)
+{
+	struct nfsd4_session *new, tmp;
+	int idx, status = nfserr_resource, slotsize;
+
+	memset(&tmp, 0, sizeof(tmp));
+
+	/* FIXME: For now, we just accept the client back channel attributes. */
+	status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel);
+	if (status)
+		goto out;
+
+	/* allocate struct nfsd4_session and slot table in one piece */
+	slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot);
+	new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
+	if (!new)
+		goto out;
+
+	memcpy(new, &tmp, sizeof(*new));
+
+	new->se_client = clp;
+	gen_sessionid(new);
+	idx = hash_sessionid(&new->se_sessionid);
+	memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
+	       NFS4_MAX_SESSIONID_LEN);
+
+	new->se_flags = cses->flags;
+	kref_init(&new->se_ref);
+	spin_lock(&sessionid_lock);
+	list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+	list_add(&new->se_perclnt, &clp->cl_sessions);
+	spin_unlock(&sessionid_lock);
+
+	status = nfs_ok;
+out:
+	return status;
+}
+
+/* caller must hold sessionid_lock */
+static struct nfsd4_session *
+find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
+{
+	struct nfsd4_session *elem;
+	int idx;
+
+	dump_sessionid(__func__, sessionid);
+	idx = hash_sessionid(sessionid);
+	dprintk("%s: idx is %d\n", __func__, idx);
+	/* Search in the appropriate list */
+	list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
+		dump_sessionid("list traversal", &elem->se_sessionid);
+		if (!memcmp(elem->se_sessionid.data, sessionid->data,
+			    NFS4_MAX_SESSIONID_LEN)) {
+			return elem;
+		}
+	}
+
+	dprintk("%s: session not found\n", __func__);
+	return NULL;
+}
+
+/* caller must hold sessionid_lock */
+static void
+unhash_session(struct nfsd4_session *ses)
+{
+	list_del(&ses->se_hash);
+	list_del(&ses->se_perclnt);
+}
+
+static void
+release_session(struct nfsd4_session *ses)
+{
+	spin_lock(&sessionid_lock);
+	unhash_session(ses);
+	spin_unlock(&sessionid_lock);
+	nfsd4_put_session(ses);
+}
+
+static void nfsd4_release_respages(struct page **respages, short resused);
+
+void
+free_session(struct kref *kref)
+{
+	struct nfsd4_session *ses;
+	int i;
+
+	ses = container_of(kref, struct nfsd4_session, se_ref);
+	for (i = 0; i < ses->se_fnumslots; i++) {
+		struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
+		nfsd4_release_respages(e->ce_respages, e->ce_resused);
+	}
+	kfree(ses->se_slots);
+	kfree(ses);
+}
+
 static inline void
 renew_client(struct nfs4_client *clp)
 {
@@ -330,8 +603,8 @@ STALE_CLIENTID(clientid_t *clid)
 {
 	if (clid->cl_boot == boot_time)
 		return 0;
-	dprintk("NFSD stale clientid (%08x/%08x)\n", 
-			clid->cl_boot, clid->cl_id);
+	dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
+		clid->cl_boot, clid->cl_id, boot_time);
 	return 1;
 }
 
@@ -376,6 +649,8 @@ static inline void
 free_client(struct nfs4_client *clp)
 {
 	shutdown_callback_client(clp);
+	nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
+			     clp->cl_slot.sl_cache_entry.ce_resused);
 	if (clp->cl_cred.cr_group_info)
 		put_group_info(clp->cl_cred.cr_group_info);
 	kfree(clp->cl_principal);
@@ -420,7 +695,13 @@ expire_client(struct nfs4_client *clp)
 	list_del(&clp->cl_lru);
 	while (!list_empty(&clp->cl_openowners)) {
 		sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
-		release_stateowner(sop);
+		release_openowner(sop);
+	}
+	while (!list_empty(&clp->cl_sessions)) {
+		struct nfsd4_session  *ses;
+		ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
+				 se_perclnt);
+		release_session(ses);
 	}
 	put_nfs4_client(clp);
 }
@@ -439,6 +720,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
 	INIT_LIST_HEAD(&clp->cl_strhash);
 	INIT_LIST_HEAD(&clp->cl_openowners);
 	INIT_LIST_HEAD(&clp->cl_delegations);
+	INIT_LIST_HEAD(&clp->cl_sessions);
 	INIT_LIST_HEAD(&clp->cl_lru);
 	return clp;
 }
@@ -568,25 +850,45 @@ find_unconfirmed_client(clientid_t *clid)
 	return NULL;
 }
 
+/*
+ * Return 1 iff clp's clientid establishment method matches the use_exchange_id
+ * parameter. Matching is based on the fact the at least one of the
+ * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
+ *
+ * FIXME: we need to unify the clientid namespaces for nfsv4.x
+ * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
+ * and SET_CLIENTID{,_CONFIRM}
+ */
+static inline int
+match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
+{
+	bool has_exchange_flags = (clp->cl_exchange_flags != 0);
+	return use_exchange_id == has_exchange_flags;
+}
+
 static struct nfs4_client *
-find_confirmed_client_by_str(const char *dname, unsigned int hashval)
+find_confirmed_client_by_str(const char *dname, unsigned int hashval,
+			     bool use_exchange_id)
 {
 	struct nfs4_client *clp;
 
 	list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
-		if (same_name(clp->cl_recdir, dname))
+		if (same_name(clp->cl_recdir, dname) &&
+		    match_clientid_establishment(clp, use_exchange_id))
 			return clp;
 	}
 	return NULL;
 }
 
 static struct nfs4_client *
-find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
+find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
+			       bool use_exchange_id)
 {
 	struct nfs4_client *clp;
 
 	list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
-		if (same_name(clp->cl_recdir, dname))
+		if (same_name(clp->cl_recdir, dname) &&
+		    match_clientid_establishment(clp, use_exchange_id))
 			return clp;
 	}
 	return NULL;
@@ -685,6 +987,534 @@ out_err:
 	return;
 }
 
+void
+nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
+{
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+
+	resp->cstate.statp = statp;
+}
+
+/*
+ * Dereference the result pages.
+ */
+static void
+nfsd4_release_respages(struct page **respages, short resused)
+{
+	int i;
+
+	dprintk("--> %s\n", __func__);
+	for (i = 0; i < resused; i++) {
+		if (!respages[i])
+			continue;
+		put_page(respages[i]);
+		respages[i] = NULL;
+	}
+}
+
+static void
+nfsd4_copy_pages(struct page **topages, struct page **frompages, short count)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		topages[i] = frompages[i];
+		if (!topages[i])
+			continue;
+		get_page(topages[i]);
+	}
+}
+
+/*
+ * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous
+ * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total
+ * length of the XDR response is less than se_fmaxresp_cached
+ * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a
+ * of the reply (e.g. readdir).
+ *
+ * Store the base and length of the rq_req.head[0] page
+ * of the NFSv4.1 data, just past the rpc header.
+ */
+void
+nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
+{
+	struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+	struct svc_rqst *rqstp = resp->rqstp;
+	struct nfsd4_compoundargs *args = rqstp->rq_argp;
+	struct nfsd4_op *op = &args->ops[resp->opcnt];
+	struct kvec *resv = &rqstp->rq_res.head[0];
+
+	dprintk("--> %s entry %p\n", __func__, entry);
+
+	/* Don't cache a failed OP_SEQUENCE. */
+	if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
+		return;
+
+	nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
+	entry->ce_opcnt = resp->opcnt;
+	entry->ce_status = resp->cstate.status;
+
+	/*
+	 * Don't need a page to cache just the sequence operation - the slot
+	 * does this for us!
+	 */
+
+	if (nfsd4_not_cached(resp)) {
+		entry->ce_resused = 0;
+		entry->ce_rpchdrlen = 0;
+		dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__,
+			resp->cstate.slot->sl_cache_entry.ce_cachethis);
+		return;
+	}
+	entry->ce_resused = rqstp->rq_resused;
+	if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
+		entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
+	nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
+			 entry->ce_resused);
+	entry->ce_datav.iov_base = resp->cstate.statp;
+	entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
+				(char *)page_address(rqstp->rq_respages[0]));
+	/* Current request rpc header length*/
+	entry->ce_rpchdrlen = (char *)resp->cstate.statp -
+				(char *)page_address(rqstp->rq_respages[0]);
+}
+
+/*
+ * We keep the rpc header, but take the nfs reply from the replycache.
+ */
+static int
+nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
+			struct nfsd4_cache_entry *entry)
+{
+	struct svc_rqst *rqstp = resp->rqstp;
+	struct kvec *resv = &resp->rqstp->rq_res.head[0];
+	int len;
+
+	/* Current request rpc header length*/
+	len = (char *)resp->cstate.statp -
+			(char *)page_address(rqstp->rq_respages[0]);
+	if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
+		dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
+			entry->ce_datav.iov_len);
+		return 0;
+	}
+	/* copy the cached reply nfsd data past the current rpc header */
+	memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
+		entry->ce_datav.iov_len);
+	resv->iov_len = len + entry->ce_datav.iov_len;
+	return 1;
+}
+
+/*
+ * Keep the first page of the replay. Copy the NFSv4.1 data from the first
+ * cached page.  Replace any futher replay pages from the cache.
+ */
+__be32
+nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
+			 struct nfsd4_sequence *seq)
+{
+	struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
+	__be32 status;
+
+	dprintk("--> %s entry %p\n", __func__, entry);
+
+	/*
+	 * If this is just the sequence operation, we did not keep
+	 * a page in the cache entry because we can just use the
+	 * slot info stored in struct nfsd4_sequence that was checked
+	 * against the slot in nfsd4_sequence().
+	 *
+	 * This occurs when seq->cachethis is FALSE, or when the client
+	 * session inactivity timer fires and a solo sequence operation
+	 * is sent (lease renewal).
+	 */
+	if (seq && nfsd4_not_cached(resp)) {
+		seq->maxslots = resp->cstate.session->se_fnumslots;
+		return nfs_ok;
+	}
+
+	if (!nfsd41_copy_replay_data(resp, entry)) {
+		/*
+		 * Not enough room to use the replay rpc header, send the
+		 * cached header. Release all the allocated result pages.
+		 */
+		svc_free_res_pages(resp->rqstp);
+		nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
+			entry->ce_resused);
+	} else {
+		/* Release all but the first allocated result page */
+
+		resp->rqstp->rq_resused--;
+		svc_free_res_pages(resp->rqstp);
+
+		nfsd4_copy_pages(&resp->rqstp->rq_respages[1],
+				 &entry->ce_respages[1],
+				 entry->ce_resused - 1);
+	}
+
+	resp->rqstp->rq_resused = entry->ce_resused;
+	resp->opcnt = entry->ce_opcnt;
+	resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen;
+	status = entry->ce_status;
+
+	return status;
+}
+
+/*
+ * Set the exchange_id flags returned by the server.
+ */
+static void
+nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
+{
+	/* pNFS is not supported */
+	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+
+	/* Referrals are supported, Migration is not. */
+	new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
+
+	/* set the wire flags to return to client. */
+	clid->flags = new->cl_exchange_flags;
+}
+
+__be32
+nfsd4_exchange_id(struct svc_rqst *rqstp,
+		  struct nfsd4_compound_state *cstate,
+		  struct nfsd4_exchange_id *exid)
+{
+	struct nfs4_client *unconf, *conf, *new;
+	int status;
+	unsigned int		strhashval;
+	char			dname[HEXDIR_LEN];
+	nfs4_verifier		verf = exid->verifier;
+	u32			ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+
+	dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
+		" ip_addr=%u flags %x, spa_how %d\n",
+		__func__, rqstp, exid, exid->clname.len, exid->clname.data,
+		ip_addr, exid->flags, exid->spa_how);
+
+	if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
+		return nfserr_inval;
+
+	/* Currently only support SP4_NONE */
+	switch (exid->spa_how) {
+	case SP4_NONE:
+		break;
+	case SP4_SSV:
+		return nfserr_encr_alg_unsupp;
+	default:
+		BUG();				/* checked by xdr code */
+	case SP4_MACH_CRED:
+		return nfserr_serverfault;	/* no excuse :-/ */
+	}
+
+	status = nfs4_make_rec_clidname(dname, &exid->clname);
+
+	if (status)
+		goto error;
+
+	strhashval = clientstr_hashval(dname);
+
+	nfs4_lock_state();
+	status = nfs_ok;
+
+	conf = find_confirmed_client_by_str(dname, strhashval, true);
+	if (conf) {
+		if (!same_verf(&verf, &conf->cl_verifier)) {
+			/* 18.35.4 case 8 */
+			if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+				status = nfserr_not_same;
+				goto out;
+			}
+			/* Client reboot: destroy old state */
+			expire_client(conf);
+			goto out_new;
+		}
+		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+			/* 18.35.4 case 9 */
+			if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+				status = nfserr_perm;
+				goto out;
+			}
+			expire_client(conf);
+			goto out_new;
+		}
+		if (ip_addr != conf->cl_addr &&
+		    !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) {
+			/* Client collision. 18.35.4 case 3 */
+			status = nfserr_clid_inuse;
+			goto out;
+		}
+		/*
+		 * Set bit when the owner id and verifier map to an already
+		 * confirmed client id (18.35.3).
+		 */
+		exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
+
+		/*
+		 * Falling into 18.35.4 case 2, possible router replay.
+		 * Leave confirmed record intact and return same result.
+		 */
+		copy_verf(conf, &verf);
+		new = conf;
+		goto out_copy;
+	} else {
+		/* 18.35.4 case 7 */
+		if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+			status = nfserr_noent;
+			goto out;
+		}
+	}
+
+	unconf  = find_unconfirmed_client_by_str(dname, strhashval, true);
+	if (unconf) {
+		/*
+		 * Possible retry or client restart.  Per 18.35.4 case 4,
+		 * a new unconfirmed record should be generated regardless
+		 * of whether any properties have changed.
+		 */
+		expire_client(unconf);
+	}
+
+out_new:
+	/* Normal case */
+	new = create_client(exid->clname, dname);
+	if (new == NULL) {
+		status = nfserr_resource;
+		goto out;
+	}
+
+	copy_verf(new, &verf);
+	copy_cred(&new->cl_cred, &rqstp->rq_cred);
+	new->cl_addr = ip_addr;
+	gen_clid(new);
+	gen_confirm(new);
+	add_to_unconfirmed(new, strhashval);
+out_copy:
+	exid->clientid.cl_boot = new->cl_clientid.cl_boot;
+	exid->clientid.cl_id = new->cl_clientid.cl_id;
+
+	new->cl_slot.sl_seqid = 0;
+	exid->seqid = 1;
+	nfsd4_set_ex_flags(new, exid);
+
+	dprintk("nfsd4_exchange_id seqid %d flags %x\n",
+		new->cl_slot.sl_seqid, new->cl_exchange_flags);
+	status = nfs_ok;
+
+out:
+	nfs4_unlock_state();
+error:
+	dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
+	return status;
+}
+
+static int
+check_slot_seqid(u32 seqid, struct nfsd4_slot *slot)
+{
+	dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid,
+		slot->sl_seqid);
+
+	/* The slot is in use, and no response has been sent. */
+	if (slot->sl_inuse) {
+		if (seqid == slot->sl_seqid)
+			return nfserr_jukebox;
+		else
+			return nfserr_seq_misordered;
+	}
+	/* Normal */
+	if (likely(seqid == slot->sl_seqid + 1))
+		return nfs_ok;
+	/* Replay */
+	if (seqid == slot->sl_seqid)
+		return nfserr_replay_cache;
+	/* Wraparound */
+	if (seqid == 1 && (slot->sl_seqid + 1) == 0)
+		return nfs_ok;
+	/* Misordered replay or misordered new request */
+	return nfserr_seq_misordered;
+}
+
+__be32
+nfsd4_create_session(struct svc_rqst *rqstp,
+		     struct nfsd4_compound_state *cstate,
+		     struct nfsd4_create_session *cr_ses)
+{
+	u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct nfs4_client *conf, *unconf;
+	struct nfsd4_slot *slot = NULL;
+	int status = 0;
+
+	nfs4_lock_state();
+	unconf = find_unconfirmed_client(&cr_ses->clientid);
+	conf = find_confirmed_client(&cr_ses->clientid);
+
+	if (conf) {
+		slot = &conf->cl_slot;
+		status = check_slot_seqid(cr_ses->seqid, slot);
+		if (status == nfserr_replay_cache) {
+			dprintk("Got a create_session replay! seqid= %d\n",
+				slot->sl_seqid);
+			cstate->slot = slot;
+			cstate->status = status;
+			/* Return the cached reply status */
+			status = nfsd4_replay_cache_entry(resp, NULL);
+			goto out;
+		} else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) {
+			status = nfserr_seq_misordered;
+			dprintk("Sequence misordered!\n");
+			dprintk("Expected seqid= %d but got seqid= %d\n",
+				slot->sl_seqid, cr_ses->seqid);
+			goto out;
+		}
+		conf->cl_slot.sl_seqid++;
+	} else if (unconf) {
+		if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
+		    (ip_addr != unconf->cl_addr)) {
+			status = nfserr_clid_inuse;
+			goto out;
+		}
+
+		slot = &unconf->cl_slot;
+		status = check_slot_seqid(cr_ses->seqid, slot);
+		if (status) {
+			/* an unconfirmed replay returns misordered */
+			status = nfserr_seq_misordered;
+			goto out;
+		}
+
+		slot->sl_seqid++; /* from 0 to 1 */
+		move_to_confirmed(unconf);
+
+		/*
+		 * We do not support RDMA or persistent sessions
+		 */
+		cr_ses->flags &= ~SESSION4_PERSIST;
+		cr_ses->flags &= ~SESSION4_RDMA;
+
+		conf = unconf;
+	} else {
+		status = nfserr_stale_clientid;
+		goto out;
+	}
+
+	status = alloc_init_session(rqstp, conf, cr_ses);
+	if (status)
+		goto out;
+
+	memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
+	       NFS4_MAX_SESSIONID_LEN);
+	cr_ses->seqid = slot->sl_seqid;
+
+	slot->sl_inuse = true;
+	cstate->slot = slot;
+	/* Ensure a page is used for the cache */
+	slot->sl_cache_entry.ce_cachethis = 1;
+out:
+	nfs4_unlock_state();
+	dprintk("%s returns %d\n", __func__, ntohl(status));
+	return status;
+}
+
+__be32
+nfsd4_destroy_session(struct svc_rqst *r,
+		      struct nfsd4_compound_state *cstate,
+		      struct nfsd4_destroy_session *sessionid)
+{
+	struct nfsd4_session *ses;
+	u32 status = nfserr_badsession;
+
+	/* Notes:
+	 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
+	 * - Should we return nfserr_back_chan_busy if waiting for
+	 *   callbacks on to-be-destroyed session?
+	 * - Do we need to clear any callback info from previous session?
+	 */
+
+	dump_sessionid(__func__, &sessionid->sessionid);
+	spin_lock(&sessionid_lock);
+	ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
+	if (!ses) {
+		spin_unlock(&sessionid_lock);
+		goto out;
+	}
+
+	unhash_session(ses);
+	spin_unlock(&sessionid_lock);
+
+	/* wait for callbacks */
+	shutdown_callback_client(ses->se_client);
+	nfsd4_put_session(ses);
+	status = nfs_ok;
+out:
+	dprintk("%s returns %d\n", __func__, ntohl(status));
+	return status;
+}
+
+__be32
+nfsd4_sequence(struct svc_rqst *rqstp,
+	       struct nfsd4_compound_state *cstate,
+	       struct nfsd4_sequence *seq)
+{
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	struct nfsd4_session *session;
+	struct nfsd4_slot *slot;
+	int status;
+
+	if (resp->opcnt != 1)
+		return nfserr_sequence_pos;
+
+	spin_lock(&sessionid_lock);
+	status = nfserr_badsession;
+	session = find_in_sessionid_hashtbl(&seq->sessionid);
+	if (!session)
+		goto out;
+
+	status = nfserr_badslot;
+	if (seq->slotid >= session->se_fnumslots)
+		goto out;
+
+	slot = &session->se_slots[seq->slotid];
+	dprintk("%s: slotid %d\n", __func__, seq->slotid);
+
+	status = check_slot_seqid(seq->seqid, slot);
+	if (status == nfserr_replay_cache) {
+		cstate->slot = slot;
+		cstate->session = session;
+		/* Return the cached reply status and set cstate->status
+		 * for nfsd4_svc_encode_compoundres processing */
+		status = nfsd4_replay_cache_entry(resp, seq);
+		cstate->status = nfserr_replay_cache;
+		goto replay_cache;
+	}
+	if (status)
+		goto out;
+
+	/* Success! bump slot seqid */
+	slot->sl_inuse = true;
+	slot->sl_seqid = seq->seqid;
+	slot->sl_cache_entry.ce_cachethis = seq->cachethis;
+	/* Always set the cache entry cachethis for solo sequence */
+	if (nfsd4_is_solo_sequence(resp))
+		slot->sl_cache_entry.ce_cachethis = 1;
+
+	cstate->slot = slot;
+	cstate->session = session;
+
+replay_cache:
+	/* Renew the clientid on success and on replay.
+	 * Hold a session reference until done processing the compound:
+	 * nfsd4_put_session called only if the cstate slot is set.
+	 */
+	renew_client(session->se_client);
+	nfsd4_get_session(session);
+out:
+	spin_unlock(&sessionid_lock);
+	dprintk("%s: return %d\n", __func__, ntohl(status));
+	return status;
+}
+
 __be32
 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		  struct nfsd4_setclientid *setclid)
@@ -716,14 +1546,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	strhashval = clientstr_hashval(dname);
 
 	nfs4_lock_state();
-	conf = find_confirmed_client_by_str(dname, strhashval);
+	conf = find_confirmed_client_by_str(dname, strhashval, false);
 	if (conf) {
 		/* RFC 3530 14.2.33 CASE 0: */
 		status = nfserr_clid_inuse;
-		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)
-				|| conf->cl_addr != sin->sin_addr.s_addr) {
-			dprintk("NFSD: setclientid: string in use by clientat %pI4\n",
-				&conf->cl_addr);
+		if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+			dprintk("NFSD: setclientid: string in use by client"
+				" at %pI4\n", &conf->cl_addr);
 			goto out;
 		}
 	}
@@ -732,7 +1561,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * has a description of SETCLIENTID request processing consisting
 	 * of 5 bullet points, labeled as CASE0 - CASE4 below.
 	 */
-	unconf = find_unconfirmed_client_by_str(dname, strhashval);
+	unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
 	status = nfserr_resource;
 	if (!conf) {
 		/*
@@ -887,7 +1716,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 			unsigned int hash =
 				clientstr_hashval(unconf->cl_recdir);
 			conf = find_confirmed_client_by_str(unconf->cl_recdir,
-									hash);
+							    hash, false);
 			if (conf) {
 				nfsd4_remove_clid_dir(conf);
 				expire_client(conf);
@@ -923,11 +1752,13 @@ alloc_init_file(struct inode *ino)
 
 	fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
 	if (fp) {
-		kref_init(&fp->fi_ref);
+		atomic_set(&fp->fi_ref, 1);
 		INIT_LIST_HEAD(&fp->fi_hash);
 		INIT_LIST_HEAD(&fp->fi_stateids);
 		INIT_LIST_HEAD(&fp->fi_delegations);
+		spin_lock(&recall_lock);
 		list_add(&fp->fi_hash, &file_hashtbl[hashval]);
+		spin_unlock(&recall_lock);
 		fp->fi_inode = igrab(ino);
 		fp->fi_id = current_fileid++;
 		fp->fi_had_conflict = false;
@@ -1037,48 +1868,6 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 	return sop;
 }
 
-static void
-release_stateid_lockowners(struct nfs4_stateid *open_stp)
-{
-	struct nfs4_stateowner *lock_sop;
-
-	while (!list_empty(&open_stp->st_lockowners)) {
-		lock_sop = list_entry(open_stp->st_lockowners.next,
-				struct nfs4_stateowner, so_perstateid);
-		/* list_del(&open_stp->st_lockowners);  */
-		BUG_ON(lock_sop->so_is_open_owner);
-		release_stateowner(lock_sop);
-	}
-}
-
-static void
-unhash_stateowner(struct nfs4_stateowner *sop)
-{
-	struct nfs4_stateid *stp;
-
-	list_del(&sop->so_idhash);
-	list_del(&sop->so_strhash);
-	if (sop->so_is_open_owner)
-		list_del(&sop->so_perclient);
-	list_del(&sop->so_perstateid);
-	while (!list_empty(&sop->so_stateids)) {
-		stp = list_entry(sop->so_stateids.next,
-			struct nfs4_stateid, st_perstateowner);
-		if (sop->so_is_open_owner)
-			release_stateid(stp, OPEN_STATE);
-		else
-			release_stateid(stp, LOCK_STATE);
-	}
-}
-
-static void
-release_stateowner(struct nfs4_stateowner *sop)
-{
-	unhash_stateowner(sop);
-	list_del(&sop->so_close_lru);
-	nfs4_put_stateowner(sop);
-}
-
 static inline void
 init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
 	struct nfs4_stateowner *sop = open->op_stateowner;
@@ -1100,30 +1889,13 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
 	stp->st_stateid.si_generation = 0;
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = 0;
-	__set_bit(open->op_share_access, &stp->st_access_bmap);
+	__set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
+		  &stp->st_access_bmap);
 	__set_bit(open->op_share_deny, &stp->st_deny_bmap);
 	stp->st_openstp = NULL;
 }
 
 static void
-release_stateid(struct nfs4_stateid *stp, int flags)
-{
-	struct file *filp = stp->st_vfs_file;
-
-	list_del(&stp->st_hash);
-	list_del(&stp->st_perfile);
-	list_del(&stp->st_perstateowner);
-	if (flags & OPEN_STATE) {
-		release_stateid_lockowners(stp);
-		stp->st_vfs_file = NULL;
-		nfsd_close(filp);
-	} else if (flags & LOCK_STATE)
-		locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner);
-	put_nfs4_file(stp->st_file);
-	kmem_cache_free(stateid_slab, stp);
-}
-
-static void
 move_to_close_lru(struct nfs4_stateowner *sop)
 {
 	dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
@@ -1160,20 +1932,33 @@ find_file(struct inode *ino)
 	unsigned int hashval = file_hashval(ino);
 	struct nfs4_file *fp;
 
+	spin_lock(&recall_lock);
 	list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
 		if (fp->fi_inode == ino) {
 			get_nfs4_file(fp);
+			spin_unlock(&recall_lock);
 			return fp;
 		}
 	}
+	spin_unlock(&recall_lock);
 	return NULL;
 }
 
-static inline int access_valid(u32 x)
+static inline int access_valid(u32 x, u32 minorversion)
 {
-	if (x < NFS4_SHARE_ACCESS_READ)
+	if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
 		return 0;
-	if (x > NFS4_SHARE_ACCESS_BOTH)
+	if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
+		return 0;
+	x &= ~NFS4_SHARE_ACCESS_MASK;
+	if (minorversion && x) {
+		if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
+			return 0;
+		if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
+			return 0;
+		x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
+	}
+	if (x)
 		return 0;
 	return 1;
 }
@@ -1409,7 +2194,8 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
 
 
 __be32
-nfsd4_process_open1(struct nfsd4_open *open)
+nfsd4_process_open1(struct nfsd4_compound_state *cstate,
+		    struct nfsd4_open *open)
 {
 	clientid_t *clientid = &open->op_clientid;
 	struct nfs4_client *clp = NULL;
@@ -1432,10 +2218,13 @@ nfsd4_process_open1(struct nfsd4_open *open)
 			return nfserr_expired;
 		goto renew;
 	}
+	/* When sessions are used, skip open sequenceid processing */
+	if (nfsd4_has_session(cstate))
+		goto renew;
 	if (!sop->so_confirmed) {
 		/* Replace unconfirmed owners without checking for replay. */
 		clp = sop->so_client;
-		release_stateowner(sop);
+		release_openowner(sop);
 		open->op_stateowner = NULL;
 		goto renew;
 	}
@@ -1709,6 +2498,7 @@ out:
 __be32
 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
 {
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
 	struct nfs4_file *fp = NULL;
 	struct inode *ino = current_fh->fh_dentry->d_inode;
 	struct nfs4_stateid *stp = NULL;
@@ -1716,7 +2506,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	__be32 status;
 
 	status = nfserr_inval;
-	if (!access_valid(open->op_share_access)
+	if (!access_valid(open->op_share_access, resp->cstate.minorversion)
 			|| !deny_valid(open->op_share_deny))
 		goto out;
 	/*
@@ -1764,12 +2554,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 		init_stateid(stp, fp, open);
 		status = nfsd4_truncate(rqstp, current_fh, open);
 		if (status) {
-			release_stateid(stp, OPEN_STATE);
+			release_open_stateid(stp);
 			goto out;
 		}
+		if (nfsd4_has_session(&resp->cstate))
+			update_stateid(&stp->st_stateid);
 	}
 	memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
 
+	if (nfsd4_has_session(&resp->cstate))
+		open->op_stateowner->so_confirmed = 1;
+
 	/*
 	* Attempt to hand out a delegation. No error return, because the
 	* OPEN succeeds even if we fail.
@@ -1790,7 +2585,8 @@ out:
 	* To finish the open response, we just need to set the rflags.
 	*/
 	open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
-	if (!open->op_stateowner->so_confirmed)
+	if (!open->op_stateowner->so_confirmed &&
+	    !nfsd4_has_session(&resp->cstate))
 		open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
 
 	return status;
@@ -1898,7 +2694,7 @@ nfs4_laundromat(void)
 		}
 		dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
 			sop->so_id);
-		release_stateowner(sop);
+		release_openowner(sop);
 	}
 	if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
 		clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -1983,10 +2779,7 @@ out:
 static inline __be32
 check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
 {
-	/* Trying to call delegreturn with a special stateid? Yuch: */
-	if (!(flags & (RD_STATE | WR_STATE)))
-		return nfserr_bad_stateid;
-	else if (ONE_STATEID(stateid) && (flags & RD_STATE))
+	if (ONE_STATEID(stateid) && (flags & RD_STATE))
 		return nfs_ok;
 	else if (locks_in_grace()) {
 		/* Answer in remaining cases depends on existance of
@@ -2005,14 +2798,20 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
  * that are not able to provide mandatory locking.
  */
 static inline int
-io_during_grace_disallowed(struct inode *inode, int flags)
+grace_disallows_io(struct inode *inode)
 {
-	return locks_in_grace() && (flags & (RD_STATE | WR_STATE))
-		&& mandatory_lock(inode);
+	return locks_in_grace() && mandatory_lock(inode);
 }
 
-static int check_stateid_generation(stateid_t *in, stateid_t *ref)
+static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
 {
+	/*
+	 * When sessions are used the stateid generation number is ignored
+	 * when it is zero.
+	 */
+	if ((flags & HAS_SESSION) && in->si_generation == 0)
+		goto out;
+
 	/* If the client sends us a stateid from the future, it's buggy: */
 	if (in->si_generation > ref->si_generation)
 		return nfserr_bad_stateid;
@@ -2028,74 +2827,77 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref)
 	 */
 	if (in->si_generation < ref->si_generation)
 		return nfserr_old_stateid;
+out:
 	return nfs_ok;
 }
 
+static int is_delegation_stateid(stateid_t *stateid)
+{
+	return stateid->si_fileid == 0;
+}
+
 /*
 * Checks for stateid operations
 */
 __be32
-nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp)
+nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
+			   stateid_t *stateid, int flags, struct file **filpp)
 {
 	struct nfs4_stateid *stp = NULL;
 	struct nfs4_delegation *dp = NULL;
-	stateid_t *stidp;
+	struct svc_fh *current_fh = &cstate->current_fh;
 	struct inode *ino = current_fh->fh_dentry->d_inode;
 	__be32 status;
 
-	dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
-		stateid->si_boot, stateid->si_stateownerid, 
-		stateid->si_fileid, stateid->si_generation); 
 	if (filpp)
 		*filpp = NULL;
 
-	if (io_during_grace_disallowed(ino, flags))
+	if (grace_disallows_io(ino))
 		return nfserr_grace;
 
+	if (nfsd4_has_session(cstate))
+		flags |= HAS_SESSION;
+
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
 		return check_special_stateids(current_fh, stateid, flags);
 
-	/* STALE STATEID */
 	status = nfserr_stale_stateid;
 	if (STALE_STATEID(stateid)) 
 		goto out;
 
-	/* BAD STATEID */
 	status = nfserr_bad_stateid;
-	if (!stateid->si_fileid) { /* delegation stateid */
-		if(!(dp = find_delegation_stateid(ino, stateid))) {
-			dprintk("NFSD: delegation stateid not found\n");
+	if (is_delegation_stateid(stateid)) {
+		dp = find_delegation_stateid(ino, stateid);
+		if (!dp)
 			goto out;
-		}
-		stidp = &dp->dl_stateid;
+		status = check_stateid_generation(stateid, &dp->dl_stateid,
+						  flags);
+		if (status)
+			goto out;
+		status = nfs4_check_delegmode(dp, flags);
+		if (status)
+			goto out;
+		renew_client(dp->dl_client);
+		if (filpp)
+			*filpp = dp->dl_vfs_file;
 	} else { /* open or lock stateid */
-		if (!(stp = find_stateid(stateid, flags))) {
-			dprintk("NFSD: open or lock stateid not found\n");
+		stp = find_stateid(stateid, flags);
+		if (!stp)
 			goto out;
-		}
-		if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp))
+		if (nfs4_check_fh(current_fh, stp))
 			goto out;
 		if (!stp->st_stateowner->so_confirmed)
 			goto out;
-		stidp = &stp->st_stateid;
-	}
-	status = check_stateid_generation(stateid, stidp);
-	if (status)
-		goto out;
-	if (stp) {
-		if ((status = nfs4_check_openmode(stp,flags)))
+		status = check_stateid_generation(stateid, &stp->st_stateid,
+						  flags);
+		if (status)
+			goto out;
+		status = nfs4_check_openmode(stp, flags);
+		if (status)
 			goto out;
 		renew_client(stp->st_stateowner->so_client);
 		if (filpp)
 			*filpp = stp->st_vfs_file;
-	} else {
-		if ((status = nfs4_check_delegmode(dp, flags)))
-			goto out;
-		renew_client(dp->dl_client);
-		if (flags & DELEG_RET)
-			unhash_delegation(dp);
-		if (filpp)
-			*filpp = dp->dl_vfs_file;
 	}
 	status = nfs_ok;
 out:
@@ -2113,10 +2915,14 @@ setlkflg (int type)
  * Checks for sequence id mutating operations. 
  */
 static __be32
-nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
+nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
+			 stateid_t *stateid, int flags,
+			 struct nfs4_stateowner **sopp,
+			 struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
 {
 	struct nfs4_stateid *stp;
 	struct nfs4_stateowner *sop;
+	struct svc_fh *current_fh = &cstate->current_fh;
 	__be32 status;
 
 	dprintk("NFSD: preprocess_seqid_op: seqid=%d " 
@@ -2134,6 +2940,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
 
 	if (STALE_STATEID(stateid))
 		return nfserr_stale_stateid;
+
+	if (nfsd4_has_session(cstate))
+		flags |= HAS_SESSION;
+
 	/*
 	* We return BAD_STATEID if filehandle doesn't match stateid, 
 	* the confirmed flag is incorrecly set, or the generation 
@@ -2166,8 +2976,9 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
 		if (lock->lk_is_new) {
 			if (!sop->so_is_open_owner)
 				return nfserr_bad_stateid;
-			if (!same_clid(&clp->cl_clientid, lockclid))
-			       return nfserr_bad_stateid;
+			if (!(flags & HAS_SESSION) &&
+			    !same_clid(&clp->cl_clientid, lockclid))
+				return nfserr_bad_stateid;
 			/* stp is the open stateid */
 			status = nfs4_check_openmode(stp, lkflg);
 			if (status)
@@ -2190,7 +3001,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
 	*  For the moment, we ignore the possibility of 
 	*  generation number wraparound.
 	*/
-	if (seqid != sop->so_seqid)
+	if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
 		goto check_replay;
 
 	if (sop->so_confirmed && flags & CONFIRM) {
@@ -2203,7 +3014,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
 				" confirmed yet!\n");
 		return nfserr_bad_stateid;
 	}
-	status = check_stateid_generation(stateid, &stp->st_stateid);
+	status = check_stateid_generation(stateid, &stp->st_stateid, flags);
 	if (status)
 		return status;
 	renew_client(sop->so_client);
@@ -2239,7 +3050,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 
-	if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+	if ((status = nfs4_preprocess_seqid_op(cstate,
 					oc->oc_seqid, &oc->oc_req_stateid,
 					CONFIRM | OPEN_STATE,
 					&oc->oc_stateowner, &stp, NULL)))
@@ -2304,12 +3115,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
 			(int)cstate->current_fh.fh_dentry->d_name.len,
 			cstate->current_fh.fh_dentry->d_name.name);
 
-	if (!access_valid(od->od_share_access)
+	if (!access_valid(od->od_share_access, cstate->minorversion)
 			|| !deny_valid(od->od_share_deny))
 		return nfserr_inval;
 
 	nfs4_lock_state();
-	if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+	if ((status = nfs4_preprocess_seqid_op(cstate,
 					od->od_seqid,
 					&od->od_stateid, 
 					OPEN_STATE,
@@ -2362,7 +3173,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 	/* check close_lru for replay */
-	if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+	if ((status = nfs4_preprocess_seqid_op(cstate,
 					close->cl_seqid,
 					&close->cl_stateid, 
 					OPEN_STATE | CLOSE_STATE,
@@ -2373,7 +3184,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
 
 	/* release_stateid() calls nfsd_close() if needed */
-	release_stateid(stp, OPEN_STATE);
+	release_open_stateid(stp);
 
 	/* place unused nfs4_stateowners on so_close_lru list to be
 	 * released by the laundromat service after the lease period
@@ -2394,16 +3205,40 @@ __be32
 nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		  struct nfsd4_delegreturn *dr)
 {
+	struct nfs4_delegation *dp;
+	stateid_t *stateid = &dr->dr_stateid;
+	struct inode *inode;
 	__be32 status;
+	int flags = 0;
 
 	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
-		goto out;
+		return status;
+	inode = cstate->current_fh.fh_dentry->d_inode;
 
+	if (nfsd4_has_session(cstate))
+		flags |= HAS_SESSION;
 	nfs4_lock_state();
-	status = nfs4_preprocess_stateid_op(&cstate->current_fh,
-					    &dr->dr_stateid, DELEG_RET, NULL);
-	nfs4_unlock_state();
+	status = nfserr_bad_stateid;
+	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
+		goto out;
+	status = nfserr_stale_stateid;
+	if (STALE_STATEID(stateid))
+		goto out;
+	status = nfserr_bad_stateid;
+	if (!is_delegation_stateid(stateid))
+		goto out;
+	dp = find_delegation_stateid(inode, stateid);
+	if (!dp)
+		goto out;
+	status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
+	if (status)
+		goto out;
+	renew_client(dp->dl_client);
+
+	unhash_delegation(dp);
 out:
+	nfs4_unlock_state();
+
 	return status;
 }
 
@@ -2684,11 +3519,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		struct nfs4_file *fp;
 		
 		status = nfserr_stale_clientid;
-		if (STALE_CLIENTID(&lock->lk_new_clientid))
+		if (!nfsd4_has_session(cstate) &&
+		    STALE_CLIENTID(&lock->lk_new_clientid))
 			goto out;
 
 		/* validate and update open stateid and open seqid */
-		status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+		status = nfs4_preprocess_seqid_op(cstate,
 				        lock->lk_new_open_seqid,
 		                        &lock->lk_new_open_stateid,
 					OPEN_STATE,
@@ -2715,7 +3551,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			goto out;
 	} else {
 		/* lock (lock owner + lock stateid) already exists */
-		status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+		status = nfs4_preprocess_seqid_op(cstate,
 				       lock->lk_old_lock_seqid, 
 				       &lock->lk_old_lock_stateid, 
 				       LOCK_STATE,
@@ -2788,7 +3624,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	}
 out:
 	if (status && lock->lk_is_new && lock_sop)
-		release_stateowner(lock_sop);
+		release_lockowner(lock_sop);
 	if (lock->lk_replay_owner) {
 		nfs4_get_stateowner(lock->lk_replay_owner);
 		cstate->replay_owner = lock->lk_replay_owner;
@@ -2838,7 +3674,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	nfs4_lock_state();
 
 	status = nfserr_stale_clientid;
-	if (STALE_CLIENTID(&lockt->lt_clientid))
+	if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
 		goto out;
 
 	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
@@ -2911,7 +3747,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_lock_state();
 									        
-	if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh,
+	if ((status = nfs4_preprocess_seqid_op(cstate,
 					locku->lu_seqid, 
 					&locku->lu_stateid, 
 					LOCK_STATE,
@@ -3037,7 +3873,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 		/* unhash_stateowner deletes so_perclient only
 		 * for openowners. */
 		list_del(&sop->so_perclient);
-		release_stateowner(sop);
+		release_lockowner(sop);
 	}
 out:
 	nfs4_unlock_state();
@@ -3051,12 +3887,12 @@ alloc_reclaim(void)
 }
 
 int
-nfs4_has_reclaimed_state(const char *name)
+nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
 {
 	unsigned int strhashval = clientstr_hashval(name);
 	struct nfs4_client *clp;
 
-	clp = find_confirmed_client_by_str(name, strhashval);
+	clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
 	return clp ? 1 : 0;
 }
 
@@ -3153,6 +3989,8 @@ nfs4_state_init(void)
 		INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
 		INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
 	}
+	for (i = 0; i < SESSION_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&sessionid_hashtbl[i]);
 	for (i = 0; i < FILE_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&file_hashtbl[i]);
 	}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9250067943d..b820c311931 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -45,6 +45,7 @@
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/vfs.h>
+#include <linux/utsname.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
@@ -188,6 +189,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
 	return p;
 }
 
+static int zero_clientid(clientid_t *clid)
+{
+	return (clid->cl_boot == 0) && (clid->cl_id == 0);
+}
+
 static int
 defer_free(struct nfsd4_compoundargs *argp,
 		void (*release)(const void *), void *p)
@@ -230,6 +236,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
 
 	bmval[0] = 0;
 	bmval[1] = 0;
+	bmval[2] = 0;
 
 	READ_BUF(4);
 	READ32(bmlen);
@@ -241,13 +248,27 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
 		READ32(bmval[0]);
 	if (bmlen > 1)
 		READ32(bmval[1]);
+	if (bmlen > 2)
+		READ32(bmval[2]);
 
 	DECODE_TAIL;
 }
 
+static u32 nfsd_attrmask[] = {
+	NFSD_WRITEABLE_ATTRS_WORD0,
+	NFSD_WRITEABLE_ATTRS_WORD1,
+	NFSD_WRITEABLE_ATTRS_WORD2
+};
+
+static u32 nfsd41_ex_attrmask[] = {
+	NFSD_SUPPATTR_EXCLCREAT_WORD0,
+	NFSD_SUPPATTR_EXCLCREAT_WORD1,
+	NFSD_SUPPATTR_EXCLCREAT_WORD2
+};
+
 static __be32
-nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr,
-    struct nfs4_acl **acl)
+nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
+		   struct iattr *iattr, struct nfs4_acl **acl)
 {
 	int expected_len, len = 0;
 	u32 dummy32;
@@ -263,9 +284,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
 	 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP;
 	 * read-only attributes return ERR_INVAL.
 	 */
-	if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1))
+	if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) ||
+	    (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) ||
+	    (bmval[2] & ~nfsd_suppattrs2(argp->minorversion)))
 		return nfserr_attrnotsupp;
-	if ((bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0) || (bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1))
+	if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
+	    (bmval[2] & ~writable[2]))
 		return nfserr_inval;
 
 	READ_BUF(4);
@@ -400,6 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
 			goto xdr_error;
 		}
 	}
+	BUG_ON(bmval[2]);	/* no such writeable attr supported yet */
 	if (len != expected_len)
 		goto xdr_error;
 
@@ -493,7 +518,9 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
 	if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
 		return status;
 
-	if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl)))
+	status = nfsd4_decode_fattr(argp, create->cr_bmval, nfsd_attrmask,
+				    &create->cr_iattr, &create->cr_acl);
+	if (status)
 		goto out;
 
 	DECODE_TAIL;
@@ -583,6 +610,8 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
 	READ_BUF(lockt->lt_owner.len);
 	READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
 
+	if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
+		return nfserr_inval;
 	DECODE_TAIL;
 }
 
@@ -652,13 +681,26 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 		switch (open->op_createmode) {
 		case NFS4_CREATE_UNCHECKED:
 		case NFS4_CREATE_GUARDED:
-			if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl)))
+			status = nfsd4_decode_fattr(argp, open->op_bmval,
+				nfsd_attrmask, &open->op_iattr, &open->op_acl);
+			if (status)
 				goto out;
 			break;
 		case NFS4_CREATE_EXCLUSIVE:
 			READ_BUF(8);
 			COPYMEM(open->op_verf.data, 8);
 			break;
+		case NFS4_CREATE_EXCLUSIVE4_1:
+			if (argp->minorversion < 1)
+				goto xdr_error;
+			READ_BUF(8);
+			COPYMEM(open->op_verf.data, 8);
+			status = nfsd4_decode_fattr(argp, open->op_bmval,
+				nfsd41_ex_attrmask, &open->op_iattr,
+				&open->op_acl);
+			if (status)
+				goto out;
+			break;
 		default:
 			goto xdr_error;
 		}
@@ -851,7 +893,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
 	status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
 	if (status)
 		return status;
-	return nfsd4_decode_fattr(argp, setattr->sa_bmval,
+	return nfsd4_decode_fattr(argp, setattr->sa_bmval, nfsd_attrmask,
 				  &setattr->sa_iattr, &setattr->sa_acl);
 }
 
@@ -993,6 +1035,241 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
 	READ_BUF(rlockowner->rl_owner.len);
 	READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
 
+	if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid))
+		return nfserr_inval;
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
+			 struct nfsd4_exchange_id *exid)
+{
+	int dummy;
+	DECODE_HEAD;
+
+	READ_BUF(NFS4_VERIFIER_SIZE);
+	COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
+
+	READ_BUF(4);
+	READ32(exid->clname.len);
+
+	READ_BUF(exid->clname.len);
+	SAVEMEM(exid->clname.data, exid->clname.len);
+
+	READ_BUF(4);
+	READ32(exid->flags);
+
+	/* Ignore state_protect4_a */
+	READ_BUF(4);
+	READ32(exid->spa_how);
+	switch (exid->spa_how) {
+	case SP4_NONE:
+		break;
+	case SP4_MACH_CRED:
+		/* spo_must_enforce */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		/* spo_must_allow */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+		break;
+	case SP4_SSV:
+		/* ssp_ops */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy * 4);
+		p += dummy;
+
+		/* ssp_hash_algs<> */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* ssp_encr_algs<> */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* ssp_window and ssp_num_gss_handles */
+		READ_BUF(8);
+		READ32(dummy);
+		READ32(dummy);
+		break;
+	default:
+		goto xdr_error;
+	}
+
+	/* Ignore Implementation ID */
+	READ_BUF(4);    /* nfs_impl_id4 array length */
+	READ32(dummy);
+
+	if (dummy > 1)
+		goto xdr_error;
+
+	if (dummy == 1) {
+		/* nii_domain */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* nii_name */
+		READ_BUF(4);
+		READ32(dummy);
+		READ_BUF(dummy);
+		p += XDR_QUADLEN(dummy);
+
+		/* nii_date */
+		READ_BUF(12);
+		p += 3;
+	}
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
+			    struct nfsd4_create_session *sess)
+{
+	DECODE_HEAD;
+
+	u32 dummy;
+	char *machine_name;
+	int i;
+	int nr_secflavs;
+
+	READ_BUF(16);
+	COPYMEM(&sess->clientid, 8);
+	READ32(sess->seqid);
+	READ32(sess->flags);
+
+	/* Fore channel attrs */
+	READ_BUF(28);
+	READ32(dummy); /* headerpadsz is always 0 */
+	READ32(sess->fore_channel.maxreq_sz);
+	READ32(sess->fore_channel.maxresp_sz);
+	READ32(sess->fore_channel.maxresp_cached);
+	READ32(sess->fore_channel.maxops);
+	READ32(sess->fore_channel.maxreqs);
+	READ32(sess->fore_channel.nr_rdma_attrs);
+	if (sess->fore_channel.nr_rdma_attrs == 1) {
+		READ_BUF(4);
+		READ32(sess->fore_channel.rdma_attrs);
+	} else if (sess->fore_channel.nr_rdma_attrs > 1) {
+		dprintk("Too many fore channel attr bitmaps!\n");
+		goto xdr_error;
+	}
+
+	/* Back channel attrs */
+	READ_BUF(28);
+	READ32(dummy); /* headerpadsz is always 0 */
+	READ32(sess->back_channel.maxreq_sz);
+	READ32(sess->back_channel.maxresp_sz);
+	READ32(sess->back_channel.maxresp_cached);
+	READ32(sess->back_channel.maxops);
+	READ32(sess->back_channel.maxreqs);
+	READ32(sess->back_channel.nr_rdma_attrs);
+	if (sess->back_channel.nr_rdma_attrs == 1) {
+		READ_BUF(4);
+		READ32(sess->back_channel.rdma_attrs);
+	} else if (sess->back_channel.nr_rdma_attrs > 1) {
+		dprintk("Too many back channel attr bitmaps!\n");
+		goto xdr_error;
+	}
+
+	READ_BUF(8);
+	READ32(sess->callback_prog);
+
+	/* callback_sec_params4 */
+	READ32(nr_secflavs);
+	for (i = 0; i < nr_secflavs; ++i) {
+		READ_BUF(4);
+		READ32(dummy);
+		switch (dummy) {
+		case RPC_AUTH_NULL:
+			/* Nothing to read */
+			break;
+		case RPC_AUTH_UNIX:
+			READ_BUF(8);
+			/* stamp */
+			READ32(dummy);
+
+			/* machine name */
+			READ32(dummy);
+			READ_BUF(dummy);
+			SAVEMEM(machine_name, dummy);
+
+			/* uid, gid */
+			READ_BUF(8);
+			READ32(sess->uid);
+			READ32(sess->gid);
+
+			/* more gids */
+			READ_BUF(4);
+			READ32(dummy);
+			READ_BUF(dummy * 4);
+			for (i = 0; i < dummy; ++i)
+				READ32(dummy);
+			break;
+		case RPC_AUTH_GSS:
+			dprintk("RPC_AUTH_GSS callback secflavor "
+				"not supported!\n");
+			READ_BUF(8);
+			/* gcbp_service */
+			READ32(dummy);
+			/* gcbp_handle_from_server */
+			READ32(dummy);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+			/* gcbp_handle_from_client */
+			READ_BUF(4);
+			READ32(dummy);
+			READ_BUF(dummy);
+			p += XDR_QUADLEN(dummy);
+			break;
+		default:
+			dprintk("Illegal callback secflavor\n");
+			return nfserr_inval;
+		}
+	}
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
+			     struct nfsd4_destroy_session *destroy_session)
+{
+	DECODE_HEAD;
+	READ_BUF(NFS4_MAX_SESSIONID_LEN);
+	COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
+		      struct nfsd4_sequence *seq)
+{
+	DECODE_HEAD;
+
+	READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+	COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	READ32(seq->seqid);
+	READ32(seq->slotid);
+	READ32(seq->maxslots);
+	READ32(seq->cachethis);
+
 	DECODE_TAIL;
 }
 
@@ -1005,7 +1282,7 @@ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
 static __be32
 nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
 {
-	return nfserr_opnotsupp;
+	return nfserr_notsupp;
 }
 
 typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
@@ -1031,7 +1308,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_OPEN_CONFIRM]	= (nfsd4_dec)nfsd4_decode_open_confirm,
 	[OP_OPEN_DOWNGRADE]	= (nfsd4_dec)nfsd4_decode_open_downgrade,
 	[OP_PUTFH]		= (nfsd4_dec)nfsd4_decode_putfh,
-	[OP_PUTPUBFH]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_PUTPUBFH]		= (nfsd4_dec)nfsd4_decode_noop,
 	[OP_PUTROOTFH]		= (nfsd4_dec)nfsd4_decode_noop,
 	[OP_READ]		= (nfsd4_dec)nfsd4_decode_read,
 	[OP_READDIR]		= (nfsd4_dec)nfsd4_decode_readdir,
@@ -1050,6 +1327,67 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_RELEASE_LOCKOWNER]	= (nfsd4_dec)nfsd4_decode_release_lockowner,
 };
 
+static nfsd4_dec nfsd41_dec_ops[] = {
+	[OP_ACCESS]		(nfsd4_dec)nfsd4_decode_access,
+	[OP_CLOSE]		(nfsd4_dec)nfsd4_decode_close,
+	[OP_COMMIT]		(nfsd4_dec)nfsd4_decode_commit,
+	[OP_CREATE]		(nfsd4_dec)nfsd4_decode_create,
+	[OP_DELEGPURGE]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DELEGRETURN]	(nfsd4_dec)nfsd4_decode_delegreturn,
+	[OP_GETATTR]		(nfsd4_dec)nfsd4_decode_getattr,
+	[OP_GETFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_LINK]		(nfsd4_dec)nfsd4_decode_link,
+	[OP_LOCK]		(nfsd4_dec)nfsd4_decode_lock,
+	[OP_LOCKT]		(nfsd4_dec)nfsd4_decode_lockt,
+	[OP_LOCKU]		(nfsd4_dec)nfsd4_decode_locku,
+	[OP_LOOKUP]		(nfsd4_dec)nfsd4_decode_lookup,
+	[OP_LOOKUPP]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_NVERIFY]		(nfsd4_dec)nfsd4_decode_verify,
+	[OP_OPEN]		(nfsd4_dec)nfsd4_decode_open,
+	[OP_OPENATTR]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OPEN_CONFIRM]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_OPEN_DOWNGRADE]	(nfsd4_dec)nfsd4_decode_open_downgrade,
+	[OP_PUTFH]		(nfsd4_dec)nfsd4_decode_putfh,
+	[OP_PUTPUBFH]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_PUTROOTFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_READ]		(nfsd4_dec)nfsd4_decode_read,
+	[OP_READDIR]		(nfsd4_dec)nfsd4_decode_readdir,
+	[OP_READLINK]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_REMOVE]		(nfsd4_dec)nfsd4_decode_remove,
+	[OP_RENAME]		(nfsd4_dec)nfsd4_decode_rename,
+	[OP_RENEW]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_RESTOREFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_SAVEFH]		(nfsd4_dec)nfsd4_decode_noop,
+	[OP_SECINFO]		(nfsd4_dec)nfsd4_decode_secinfo,
+	[OP_SETATTR]		(nfsd4_dec)nfsd4_decode_setattr,
+	[OP_SETCLIENTID]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_VERIFY]		(nfsd4_dec)nfsd4_decode_verify,
+	[OP_WRITE]		(nfsd4_dec)nfsd4_decode_write,
+	[OP_RELEASE_LOCKOWNER]	(nfsd4_dec)nfsd4_decode_notsupp,
+
+	/* new operations for NFSv4.1 */
+	[OP_BACKCHANNEL_CTL]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_EXCHANGE_ID]	(nfsd4_dec)nfsd4_decode_exchange_id,
+	[OP_CREATE_SESSION]	(nfsd4_dec)nfsd4_decode_create_session,
+	[OP_DESTROY_SESSION]	(nfsd4_dec)nfsd4_decode_destroy_session,
+	[OP_FREE_STATEID]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_GET_DIR_DELEGATION]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_GETDEVICEINFO]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_GETDEVICELIST]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTCOMMIT]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTGET]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTRETURN]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SECINFO_NO_NAME]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_SEQUENCE]		(nfsd4_dec)nfsd4_decode_sequence,
+	[OP_SET_SSV]		(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_TEST_STATEID]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_WANT_DELEGATION]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_DESTROY_CLIENTID]	(nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_RECLAIM_COMPLETE]	(nfsd4_dec)nfsd4_decode_notsupp,
+};
+
 struct nfsd4_minorversion_ops {
 	nfsd4_dec *decoders;
 	int nops;
@@ -1057,6 +1395,7 @@ struct nfsd4_minorversion_ops {
 
 static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
 	[0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
+	[1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
 };
 
 static __be32
@@ -1412,6 +1751,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 {
 	u32 bmval0 = bmval[0];
 	u32 bmval1 = bmval[1];
+	u32 bmval2 = bmval[2];
 	struct kstat stat;
 	struct svc_fh tempfh;
 	struct kstatfs statfs;
@@ -1425,12 +1765,16 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	int err;
 	int aclsupport = 0;
 	struct nfs4_acl *acl = NULL;
+	struct nfsd4_compoundres *resp = rqstp->rq_resp;
+	u32 minorversion = resp->cstate.minorversion;
 
 	BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
-	BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0);
-	BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1);
+	BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
+	BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion));
+	BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
 
 	if (exp->ex_fslocs.migrated) {
+		BUG_ON(bmval[2]);
 		status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
 		if (status)
 			goto out;
@@ -1476,22 +1820,42 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	if ((buflen -= 16) < 0)
 		goto out_resource;
 
-	WRITE32(2);
-	WRITE32(bmval0);
-	WRITE32(bmval1);
+	if (unlikely(bmval2)) {
+		WRITE32(3);
+		WRITE32(bmval0);
+		WRITE32(bmval1);
+		WRITE32(bmval2);
+	} else if (likely(bmval1)) {
+		WRITE32(2);
+		WRITE32(bmval0);
+		WRITE32(bmval1);
+	} else {
+		WRITE32(1);
+		WRITE32(bmval0);
+	}
 	attrlenp = p++;                /* to be backfilled later */
 
 	if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
-		u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0;
+		u32 word0 = nfsd_suppattrs0(minorversion);
+		u32 word1 = nfsd_suppattrs1(minorversion);
+		u32 word2 = nfsd_suppattrs2(minorversion);
+
 		if ((buflen -= 12) < 0)
 			goto out_resource;
 		if (!aclsupport)
 			word0 &= ~FATTR4_WORD0_ACL;
 		if (!exp->ex_fslocs.locations)
 			word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
-		WRITE32(2);
-		WRITE32(word0);
-		WRITE32(NFSD_SUPPORTED_ATTRS_WORD1);
+		if (!word2) {
+			WRITE32(2);
+			WRITE32(word0);
+			WRITE32(word1);
+		} else {
+			WRITE32(3);
+			WRITE32(word0);
+			WRITE32(word1);
+			WRITE32(word2);
+		}
 	}
 	if (bmval0 & FATTR4_WORD0_TYPE) {
 		if ((buflen -= 4) < 0)
@@ -1801,6 +2165,13 @@ out_acl:
 		}
 		WRITE64(stat.ino);
 	}
+	if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
+		WRITE32(3);
+		WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
+		WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
+		WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
+	}
+
 	*attrlenp = htonl((char *)p - (char *)attrlenp - 4);
 	*countp = p - buffer;
 	status = nfs_ok;
@@ -2572,6 +2943,143 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
 }
 
 static __be32
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
+			 struct nfsd4_exchange_id *exid)
+{
+	ENCODE_HEAD;
+	char *major_id;
+	char *server_scope;
+	int major_id_sz;
+	int server_scope_sz;
+	uint64_t minor_id = 0;
+
+	if (nfserr)
+		return nfserr;
+
+	major_id = utsname()->nodename;
+	major_id_sz = strlen(major_id);
+	server_scope = utsname()->nodename;
+	server_scope_sz = strlen(server_scope);
+
+	RESERVE_SPACE(
+		8 /* eir_clientid */ +
+		4 /* eir_sequenceid */ +
+		4 /* eir_flags */ +
+		4 /* spr_how (SP4_NONE) */ +
+		8 /* so_minor_id */ +
+		4 /* so_major_id.len */ +
+		(XDR_QUADLEN(major_id_sz) * 4) +
+		4 /* eir_server_scope.len */ +
+		(XDR_QUADLEN(server_scope_sz) * 4) +
+		4 /* eir_server_impl_id.count (0) */);
+
+	WRITEMEM(&exid->clientid, 8);
+	WRITE32(exid->seqid);
+	WRITE32(exid->flags);
+
+	/* state_protect4_r. Currently only support SP4_NONE */
+	BUG_ON(exid->spa_how != SP4_NONE);
+	WRITE32(exid->spa_how);
+
+	/* The server_owner struct */
+	WRITE64(minor_id);      /* Minor id */
+	/* major id */
+	WRITE32(major_id_sz);
+	WRITEMEM(major_id, major_id_sz);
+
+	/* Server scope */
+	WRITE32(server_scope_sz);
+	WRITEMEM(server_scope, server_scope_sz);
+
+	/* Implementation id */
+	WRITE32(0);	/* zero length nfs_impl_id4 array */
+	ADJUST_ARGS();
+	return 0;
+}
+
+static __be32
+nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
+			    struct nfsd4_create_session *sess)
+{
+	ENCODE_HEAD;
+
+	if (nfserr)
+		return nfserr;
+
+	RESERVE_SPACE(24);
+	WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	WRITE32(sess->seqid);
+	WRITE32(sess->flags);
+	ADJUST_ARGS();
+
+	RESERVE_SPACE(28);
+	WRITE32(0); /* headerpadsz */
+	WRITE32(sess->fore_channel.maxreq_sz);
+	WRITE32(sess->fore_channel.maxresp_sz);
+	WRITE32(sess->fore_channel.maxresp_cached);
+	WRITE32(sess->fore_channel.maxops);
+	WRITE32(sess->fore_channel.maxreqs);
+	WRITE32(sess->fore_channel.nr_rdma_attrs);
+	ADJUST_ARGS();
+
+	if (sess->fore_channel.nr_rdma_attrs) {
+		RESERVE_SPACE(4);
+		WRITE32(sess->fore_channel.rdma_attrs);
+		ADJUST_ARGS();
+	}
+
+	RESERVE_SPACE(28);
+	WRITE32(0); /* headerpadsz */
+	WRITE32(sess->back_channel.maxreq_sz);
+	WRITE32(sess->back_channel.maxresp_sz);
+	WRITE32(sess->back_channel.maxresp_cached);
+	WRITE32(sess->back_channel.maxops);
+	WRITE32(sess->back_channel.maxreqs);
+	WRITE32(sess->back_channel.nr_rdma_attrs);
+	ADJUST_ARGS();
+
+	if (sess->back_channel.nr_rdma_attrs) {
+		RESERVE_SPACE(4);
+		WRITE32(sess->back_channel.rdma_attrs);
+		ADJUST_ARGS();
+	}
+	return 0;
+}
+
+static __be32
+nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
+			     struct nfsd4_destroy_session *destroy_session)
+{
+	return nfserr;
+}
+
+__be32
+nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
+		      struct nfsd4_sequence *seq)
+{
+	ENCODE_HEAD;
+
+	if (nfserr)
+		return nfserr;
+
+	RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20);
+	WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+	WRITE32(seq->seqid);
+	WRITE32(seq->slotid);
+	WRITE32(seq->maxslots);
+	/*
+	 * FIXME: for now:
+	 *   target_maxslots = maxslots
+	 *   status_flags = 0
+	 */
+	WRITE32(seq->maxslots);
+	WRITE32(0);
+
+	ADJUST_ARGS();
+	return 0;
+}
+
+static __be32
 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 {
 	return nfserr;
@@ -2579,6 +3087,11 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
 
 typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
 
+/*
+ * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
+ * since we don't need to filter out obsolete ops as this is
+ * done in the decoding phase.
+ */
 static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_ACCESS]		= (nfsd4_enc)nfsd4_encode_access,
 	[OP_CLOSE]		= (nfsd4_enc)nfsd4_encode_close,
@@ -2617,8 +3130,77 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_VERIFY]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_WRITE]		= (nfsd4_enc)nfsd4_encode_write,
 	[OP_RELEASE_LOCKOWNER]	= (nfsd4_enc)nfsd4_encode_noop,
+
+	/* NFSv4.1 operations */
+	[OP_BACKCHANNEL_CTL]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+	[OP_EXCHANGE_ID]	= (nfsd4_enc)nfsd4_encode_exchange_id,
+	[OP_CREATE_SESSION]	= (nfsd4_enc)nfsd4_encode_create_session,
+	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_destroy_session,
+	[OP_FREE_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GET_DIR_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_SEQUENCE]		= (nfsd4_enc)nfsd4_encode_sequence,
+	[OP_SET_SSV]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_TEST_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_WANT_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_DESTROY_CLIENTID]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_RECLAIM_COMPLETE]	= (nfsd4_enc)nfsd4_encode_noop,
 };
 
+/*
+ * Calculate the total amount of memory that the compound response has taken
+ * after encoding the current operation.
+ *
+ * pad: add on 8 bytes for the next operation's op_code and status so that
+ * there is room to cache a failure on the next operation.
+ *
+ * Compare this length to the session se_fmaxresp_cached.
+ *
+ * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
+ * will be at least a page and will therefore hold the xdr_buf head.
+ */
+static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
+{
+	int status = 0;
+	struct xdr_buf *xb = &resp->rqstp->rq_res;
+	struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
+	struct nfsd4_session *session = NULL;
+	struct nfsd4_slot *slot = resp->cstate.slot;
+	u32 length, tlen = 0, pad = 8;
+
+	if (!nfsd4_has_session(&resp->cstate))
+		return status;
+
+	session = resp->cstate.session;
+	if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0)
+		return status;
+
+	if (resp->opcnt >= args->opcnt)
+		pad = 0; /* this is the last operation */
+
+	if (xb->page_len == 0) {
+		length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
+	} else {
+		if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0)
+			tlen = (char *)resp->p - (char *)xb->tail[0].iov_base;
+
+		length = xb->head[0].iov_len + xb->page_len + tlen + pad;
+	}
+	dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
+		length, xb->page_len, tlen, pad);
+
+	if (length <= session->se_fmaxresp_cached)
+		return status;
+	else
+		return nfserr_rep_too_big_to_cache;
+}
+
 void
 nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 {
@@ -2635,6 +3217,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
 	BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
 	       !nfsd4_enc_ops[op->opnum]);
 	op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
+	/* nfsd4_check_drc_limit guarantees enough room for error status */
+	if (!op->status && nfsd4_check_drc_limit(resp))
+		op->status = nfserr_rep_too_big_to_cache;
 status:
 	/*
 	 * Note: We write the status directly, instead of using WRITE32(),
@@ -2735,6 +3320,18 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
 		iov = &rqstp->rq_res.head[0];
 	iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
 	BUG_ON(iov->iov_len > PAGE_SIZE);
+	if (nfsd4_has_session(&resp->cstate)) {
+		if (resp->cstate.status == nfserr_replay_cache &&
+				!nfsd4_not_cached(resp)) {
+			iov->iov_len = resp->cstate.iovlen;
+		} else {
+			nfsd4_store_cache_entry(resp);
+			dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
+			resp->cstate.slot->sl_inuse = 0;
+		}
+		if (resp->cstate.session)
+			nfsd4_put_session(resp->cstate.session);
+	}
 	return 1;
 }
 
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index a4ed8644d69..af16849d243 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -60,6 +60,7 @@ enum {
 	NFSD_FO_UnlockFS,
 	NFSD_Threads,
 	NFSD_Pool_Threads,
+	NFSD_Pool_Stats,
 	NFSD_Versions,
 	NFSD_Ports,
 	NFSD_MaxBlkSize,
@@ -172,6 +173,16 @@ static const struct file_operations exports_operations = {
 	.owner		= THIS_MODULE,
 };
 
+extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
+
+static struct file_operations pool_stats_operations = {
+	.open		= nfsd_pool_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+	.owner		= THIS_MODULE,
+};
+
 /*----------------------------------------------------------------------------*/
 /*
  * payload - write methods
@@ -781,8 +792,9 @@ out_free:
 static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
 	char *mesg = buf;
-	char *vers, sign;
+	char *vers, *minorp, sign;
 	int len, num;
+	unsigned minor;
 	ssize_t tlen = 0;
 	char *sep;
 
@@ -803,9 +815,20 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 		do {
 			sign = *vers;
 			if (sign == '+' || sign == '-')
-				num = simple_strtol((vers+1), NULL, 0);
+				num = simple_strtol((vers+1), &minorp, 0);
 			else
-				num = simple_strtol(vers, NULL, 0);
+				num = simple_strtol(vers, &minorp, 0);
+			if (*minorp == '.') {
+				if (num < 4)
+					return -EINVAL;
+				minor = simple_strtoul(minorp+1, NULL, 0);
+				if (minor == 0)
+					return -EINVAL;
+				if (nfsd_minorversion(minor, sign == '-' ?
+						     NFSD_CLEAR : NFSD_SET) < 0)
+					return -EINVAL;
+				goto next;
+			}
 			switch(num) {
 			case 2:
 			case 3:
@@ -815,6 +838,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 			default:
 				return -EINVAL;
 			}
+		next:
 			vers += len + 1;
 			tlen += len;
 		} while ((len = qword_get(&mesg, vers, size)) > 0);
@@ -833,6 +857,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 				       num);
 			sep = " ";
 		}
+	if (nfsd_vers(4, NFSD_AVAIL))
+		for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++)
+			len += sprintf(buf+len, " %c4.%u",
+					(nfsd_vers(4, NFSD_TEST) &&
+					 nfsd_minorversion(minor, NFSD_TEST)) ?
+						'+' : '-',
+					minor);
 	len += sprintf(buf+len, "\n");
 	return len;
 }
@@ -1248,6 +1279,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 		[NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
 		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6f7f2635122..e298e260b5f 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -180,6 +180,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
 {
 	__be32	nfserr;
 	int	stable = 1;
+	unsigned long cnt = argp->len;
 
 	dprintk("nfsd: WRITE    %s %d bytes at %d\n",
 		SVCFH_fmt(&argp->fh),
@@ -188,7 +189,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
 	nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
 				   argp->offset,
 				   rqstp->rq_vec, argp->vlen,
-				   argp->len,
+			           &cnt,
 				   &stable);
 	return nfsd_return_attrs(nfserr, resp);
 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 7c09852be71..cbba4a93578 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -22,6 +22,7 @@
 #include <linux/freezer.h>
 #include <linux/fs_struct.h>
 #include <linux/kthread.h>
+#include <linux/swap.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/stats.h>
@@ -40,9 +41,6 @@
 extern struct svc_program	nfsd_program;
 static int			nfsd(void *vrqstp);
 struct timeval			nfssvc_boot;
-static atomic_t			nfsd_busy;
-static unsigned long		nfsd_last_call;
-static DEFINE_SPINLOCK(nfsd_call_lock);
 
 /*
  * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
@@ -123,6 +121,8 @@ struct svc_program		nfsd_program = {
 
 };
 
+u32 nfsd_supported_minorversion;
+
 int nfsd_vers(int vers, enum vers_op change)
 {
 	if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
@@ -149,6 +149,28 @@ int nfsd_vers(int vers, enum vers_op change)
 	}
 	return 0;
 }
+
+int nfsd_minorversion(u32 minorversion, enum vers_op change)
+{
+	if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
+		return -1;
+	switch(change) {
+	case NFSD_SET:
+		nfsd_supported_minorversion = minorversion;
+		break;
+	case NFSD_CLEAR:
+		if (minorversion == 0)
+			return -1;
+		nfsd_supported_minorversion = minorversion - 1;
+		break;
+	case NFSD_TEST:
+		return minorversion <= nfsd_supported_minorversion;
+	case NFSD_AVAIL:
+		return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
+	}
+	return 0;
+}
+
 /*
  * Maximum number of nfsd processes
  */
@@ -200,6 +222,28 @@ void nfsd_reset_versions(void)
 	}
 }
 
+/*
+ * Each session guarantees a negotiated per slot memory cache for replies
+ * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated
+ * NFSv4.1 server might want to use more memory for a DRC than a machine
+ * with mutiple services.
+ *
+ * Impose a hard limit on the number of pages for the DRC which varies
+ * according to the machines free pages. This is of course only a default.
+ *
+ * For now this is a #defined shift which could be under admin control
+ * in the future.
+ */
+static void set_max_drc(void)
+{
+	/* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
+	#define NFSD_DRC_SIZE_SHIFT	7
+	nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages()
+						>> NFSD_DRC_SIZE_SHIFT;
+	nfsd_serv->sv_drc_pages_used = 0;
+	dprintk("%s svc_drc_max_pages %u\n", __func__,
+		nfsd_serv->sv_drc_max_pages);
+}
 
 int nfsd_create_serv(void)
 {
@@ -227,11 +271,12 @@ int nfsd_create_serv(void)
 			nfsd_max_blksize /= 2;
 	}
 
-	atomic_set(&nfsd_busy, 0);
 	nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
 				      nfsd_last_thread, nfsd, THIS_MODULE);
 	if (nfsd_serv == NULL)
 		err = -ENOMEM;
+	else
+		set_max_drc();
 
 	do_gettimeofday(&nfssvc_boot);		/* record boot time */
 	return err;
@@ -375,26 +420,6 @@ nfsd_svc(unsigned short port, int nrservs)
 	return error;
 }
 
-static inline void
-update_thread_usage(int busy_threads)
-{
-	unsigned long prev_call;
-	unsigned long diff;
-	int decile;
-
-	spin_lock(&nfsd_call_lock);
-	prev_call = nfsd_last_call;
-	nfsd_last_call = jiffies;
-	decile = busy_threads*10/nfsdstats.th_cnt;
-	if (decile>0 && decile <= 10) {
-		diff = nfsd_last_call - prev_call;
-		if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP)
-			nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP;
-		if (decile == 10)
-			nfsdstats.th_fullcnt++;
-	}
-	spin_unlock(&nfsd_call_lock);
-}
 
 /*
  * This is the NFS server kernel thread
@@ -460,8 +485,6 @@ nfsd(void *vrqstp)
 			continue;
 		}
 
-		update_thread_usage(atomic_read(&nfsd_busy));
-		atomic_inc(&nfsd_busy);
 
 		/* Lock the export hash tables for reading. */
 		exp_readlock();
@@ -470,8 +493,6 @@ nfsd(void *vrqstp)
 
 		/* Unlock export hash tables */
 		exp_readunlock();
-		update_thread_usage(atomic_read(&nfsd_busy));
-		atomic_dec(&nfsd_busy);
 	}
 
 	/* Clear signals before calling svc_exit_thread() */
@@ -539,6 +560,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 		+ rqstp->rq_res.head[0].iov_len;
 	rqstp->rq_res.head[0].iov_len += sizeof(__be32);
 
+	/* NFSv4.1 DRC requires statp */
+	if (rqstp->rq_vers == 4)
+		nfsd4_set_statp(rqstp, statp);
+
 	/* Now call the procedure handler, and encode NFS status. */
 	nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
 	nfserr = map_new_errors(rqstp->rq_vers, nfserr);
@@ -570,3 +595,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 	nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
 	return 1;
 }
+
+int nfsd_pool_stats_open(struct inode *inode, struct file *file)
+{
+	if (nfsd_serv == NULL)
+		return -ENODEV;
+	return svc_pool_stats_open(nfsd_serv, file);
+}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 78376b6c023..ab93fcfef25 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -366,8 +366,9 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	}
 
 	/* Revoke setuid/setgid on chown */
-	if (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
-	    ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)) {
+	if (!S_ISDIR(inode->i_mode) &&
+	    (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
+	     ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) {
 		iap->ia_valid |= ATTR_KILL_PRIV;
 		if (iap->ia_valid & ATTR_MODE) {
 			/* we're setting mode too, just clear the s*id bits */
@@ -960,7 +961,7 @@ static void kill_suid(struct dentry *dentry)
 static __be32
 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 				loff_t offset, struct kvec *vec, int vlen,
-	   			unsigned long cnt, int *stablep)
+				unsigned long *cnt, int *stablep)
 {
 	struct svc_export	*exp;
 	struct dentry		*dentry;
@@ -974,7 +975,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	err = nfserr_perm;
 
 	if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
-		(!lock_may_write(file->f_path.dentry->d_inode, offset, cnt)))
+		(!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
 		goto out;
 #endif
 
@@ -1009,7 +1010,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
 	set_fs(oldfs);
 	if (host_err >= 0) {
-		nfsdstats.io_write += cnt;
+		nfsdstats.io_write += host_err;
 		fsnotify_modify(file->f_path.dentry);
 	}
 
@@ -1054,9 +1055,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	}
 
 	dprintk("nfsd: write complete host_err=%d\n", host_err);
-	if (host_err >= 0)
+	if (host_err >= 0) {
 		err = 0;
-	else 
+		*cnt = host_err;
+	} else
 		err = nfserrno(host_err);
 out:
 	return err;
@@ -1098,7 +1100,7 @@ out:
  */
 __be32
 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
-		loff_t offset, struct kvec *vec, int vlen, unsigned long cnt,
+		loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
 		int *stablep)
 {
 	__be32			err = 0;
@@ -1179,6 +1181,21 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
 	return 0;
 }
 
+/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
+ * setting size to 0 may fail for some specific file systems by the permission
+ * checking which requires WRITE permission but the mode is 000.
+ * we ignore the resizing(to 0) on the just new created file, since the size is
+ * 0 after file created.
+ *
+ * call this only after vfs_create() is called.
+ * */
+static void
+nfsd_check_ignore_resizing(struct iattr *iap)
+{
+	if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
+		iap->ia_valid &= ~ATTR_SIZE;
+}
+
 /*
  * Create a file (regular, directory, device, fifo); UNIX sockets 
  * not yet implemented.
@@ -1274,6 +1291,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	switch (type) {
 	case S_IFREG:
 		host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
+		if (!host_err)
+			nfsd_check_ignore_resizing(iap);
 		break;
 	case S_IFDIR:
 		host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
@@ -1427,6 +1446,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		/* setattr will sync the child (or not) */
 	}
 
+	nfsd_check_ignore_resizing(iap);
+
 	if (createmode == NFS3_CREATE_EXCLUSIVE) {
 		/* Cram the verifier into atime/mtime */
 		iap->ia_valid = ATTR_MTIME|ATTR_ATIME
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
new file mode 100644
index 00000000000..df3e62c1ddc
--- /dev/null
+++ b/fs/nilfs2/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_NILFS2_FS) += nilfs2.o
+nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
+	btnode.o bmap.o btree.o direct.o dat.o recovery.o \
+	the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
+	ifile.o alloc.o gcinode.o ioctl.o gcdat.o
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
new file mode 100644
index 00000000000..d69e6ae5925
--- /dev/null
+++ b/fs/nilfs2/alloc.c
@@ -0,0 +1,504 @@
+/*
+ * alloc.c - NILFS dat/inode allocator
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Original code was written by Koji Sato <koji@osrg.net>.
+ * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
+ *                                Amagai Yoshiji <amagai@osrg.net>.
+ */
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/bitops.h>
+#include "mdt.h"
+#include "alloc.h"
+
+
+static inline unsigned long
+nilfs_palloc_groups_per_desc_block(const struct inode *inode)
+{
+	return (1UL << inode->i_blkbits) /
+		sizeof(struct nilfs_palloc_group_desc);
+}
+
+static inline unsigned long
+nilfs_palloc_groups_count(const struct inode *inode)
+{
+	return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
+}
+
+int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+
+	mi->mi_bgl = kmalloc(sizeof(*mi->mi_bgl), GFP_NOFS);
+	if (!mi->mi_bgl)
+		return -ENOMEM;
+
+	bgl_lock_init(mi->mi_bgl);
+
+	nilfs_mdt_set_entry_size(inode, entry_size, 0);
+
+	mi->mi_blocks_per_group =
+		DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode),
+			     mi->mi_entries_per_block) + 1;
+		/* Number of blocks in a group including entry blocks and
+		   a bitmap block */
+	mi->mi_blocks_per_desc_block =
+		nilfs_palloc_groups_per_desc_block(inode) *
+		mi->mi_blocks_per_group + 1;
+		/* Number of blocks per descriptor including the
+		   descriptor block */
+	return 0;
+}
+
+static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
+					unsigned long *offset)
+{
+	__u64 group = nr;
+
+	*offset = do_div(group, nilfs_palloc_entries_per_group(inode));
+	return group;
+}
+
+static unsigned long
+nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
+{
+	unsigned long desc_block =
+		group / nilfs_palloc_groups_per_desc_block(inode);
+	return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
+}
+
+static unsigned long
+nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
+{
+	unsigned long desc_offset =
+		group % nilfs_palloc_groups_per_desc_block(inode);
+	return nilfs_palloc_desc_blkoff(inode, group) + 1 +
+		desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
+}
+
+static unsigned long
+nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
+			       const struct nilfs_palloc_group_desc *desc)
+{
+	unsigned long nfree;
+
+	spin_lock(nilfs_mdt_bgl_lock(inode, group));
+	nfree = le32_to_cpu(desc->pg_nfrees);
+	spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+	return nfree;
+}
+
+static void
+nilfs_palloc_group_desc_add_entries(struct inode *inode,
+				    unsigned long group,
+				    struct nilfs_palloc_group_desc *desc,
+				    u32 n)
+{
+	spin_lock(nilfs_mdt_bgl_lock(inode, group));
+	le32_add_cpu(&desc->pg_nfrees, n);
+	spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+}
+
+static unsigned long
+nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
+{
+	unsigned long group, group_offset;
+
+	group = nilfs_palloc_group(inode, nr, &group_offset);
+
+	return nilfs_palloc_bitmap_blkoff(inode, group) + 1 +
+		group_offset / NILFS_MDT(inode)->mi_entries_per_block;
+}
+
+static void nilfs_palloc_desc_block_init(struct inode *inode,
+					 struct buffer_head *bh, void *kaddr)
+{
+	struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh);
+	unsigned long n = nilfs_palloc_groups_per_desc_block(inode);
+	__le32 nfrees;
+
+	nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode));
+	while (n-- > 0) {
+		desc->pg_nfrees = nfrees;
+		desc++;
+	}
+}
+
+static int nilfs_palloc_get_desc_block(struct inode *inode,
+				       unsigned long group,
+				       int create, struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(inode,
+				   nilfs_palloc_desc_blkoff(inode, group),
+				   create, nilfs_palloc_desc_block_init, bhp);
+}
+
+static int nilfs_palloc_get_bitmap_block(struct inode *inode,
+					 unsigned long group,
+					 int create, struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(inode,
+				   nilfs_palloc_bitmap_blkoff(inode, group),
+				   create, NULL, bhp);
+}
+
+int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
+				 int create, struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr),
+				   create, NULL, bhp);
+}
+
+static struct nilfs_palloc_group_desc *
+nilfs_palloc_block_get_group_desc(const struct inode *inode,
+				  unsigned long group,
+				  const struct buffer_head *bh, void *kaddr)
+{
+	return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) +
+		group % nilfs_palloc_groups_per_desc_block(inode);
+}
+
+static unsigned char *
+nilfs_palloc_block_get_bitmap(const struct inode *inode,
+			      const struct buffer_head *bh, void *kaddr)
+{
+	return (unsigned char *)(kaddr + bh_offset(bh));
+}
+
+void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
+				   const struct buffer_head *bh, void *kaddr)
+{
+	unsigned long entry_offset, group_offset;
+
+	nilfs_palloc_group(inode, nr, &group_offset);
+	entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block;
+
+	return kaddr + bh_offset(bh) +
+		entry_offset * NILFS_MDT(inode)->mi_entry_size;
+}
+
+static int nilfs_palloc_find_available_slot(struct inode *inode,
+					    unsigned long group,
+					    unsigned long target,
+					    unsigned char *bitmap,
+					    int bsize)  /* size in bits */
+{
+	int curr, pos, end, i;
+
+	if (target > 0) {
+		end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
+		if (end > bsize)
+			end = bsize;
+		pos = nilfs_find_next_zero_bit(bitmap, end, target);
+		if (pos < end &&
+		    !nilfs_set_bit_atomic(
+			    nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
+			return pos;
+	} else
+		end = 0;
+
+	for (i = 0, curr = end;
+	     i < bsize;
+	     i += BITS_PER_LONG, curr += BITS_PER_LONG) {
+		/* wrap around */
+		if (curr >= bsize)
+			curr = 0;
+		while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
+		       != ~0UL) {
+			end = curr + BITS_PER_LONG;
+			if (end > bsize)
+				end = bsize;
+			pos = nilfs_find_next_zero_bit(bitmap, end, curr);
+			if ((pos < end) &&
+			    !nilfs_set_bit_atomic(
+				    nilfs_mdt_bgl_lock(inode, group), pos,
+				    bitmap))
+				return pos;
+		}
+	}
+	return -ENOSPC;
+}
+
+static unsigned long
+nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
+				       unsigned long curr, unsigned long max)
+{
+	return min_t(unsigned long,
+		     nilfs_palloc_groups_per_desc_block(inode) -
+		     curr % nilfs_palloc_groups_per_desc_block(inode),
+		     max - curr + 1);
+}
+
+int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
+				     struct nilfs_palloc_req *req)
+{
+	struct buffer_head *desc_bh, *bitmap_bh;
+	struct nilfs_palloc_group_desc *desc;
+	unsigned char *bitmap;
+	void *desc_kaddr, *bitmap_kaddr;
+	unsigned long group, maxgroup, ngroups;
+	unsigned long group_offset, maxgroup_offset;
+	unsigned long n, entries_per_group, groups_per_desc_block;
+	unsigned long i, j;
+	int pos, ret;
+
+	ngroups = nilfs_palloc_groups_count(inode);
+	maxgroup = ngroups - 1;
+	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+	entries_per_group = nilfs_palloc_entries_per_group(inode);
+	groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
+
+	for (i = 0; i < ngroups; i += n) {
+		if (group >= ngroups) {
+			/* wrap around */
+			group = 0;
+			maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
+						      &maxgroup_offset) - 1;
+		}
+		ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
+		if (ret < 0)
+			return ret;
+		desc_kaddr = kmap(desc_bh->b_page);
+		desc = nilfs_palloc_block_get_group_desc(
+			inode, group, desc_bh, desc_kaddr);
+		n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
+							   maxgroup);
+		for (j = 0; j < n; j++, desc++, group++) {
+			if (nilfs_palloc_group_desc_nfrees(inode, group, desc)
+			    > 0) {
+				ret = nilfs_palloc_get_bitmap_block(
+					inode, group, 1, &bitmap_bh);
+				if (ret < 0)
+					goto out_desc;
+				bitmap_kaddr = kmap(bitmap_bh->b_page);
+				bitmap = nilfs_palloc_block_get_bitmap(
+					inode, bitmap_bh, bitmap_kaddr);
+				pos = nilfs_palloc_find_available_slot(
+					inode, group, group_offset, bitmap,
+					entries_per_group);
+				if (pos >= 0) {
+					/* found a free entry */
+					nilfs_palloc_group_desc_add_entries(
+						inode, group, desc, -1);
+					req->pr_entry_nr =
+						entries_per_group * group + pos;
+					kunmap(desc_bh->b_page);
+					kunmap(bitmap_bh->b_page);
+
+					req->pr_desc_bh = desc_bh;
+					req->pr_bitmap_bh = bitmap_bh;
+					return 0;
+				}
+				kunmap(bitmap_bh->b_page);
+				brelse(bitmap_bh);
+			}
+
+			group_offset = 0;
+		}
+
+		kunmap(desc_bh->b_page);
+		brelse(desc_bh);
+	}
+
+	/* no entries left */
+	return -ENOSPC;
+
+ out_desc:
+	kunmap(desc_bh->b_page);
+	brelse(desc_bh);
+	return ret;
+}
+
+void nilfs_palloc_commit_alloc_entry(struct inode *inode,
+				     struct nilfs_palloc_req *req)
+{
+	nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
+	nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
+	nilfs_mdt_mark_dirty(inode);
+
+	brelse(req->pr_bitmap_bh);
+	brelse(req->pr_desc_bh);
+}
+
+void nilfs_palloc_commit_free_entry(struct inode *inode,
+				    struct nilfs_palloc_req *req)
+{
+	struct nilfs_palloc_group_desc *desc;
+	unsigned long group, group_offset;
+	unsigned char *bitmap;
+	void *desc_kaddr, *bitmap_kaddr;
+
+	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+	desc_kaddr = kmap(req->pr_desc_bh->b_page);
+	desc = nilfs_palloc_block_get_group_desc(inode, group,
+						 req->pr_desc_bh, desc_kaddr);
+	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+	bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
+					       bitmap_kaddr);
+
+	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
+				    group_offset, bitmap))
+		printk(KERN_WARNING "%s: entry number %llu already freed\n",
+		       __func__, (unsigned long long)req->pr_entry_nr);
+
+	nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+
+	kunmap(req->pr_bitmap_bh->b_page);
+	kunmap(req->pr_desc_bh->b_page);
+
+	nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
+	nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
+	nilfs_mdt_mark_dirty(inode);
+
+	brelse(req->pr_bitmap_bh);
+	brelse(req->pr_desc_bh);
+}
+
+void nilfs_palloc_abort_alloc_entry(struct inode *inode,
+				    struct nilfs_palloc_req *req)
+{
+	struct nilfs_palloc_group_desc *desc;
+	void *desc_kaddr, *bitmap_kaddr;
+	unsigned char *bitmap;
+	unsigned long group, group_offset;
+
+	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+	desc_kaddr = kmap(req->pr_desc_bh->b_page);
+	desc = nilfs_palloc_block_get_group_desc(inode, group,
+						 req->pr_desc_bh, desc_kaddr);
+	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+	bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
+					       bitmap_kaddr);
+	if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
+				    group_offset, bitmap))
+		printk(KERN_WARNING "%s: entry numer %llu already freed\n",
+		       __func__, (unsigned long long)req->pr_entry_nr);
+
+	nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+
+	kunmap(req->pr_bitmap_bh->b_page);
+	kunmap(req->pr_desc_bh->b_page);
+
+	brelse(req->pr_bitmap_bh);
+	brelse(req->pr_desc_bh);
+
+	req->pr_entry_nr = 0;
+	req->pr_bitmap_bh = NULL;
+	req->pr_desc_bh = NULL;
+}
+
+int nilfs_palloc_prepare_free_entry(struct inode *inode,
+				    struct nilfs_palloc_req *req)
+{
+	struct buffer_head *desc_bh, *bitmap_bh;
+	unsigned long group, group_offset;
+	int ret;
+
+	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
+	ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
+	if (ret < 0)
+		return ret;
+	ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh);
+	if (ret < 0) {
+		brelse(desc_bh);
+		return ret;
+	}
+
+	req->pr_desc_bh = desc_bh;
+	req->pr_bitmap_bh = bitmap_bh;
+	return 0;
+}
+
+void nilfs_palloc_abort_free_entry(struct inode *inode,
+				   struct nilfs_palloc_req *req)
+{
+	brelse(req->pr_bitmap_bh);
+	brelse(req->pr_desc_bh);
+
+	req->pr_entry_nr = 0;
+	req->pr_bitmap_bh = NULL;
+	req->pr_desc_bh = NULL;
+}
+
+static int
+nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
+{
+	__u64 first, last;
+
+	first = group * nilfs_palloc_entries_per_group(inode);
+	last = first + nilfs_palloc_entries_per_group(inode) - 1;
+	return (nr >= first) && (nr <= last);
+}
+
+int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
+{
+	struct buffer_head *desc_bh, *bitmap_bh;
+	struct nilfs_palloc_group_desc *desc;
+	unsigned char *bitmap;
+	void *desc_kaddr, *bitmap_kaddr;
+	unsigned long group, group_offset;
+	int i, j, n, ret;
+
+	for (i = 0; i < nitems; i += n) {
+		group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
+		ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
+		if (ret < 0)
+			return ret;
+		ret = nilfs_palloc_get_bitmap_block(inode, group, 0,
+						    &bitmap_bh);
+		if (ret < 0) {
+			brelse(desc_bh);
+			return ret;
+		}
+		desc_kaddr = kmap(desc_bh->b_page);
+		desc = nilfs_palloc_block_get_group_desc(
+			inode, group, desc_bh, desc_kaddr);
+		bitmap_kaddr = kmap(bitmap_bh->b_page);
+		bitmap = nilfs_palloc_block_get_bitmap(
+			inode, bitmap_bh, bitmap_kaddr);
+		for (j = i, n = 0;
+		     (j < nitems) && nilfs_palloc_group_is_in(inode, group,
+							      entry_nrs[j]);
+		     j++, n++) {
+			nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
+			if (!nilfs_clear_bit_atomic(
+				    nilfs_mdt_bgl_lock(inode, group),
+				    group_offset, bitmap)) {
+				printk(KERN_WARNING
+				       "%s: entry number %llu already freed\n",
+				       __func__,
+				       (unsigned long long)entry_nrs[j]);
+			}
+		}
+		nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
+
+		kunmap(bitmap_bh->b_page);
+		kunmap(desc_bh->b_page);
+
+		nilfs_mdt_mark_buffer_dirty(desc_bh);
+		nilfs_mdt_mark_buffer_dirty(bitmap_bh);
+		nilfs_mdt_mark_dirty(inode);
+
+		brelse(bitmap_bh);
+		brelse(desc_bh);
+	}
+	return 0;
+}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
new file mode 100644
index 00000000000..4ace5475c2c
--- /dev/null
+++ b/fs/nilfs2/alloc.h
@@ -0,0 +1,72 @@
+/*
+ * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Original code was written by Koji Sato <koji@osrg.net>.
+ * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
+ *                                Amagai Yoshiji <amagai@osrg.net>.
+ */
+
+#ifndef _NILFS_ALLOC_H
+#define _NILFS_ALLOC_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+
+static inline unsigned long
+nilfs_palloc_entries_per_group(const struct inode *inode)
+{
+	return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */);
+}
+
+int nilfs_palloc_init_blockgroup(struct inode *, unsigned);
+int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
+				 struct buffer_head **);
+void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
+				   const struct buffer_head *, void *);
+
+/**
+ * nilfs_palloc_req - persistent alloctor request and reply
+ * @pr_entry_nr: entry number (vblocknr or inode number)
+ * @pr_desc_bh: buffer head of the buffer containing block group descriptors
+ * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
+ * @pr_entry_bh: buffer head of the buffer containing translation entries
+ */
+struct nilfs_palloc_req {
+	__u64 pr_entry_nr;
+	struct buffer_head *pr_desc_bh;
+	struct buffer_head *pr_bitmap_bh;
+	struct buffer_head *pr_entry_bh;
+};
+
+int nilfs_palloc_prepare_alloc_entry(struct inode *,
+				     struct nilfs_palloc_req *);
+void nilfs_palloc_commit_alloc_entry(struct inode *,
+				     struct nilfs_palloc_req *);
+void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *);
+void nilfs_palloc_commit_free_entry(struct inode *, struct nilfs_palloc_req *);
+int nilfs_palloc_prepare_free_entry(struct inode *, struct nilfs_palloc_req *);
+void nilfs_palloc_abort_free_entry(struct inode *, struct nilfs_palloc_req *);
+int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
+
+#define nilfs_set_bit_atomic		ext2_set_bit_atomic
+#define nilfs_clear_bit_atomic		ext2_clear_bit_atomic
+#define nilfs_find_next_zero_bit	ext2_find_next_zero_bit
+
+#endif	/* _NILFS_ALLOC_H */
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
new file mode 100644
index 00000000000..064279e33bb
--- /dev/null
+++ b/fs/nilfs2/bmap.c
@@ -0,0 +1,788 @@
+/*
+ * bmap.c - NILFS block mapping.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "bmap.h"
+#include "sb.h"
+#include "btnode.h"
+#include "mdt.h"
+#include "dat.h"
+#include "alloc.h"
+
+int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
+			       __u64 *ptrp)
+{
+	__u64 ptr;
+	int ret;
+
+	down_read(&bmap->b_sem);
+	ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
+	if (ret < 0)
+		goto out;
+	if (bmap->b_pops->bpop_translate != NULL) {
+		ret = bmap->b_pops->bpop_translate(bmap, *ptrp, &ptr);
+		if (ret < 0)
+			goto out;
+		*ptrp = ptr;
+	}
+
+ out:
+	up_read(&bmap->b_sem);
+	return ret;
+}
+
+
+/**
+ * nilfs_bmap_lookup - find a record
+ * @bmap: bmap
+ * @key: key
+ * @recp: pointer to record
+ *
+ * Description: nilfs_bmap_lookup() finds a record whose key matches @key in
+ * @bmap.
+ *
+ * Return Value: On success, 0 is returned and the record associated with @key
+ * is stored in the place pointed by @recp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
+int nilfs_bmap_lookup(struct nilfs_bmap *bmap,
+		      unsigned long key,
+		      unsigned long *recp)
+{
+	__u64 ptr;
+	int ret;
+
+	/* XXX: use macro for level 1 */
+	ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr);
+	if (recp != NULL)
+		*recp = ptr;
+	return ret;
+}
+
+static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+{
+	__u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
+	__u64 ptrs[NILFS_BMAP_SMALL_HIGH + 1];
+	int ret, n;
+
+	if (bmap->b_ops->bop_check_insert != NULL) {
+		ret = bmap->b_ops->bop_check_insert(bmap, key);
+		if (ret > 0) {
+			n = bmap->b_ops->bop_gather_data(
+				bmap, keys, ptrs, NILFS_BMAP_SMALL_HIGH + 1);
+			if (n < 0)
+				return n;
+			ret = nilfs_btree_convert_and_insert(
+				bmap, key, ptr, keys, ptrs, n,
+				NILFS_BMAP_LARGE_LOW, NILFS_BMAP_LARGE_HIGH);
+			if (ret == 0)
+				bmap->b_u.u_flags |= NILFS_BMAP_LARGE;
+
+			return ret;
+		} else if (ret < 0)
+			return ret;
+	}
+
+	return bmap->b_ops->bop_insert(bmap, key, ptr);
+}
+
+/**
+ * nilfs_bmap_insert - insert a new key-record pair into a bmap
+ * @bmap: bmap
+ * @key: key
+ * @rec: record
+ *
+ * Description: nilfs_bmap_insert() inserts the new key-record pair specified
+ * by @key and @rec into @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EEXIST - A record associated with @key already exist.
+ */
+int nilfs_bmap_insert(struct nilfs_bmap *bmap,
+		      unsigned long key,
+		      unsigned long rec)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = nilfs_bmap_do_insert(bmap, key, rec);
+	up_write(&bmap->b_sem);
+	return ret;
+}
+
+static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+	__u64 keys[NILFS_BMAP_LARGE_LOW + 1];
+	__u64 ptrs[NILFS_BMAP_LARGE_LOW + 1];
+	int ret, n;
+
+	if (bmap->b_ops->bop_check_delete != NULL) {
+		ret = bmap->b_ops->bop_check_delete(bmap, key);
+		if (ret > 0) {
+			n = bmap->b_ops->bop_gather_data(
+				bmap, keys, ptrs, NILFS_BMAP_LARGE_LOW + 1);
+			if (n < 0)
+				return n;
+			ret = nilfs_direct_delete_and_convert(
+				bmap, key, keys, ptrs, n,
+				NILFS_BMAP_SMALL_LOW, NILFS_BMAP_SMALL_HIGH);
+			if (ret == 0)
+				bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE;
+
+			return ret;
+		} else if (ret < 0)
+			return ret;
+	}
+
+	return bmap->b_ops->bop_delete(bmap, key);
+}
+
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
+{
+	__u64 lastkey;
+	int ret;
+
+	down_read(&bmap->b_sem);
+	ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+	if (!ret)
+		*key = lastkey;
+	up_read(&bmap->b_sem);
+	return ret;
+}
+
+/**
+ * nilfs_bmap_delete - delete a key-record pair from a bmap
+ * @bmap: bmap
+ * @key: key
+ *
+ * Description: nilfs_bmap_delete() deletes the key-record pair specified by
+ * @key from @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = nilfs_bmap_do_delete(bmap, key);
+	up_write(&bmap->b_sem);
+	return ret;
+}
+
+static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
+{
+	__u64 lastkey;
+	int ret;
+
+	ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+	if (ret < 0) {
+		if (ret == -ENOENT)
+			ret = 0;
+		return ret;
+	}
+
+	while (key <= lastkey) {
+		ret = nilfs_bmap_do_delete(bmap, lastkey);
+		if (ret < 0)
+			return ret;
+		ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
+		if (ret < 0) {
+			if (ret == -ENOENT)
+				ret = 0;
+			return ret;
+		}
+	}
+	return 0;
+}
+
+/**
+ * nilfs_bmap_truncate - truncate a bmap to a specified key
+ * @bmap: bmap
+ * @key: key
+ *
+ * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are
+ * greater than or equal to @key from @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = nilfs_bmap_do_truncate(bmap, key);
+	up_write(&bmap->b_sem);
+	return ret;
+}
+
+/**
+ * nilfs_bmap_clear - free resources a bmap holds
+ * @bmap: bmap
+ *
+ * Description: nilfs_bmap_clear() frees resources associated with @bmap.
+ */
+void nilfs_bmap_clear(struct nilfs_bmap *bmap)
+{
+	down_write(&bmap->b_sem);
+	if (bmap->b_ops->bop_clear != NULL)
+		bmap->b_ops->bop_clear(bmap);
+	up_write(&bmap->b_sem);
+}
+
+/**
+ * nilfs_bmap_propagate - propagate dirty state
+ * @bmap: bmap
+ * @bh: buffer head
+ *
+ * Description: nilfs_bmap_propagate() marks the buffers that directly or
+ * indirectly refer to the block specified by @bh dirty.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = bmap->b_ops->bop_propagate(bmap, bh);
+	up_write(&bmap->b_sem);
+	return ret;
+}
+
+/**
+ * nilfs_bmap_lookup_dirty_buffers -
+ * @bmap: bmap
+ * @listp: pointer to buffer head list
+ */
+void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap,
+				     struct list_head *listp)
+{
+	if (bmap->b_ops->bop_lookup_dirty_buffers != NULL)
+		bmap->b_ops->bop_lookup_dirty_buffers(bmap, listp);
+}
+
+/**
+ * nilfs_bmap_assign - assign a new block number to a block
+ * @bmap: bmap
+ * @bhp: pointer to buffer head
+ * @blocknr: block number
+ * @binfo: block information
+ *
+ * Description: nilfs_bmap_assign() assigns the block number @blocknr to the
+ * buffer specified by @bh.
+ *
+ * Return Value: On success, 0 is returned and the buffer head of a newly
+ * create buffer and the block information associated with the buffer are
+ * stored in the place pointed by @bh and @binfo, respectively. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_assign(struct nilfs_bmap *bmap,
+		      struct buffer_head **bh,
+		      unsigned long blocknr,
+		      union nilfs_binfo *binfo)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
+	up_write(&bmap->b_sem);
+	return ret;
+}
+
+/**
+ * nilfs_bmap_mark - mark block dirty
+ * @bmap: bmap
+ * @key: key
+ * @level: level
+ *
+ * Description: nilfs_bmap_mark() marks the block specified by @key and @level
+ * as dirty.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
+{
+	int ret;
+
+	if (bmap->b_ops->bop_mark == NULL)
+		return 0;
+
+	down_write(&bmap->b_sem);
+	ret = bmap->b_ops->bop_mark(bmap, key, level);
+	up_write(&bmap->b_sem);
+	return ret;
+}
+
+/**
+ * nilfs_bmap_test_and_clear_dirty - test and clear a bmap dirty state
+ * @bmap: bmap
+ *
+ * Description: nilfs_test_and_clear() is the atomic operation to test and
+ * clear the dirty state of @bmap.
+ *
+ * Return Value: 1 is returned if @bmap is dirty, or 0 if clear.
+ */
+int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
+{
+	int ret;
+
+	down_write(&bmap->b_sem);
+	ret = nilfs_bmap_dirty(bmap);
+	nilfs_bmap_clear_dirty(bmap);
+	up_write(&bmap->b_sem);
+	return ret;
+}
+
+
+/*
+ * Internal use only
+ */
+
+void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
+{
+	inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
+	if (NILFS_MDT(bmap->b_inode))
+		nilfs_mdt_mark_dirty(bmap->b_inode);
+	else
+		mark_inode_dirty(bmap->b_inode);
+}
+
+void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
+{
+	inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
+	if (NILFS_MDT(bmap->b_inode))
+		nilfs_mdt_mark_dirty(bmap->b_inode);
+	else
+		mark_inode_dirty(bmap->b_inode);
+}
+
+int nilfs_bmap_get_block(const struct nilfs_bmap *bmap, __u64 ptr,
+			 struct buffer_head **bhp)
+{
+	return nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
+				ptr, 0, bhp, 0);
+}
+
+void nilfs_bmap_put_block(const struct nilfs_bmap *bmap,
+			  struct buffer_head *bh)
+{
+	brelse(bh);
+}
+
+int nilfs_bmap_get_new_block(const struct nilfs_bmap *bmap, __u64 ptr,
+			     struct buffer_head **bhp)
+{
+	int ret;
+
+	ret = nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
+			       ptr, 0, bhp, 1);
+	if (ret < 0)
+		return ret;
+	set_buffer_nilfs_volatile(*bhp);
+	return 0;
+}
+
+void nilfs_bmap_delete_block(const struct nilfs_bmap *bmap,
+			     struct buffer_head *bh)
+{
+	nilfs_btnode_delete(bh);
+}
+
+__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
+			      const struct buffer_head *bh)
+{
+	struct buffer_head *pbh;
+	__u64 key;
+
+	key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT -
+					 bmap->b_inode->i_blkbits);
+	for (pbh = page_buffers(bh->b_page); pbh != bh;
+	     pbh = pbh->b_this_page, key++);
+
+	return key;
+}
+
+__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key)
+{
+	__s64 diff;
+
+	diff = key - bmap->b_last_allocated_key;
+	if ((nilfs_bmap_keydiff_abs(diff) < NILFS_INODE_BMAP_SIZE) &&
+	    (bmap->b_last_allocated_ptr != NILFS_BMAP_INVALID_PTR) &&
+	    (bmap->b_last_allocated_ptr + diff > 0))
+		return bmap->b_last_allocated_ptr + diff;
+	else
+		return NILFS_BMAP_INVALID_PTR;
+}
+
+static struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
+{
+	return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
+}
+
+#define NILFS_BMAP_GROUP_DIV	8
+__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
+{
+	struct inode *dat = nilfs_bmap_get_dat(bmap);
+	unsigned long entries_per_group = nilfs_palloc_entries_per_group(dat);
+	unsigned long group = bmap->b_inode->i_ino / entries_per_group;
+
+	return group * entries_per_group +
+		(bmap->b_inode->i_ino % NILFS_BMAP_GROUP_DIV) *
+		(entries_per_group / NILFS_BMAP_GROUP_DIV);
+}
+
+static int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
+				      union nilfs_bmap_ptr_req *req)
+{
+	return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+
+static void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
+				      union nilfs_bmap_ptr_req *req)
+{
+	nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+
+static void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
+				     union nilfs_bmap_ptr_req *req)
+{
+	nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+
+static int nilfs_bmap_prepare_start_v(struct nilfs_bmap *bmap,
+				      union nilfs_bmap_ptr_req *req)
+{
+	return nilfs_dat_prepare_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+
+static void nilfs_bmap_commit_start_v(struct nilfs_bmap *bmap,
+				      union nilfs_bmap_ptr_req *req,
+				      sector_t blocknr)
+{
+	nilfs_dat_commit_start(nilfs_bmap_get_dat(bmap), &req->bpr_req,
+			       blocknr);
+}
+
+static void nilfs_bmap_abort_start_v(struct nilfs_bmap *bmap,
+				     union nilfs_bmap_ptr_req *req)
+{
+	nilfs_dat_abort_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+
+static int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
+				    union nilfs_bmap_ptr_req *req)
+{
+	return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+
+static void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
+				    union nilfs_bmap_ptr_req *req)
+{
+	nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 0);
+}
+
+static void nilfs_bmap_commit_end_vmdt(struct nilfs_bmap *bmap,
+				       union nilfs_bmap_ptr_req *req)
+{
+	nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 1);
+}
+
+static void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
+				   union nilfs_bmap_ptr_req *req)
+{
+	nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
+}
+
+int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr,
+		      sector_t blocknr)
+{
+	return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr);
+}
+
+int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
+{
+	return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
+}
+
+int nilfs_bmap_prepare_update(struct nilfs_bmap *bmap,
+			      union nilfs_bmap_ptr_req *oldreq,
+			      union nilfs_bmap_ptr_req *newreq)
+{
+	int ret;
+
+	ret = bmap->b_pops->bpop_prepare_end_ptr(bmap, oldreq);
+	if (ret < 0)
+		return ret;
+	ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, newreq);
+	if (ret < 0)
+		bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
+
+	return ret;
+}
+
+void nilfs_bmap_commit_update(struct nilfs_bmap *bmap,
+			      union nilfs_bmap_ptr_req *oldreq,
+			      union nilfs_bmap_ptr_req *newreq)
+{
+	bmap->b_pops->bpop_commit_end_ptr(bmap, oldreq);
+	bmap->b_pops->bpop_commit_alloc_ptr(bmap, newreq);
+}
+
+void nilfs_bmap_abort_update(struct nilfs_bmap *bmap,
+			     union nilfs_bmap_ptr_req *oldreq,
+			     union nilfs_bmap_ptr_req *newreq)
+{
+	bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
+	bmap->b_pops->bpop_abort_alloc_ptr(bmap, newreq);
+}
+
+static int nilfs_bmap_translate_v(const struct nilfs_bmap *bmap, __u64 ptr,
+				  __u64 *ptrp)
+{
+	sector_t blocknr;
+	int ret;
+
+	ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), ptr, &blocknr);
+	if (ret < 0)
+		return ret;
+	if (ptrp != NULL)
+		*ptrp = blocknr;
+	return 0;
+}
+
+static int nilfs_bmap_prepare_alloc_p(struct nilfs_bmap *bmap,
+				      union nilfs_bmap_ptr_req *req)
+{
+	/* ignore target ptr */
+	req->bpr_ptr = bmap->b_last_allocated_ptr++;
+	return 0;
+}
+
+static void nilfs_bmap_commit_alloc_p(struct nilfs_bmap *bmap,
+				      union nilfs_bmap_ptr_req *req)
+{
+	/* do nothing */
+}
+
+static void nilfs_bmap_abort_alloc_p(struct nilfs_bmap *bmap,
+				     union nilfs_bmap_ptr_req *req)
+{
+	bmap->b_last_allocated_ptr--;
+}
+
+static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_v = {
+	.bpop_prepare_alloc_ptr	=	nilfs_bmap_prepare_alloc_v,
+	.bpop_commit_alloc_ptr	=	nilfs_bmap_commit_alloc_v,
+	.bpop_abort_alloc_ptr	=	nilfs_bmap_abort_alloc_v,
+	.bpop_prepare_start_ptr	=	nilfs_bmap_prepare_start_v,
+	.bpop_commit_start_ptr	=	nilfs_bmap_commit_start_v,
+	.bpop_abort_start_ptr	=	nilfs_bmap_abort_start_v,
+	.bpop_prepare_end_ptr	=	nilfs_bmap_prepare_end_v,
+	.bpop_commit_end_ptr	=	nilfs_bmap_commit_end_v,
+	.bpop_abort_end_ptr	=	nilfs_bmap_abort_end_v,
+
+	.bpop_translate		=	nilfs_bmap_translate_v,
+};
+
+static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_vmdt = {
+	.bpop_prepare_alloc_ptr	=	nilfs_bmap_prepare_alloc_v,
+	.bpop_commit_alloc_ptr	=	nilfs_bmap_commit_alloc_v,
+	.bpop_abort_alloc_ptr	=	nilfs_bmap_abort_alloc_v,
+	.bpop_prepare_start_ptr	=	nilfs_bmap_prepare_start_v,
+	.bpop_commit_start_ptr	=	nilfs_bmap_commit_start_v,
+	.bpop_abort_start_ptr	=	nilfs_bmap_abort_start_v,
+	.bpop_prepare_end_ptr	=	nilfs_bmap_prepare_end_v,
+	.bpop_commit_end_ptr	=	nilfs_bmap_commit_end_vmdt,
+	.bpop_abort_end_ptr	=	nilfs_bmap_abort_end_v,
+
+	.bpop_translate		=	nilfs_bmap_translate_v,
+};
+
+static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_p = {
+	.bpop_prepare_alloc_ptr	=	nilfs_bmap_prepare_alloc_p,
+	.bpop_commit_alloc_ptr	=	nilfs_bmap_commit_alloc_p,
+	.bpop_abort_alloc_ptr	=	nilfs_bmap_abort_alloc_p,
+	.bpop_prepare_start_ptr	=	NULL,
+	.bpop_commit_start_ptr	=	NULL,
+	.bpop_abort_start_ptr	=	NULL,
+	.bpop_prepare_end_ptr	=	NULL,
+	.bpop_commit_end_ptr	=	NULL,
+	.bpop_abort_end_ptr	=	NULL,
+
+	.bpop_translate		=	NULL,
+};
+
+static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = {
+	.bpop_prepare_alloc_ptr	=	NULL,
+	.bpop_commit_alloc_ptr	=	NULL,
+	.bpop_abort_alloc_ptr	=	NULL,
+	.bpop_prepare_start_ptr	=	NULL,
+	.bpop_commit_start_ptr	=	NULL,
+	.bpop_abort_start_ptr	=	NULL,
+	.bpop_prepare_end_ptr	=	NULL,
+	.bpop_commit_end_ptr	=	NULL,
+	.bpop_abort_end_ptr	=	NULL,
+
+	.bpop_translate		=	NULL,
+};
+
+static struct lock_class_key nilfs_bmap_dat_lock_key;
+
+/**
+ * nilfs_bmap_read - read a bmap from an inode
+ * @bmap: bmap
+ * @raw_inode: on-disk inode
+ *
+ * Description: nilfs_bmap_read() initializes the bmap @bmap.
+ *
+ * Return Value: On success, 0 is returned. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
+{
+	if (raw_inode == NULL)
+		memset(bmap->b_u.u_data, 0, NILFS_BMAP_SIZE);
+	else
+		memcpy(bmap->b_u.u_data, raw_inode->i_bmap, NILFS_BMAP_SIZE);
+
+	init_rwsem(&bmap->b_sem);
+	bmap->b_state = 0;
+	bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+	switch (bmap->b_inode->i_ino) {
+	case NILFS_DAT_INO:
+		bmap->b_pops = &nilfs_bmap_ptr_ops_p;
+		bmap->b_last_allocated_key = 0;	/* XXX: use macro */
+		bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
+		lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
+		break;
+	case NILFS_CPFILE_INO:
+	case NILFS_SUFILE_INO:
+		bmap->b_pops = &nilfs_bmap_ptr_ops_vmdt;
+		bmap->b_last_allocated_key = 0;	/* XXX: use macro */
+		bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+		break;
+	default:
+		bmap->b_pops = &nilfs_bmap_ptr_ops_v;
+		bmap->b_last_allocated_key = 0;	/* XXX: use macro */
+		bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+		break;
+	}
+
+	return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ?
+		nilfs_btree_init(bmap,
+				 NILFS_BMAP_LARGE_LOW,
+				 NILFS_BMAP_LARGE_HIGH) :
+		nilfs_direct_init(bmap,
+				  NILFS_BMAP_SMALL_LOW,
+				  NILFS_BMAP_SMALL_HIGH);
+}
+
+/**
+ * nilfs_bmap_write - write back a bmap to an inode
+ * @bmap: bmap
+ * @raw_inode: on-disk inode
+ *
+ * Description: nilfs_bmap_write() stores @bmap in @raw_inode.
+ */
+void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
+{
+	down_write(&bmap->b_sem);
+	memcpy(raw_inode->i_bmap, bmap->b_u.u_data,
+	       NILFS_INODE_BMAP_SIZE * sizeof(__le64));
+	if (bmap->b_inode->i_ino == NILFS_DAT_INO)
+		bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
+
+	up_write(&bmap->b_sem);
+}
+
+void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
+{
+	memset(&bmap->b_u, 0, NILFS_BMAP_SIZE);
+	init_rwsem(&bmap->b_sem);
+	bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+	bmap->b_pops = &nilfs_bmap_ptr_ops_gc;
+	bmap->b_last_allocated_key = 0;
+	bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
+	bmap->b_state = 0;
+	nilfs_btree_init_gc(bmap);
+}
+
+void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
+{
+	memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union));
+	init_rwsem(&gcbmap->b_sem);
+	lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
+	gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
+}
+
+void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
+{
+	memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union));
+	init_rwsem(&bmap->b_sem);
+	lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
+	bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
+}
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
new file mode 100644
index 00000000000..4f2708abb1b
--- /dev/null
+++ b/fs/nilfs2/bmap.h
@@ -0,0 +1,244 @@
+/*
+ * bmap.h - NILFS block mapping.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_BMAP_H
+#define _NILFS_BMAP_H
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "alloc.h"
+
+#define NILFS_BMAP_INVALID_PTR	0
+
+#define nilfs_bmap_dkey_to_key(dkey)	le64_to_cpu(dkey)
+#define nilfs_bmap_key_to_dkey(key)	cpu_to_le64(key)
+#define nilfs_bmap_dptr_to_ptr(dptr)	le64_to_cpu(dptr)
+#define nilfs_bmap_ptr_to_dptr(ptr)	cpu_to_le64(ptr)
+
+#define nilfs_bmap_keydiff_abs(diff)	((diff) < 0 ? -(diff) : (diff))
+
+
+struct nilfs_bmap;
+
+/**
+ * union nilfs_bmap_ptr_req - request for bmap ptr
+ * @bpr_ptr: bmap pointer
+ * @bpr_req: request for persistent allocator
+ */
+union nilfs_bmap_ptr_req {
+	__u64 bpr_ptr;
+	struct nilfs_palloc_req bpr_req;
+};
+
+/**
+ * struct nilfs_bmap_stats - bmap statistics
+ * @bs_nblocks: number of blocks created or deleted
+ */
+struct nilfs_bmap_stats {
+	unsigned int bs_nblocks;
+};
+
+/**
+ * struct nilfs_bmap_operations - bmap operation table
+ */
+struct nilfs_bmap_operations {
+	int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
+	int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
+	int (*bop_delete)(struct nilfs_bmap *, __u64);
+	void (*bop_clear)(struct nilfs_bmap *);
+
+	int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *);
+	void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *,
+					 struct list_head *);
+
+	int (*bop_assign)(struct nilfs_bmap *,
+			  struct buffer_head **,
+			  sector_t,
+			  union nilfs_binfo *);
+	int (*bop_mark)(struct nilfs_bmap *, __u64, int);
+
+	/* The following functions are internal use only. */
+	int (*bop_last_key)(const struct nilfs_bmap *, __u64 *);
+	int (*bop_check_insert)(const struct nilfs_bmap *, __u64);
+	int (*bop_check_delete)(struct nilfs_bmap *, __u64);
+	int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int);
+};
+
+
+/**
+ * struct nilfs_bmap_ptr_operations - bmap ptr operation table
+ */
+struct nilfs_bmap_ptr_operations {
+	int (*bpop_prepare_alloc_ptr)(struct nilfs_bmap *,
+				      union nilfs_bmap_ptr_req *);
+	void (*bpop_commit_alloc_ptr)(struct nilfs_bmap *,
+				      union nilfs_bmap_ptr_req *);
+	void (*bpop_abort_alloc_ptr)(struct nilfs_bmap *,
+				     union nilfs_bmap_ptr_req *);
+	int (*bpop_prepare_start_ptr)(struct nilfs_bmap *,
+				      union nilfs_bmap_ptr_req *);
+	void (*bpop_commit_start_ptr)(struct nilfs_bmap *,
+				      union nilfs_bmap_ptr_req *,
+				      sector_t);
+	void (*bpop_abort_start_ptr)(struct nilfs_bmap *,
+				     union nilfs_bmap_ptr_req *);
+	int (*bpop_prepare_end_ptr)(struct nilfs_bmap *,
+				    union nilfs_bmap_ptr_req *);
+	void (*bpop_commit_end_ptr)(struct nilfs_bmap *,
+				    union nilfs_bmap_ptr_req *);
+	void (*bpop_abort_end_ptr)(struct nilfs_bmap *,
+				   union nilfs_bmap_ptr_req *);
+
+	int (*bpop_translate)(const struct nilfs_bmap *, __u64, __u64 *);
+};
+
+
+#define NILFS_BMAP_SIZE		(NILFS_INODE_BMAP_SIZE * sizeof(__le64))
+#define NILFS_BMAP_KEY_BIT	(sizeof(unsigned long) * 8 /* CHAR_BIT */)
+#define NILFS_BMAP_NEW_PTR_INIT	\
+	(1UL << (sizeof(unsigned long) * 8 /* CHAR_BIT */ - 1))
+
+static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
+{
+	return !!(ptr & NILFS_BMAP_NEW_PTR_INIT);
+}
+
+
+/**
+ * struct nilfs_bmap - bmap structure
+ * @b_u: raw data
+ * @b_sem: semaphore
+ * @b_inode: owner of bmap
+ * @b_ops: bmap operation table
+ * @b_pops: bmap ptr operation table
+ * @b_low: low watermark of conversion
+ * @b_high: high watermark of conversion
+ * @b_last_allocated_key: last allocated key for data block
+ * @b_last_allocated_ptr: last allocated ptr for data block
+ * @b_state: state
+ */
+struct nilfs_bmap {
+	union {
+		__u8 u_flags;
+		__le64 u_data[NILFS_BMAP_SIZE / sizeof(__le64)];
+	} b_u;
+	struct rw_semaphore b_sem;
+	struct inode *b_inode;
+	const struct nilfs_bmap_operations *b_ops;
+	const struct nilfs_bmap_ptr_operations *b_pops;
+	__u64 b_low;
+	__u64 b_high;
+	__u64 b_last_allocated_key;
+	__u64 b_last_allocated_ptr;
+	int b_state;
+};
+
+/* state */
+#define NILFS_BMAP_DIRTY	0x00000001
+
+
+int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
+int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
+void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
+int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
+int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
+int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
+int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
+int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long);
+void nilfs_bmap_clear(struct nilfs_bmap *);
+int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *);
+void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *);
+int nilfs_bmap_assign(struct nilfs_bmap *, struct buffer_head **,
+		      unsigned long, union nilfs_binfo *);
+int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
+int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
+
+void nilfs_bmap_init_gc(struct nilfs_bmap *);
+void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
+void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
+
+
+/*
+ * Internal use only
+ */
+
+int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
+int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
+
+
+__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
+			      const struct buffer_head *);
+
+__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
+__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
+
+int nilfs_bmap_prepare_update(struct nilfs_bmap *,
+			      union nilfs_bmap_ptr_req *,
+			      union nilfs_bmap_ptr_req *);
+void nilfs_bmap_commit_update(struct nilfs_bmap *,
+			      union nilfs_bmap_ptr_req *,
+			      union nilfs_bmap_ptr_req *);
+void nilfs_bmap_abort_update(struct nilfs_bmap *,
+			     union nilfs_bmap_ptr_req *,
+			     union nilfs_bmap_ptr_req *);
+
+void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
+void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
+
+
+int nilfs_bmap_get_block(const struct nilfs_bmap *, __u64,
+			 struct buffer_head **);
+void nilfs_bmap_put_block(const struct nilfs_bmap *, struct buffer_head *);
+int nilfs_bmap_get_new_block(const struct nilfs_bmap *, __u64,
+			     struct buffer_head **);
+void nilfs_bmap_delete_block(const struct nilfs_bmap *, struct buffer_head *);
+
+
+/* Assume that bmap semaphore is locked. */
+static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
+{
+	return !!(bmap->b_state & NILFS_BMAP_DIRTY);
+}
+
+/* Assume that bmap semaphore is locked. */
+static inline void nilfs_bmap_set_dirty(struct nilfs_bmap *bmap)
+{
+	bmap->b_state |= NILFS_BMAP_DIRTY;
+}
+
+/* Assume that bmap semaphore is locked. */
+static inline void nilfs_bmap_clear_dirty(struct nilfs_bmap *bmap)
+{
+	bmap->b_state &= ~NILFS_BMAP_DIRTY;
+}
+
+
+#define NILFS_BMAP_LARGE	0x1
+
+#define NILFS_BMAP_SMALL_LOW	NILFS_DIRECT_KEY_MIN
+#define NILFS_BMAP_SMALL_HIGH	NILFS_DIRECT_KEY_MAX
+#define NILFS_BMAP_LARGE_LOW	NILFS_BTREE_ROOT_NCHILDREN_MAX
+#define NILFS_BMAP_LARGE_HIGH	NILFS_BTREE_KEY_MAX
+
+#endif	/* _NILFS_BMAP_H */
diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h
new file mode 100644
index 00000000000..d41509bff47
--- /dev/null
+++ b/fs/nilfs2/bmap_union.h
@@ -0,0 +1,42 @@
+/*
+ * bmap_union.h - NILFS block mapping.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_BMAP_UNION_H
+#define _NILFS_BMAP_UNION_H
+
+#include "bmap.h"
+#include "direct.h"
+#include "btree.h"
+
+/**
+ * nilfs_bmap_union -
+ * @bi_bmap: bmap structure
+ * @bi_btree: direct map structure
+ * @bi_direct: B-tree structure
+ */
+union nilfs_bmap_union {
+	struct nilfs_bmap bi_bmap;
+	struct nilfs_direct bi_direct;
+	struct nilfs_btree bi_btree;
+};
+
+#endif	/* _NILFS_BMAP_UNION_H */
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
new file mode 100644
index 00000000000..4cc07b2c30e
--- /dev/null
+++ b/fs/nilfs2/btnode.c
@@ -0,0 +1,316 @@
+/*
+ * btnode.c - NILFS B-tree node cache
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * This file was originally written by Seiji Kihara <kihara@osrg.net>
+ * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for
+ * stabilization and simplification.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "dat.h"
+#include "page.h"
+#include "btnode.h"
+
+
+void nilfs_btnode_cache_init_once(struct address_space *btnc)
+{
+	INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
+	spin_lock_init(&btnc->tree_lock);
+	INIT_LIST_HEAD(&btnc->private_list);
+	spin_lock_init(&btnc->private_lock);
+
+	spin_lock_init(&btnc->i_mmap_lock);
+	INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap);
+	INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
+}
+
+static struct address_space_operations def_btnode_aops;
+
+void nilfs_btnode_cache_init(struct address_space *btnc)
+{
+	btnc->host = NULL;  /* can safely set to host inode ? */
+	btnc->flags = 0;
+	mapping_set_gfp_mask(btnc, GFP_NOFS);
+	btnc->assoc_mapping = NULL;
+	btnc->backing_dev_info = &default_backing_dev_info;
+	btnc->a_ops = &def_btnode_aops;
+}
+
+void nilfs_btnode_cache_clear(struct address_space *btnc)
+{
+	invalidate_mapping_pages(btnc, 0, -1);
+	truncate_inode_pages(btnc, 0);
+}
+
+int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
+			      sector_t pblocknr, struct buffer_head **pbh,
+			      int newblk)
+{
+	struct buffer_head *bh;
+	struct inode *inode = NILFS_BTNC_I(btnc);
+	int err;
+
+	bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+	if (unlikely(!bh))
+		return -ENOMEM;
+
+	err = -EEXIST; /* internal code */
+	if (newblk) {
+		if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
+			     buffer_dirty(bh))) {
+			brelse(bh);
+			BUG();
+		}
+		bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+		bh->b_blocknr = blocknr;
+		set_buffer_mapped(bh);
+		set_buffer_uptodate(bh);
+		goto found;
+	}
+
+	if (buffer_uptodate(bh) || buffer_dirty(bh))
+		goto found;
+
+	if (pblocknr == 0) {
+		pblocknr = blocknr;
+		if (inode->i_ino != NILFS_DAT_INO) {
+			struct inode *dat =
+				nilfs_dat_inode(NILFS_I_NILFS(inode));
+
+			/* blocknr is a virtual block number */
+			err = nilfs_dat_translate(dat, blocknr, &pblocknr);
+			if (unlikely(err)) {
+				brelse(bh);
+				goto out_locked;
+			}
+		}
+	}
+	lock_buffer(bh);
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		err = -EEXIST; /* internal code */
+		goto found;
+	}
+	set_buffer_mapped(bh);
+	bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+	bh->b_blocknr = pblocknr; /* set block address for read */
+	bh->b_end_io = end_buffer_read_sync;
+	get_bh(bh);
+	submit_bh(READ, bh);
+	bh->b_blocknr = blocknr; /* set back to the given block address */
+	err = 0;
+found:
+	*pbh = bh;
+
+out_locked:
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	return err;
+}
+
+int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr,
+		     sector_t pblocknr, struct buffer_head **pbh, int newblk)
+{
+	struct buffer_head *bh;
+	int err;
+
+	err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk);
+	if (err == -EEXIST) /* internal code (cache hit) */
+		return 0;
+	if (unlikely(err))
+		return err;
+
+	bh = *pbh;
+	wait_on_buffer(bh);
+	if (!buffer_uptodate(bh)) {
+		brelse(bh);
+		return -EIO;
+	}
+	return 0;
+}
+
+/**
+ * nilfs_btnode_delete - delete B-tree node buffer
+ * @bh: buffer to be deleted
+ *
+ * nilfs_btnode_delete() invalidates the specified buffer and delete the page
+ * including the buffer if the page gets unbusy.
+ */
+void nilfs_btnode_delete(struct buffer_head *bh)
+{
+	struct address_space *mapping;
+	struct page *page = bh->b_page;
+	pgoff_t index = page_index(page);
+	int still_dirty;
+
+	page_cache_get(page);
+	lock_page(page);
+	wait_on_page_writeback(page);
+
+	nilfs_forget_buffer(bh);
+	still_dirty = PageDirty(page);
+	mapping = page->mapping;
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (!still_dirty && mapping)
+		invalidate_inode_pages2_range(mapping, index, index);
+}
+
+/**
+ * nilfs_btnode_prepare_change_key
+ *  prepare to move contents of the block for old key to one of new key.
+ *  the old buffer will not be removed, but might be reused for new buffer.
+ *  it might return -ENOMEM because of memory allocation errors,
+ *  and might return -EIO because of disk read errors.
+ */
+int nilfs_btnode_prepare_change_key(struct address_space *btnc,
+				    struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+	struct buffer_head *obh, *nbh;
+	struct inode *inode = NILFS_BTNC_I(btnc);
+	__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+	int err;
+
+	if (oldkey == newkey)
+		return 0;
+
+	obh = ctxt->bh;
+	ctxt->newbh = NULL;
+
+	if (inode->i_blkbits == PAGE_CACHE_SHIFT) {
+		lock_page(obh->b_page);
+		/*
+		 * We cannot call radix_tree_preload for the kernels older
+		 * than 2.6.23, because it is not exported for modules.
+		 */
+		err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+		if (err)
+			goto failed_unlock;
+		/* BUG_ON(oldkey != obh->b_page->index); */
+		if (unlikely(oldkey != obh->b_page->index))
+			NILFS_PAGE_BUG(obh->b_page,
+				       "invalid oldkey %lld (newkey=%lld)",
+				       (unsigned long long)oldkey,
+				       (unsigned long long)newkey);
+
+retry:
+		spin_lock_irq(&btnc->tree_lock);
+		err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
+		spin_unlock_irq(&btnc->tree_lock);
+		/*
+		 * Note: page->index will not change to newkey until
+		 * nilfs_btnode_commit_change_key() will be called.
+		 * To protect the page in intermediate state, the page lock
+		 * is held.
+		 */
+		radix_tree_preload_end();
+		if (!err)
+			return 0;
+		else if (err != -EEXIST)
+			goto failed_unlock;
+
+		err = invalidate_inode_pages2_range(btnc, newkey, newkey);
+		if (!err)
+			goto retry;
+		/* fallback to copy mode */
+		unlock_page(obh->b_page);
+	}
+
+	err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1);
+	if (likely(!err)) {
+		BUG_ON(nbh == obh);
+		ctxt->newbh = nbh;
+	}
+	return err;
+
+ failed_unlock:
+	unlock_page(obh->b_page);
+	return err;
+}
+
+/**
+ * nilfs_btnode_commit_change_key
+ *  commit the change_key operation prepared by prepare_change_key().
+ */
+void nilfs_btnode_commit_change_key(struct address_space *btnc,
+				    struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+	struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh;
+	__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+	struct page *opage;
+
+	if (oldkey == newkey)
+		return;
+
+	if (nbh == NULL) {	/* blocksize == pagesize */
+		opage = obh->b_page;
+		if (unlikely(oldkey != opage->index))
+			NILFS_PAGE_BUG(opage,
+				       "invalid oldkey %lld (newkey=%lld)",
+				       (unsigned long long)oldkey,
+				       (unsigned long long)newkey);
+		if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage))
+			BUG();
+
+		spin_lock_irq(&btnc->tree_lock);
+		radix_tree_delete(&btnc->page_tree, oldkey);
+		radix_tree_tag_set(&btnc->page_tree, newkey,
+				   PAGECACHE_TAG_DIRTY);
+		spin_unlock_irq(&btnc->tree_lock);
+
+		opage->index = obh->b_blocknr = newkey;
+		unlock_page(opage);
+	} else {
+		nilfs_copy_buffer(nbh, obh);
+		nilfs_btnode_mark_dirty(nbh);
+
+		nbh->b_blocknr = newkey;
+		ctxt->bh = nbh;
+		nilfs_btnode_delete(obh); /* will decrement bh->b_count */
+	}
+}
+
+/**
+ * nilfs_btnode_abort_change_key
+ *  abort the change_key operation prepared by prepare_change_key().
+ */
+void nilfs_btnode_abort_change_key(struct address_space *btnc,
+				   struct nilfs_btnode_chkey_ctxt *ctxt)
+{
+	struct buffer_head *nbh = ctxt->newbh;
+	__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
+
+	if (oldkey == newkey)
+		return;
+
+	if (nbh == NULL) {	/* blocksize == pagesize */
+		spin_lock_irq(&btnc->tree_lock);
+		radix_tree_delete(&btnc->page_tree, newkey);
+		spin_unlock_irq(&btnc->tree_lock);
+		unlock_page(ctxt->bh->b_page);
+	} else
+		brelse(nbh);
+}
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
new file mode 100644
index 00000000000..35faa86444a
--- /dev/null
+++ b/fs/nilfs2/btnode.h
@@ -0,0 +1,58 @@
+/*
+ * btnode.h - NILFS B-tree node cache
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Seiji Kihara <kihara@osrg.net>
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#ifndef _NILFS_BTNODE_H
+#define _NILFS_BTNODE_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/backing-dev.h>
+
+
+struct nilfs_btnode_chkey_ctxt {
+	__u64 oldkey;
+	__u64 newkey;
+	struct buffer_head *bh;
+	struct buffer_head *newbh;
+};
+
+void nilfs_btnode_cache_init_once(struct address_space *);
+void nilfs_btnode_cache_init(struct address_space *);
+void nilfs_btnode_cache_clear(struct address_space *);
+int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
+			      struct buffer_head **, int);
+int nilfs_btnode_get(struct address_space *, __u64, sector_t,
+		     struct buffer_head **, int);
+void nilfs_btnode_delete(struct buffer_head *);
+int nilfs_btnode_prepare_change_key(struct address_space *,
+				    struct nilfs_btnode_chkey_ctxt *);
+void nilfs_btnode_commit_change_key(struct address_space *,
+				    struct nilfs_btnode_chkey_ctxt *);
+void nilfs_btnode_abort_change_key(struct address_space *,
+				   struct nilfs_btnode_chkey_ctxt *);
+
+#define nilfs_btnode_mark_dirty(bh)	nilfs_mark_buffer_dirty(bh)
+
+
+#endif	/* _NILFS_BTNODE_H */
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
new file mode 100644
index 00000000000..6b37a276729
--- /dev/null
+++ b/fs/nilfs2/btree.c
@@ -0,0 +1,2269 @@
+/*
+ * btree.c - NILFS B-tree.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/pagevec.h>
+#include "nilfs.h"
+#include "page.h"
+#include "btnode.h"
+#include "btree.h"
+#include "alloc.h"
+
+/**
+ * struct nilfs_btree_path - A path on which B-tree operations are executed
+ * @bp_bh: buffer head of node block
+ * @bp_sib_bh: buffer head of sibling node block
+ * @bp_index: index of child node
+ * @bp_oldreq: ptr end request for old ptr
+ * @bp_newreq: ptr alloc request for new ptr
+ * @bp_op: rebalance operation
+ */
+struct nilfs_btree_path {
+	struct buffer_head *bp_bh;
+	struct buffer_head *bp_sib_bh;
+	int bp_index;
+	union nilfs_bmap_ptr_req bp_oldreq;
+	union nilfs_bmap_ptr_req bp_newreq;
+	struct nilfs_btnode_chkey_ctxt bp_ctxt;
+	void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
+		      int, __u64 *, __u64 *);
+};
+
+/*
+ * B-tree path operations
+ */
+
+static struct kmem_cache *nilfs_btree_path_cache;
+
+int __init nilfs_btree_path_cache_init(void)
+{
+	nilfs_btree_path_cache =
+		kmem_cache_create("nilfs2_btree_path_cache",
+				  sizeof(struct nilfs_btree_path) *
+				  NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
+	return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
+}
+
+void nilfs_btree_path_cache_destroy(void)
+{
+	kmem_cache_destroy(nilfs_btree_path_cache);
+}
+
+static inline struct nilfs_btree_path *
+nilfs_btree_alloc_path(const struct nilfs_btree *btree)
+{
+	return (struct nilfs_btree_path *)
+		kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
+}
+
+static inline void nilfs_btree_free_path(const struct nilfs_btree *btree,
+					 struct nilfs_btree_path *path)
+{
+	kmem_cache_free(nilfs_btree_path_cache, path);
+}
+
+static void nilfs_btree_init_path(const struct nilfs_btree *btree,
+				  struct nilfs_btree_path *path)
+{
+	int level;
+
+	for (level = NILFS_BTREE_LEVEL_DATA;
+	     level < NILFS_BTREE_LEVEL_MAX;
+	     level++) {
+		path[level].bp_bh = NULL;
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index = 0;
+		path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+		path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+		path[level].bp_op = NULL;
+	}
+}
+
+static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
+				   struct nilfs_btree_path *path)
+{
+	int level;
+
+	for (level = NILFS_BTREE_LEVEL_DATA;
+	     level < NILFS_BTREE_LEVEL_MAX;
+	     level++) {
+		if (path[level].bp_bh != NULL) {
+			nilfs_bmap_put_block(&btree->bt_bmap,
+					     path[level].bp_bh);
+			path[level].bp_bh = NULL;
+		}
+		/* sib_bh is released or deleted by prepare or commit
+		 * operations. */
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index = 0;
+		path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+		path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
+		path[level].bp_op = NULL;
+	}
+}
+
+
+/*
+ * B-tree node operations
+ */
+
+static inline int
+nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
+			   const struct nilfs_btree_node *node)
+{
+	return node->bn_flags;
+}
+
+static inline void
+nilfs_btree_node_set_flags(struct nilfs_btree *btree,
+			   struct nilfs_btree_node *node,
+			   int flags)
+{
+	node->bn_flags = flags;
+}
+
+static inline int nilfs_btree_node_root(const struct nilfs_btree *btree,
+					const struct nilfs_btree_node *node)
+{
+	return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT;
+}
+
+static inline int
+nilfs_btree_node_get_level(const struct nilfs_btree *btree,
+			   const struct nilfs_btree_node *node)
+{
+	return node->bn_level;
+}
+
+static inline void
+nilfs_btree_node_set_level(struct nilfs_btree *btree,
+			   struct nilfs_btree_node *node,
+			   int level)
+{
+	node->bn_level = level;
+}
+
+static inline int
+nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree,
+			       const struct nilfs_btree_node *node)
+{
+	return le16_to_cpu(node->bn_nchildren);
+}
+
+static inline void
+nilfs_btree_node_set_nchildren(struct nilfs_btree *btree,
+			       struct nilfs_btree_node *node,
+			       int nchildren)
+{
+	node->bn_nchildren = cpu_to_le16(nchildren);
+}
+
+static inline int
+nilfs_btree_node_size(const struct nilfs_btree *btree)
+{
+	return 1 << btree->bt_bmap.b_inode->i_blkbits;
+}
+
+static inline int
+nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree,
+			       const struct nilfs_btree_node *node)
+{
+	return nilfs_btree_node_root(btree, node) ?
+		NILFS_BTREE_ROOT_NCHILDREN_MIN :
+		NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
+}
+
+static inline int
+nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree,
+			       const struct nilfs_btree_node *node)
+{
+	return nilfs_btree_node_root(btree, node) ?
+		NILFS_BTREE_ROOT_NCHILDREN_MAX :
+		NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
+}
+
+static inline __le64 *
+nilfs_btree_node_dkeys(const struct nilfs_btree *btree,
+		       const struct nilfs_btree_node *node)
+{
+	return (__le64 *)((char *)(node + 1) +
+			  (nilfs_btree_node_root(btree, node) ?
+			   0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
+}
+
+static inline __le64 *
+nilfs_btree_node_dptrs(const struct nilfs_btree *btree,
+		       const struct nilfs_btree_node *node)
+{
+	return (__le64 *)(nilfs_btree_node_dkeys(btree, node) +
+			  nilfs_btree_node_nchildren_max(btree, node));
+}
+
+static inline __u64
+nilfs_btree_node_get_key(const struct nilfs_btree *btree,
+			 const struct nilfs_btree_node *node, int index)
+{
+	return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) +
+					index));
+}
+
+static inline void
+nilfs_btree_node_set_key(struct nilfs_btree *btree,
+			 struct nilfs_btree_node *node, int index, __u64 key)
+{
+	*(nilfs_btree_node_dkeys(btree, node) + index) =
+		nilfs_bmap_key_to_dkey(key);
+}
+
+static inline __u64
+nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
+			 const struct nilfs_btree_node *node,
+			 int index)
+{
+	return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) +
+					index));
+}
+
+static inline void
+nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
+			 struct nilfs_btree_node *node,
+			 int index,
+			 __u64 ptr)
+{
+	*(nilfs_btree_node_dptrs(btree, node) + index) =
+		nilfs_bmap_ptr_to_dptr(ptr);
+}
+
+static void nilfs_btree_node_init(struct nilfs_btree *btree,
+				  struct nilfs_btree_node *node,
+				  int flags, int level, int nchildren,
+				  const __u64 *keys, const __u64 *ptrs)
+{
+	__le64 *dkeys;
+	__le64 *dptrs;
+	int i;
+
+	nilfs_btree_node_set_flags(btree, node, flags);
+	nilfs_btree_node_set_level(btree, node, level);
+	nilfs_btree_node_set_nchildren(btree, node, nchildren);
+
+	dkeys = nilfs_btree_node_dkeys(btree, node);
+	dptrs = nilfs_btree_node_dptrs(btree, node);
+	for (i = 0; i < nchildren; i++) {
+		dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
+		dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
+	}
+}
+
+/* Assume the buffer heads corresponding to left and right are locked. */
+static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
+				       struct nilfs_btree_node *left,
+				       struct nilfs_btree_node *right,
+				       int n)
+{
+	__le64 *ldkeys, *rdkeys;
+	__le64 *ldptrs, *rdptrs;
+	int lnchildren, rnchildren;
+
+	ldkeys = nilfs_btree_node_dkeys(btree, left);
+	ldptrs = nilfs_btree_node_dptrs(btree, left);
+	lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+
+	rdkeys = nilfs_btree_node_dkeys(btree, right);
+	rdptrs = nilfs_btree_node_dptrs(btree, right);
+	rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+
+	memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
+	memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
+	memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys));
+	memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs));
+
+	lnchildren += n;
+	rnchildren -= n;
+	nilfs_btree_node_set_nchildren(btree, left, lnchildren);
+	nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+}
+
+/* Assume that the buffer heads corresponding to left and right are locked. */
+static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
+					struct nilfs_btree_node *left,
+					struct nilfs_btree_node *right,
+					int n)
+{
+	__le64 *ldkeys, *rdkeys;
+	__le64 *ldptrs, *rdptrs;
+	int lnchildren, rnchildren;
+
+	ldkeys = nilfs_btree_node_dkeys(btree, left);
+	ldptrs = nilfs_btree_node_dptrs(btree, left);
+	lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+
+	rdkeys = nilfs_btree_node_dkeys(btree, right);
+	rdptrs = nilfs_btree_node_dptrs(btree, right);
+	rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+
+	memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
+	memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
+	memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys));
+	memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs));
+
+	lnchildren -= n;
+	rnchildren += n;
+	nilfs_btree_node_set_nchildren(btree, left, lnchildren);
+	nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+}
+
+/* Assume that the buffer head corresponding to node is locked. */
+static void nilfs_btree_node_insert(struct nilfs_btree *btree,
+				    struct nilfs_btree_node *node,
+				    __u64 key, __u64 ptr, int index)
+{
+	__le64 *dkeys;
+	__le64 *dptrs;
+	int nchildren;
+
+	dkeys = nilfs_btree_node_dkeys(btree, node);
+	dptrs = nilfs_btree_node_dptrs(btree, node);
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	if (index < nchildren) {
+		memmove(dkeys + index + 1, dkeys + index,
+			(nchildren - index) * sizeof(*dkeys));
+		memmove(dptrs + index + 1, dptrs + index,
+			(nchildren - index) * sizeof(*dptrs));
+	}
+	dkeys[index] = nilfs_bmap_key_to_dkey(key);
+	dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
+	nchildren++;
+	nilfs_btree_node_set_nchildren(btree, node, nchildren);
+}
+
+/* Assume that the buffer head corresponding to node is locked. */
+static void nilfs_btree_node_delete(struct nilfs_btree *btree,
+				    struct nilfs_btree_node *node,
+				    __u64 *keyp, __u64 *ptrp, int index)
+{
+	__u64 key;
+	__u64 ptr;
+	__le64 *dkeys;
+	__le64 *dptrs;
+	int nchildren;
+
+	dkeys = nilfs_btree_node_dkeys(btree, node);
+	dptrs = nilfs_btree_node_dptrs(btree, node);
+	key = nilfs_bmap_dkey_to_key(dkeys[index]);
+	ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	if (keyp != NULL)
+		*keyp = key;
+	if (ptrp != NULL)
+		*ptrp = ptr;
+
+	if (index < nchildren - 1) {
+		memmove(dkeys + index, dkeys + index + 1,
+			(nchildren - index - 1) * sizeof(*dkeys));
+		memmove(dptrs + index, dptrs + index + 1,
+			(nchildren - index - 1) * sizeof(*dptrs));
+	}
+	nchildren--;
+	nilfs_btree_node_set_nchildren(btree, node, nchildren);
+}
+
+static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
+				   const struct nilfs_btree_node *node,
+				   __u64 key, int *indexp)
+{
+	__u64 nkey;
+	int index, low, high, s;
+
+	/* binary search */
+	low = 0;
+	high = nilfs_btree_node_get_nchildren(btree, node) - 1;
+	index = 0;
+	s = 0;
+	while (low <= high) {
+		index = (low + high) / 2;
+		nkey = nilfs_btree_node_get_key(btree, node, index);
+		if (nkey == key) {
+			s = 0;
+			goto out;
+		} else if (nkey < key) {
+			low = index + 1;
+			s = -1;
+		} else {
+			high = index - 1;
+			s = 1;
+		}
+	}
+
+	/* adjust index */
+	if (nilfs_btree_node_get_level(btree, node) >
+	    NILFS_BTREE_LEVEL_NODE_MIN) {
+		if ((s > 0) && (index > 0))
+			index--;
+	} else if (s < 0)
+		index++;
+
+ out:
+	*indexp = index;
+
+	return s == 0;
+}
+
+static inline struct nilfs_btree_node *
+nilfs_btree_get_root(const struct nilfs_btree *btree)
+{
+	return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data;
+}
+
+static inline struct nilfs_btree_node *
+nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree,
+			     const struct nilfs_btree_path *path,
+			     int level)
+{
+	return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
+}
+
+static inline struct nilfs_btree_node *
+nilfs_btree_get_sib_node(const struct nilfs_btree *btree,
+			 const struct nilfs_btree_path *path,
+			 int level)
+{
+	return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
+}
+
+static inline int nilfs_btree_height(const struct nilfs_btree *btree)
+{
+	return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree))
+		+ 1;
+}
+
+static inline struct nilfs_btree_node *
+nilfs_btree_get_node(const struct nilfs_btree *btree,
+		     const struct nilfs_btree_path *path,
+		     int level)
+{
+	return (level == nilfs_btree_height(btree) - 1) ?
+		nilfs_btree_get_root(btree) :
+		nilfs_btree_get_nonroot_node(btree, path, level);
+}
+
+static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
+				 struct nilfs_btree_path *path,
+				 __u64 key, __u64 *ptrp, int minlevel)
+{
+	struct nilfs_btree_node *node;
+	__u64 ptr;
+	int level, index, found, ret;
+
+	node = nilfs_btree_get_root(btree);
+	level = nilfs_btree_node_get_level(btree, node);
+	if ((level < minlevel) ||
+	    (nilfs_btree_node_get_nchildren(btree, node) <= 0))
+		return -ENOENT;
+
+	found = nilfs_btree_node_lookup(btree, node, key, &index);
+	ptr = nilfs_btree_node_get_ptr(btree, node, index);
+	path[level].bp_bh = NULL;
+	path[level].bp_index = index;
+
+	for (level--; level >= minlevel; level--) {
+		ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
+					   &path[level].bp_bh);
+		if (ret < 0)
+			return ret;
+		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		BUG_ON(level != nilfs_btree_node_get_level(btree, node));
+		if (!found)
+			found = nilfs_btree_node_lookup(btree, node, key,
+							&index);
+		else
+			index = 0;
+		if (index < nilfs_btree_node_nchildren_max(btree, node))
+			ptr = nilfs_btree_node_get_ptr(btree, node, index);
+		else {
+			WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
+			/* insert */
+			ptr = NILFS_BMAP_INVALID_PTR;
+		}
+		path[level].bp_index = index;
+	}
+	if (!found)
+		return -ENOENT;
+
+	if (ptrp != NULL)
+		*ptrp = ptr;
+
+	return 0;
+}
+
+static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
+				      struct nilfs_btree_path *path,
+				      __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node;
+	__u64 ptr;
+	int index, level, ret;
+
+	node = nilfs_btree_get_root(btree);
+	index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+	if (index < 0)
+		return -ENOENT;
+	level = nilfs_btree_node_get_level(btree, node);
+	ptr = nilfs_btree_node_get_ptr(btree, node, index);
+	path[level].bp_bh = NULL;
+	path[level].bp_index = index;
+
+	for (level--; level > 0; level--) {
+		ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
+					   &path[level].bp_bh);
+		if (ret < 0)
+			return ret;
+		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		BUG_ON(level != nilfs_btree_node_get_level(btree, node));
+		index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+		ptr = nilfs_btree_node_get_ptr(btree, node, index);
+		path[level].bp_index = index;
+	}
+
+	if (keyp != NULL)
+		*keyp = nilfs_btree_node_get_key(btree, node, index);
+	if (ptrp != NULL)
+		*ptrp = ptr;
+
+	return 0;
+}
+
+static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
+			      __u64 key, int level, __u64 *ptrp)
+{
+	struct nilfs_btree *btree;
+	struct nilfs_btree_path *path;
+	__u64 ptr;
+	int ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	path = nilfs_btree_alloc_path(btree);
+	if (path == NULL)
+		return -ENOMEM;
+	nilfs_btree_init_path(btree, path);
+
+	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
+
+	if (ptrp != NULL)
+		*ptrp = ptr;
+
+	nilfs_btree_clear_path(btree, path);
+	nilfs_btree_free_path(btree, path);
+
+	return ret;
+}
+
+static void nilfs_btree_promote_key(struct nilfs_btree *btree,
+				    struct nilfs_btree_path *path,
+				    int level, __u64 key)
+{
+	if (level < nilfs_btree_height(btree) - 1) {
+		do {
+			lock_buffer(path[level].bp_bh);
+			nilfs_btree_node_set_key(
+				btree,
+				nilfs_btree_get_nonroot_node(
+					btree, path, level),
+				path[level].bp_index, key);
+			if (!buffer_dirty(path[level].bp_bh))
+				nilfs_btnode_mark_dirty(path[level].bp_bh);
+			unlock_buffer(path[level].bp_bh);
+		} while ((path[level].bp_index == 0) &&
+			 (++level < nilfs_btree_height(btree) - 1));
+	}
+
+	/* root */
+	if (level == nilfs_btree_height(btree) - 1) {
+		nilfs_btree_node_set_key(btree,
+					 nilfs_btree_get_root(btree),
+					 path[level].bp_index, key);
+	}
+}
+
+static void nilfs_btree_do_insert(struct nilfs_btree *btree,
+				  struct nilfs_btree_path *path,
+				  int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node;
+
+	if (level < nilfs_btree_height(btree) - 1) {
+		lock_buffer(path[level].bp_bh);
+		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
+					path[level].bp_index);
+		if (!buffer_dirty(path[level].bp_bh))
+			nilfs_btnode_mark_dirty(path[level].bp_bh);
+		unlock_buffer(path[level].bp_bh);
+
+		if (path[level].bp_index == 0)
+			nilfs_btree_promote_key(btree, path, level + 1,
+						nilfs_btree_node_get_key(
+							btree, node, 0));
+	} else {
+		node = nilfs_btree_get_root(btree);
+		nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
+					path[level].bp_index);
+	}
+}
+
+static void nilfs_btree_carry_left(struct nilfs_btree *btree,
+				   struct nilfs_btree_path *path,
+				   int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *left;
+	int nchildren, lnchildren, n, move;
+
+	lock_buffer(path[level].bp_bh);
+	lock_buffer(path[level].bp_sib_bh);
+
+	node = nilfs_btree_get_nonroot_node(btree, path, level);
+	left = nilfs_btree_get_sib_node(btree, path, level);
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+	move = 0;
+
+	n = (nchildren + lnchildren + 1) / 2 - lnchildren;
+	if (n > path[level].bp_index) {
+		/* move insert point */
+		n--;
+		move = 1;
+	}
+
+	nilfs_btree_node_move_left(btree, left, node, n);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+
+	unlock_buffer(path[level].bp_bh);
+	unlock_buffer(path[level].bp_sib_bh);
+
+	nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(btree, node, 0));
+
+	if (move) {
+		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+		path[level].bp_bh = path[level].bp_sib_bh;
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index += lnchildren;
+		path[level + 1].bp_index--;
+	} else {
+		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index -= n;
+	}
+
+	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+}
+
+static void nilfs_btree_carry_right(struct nilfs_btree *btree,
+				    struct nilfs_btree_path *path,
+				    int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *right;
+	int nchildren, rnchildren, n, move;
+
+	lock_buffer(path[level].bp_bh);
+	lock_buffer(path[level].bp_sib_bh);
+
+	node = nilfs_btree_get_nonroot_node(btree, path, level);
+	right = nilfs_btree_get_sib_node(btree, path, level);
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+	move = 0;
+
+	n = (nchildren + rnchildren + 1) / 2 - rnchildren;
+	if (n > nchildren - path[level].bp_index) {
+		/* move insert point */
+		n--;
+		move = 1;
+	}
+
+	nilfs_btree_node_move_right(btree, node, right, n);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+
+	unlock_buffer(path[level].bp_bh);
+	unlock_buffer(path[level].bp_sib_bh);
+
+	path[level + 1].bp_index++;
+	nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(btree, right, 0));
+	path[level + 1].bp_index--;
+
+	if (move) {
+		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+		path[level].bp_bh = path[level].bp_sib_bh;
+		path[level].bp_sib_bh = NULL;
+		path[level].bp_index -=
+			nilfs_btree_node_get_nchildren(btree, node);
+		path[level + 1].bp_index++;
+	} else {
+		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+		path[level].bp_sib_bh = NULL;
+	}
+
+	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+}
+
+static void nilfs_btree_split(struct nilfs_btree *btree,
+			      struct nilfs_btree_path *path,
+			      int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *right;
+	__u64 newkey;
+	__u64 newptr;
+	int nchildren, n, move;
+
+	lock_buffer(path[level].bp_bh);
+	lock_buffer(path[level].bp_sib_bh);
+
+	node = nilfs_btree_get_nonroot_node(btree, path, level);
+	right = nilfs_btree_get_sib_node(btree, path, level);
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	move = 0;
+
+	n = (nchildren + 1) / 2;
+	if (n > nchildren - path[level].bp_index) {
+		n--;
+		move = 1;
+	}
+
+	nilfs_btree_node_move_right(btree, node, right, n);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+
+	unlock_buffer(path[level].bp_bh);
+	unlock_buffer(path[level].bp_sib_bh);
+
+	newkey = nilfs_btree_node_get_key(btree, right, 0);
+	newptr = path[level].bp_newreq.bpr_ptr;
+
+	if (move) {
+		path[level].bp_index -=
+			nilfs_btree_node_get_nchildren(btree, node);
+		nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
+					path[level].bp_index);
+
+		*keyp = nilfs_btree_node_get_key(btree, right, 0);
+		*ptrp = path[level].bp_newreq.bpr_ptr;
+
+		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
+		path[level].bp_bh = path[level].bp_sib_bh;
+		path[level].bp_sib_bh = NULL;
+	} else {
+		nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+
+		*keyp = nilfs_btree_node_get_key(btree, right, 0);
+		*ptrp = path[level].bp_newreq.bpr_ptr;
+
+		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+		path[level].bp_sib_bh = NULL;
+	}
+
+	path[level + 1].bp_index++;
+}
+
+static void nilfs_btree_grow(struct nilfs_btree *btree,
+			     struct nilfs_btree_path *path,
+			     int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *root, *child;
+	int n;
+
+	lock_buffer(path[level].bp_sib_bh);
+
+	root = nilfs_btree_get_root(btree);
+	child = nilfs_btree_get_sib_node(btree, path, level);
+
+	n = nilfs_btree_node_get_nchildren(btree, root);
+
+	nilfs_btree_node_move_right(btree, root, child, n);
+	nilfs_btree_node_set_level(btree, root, level + 1);
+
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+
+	unlock_buffer(path[level].bp_sib_bh);
+
+	path[level].bp_bh = path[level].bp_sib_bh;
+	path[level].bp_sib_bh = NULL;
+
+	nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
+
+	*keyp = nilfs_btree_node_get_key(btree, child, 0);
+	*ptrp = path[level].bp_newreq.bpr_ptr;
+}
+
+static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree,
+				   const struct nilfs_btree_path *path)
+{
+	struct nilfs_btree_node *node;
+	int level;
+
+	if (path == NULL)
+		return NILFS_BMAP_INVALID_PTR;
+
+	/* left sibling */
+	level = NILFS_BTREE_LEVEL_NODE_MIN;
+	if (path[level].bp_index > 0) {
+		node = nilfs_btree_get_node(btree, path, level);
+		return nilfs_btree_node_get_ptr(btree, node,
+						path[level].bp_index - 1);
+	}
+
+	/* parent */
+	level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
+	if (level <= nilfs_btree_height(btree) - 1) {
+		node = nilfs_btree_get_node(btree, path, level);
+		return nilfs_btree_node_get_ptr(btree, node,
+						path[level].bp_index);
+	}
+
+	return NILFS_BMAP_INVALID_PTR;
+}
+
+static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree,
+				       const struct nilfs_btree_path *path,
+				       __u64 key)
+{
+	__u64 ptr;
+
+	ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key);
+	if (ptr != NILFS_BMAP_INVALID_PTR)
+		/* sequential access */
+		return ptr;
+	else {
+		ptr = nilfs_btree_find_near(btree, path);
+		if (ptr != NILFS_BMAP_INVALID_PTR)
+			/* near */
+			return ptr;
+	}
+	/* block group */
+	return nilfs_bmap_find_target_in_group(&btree->bt_bmap);
+}
+
+static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key,
+				     __u64 ptr)
+{
+	btree->bt_bmap.b_last_allocated_key = key;
+	btree->bt_bmap.b_last_allocated_ptr = ptr;
+}
+
+static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
+				      struct nilfs_btree_path *path,
+				      int *levelp, __u64 key, __u64 ptr,
+				      struct nilfs_bmap_stats *stats)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree_node *node, *parent, *sib;
+	__u64 sibptr;
+	int pindex, level, ret;
+
+	stats->bs_nblocks = 0;
+	level = NILFS_BTREE_LEVEL_DATA;
+
+	/* allocate a new ptr for data block */
+	if (btree->bt_ops->btop_find_target != NULL)
+		path[level].bp_newreq.bpr_ptr =
+			btree->bt_ops->btop_find_target(btree, path, key);
+
+	ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+		&btree->bt_bmap, &path[level].bp_newreq);
+	if (ret < 0)
+		goto err_out_data;
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+	     level < nilfs_btree_height(btree) - 1;
+	     level++) {
+		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		if (nilfs_btree_node_get_nchildren(btree, node) <
+		    nilfs_btree_node_nchildren_max(btree, node)) {
+			path[level].bp_op = nilfs_btree_do_insert;
+			stats->bs_nblocks++;
+			goto out;
+		}
+
+		parent = nilfs_btree_get_node(btree, path, level + 1);
+		pindex = path[level + 1].bp_index;
+
+		/* left sibling */
+		if (pindex > 0) {
+			sibptr = nilfs_btree_node_get_ptr(btree, parent,
+							  pindex - 1);
+			ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+						   &bh);
+			if (ret < 0)
+				goto err_out_child_node;
+			sib = (struct nilfs_btree_node *)bh->b_data;
+			if (nilfs_btree_node_get_nchildren(btree, sib) <
+			    nilfs_btree_node_nchildren_max(btree, sib)) {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_carry_left;
+				stats->bs_nblocks++;
+				goto out;
+			} else
+				nilfs_bmap_put_block(&btree->bt_bmap, bh);
+		}
+
+		/* right sibling */
+		if (pindex <
+		    nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+			sibptr = nilfs_btree_node_get_ptr(btree, parent,
+							  pindex + 1);
+			ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+						   &bh);
+			if (ret < 0)
+				goto err_out_child_node;
+			sib = (struct nilfs_btree_node *)bh->b_data;
+			if (nilfs_btree_node_get_nchildren(btree, sib) <
+			    nilfs_btree_node_nchildren_max(btree, sib)) {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_carry_right;
+				stats->bs_nblocks++;
+				goto out;
+			} else
+				nilfs_bmap_put_block(&btree->bt_bmap, bh);
+		}
+
+		/* split */
+		path[level].bp_newreq.bpr_ptr =
+			path[level - 1].bp_newreq.bpr_ptr + 1;
+		ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+			&btree->bt_bmap, &path[level].bp_newreq);
+		if (ret < 0)
+			goto err_out_child_node;
+		ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
+					       path[level].bp_newreq.bpr_ptr,
+					       &bh);
+		if (ret < 0)
+			goto err_out_curr_node;
+
+		stats->bs_nblocks++;
+
+		lock_buffer(bh);
+		nilfs_btree_node_init(btree,
+				      (struct nilfs_btree_node *)bh->b_data,
+				      0, level, 0, NULL, NULL);
+		unlock_buffer(bh);
+		path[level].bp_sib_bh = bh;
+		path[level].bp_op = nilfs_btree_split;
+	}
+
+	/* root */
+	node = nilfs_btree_get_root(btree);
+	if (nilfs_btree_node_get_nchildren(btree, node) <
+	    nilfs_btree_node_nchildren_max(btree, node)) {
+		path[level].bp_op = nilfs_btree_do_insert;
+		stats->bs_nblocks++;
+		goto out;
+	}
+
+	/* grow */
+	path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
+	ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
+		&btree->bt_bmap, &path[level].bp_newreq);
+	if (ret < 0)
+		goto err_out_child_node;
+	ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
+				       path[level].bp_newreq.bpr_ptr, &bh);
+	if (ret < 0)
+		goto err_out_curr_node;
+
+	lock_buffer(bh);
+	nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data,
+			      0, level, 0, NULL, NULL);
+	unlock_buffer(bh);
+	path[level].bp_sib_bh = bh;
+	path[level].bp_op = nilfs_btree_grow;
+
+	level++;
+	path[level].bp_op = nilfs_btree_do_insert;
+
+	/* a newly-created node block and a data block are added */
+	stats->bs_nblocks += 2;
+
+	/* success */
+ out:
+	*levelp = level;
+	return ret;
+
+	/* error */
+ err_out_curr_node:
+	btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
+						    &path[level].bp_newreq);
+ err_out_child_node:
+	for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
+		nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
+		btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(
+			&btree->bt_bmap, &path[level].bp_newreq);
+
+	}
+
+	btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
+						       &path[level].bp_newreq);
+ err_out_data:
+	*levelp = level;
+	stats->bs_nblocks = 0;
+	return ret;
+}
+
+static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
+				      struct nilfs_btree_path *path,
+				      int maxlevel, __u64 key, __u64 ptr)
+{
+	int level;
+
+	set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
+	ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
+	if (btree->bt_ops->btop_set_target != NULL)
+		btree->bt_ops->btop_set_target(btree, key, ptr);
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
+		if (btree->bt_bmap.b_pops->bpop_commit_alloc_ptr != NULL) {
+			btree->bt_bmap.b_pops->bpop_commit_alloc_ptr(
+				&btree->bt_bmap, &path[level - 1].bp_newreq);
+		}
+		path[level].bp_op(btree, path, level, &key, &ptr);
+	}
+
+	if (!nilfs_bmap_dirty(&btree->bt_bmap))
+		nilfs_bmap_set_dirty(&btree->bt_bmap);
+}
+
+static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+{
+	struct nilfs_btree *btree;
+	struct nilfs_btree_path *path;
+	struct nilfs_bmap_stats stats;
+	int level, ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	path = nilfs_btree_alloc_path(btree);
+	if (path == NULL)
+		return -ENOMEM;
+	nilfs_btree_init_path(btree, path);
+
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
+				    NILFS_BTREE_LEVEL_NODE_MIN);
+	if (ret != -ENOENT) {
+		if (ret == 0)
+			ret = -EEXIST;
+		goto out;
+	}
+
+	ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats);
+	if (ret < 0)
+		goto out;
+	nilfs_btree_commit_insert(btree, path, level, key, ptr);
+	nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+
+ out:
+	nilfs_btree_clear_path(btree, path);
+	nilfs_btree_free_path(btree, path);
+	return ret;
+}
+
+static void nilfs_btree_do_delete(struct nilfs_btree *btree,
+				  struct nilfs_btree_path *path,
+				  int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node;
+
+	if (level < nilfs_btree_height(btree) - 1) {
+		lock_buffer(path[level].bp_bh);
+		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		nilfs_btree_node_delete(btree, node, keyp, ptrp,
+					path[level].bp_index);
+		if (!buffer_dirty(path[level].bp_bh))
+			nilfs_btnode_mark_dirty(path[level].bp_bh);
+		unlock_buffer(path[level].bp_bh);
+		if (path[level].bp_index == 0)
+			nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(btree, node, 0));
+	} else {
+		node = nilfs_btree_get_root(btree);
+		nilfs_btree_node_delete(btree, node, keyp, ptrp,
+					path[level].bp_index);
+	}
+}
+
+static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
+				    struct nilfs_btree_path *path,
+				    int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *left;
+	int nchildren, lnchildren, n;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	lock_buffer(path[level].bp_bh);
+	lock_buffer(path[level].bp_sib_bh);
+
+	node = nilfs_btree_get_nonroot_node(btree, path, level);
+	left = nilfs_btree_get_sib_node(btree, path, level);
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+
+	n = (nchildren + lnchildren) / 2 - nchildren;
+
+	nilfs_btree_node_move_right(btree, left, node, n);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+
+	unlock_buffer(path[level].bp_bh);
+	unlock_buffer(path[level].bp_sib_bh);
+
+	nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(btree, node, 0));
+
+	nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+	path[level].bp_sib_bh = NULL;
+	path[level].bp_index += n;
+}
+
+static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
+				     struct nilfs_btree_path *path,
+				     int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *right;
+	int nchildren, rnchildren, n;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	lock_buffer(path[level].bp_bh);
+	lock_buffer(path[level].bp_sib_bh);
+
+	node = nilfs_btree_get_nonroot_node(btree, path, level);
+	right = nilfs_btree_get_sib_node(btree, path, level);
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+
+	n = (nchildren + rnchildren) / 2 - nchildren;
+
+	nilfs_btree_node_move_left(btree, node, right, n);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_bh);
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+
+	unlock_buffer(path[level].bp_bh);
+	unlock_buffer(path[level].bp_sib_bh);
+
+	path[level + 1].bp_index++;
+	nilfs_btree_promote_key(btree, path, level + 1,
+				nilfs_btree_node_get_key(btree, right, 0));
+	path[level + 1].bp_index--;
+
+	nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+	path[level].bp_sib_bh = NULL;
+}
+
+static void nilfs_btree_concat_left(struct nilfs_btree *btree,
+				    struct nilfs_btree_path *path,
+				    int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *left;
+	int n;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	lock_buffer(path[level].bp_bh);
+	lock_buffer(path[level].bp_sib_bh);
+
+	node = nilfs_btree_get_nonroot_node(btree, path, level);
+	left = nilfs_btree_get_sib_node(btree, path, level);
+
+	n = nilfs_btree_node_get_nchildren(btree, node);
+
+	nilfs_btree_node_move_left(btree, left, node, n);
+
+	if (!buffer_dirty(path[level].bp_sib_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+
+	unlock_buffer(path[level].bp_bh);
+	unlock_buffer(path[level].bp_sib_bh);
+
+	nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
+	path[level].bp_bh = path[level].bp_sib_bh;
+	path[level].bp_sib_bh = NULL;
+	path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
+}
+
+static void nilfs_btree_concat_right(struct nilfs_btree *btree,
+				     struct nilfs_btree_path *path,
+				     int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *node, *right;
+	int n;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	lock_buffer(path[level].bp_bh);
+	lock_buffer(path[level].bp_sib_bh);
+
+	node = nilfs_btree_get_nonroot_node(btree, path, level);
+	right = nilfs_btree_get_sib_node(btree, path, level);
+
+	n = nilfs_btree_node_get_nchildren(btree, right);
+
+	nilfs_btree_node_move_left(btree, node, right, n);
+
+	if (!buffer_dirty(path[level].bp_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_bh);
+
+	unlock_buffer(path[level].bp_bh);
+	unlock_buffer(path[level].bp_sib_bh);
+
+	nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
+	path[level].bp_sib_bh = NULL;
+	path[level + 1].bp_index++;
+}
+
+static void nilfs_btree_shrink(struct nilfs_btree *btree,
+			       struct nilfs_btree_path *path,
+			       int level, __u64 *keyp, __u64 *ptrp)
+{
+	struct nilfs_btree_node *root, *child;
+	int n;
+
+	nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
+
+	lock_buffer(path[level].bp_bh);
+	root = nilfs_btree_get_root(btree);
+	child = nilfs_btree_get_nonroot_node(btree, path, level);
+
+	nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
+	nilfs_btree_node_set_level(btree, root, level);
+	n = nilfs_btree_node_get_nchildren(btree, child);
+	nilfs_btree_node_move_left(btree, root, child, n);
+	unlock_buffer(path[level].bp_bh);
+
+	nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
+	path[level].bp_bh = NULL;
+}
+
+
+static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
+				      struct nilfs_btree_path *path,
+				      int *levelp,
+				      struct nilfs_bmap_stats *stats)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree_node *node, *parent, *sib;
+	__u64 sibptr;
+	int pindex, level, ret;
+
+	ret = 0;
+	stats->bs_nblocks = 0;
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+	     level < nilfs_btree_height(btree) - 1;
+	     level++) {
+		node = nilfs_btree_get_nonroot_node(btree, path, level);
+		path[level].bp_oldreq.bpr_ptr =
+			nilfs_btree_node_get_ptr(btree, node,
+						 path[level].bp_index);
+		if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+			ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
+				&btree->bt_bmap, &path[level].bp_oldreq);
+			if (ret < 0)
+				goto err_out_child_node;
+		}
+
+		if (nilfs_btree_node_get_nchildren(btree, node) >
+		    nilfs_btree_node_nchildren_min(btree, node)) {
+			path[level].bp_op = nilfs_btree_do_delete;
+			stats->bs_nblocks++;
+			goto out;
+		}
+
+		parent = nilfs_btree_get_node(btree, path, level + 1);
+		pindex = path[level + 1].bp_index;
+
+		if (pindex > 0) {
+			/* left sibling */
+			sibptr = nilfs_btree_node_get_ptr(btree, parent,
+							  pindex - 1);
+			ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+						   &bh);
+			if (ret < 0)
+				goto err_out_curr_node;
+			sib = (struct nilfs_btree_node *)bh->b_data;
+			if (nilfs_btree_node_get_nchildren(btree, sib) >
+			    nilfs_btree_node_nchildren_min(btree, sib)) {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_borrow_left;
+				stats->bs_nblocks++;
+				goto out;
+			} else {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_concat_left;
+				stats->bs_nblocks++;
+				/* continue; */
+			}
+		} else if (pindex <
+			   nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+			/* right sibling */
+			sibptr = nilfs_btree_node_get_ptr(btree, parent,
+							  pindex + 1);
+			ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
+						   &bh);
+			if (ret < 0)
+				goto err_out_curr_node;
+			sib = (struct nilfs_btree_node *)bh->b_data;
+			if (nilfs_btree_node_get_nchildren(btree, sib) >
+			    nilfs_btree_node_nchildren_min(btree, sib)) {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_borrow_right;
+				stats->bs_nblocks++;
+				goto out;
+			} else {
+				path[level].bp_sib_bh = bh;
+				path[level].bp_op = nilfs_btree_concat_right;
+				stats->bs_nblocks++;
+				/* continue; */
+			}
+		} else {
+			/* no siblings */
+			/* the only child of the root node */
+			WARN_ON(level != nilfs_btree_height(btree) - 2);
+			if (nilfs_btree_node_get_nchildren(btree, node) - 1 <=
+			    NILFS_BTREE_ROOT_NCHILDREN_MAX) {
+				path[level].bp_op = nilfs_btree_shrink;
+				stats->bs_nblocks += 2;
+			} else {
+				path[level].bp_op = nilfs_btree_do_delete;
+				stats->bs_nblocks++;
+			}
+
+			goto out;
+
+		}
+	}
+
+	node = nilfs_btree_get_root(btree);
+	path[level].bp_oldreq.bpr_ptr =
+		nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
+	if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+		ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
+			&btree->bt_bmap, &path[level].bp_oldreq);
+		if (ret < 0)
+			goto err_out_child_node;
+	}
+	/* child of the root node is deleted */
+	path[level].bp_op = nilfs_btree_do_delete;
+	stats->bs_nblocks++;
+
+	/* success */
+ out:
+	*levelp = level;
+	return ret;
+
+	/* error */
+ err_out_curr_node:
+	if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
+		btree->bt_bmap.b_pops->bpop_abort_end_ptr(
+			&btree->bt_bmap, &path[level].bp_oldreq);
+ err_out_child_node:
+	for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
+		nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
+		if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
+			btree->bt_bmap.b_pops->bpop_abort_end_ptr(
+				&btree->bt_bmap, &path[level].bp_oldreq);
+	}
+	*levelp = level;
+	stats->bs_nblocks = 0;
+	return ret;
+}
+
+static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
+				      struct nilfs_btree_path *path,
+				      int maxlevel)
+{
+	int level;
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
+		if (btree->bt_bmap.b_pops->bpop_commit_end_ptr != NULL)
+			btree->bt_bmap.b_pops->bpop_commit_end_ptr(
+				&btree->bt_bmap, &path[level].bp_oldreq);
+		path[level].bp_op(btree, path, level, NULL, NULL);
+	}
+
+	if (!nilfs_bmap_dirty(&btree->bt_bmap))
+		nilfs_bmap_set_dirty(&btree->bt_bmap);
+}
+
+static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
+
+{
+	struct nilfs_btree *btree;
+	struct nilfs_btree_path *path;
+	struct nilfs_bmap_stats stats;
+	int level, ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	path = nilfs_btree_alloc_path(btree);
+	if (path == NULL)
+		return -ENOMEM;
+	nilfs_btree_init_path(btree, path);
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL,
+				    NILFS_BTREE_LEVEL_NODE_MIN);
+	if (ret < 0)
+		goto out;
+
+	ret = nilfs_btree_prepare_delete(btree, path, &level, &stats);
+	if (ret < 0)
+		goto out;
+	nilfs_btree_commit_delete(btree, path, level);
+	nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
+
+out:
+	nilfs_btree_clear_path(btree, path);
+	nilfs_btree_free_path(btree, path);
+	return ret;
+}
+
+static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
+{
+	struct nilfs_btree *btree;
+	struct nilfs_btree_path *path;
+	int ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	path = nilfs_btree_alloc_path(btree);
+	if (path == NULL)
+		return -ENOMEM;
+	nilfs_btree_init_path(btree, path);
+
+	ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
+
+	nilfs_btree_clear_path(btree, path);
+	nilfs_btree_free_path(btree, path);
+
+	return ret;
+}
+
+static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree *btree;
+	struct nilfs_btree_node *root, *node;
+	__u64 maxkey, nextmaxkey;
+	__u64 ptr;
+	int nchildren, ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	root = nilfs_btree_get_root(btree);
+	switch (nilfs_btree_height(btree)) {
+	case 2:
+		bh = NULL;
+		node = root;
+		break;
+	case 3:
+		nchildren = nilfs_btree_node_get_nchildren(btree, root);
+		if (nchildren > 1)
+			return 0;
+		ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
+		ret = nilfs_bmap_get_block(bmap, ptr, &bh);
+		if (ret < 0)
+			return ret;
+		node = (struct nilfs_btree_node *)bh->b_data;
+		break;
+	default:
+		return 0;
+	}
+
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1);
+	nextmaxkey = (nchildren > 1) ?
+		nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
+	if (bh != NULL)
+		nilfs_bmap_put_block(bmap, bh);
+
+	return (maxkey == key) && (nextmaxkey < bmap->b_low);
+}
+
+static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
+				   __u64 *keys, __u64 *ptrs, int nitems)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree *btree;
+	struct nilfs_btree_node *node, *root;
+	__le64 *dkeys;
+	__le64 *dptrs;
+	__u64 ptr;
+	int nchildren, i, ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	root = nilfs_btree_get_root(btree);
+	switch (nilfs_btree_height(btree)) {
+	case 2:
+		bh = NULL;
+		node = root;
+		break;
+	case 3:
+		nchildren = nilfs_btree_node_get_nchildren(btree, root);
+		WARN_ON(nchildren > 1);
+		ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
+		ret = nilfs_bmap_get_block(bmap, ptr, &bh);
+		if (ret < 0)
+			return ret;
+		node = (struct nilfs_btree_node *)bh->b_data;
+		break;
+	default:
+		node = NULL;
+		return -EINVAL;
+	}
+
+	nchildren = nilfs_btree_node_get_nchildren(btree, node);
+	if (nchildren < nitems)
+		nitems = nchildren;
+	dkeys = nilfs_btree_node_dkeys(btree, node);
+	dptrs = nilfs_btree_node_dptrs(btree, node);
+	for (i = 0; i < nitems; i++) {
+		keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
+		ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
+	}
+
+	if (bh != NULL)
+		nilfs_bmap_put_block(bmap, bh);
+
+	return nitems;
+}
+
+static int
+nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
+				       union nilfs_bmap_ptr_req *dreq,
+				       union nilfs_bmap_ptr_req *nreq,
+				       struct buffer_head **bhp,
+				       struct nilfs_bmap_stats *stats)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree *btree;
+	int ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	stats->bs_nblocks = 0;
+
+	/* for data */
+	/* cannot find near ptr */
+	if (btree->bt_ops->btop_find_target != NULL)
+		dreq->bpr_ptr
+			= btree->bt_ops->btop_find_target(btree, NULL, key);
+	ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, dreq);
+	if (ret < 0)
+		return ret;
+
+	*bhp = NULL;
+	stats->bs_nblocks++;
+	if (nreq != NULL) {
+		nreq->bpr_ptr = dreq->bpr_ptr + 1;
+		ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, nreq);
+		if (ret < 0)
+			goto err_out_dreq;
+
+		ret = nilfs_bmap_get_new_block(bmap, nreq->bpr_ptr, &bh);
+		if (ret < 0)
+			goto err_out_nreq;
+
+		*bhp = bh;
+		stats->bs_nblocks++;
+	}
+
+	/* success */
+	return 0;
+
+	/* error */
+ err_out_nreq:
+	bmap->b_pops->bpop_abort_alloc_ptr(bmap, nreq);
+ err_out_dreq:
+	bmap->b_pops->bpop_abort_alloc_ptr(bmap, dreq);
+	stats->bs_nblocks = 0;
+	return ret;
+
+}
+
+static void
+nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
+				      __u64 key, __u64 ptr,
+				      const __u64 *keys, const __u64 *ptrs,
+				      int n, __u64 low, __u64 high,
+				      union nilfs_bmap_ptr_req *dreq,
+				      union nilfs_bmap_ptr_req *nreq,
+				      struct buffer_head *bh)
+{
+	struct nilfs_btree *btree;
+	struct nilfs_btree_node *node;
+	__u64 tmpptr;
+
+	/* free resources */
+	if (bmap->b_ops->bop_clear != NULL)
+		bmap->b_ops->bop_clear(bmap);
+
+	/* ptr must be a pointer to a buffer head. */
+	set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
+
+	/* convert and insert */
+	btree = (struct nilfs_btree *)bmap;
+	nilfs_btree_init(bmap, low, high);
+	if (nreq != NULL) {
+		if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) {
+			bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
+			bmap->b_pops->bpop_commit_alloc_ptr(bmap, nreq);
+		}
+
+		/* create child node at level 1 */
+		lock_buffer(bh);
+		node = (struct nilfs_btree_node *)bh->b_data;
+		nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs);
+		nilfs_btree_node_insert(btree, node,
+					key, dreq->bpr_ptr, n);
+		if (!buffer_dirty(bh))
+			nilfs_btnode_mark_dirty(bh);
+		if (!nilfs_bmap_dirty(bmap))
+			nilfs_bmap_set_dirty(bmap);
+
+		unlock_buffer(bh);
+		nilfs_bmap_put_block(bmap, bh);
+
+		/* create root node at level 2 */
+		node = nilfs_btree_get_root(btree);
+		tmpptr = nreq->bpr_ptr;
+		nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
+				      2, 1, &keys[0], &tmpptr);
+	} else {
+		if (bmap->b_pops->bpop_commit_alloc_ptr != NULL)
+			bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
+
+		/* create root node at level 1 */
+		node = nilfs_btree_get_root(btree);
+		nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
+				      1, n, keys, ptrs);
+		nilfs_btree_node_insert(btree, node,
+					key, dreq->bpr_ptr, n);
+		if (!nilfs_bmap_dirty(bmap))
+			nilfs_bmap_set_dirty(bmap);
+	}
+
+	if (btree->bt_ops->btop_set_target != NULL)
+		btree->bt_ops->btop_set_target(btree, key, dreq->bpr_ptr);
+}
+
+/**
+ * nilfs_btree_convert_and_insert -
+ * @bmap:
+ * @key:
+ * @ptr:
+ * @keys:
+ * @ptrs:
+ * @n:
+ * @low:
+ * @high:
+ */
+int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
+				   __u64 key, __u64 ptr,
+				   const __u64 *keys, const __u64 *ptrs,
+				   int n, __u64 low, __u64 high)
+{
+	struct buffer_head *bh;
+	union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
+	struct nilfs_bmap_stats stats;
+	int ret;
+
+	if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) {
+		di = &dreq;
+		ni = NULL;
+	} else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
+			   1 << bmap->b_inode->i_blkbits)) {
+		di = &dreq;
+		ni = &nreq;
+	} else {
+		di = NULL;
+		ni = NULL;
+		BUG();
+	}
+
+	ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh,
+						     &stats);
+	if (ret < 0)
+		return ret;
+	nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n,
+					      low, high, di, ni, bh);
+	nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+	return 0;
+}
+
+static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
+				   struct nilfs_btree_path *path,
+				   int level,
+				   struct buffer_head *bh)
+{
+	while ((++level < nilfs_btree_height(btree) - 1) &&
+	       !buffer_dirty(path[level].bp_bh))
+		nilfs_btnode_mark_dirty(path[level].bp_bh);
+
+	return 0;
+}
+
+static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
+					struct nilfs_btree_path *path,
+					int level)
+{
+	struct nilfs_btree_node *parent;
+	int ret;
+
+	parent = nilfs_btree_get_node(btree, path, level + 1);
+	path[level].bp_oldreq.bpr_ptr =
+		nilfs_btree_node_get_ptr(btree, parent,
+					 path[level + 1].bp_index);
+	path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
+	ret = nilfs_bmap_prepare_update(&btree->bt_bmap,
+					&path[level].bp_oldreq,
+					&path[level].bp_newreq);
+	if (ret < 0)
+		return ret;
+
+	if (buffer_nilfs_node(path[level].bp_bh)) {
+		path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr;
+		path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
+		path[level].bp_ctxt.bh = path[level].bp_bh;
+		ret = nilfs_btnode_prepare_change_key(
+			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+			&path[level].bp_ctxt);
+		if (ret < 0) {
+			nilfs_bmap_abort_update(&btree->bt_bmap,
+						&path[level].bp_oldreq,
+						&path[level].bp_newreq);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
+					struct nilfs_btree_path *path,
+					int level)
+{
+	struct nilfs_btree_node *parent;
+
+	nilfs_bmap_commit_update(&btree->bt_bmap,
+				 &path[level].bp_oldreq,
+				 &path[level].bp_newreq);
+
+	if (buffer_nilfs_node(path[level].bp_bh)) {
+		nilfs_btnode_commit_change_key(
+			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+			&path[level].bp_ctxt);
+		path[level].bp_bh = path[level].bp_ctxt.bh;
+	}
+	set_buffer_nilfs_volatile(path[level].bp_bh);
+
+	parent = nilfs_btree_get_node(btree, path, level + 1);
+	nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index,
+				 path[level].bp_newreq.bpr_ptr);
+}
+
+static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
+				       struct nilfs_btree_path *path,
+				       int level)
+{
+	nilfs_bmap_abort_update(&btree->bt_bmap,
+				&path[level].bp_oldreq,
+				&path[level].bp_newreq);
+	if (buffer_nilfs_node(path[level].bp_bh))
+		nilfs_btnode_abort_change_key(
+			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+			&path[level].bp_ctxt);
+}
+
+static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
+					   struct nilfs_btree_path *path,
+					   int minlevel,
+					   int *maxlevelp)
+{
+	int level, ret;
+
+	level = minlevel;
+	if (!buffer_nilfs_volatile(path[level].bp_bh)) {
+		ret = nilfs_btree_prepare_update_v(btree, path, level);
+		if (ret < 0)
+			return ret;
+	}
+	while ((++level < nilfs_btree_height(btree) - 1) &&
+	       !buffer_dirty(path[level].bp_bh)) {
+
+		WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
+		ret = nilfs_btree_prepare_update_v(btree, path, level);
+		if (ret < 0)
+			goto out;
+	}
+
+	/* success */
+	*maxlevelp = level - 1;
+	return 0;
+
+	/* error */
+ out:
+	while (--level > minlevel)
+		nilfs_btree_abort_update_v(btree, path, level);
+	if (!buffer_nilfs_volatile(path[level].bp_bh))
+		nilfs_btree_abort_update_v(btree, path, level);
+	return ret;
+}
+
+static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
+					   struct nilfs_btree_path *path,
+					   int minlevel,
+					   int maxlevel,
+					   struct buffer_head *bh)
+{
+	int level;
+
+	if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
+		nilfs_btree_commit_update_v(btree, path, minlevel);
+
+	for (level = minlevel + 1; level <= maxlevel; level++)
+		nilfs_btree_commit_update_v(btree, path, level);
+}
+
+static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
+				   struct nilfs_btree_path *path,
+				   int level,
+				   struct buffer_head *bh)
+{
+	int maxlevel, ret;
+	struct nilfs_btree_node *parent;
+	__u64 ptr;
+
+	get_bh(bh);
+	path[level].bp_bh = bh;
+	ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel);
+	if (ret < 0)
+		goto out;
+
+	if (buffer_nilfs_volatile(path[level].bp_bh)) {
+		parent = nilfs_btree_get_node(btree, path, level + 1);
+		ptr = nilfs_btree_node_get_ptr(btree, parent,
+					       path[level + 1].bp_index);
+		ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr);
+		if (ret < 0)
+			goto out;
+	}
+
+	nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh);
+
+ out:
+	brelse(path[level].bp_bh);
+	path[level].bp_bh = NULL;
+	return ret;
+}
+
+static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
+				 struct buffer_head *bh)
+{
+	struct nilfs_btree *btree;
+	struct nilfs_btree_path *path;
+	struct nilfs_btree_node *node;
+	__u64 key;
+	int level, ret;
+
+	WARN_ON(!buffer_dirty(bh));
+
+	btree = (struct nilfs_btree *)bmap;
+	path = nilfs_btree_alloc_path(btree);
+	if (path == NULL)
+		return -ENOMEM;
+	nilfs_btree_init_path(btree, path);
+
+	if (buffer_nilfs_node(bh)) {
+		node = (struct nilfs_btree_node *)bh->b_data;
+		key = nilfs_btree_node_get_key(btree, node, 0);
+		level = nilfs_btree_node_get_level(btree, node);
+	} else {
+		key = nilfs_bmap_data_get_key(bmap, bh);
+		level = NILFS_BTREE_LEVEL_DATA;
+	}
+
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
+	if (ret < 0) {
+		if (unlikely(ret == -ENOENT))
+			printk(KERN_CRIT "%s: key = %llu, level == %d\n",
+			       __func__, (unsigned long long)key, level);
+		goto out;
+	}
+
+	ret = btree->bt_ops->btop_propagate(btree, path, level, bh);
+
+ out:
+	nilfs_btree_clear_path(btree, path);
+	nilfs_btree_free_path(btree, path);
+
+	return ret;
+}
+
+static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
+				    struct buffer_head *bh)
+{
+	return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr);
+}
+
+static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
+					 struct list_head *lists,
+					 struct buffer_head *bh)
+{
+	struct list_head *head;
+	struct buffer_head *cbh;
+	struct nilfs_btree_node *node, *cnode;
+	__u64 key, ckey;
+	int level;
+
+	get_bh(bh);
+	node = (struct nilfs_btree_node *)bh->b_data;
+	key = nilfs_btree_node_get_key(btree, node, 0);
+	level = nilfs_btree_node_get_level(btree, node);
+	list_for_each(head, &lists[level]) {
+		cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
+		cnode = (struct nilfs_btree_node *)cbh->b_data;
+		ckey = nilfs_btree_node_get_key(btree, cnode, 0);
+		if (key < ckey)
+			break;
+	}
+	list_add_tail(&bh->b_assoc_buffers, head);
+}
+
+static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
+					     struct list_head *listp)
+{
+	struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
+	struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache;
+	struct list_head lists[NILFS_BTREE_LEVEL_MAX];
+	struct pagevec pvec;
+	struct buffer_head *bh, *head;
+	pgoff_t index = 0;
+	int level, i;
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+	     level < NILFS_BTREE_LEVEL_MAX;
+	     level++)
+		INIT_LIST_HEAD(&lists[level]);
+
+	pagevec_init(&pvec, 0);
+
+	while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY,
+				  PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			bh = head = page_buffers(pvec.pages[i]);
+			do {
+				if (buffer_dirty(bh))
+					nilfs_btree_add_dirty_buffer(btree,
+								     lists, bh);
+			} while ((bh = bh->b_this_page) != head);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	for (level = NILFS_BTREE_LEVEL_NODE_MIN;
+	     level < NILFS_BTREE_LEVEL_MAX;
+	     level++)
+		list_splice(&lists[level], listp->prev);
+}
+
+static int nilfs_btree_assign_p(struct nilfs_btree *btree,
+				struct nilfs_btree_path *path,
+				int level,
+				struct buffer_head **bh,
+				sector_t blocknr,
+				union nilfs_binfo *binfo)
+{
+	struct nilfs_btree_node *parent;
+	__u64 key;
+	__u64 ptr;
+	int ret;
+
+	parent = nilfs_btree_get_node(btree, path, level + 1);
+	ptr = nilfs_btree_node_get_ptr(btree, parent,
+				       path[level + 1].bp_index);
+	if (buffer_nilfs_node(*bh)) {
+		path[level].bp_ctxt.oldkey = ptr;
+		path[level].bp_ctxt.newkey = blocknr;
+		path[level].bp_ctxt.bh = *bh;
+		ret = nilfs_btnode_prepare_change_key(
+			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+			&path[level].bp_ctxt);
+		if (ret < 0)
+			return ret;
+		nilfs_btnode_commit_change_key(
+			&NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
+			&path[level].bp_ctxt);
+		*bh = path[level].bp_ctxt.bh;
+	}
+
+	nilfs_btree_node_set_ptr(btree, parent,
+				 path[level + 1].bp_index, blocknr);
+
+	key = nilfs_btree_node_get_key(btree, parent,
+				       path[level + 1].bp_index);
+	/* on-disk format */
+	binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+	binfo->bi_dat.bi_level = level;
+
+	return 0;
+}
+
+static int nilfs_btree_assign_v(struct nilfs_btree *btree,
+				struct nilfs_btree_path *path,
+				int level,
+				struct buffer_head **bh,
+				sector_t blocknr,
+				union nilfs_binfo *binfo)
+{
+	struct nilfs_btree_node *parent;
+	__u64 key;
+	__u64 ptr;
+	union nilfs_bmap_ptr_req req;
+	int ret;
+
+	parent = nilfs_btree_get_node(btree, path, level + 1);
+	ptr = nilfs_btree_node_get_ptr(btree, parent,
+				       path[level + 1].bp_index);
+	req.bpr_ptr = ptr;
+	ret = btree->bt_bmap.b_pops->bpop_prepare_start_ptr(&btree->bt_bmap,
+							       &req);
+	if (ret < 0)
+		return ret;
+	btree->bt_bmap.b_pops->bpop_commit_start_ptr(&btree->bt_bmap,
+							&req, blocknr);
+
+	key = nilfs_btree_node_get_key(btree, parent,
+				       path[level + 1].bp_index);
+	/* on-disk format */
+	binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+	binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+
+	return 0;
+}
+
+static int nilfs_btree_assign(struct nilfs_bmap *bmap,
+			      struct buffer_head **bh,
+			      sector_t blocknr,
+			      union nilfs_binfo *binfo)
+{
+	struct nilfs_btree *btree;
+	struct nilfs_btree_path *path;
+	struct nilfs_btree_node *node;
+	__u64 key;
+	int level, ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	path = nilfs_btree_alloc_path(btree);
+	if (path == NULL)
+		return -ENOMEM;
+	nilfs_btree_init_path(btree, path);
+
+	if (buffer_nilfs_node(*bh)) {
+		node = (struct nilfs_btree_node *)(*bh)->b_data;
+		key = nilfs_btree_node_get_key(btree, node, 0);
+		level = nilfs_btree_node_get_level(btree, node);
+	} else {
+		key = nilfs_bmap_data_get_key(bmap, *bh);
+		level = NILFS_BTREE_LEVEL_DATA;
+	}
+
+	ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
+	if (ret < 0) {
+		WARN_ON(ret == -ENOENT);
+		goto out;
+	}
+
+	ret = btree->bt_ops->btop_assign(btree, path, level, bh,
+					    blocknr, binfo);
+
+ out:
+	nilfs_btree_clear_path(btree, path);
+	nilfs_btree_free_path(btree, path);
+
+	return ret;
+}
+
+static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
+				 struct buffer_head **bh,
+				 sector_t blocknr,
+				 union nilfs_binfo *binfo)
+{
+	struct nilfs_btree *btree;
+	struct nilfs_btree_node *node;
+	__u64 key;
+	int ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr);
+	if (ret < 0)
+		return ret;
+
+	if (buffer_nilfs_node(*bh)) {
+		node = (struct nilfs_btree_node *)(*bh)->b_data;
+		key = nilfs_btree_node_get_key(btree, node, 0);
+	} else
+		key = nilfs_bmap_data_get_key(bmap, *bh);
+
+	/* on-disk format */
+	binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
+	binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+
+	return 0;
+}
+
+static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
+{
+	struct buffer_head *bh;
+	struct nilfs_btree *btree;
+	struct nilfs_btree_path *path;
+	__u64 ptr;
+	int ret;
+
+	btree = (struct nilfs_btree *)bmap;
+	path = nilfs_btree_alloc_path(btree);
+	if (path == NULL)
+		return -ENOMEM;
+	nilfs_btree_init_path(btree, path);
+
+	ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
+	if (ret < 0) {
+		WARN_ON(ret == -ENOENT);
+		goto out;
+	}
+	ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, &bh);
+	if (ret < 0) {
+		WARN_ON(ret == -ENOENT);
+		goto out;
+	}
+
+	if (!buffer_dirty(bh))
+		nilfs_btnode_mark_dirty(bh);
+	nilfs_bmap_put_block(&btree->bt_bmap, bh);
+	if (!nilfs_bmap_dirty(&btree->bt_bmap))
+		nilfs_bmap_set_dirty(&btree->bt_bmap);
+
+ out:
+	nilfs_btree_clear_path(btree, path);
+	nilfs_btree_free_path(btree, path);
+	return ret;
+}
+
+static const struct nilfs_bmap_operations nilfs_btree_ops = {
+	.bop_lookup		=	nilfs_btree_lookup,
+	.bop_insert		=	nilfs_btree_insert,
+	.bop_delete		=	nilfs_btree_delete,
+	.bop_clear		=	NULL,
+
+	.bop_propagate		=	nilfs_btree_propagate,
+
+	.bop_lookup_dirty_buffers =	nilfs_btree_lookup_dirty_buffers,
+
+	.bop_assign		=	nilfs_btree_assign,
+	.bop_mark		=	nilfs_btree_mark,
+
+	.bop_last_key		=	nilfs_btree_last_key,
+	.bop_check_insert	=	NULL,
+	.bop_check_delete	=	nilfs_btree_check_delete,
+	.bop_gather_data	=	nilfs_btree_gather_data,
+};
+
+static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
+	.bop_lookup		=	NULL,
+	.bop_insert		=	NULL,
+	.bop_delete		=	NULL,
+	.bop_clear		=	NULL,
+
+	.bop_propagate		=	nilfs_btree_propagate_gc,
+
+	.bop_lookup_dirty_buffers =	nilfs_btree_lookup_dirty_buffers,
+
+	.bop_assign		=	nilfs_btree_assign_gc,
+	.bop_mark		=	NULL,
+
+	.bop_last_key		=	NULL,
+	.bop_check_insert	=	NULL,
+	.bop_check_delete	=	NULL,
+	.bop_gather_data	=	NULL,
+};
+
+static const struct nilfs_btree_operations nilfs_btree_ops_v = {
+	.btop_find_target	=	nilfs_btree_find_target_v,
+	.btop_set_target	=	nilfs_btree_set_target_v,
+	.btop_propagate		=	nilfs_btree_propagate_v,
+	.btop_assign		=	nilfs_btree_assign_v,
+};
+
+static const struct nilfs_btree_operations nilfs_btree_ops_p = {
+	.btop_find_target	=	NULL,
+	.btop_set_target	=	NULL,
+	.btop_propagate		=	nilfs_btree_propagate_p,
+	.btop_assign		=	nilfs_btree_assign_p,
+};
+
+int nilfs_btree_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
+{
+	struct nilfs_btree *btree;
+
+	btree = (struct nilfs_btree *)bmap;
+	bmap->b_ops = &nilfs_btree_ops;
+	bmap->b_low = low;
+	bmap->b_high = high;
+	switch (bmap->b_inode->i_ino) {
+	case NILFS_DAT_INO:
+		btree->bt_ops = &nilfs_btree_ops_p;
+		break;
+	default:
+		btree->bt_ops = &nilfs_btree_ops_v;
+		break;
+	}
+
+	return 0;
+}
+
+void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
+{
+	bmap->b_low = NILFS_BMAP_LARGE_LOW;
+	bmap->b_high = NILFS_BMAP_LARGE_HIGH;
+	bmap->b_ops = &nilfs_btree_ops_gc;
+}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
new file mode 100644
index 00000000000..4766deb52fb
--- /dev/null
+++ b/fs/nilfs2/btree.h
@@ -0,0 +1,117 @@
+/*
+ * btree.h - NILFS B-tree.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_BTREE_H
+#define _NILFS_BTREE_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/list.h>
+#include <linux/nilfs2_fs.h>
+#include "btnode.h"
+#include "bmap.h"
+
+struct nilfs_btree;
+struct nilfs_btree_path;
+
+/**
+ * struct nilfs_btree_operations - B-tree operation table
+ */
+struct nilfs_btree_operations {
+	__u64 (*btop_find_target)(const struct nilfs_btree *,
+				  const struct nilfs_btree_path *, __u64);
+	void (*btop_set_target)(struct nilfs_btree *, __u64, __u64);
+
+	struct the_nilfs *(*btop_get_nilfs)(struct nilfs_btree *);
+
+	int (*btop_propagate)(struct nilfs_btree *,
+			      struct nilfs_btree_path *,
+			      int,
+			      struct buffer_head *);
+	int (*btop_assign)(struct nilfs_btree *,
+			   struct nilfs_btree_path *,
+			   int,
+			   struct buffer_head **,
+			   sector_t,
+			   union nilfs_binfo *);
+};
+
+/**
+ * struct nilfs_btree_node - B-tree node
+ * @bn_flags: flags
+ * @bn_level: level
+ * @bn_nchildren: number of children
+ * @bn_pad: padding
+ */
+struct nilfs_btree_node {
+	__u8 bn_flags;
+	__u8 bn_level;
+	__le16 bn_nchildren;
+	__le32 bn_pad;
+};
+
+/* flags */
+#define NILFS_BTREE_NODE_ROOT	0x01
+
+/* level */
+#define NILFS_BTREE_LEVEL_DATA		0
+#define NILFS_BTREE_LEVEL_NODE_MIN	(NILFS_BTREE_LEVEL_DATA + 1)
+#define NILFS_BTREE_LEVEL_MAX		14
+
+/**
+ * struct nilfs_btree - B-tree structure
+ * @bt_bmap: bmap base structure
+ * @bt_ops: B-tree operation table
+ */
+struct nilfs_btree {
+	struct nilfs_bmap bt_bmap;
+
+	/* B-tree-specific members */
+	const struct nilfs_btree_operations *bt_ops;
+};
+
+
+#define NILFS_BTREE_ROOT_SIZE		NILFS_BMAP_SIZE
+#define NILFS_BTREE_ROOT_NCHILDREN_MAX					\
+	((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) /	\
+	 (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
+#define NILFS_BTREE_ROOT_NCHILDREN_MIN	0
+#define NILFS_BTREE_NODE_EXTRA_PAD_SIZE	(sizeof(__le64))
+#define NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize)			\
+	(((nodesize) - sizeof(struct nilfs_btree_node) -		\
+		NILFS_BTREE_NODE_EXTRA_PAD_SIZE) /			\
+	 (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
+#define NILFS_BTREE_NODE_NCHILDREN_MIN(nodesize)			\
+	((NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) - 1) / 2 + 1)
+#define NILFS_BTREE_KEY_MIN	((__u64)0)
+#define NILFS_BTREE_KEY_MAX	(~(__u64)0)
+
+
+int nilfs_btree_path_cache_init(void);
+void nilfs_btree_path_cache_destroy(void);
+int nilfs_btree_init(struct nilfs_bmap *, __u64, __u64);
+int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
+				   const __u64 *, const __u64 *,
+				   int, __u64, __u64);
+void nilfs_btree_init_gc(struct nilfs_bmap *);
+
+#endif	/* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
new file mode 100644
index 00000000000..e90b60dfced
--- /dev/null
+++ b/fs/nilfs2/cpfile.c
@@ -0,0 +1,925 @@
+/*
+ * cpfile.c - NILFS checkpoint file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/errno.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "cpfile.h"
+
+
+static inline unsigned long
+nilfs_cpfile_checkpoints_per_block(const struct inode *cpfile)
+{
+	return NILFS_MDT(cpfile)->mi_entries_per_block;
+}
+
+/* block number from the beginning of the file */
+static unsigned long
+nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno)
+{
+	__u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+	do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
+	return (unsigned long)tcno;
+}
+
+/* offset in block */
+static unsigned long
+nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno)
+{
+	__u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
+	return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
+}
+
+static unsigned long
+nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile,
+				  __u64 curr,
+				  __u64 max)
+{
+	return min_t(__u64,
+		     nilfs_cpfile_checkpoints_per_block(cpfile) -
+		     nilfs_cpfile_get_offset(cpfile, curr),
+		     max - curr);
+}
+
+static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile,
+					   __u64 cno)
+{
+	return nilfs_cpfile_get_blkoff(cpfile, cno) == 0;
+}
+
+static unsigned int
+nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile,
+					 struct buffer_head *bh,
+					 void *kaddr,
+					 unsigned int n)
+{
+	struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+	unsigned int count;
+
+	count = le32_to_cpu(cp->cp_checkpoints_count) + n;
+	cp->cp_checkpoints_count = cpu_to_le32(count);
+	return count;
+}
+
+static unsigned int
+nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile,
+					 struct buffer_head *bh,
+					 void *kaddr,
+					 unsigned int n)
+{
+	struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+	unsigned int count;
+
+	WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n);
+	count = le32_to_cpu(cp->cp_checkpoints_count) - n;
+	cp->cp_checkpoints_count = cpu_to_le32(count);
+	return count;
+}
+
+static inline struct nilfs_cpfile_header *
+nilfs_cpfile_block_get_header(const struct inode *cpfile,
+			      struct buffer_head *bh,
+			      void *kaddr)
+{
+	return kaddr + bh_offset(bh);
+}
+
+static struct nilfs_checkpoint *
+nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno,
+				  struct buffer_head *bh,
+				  void *kaddr)
+{
+	return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) *
+		NILFS_MDT(cpfile)->mi_entry_size;
+}
+
+static void nilfs_cpfile_block_init(struct inode *cpfile,
+				    struct buffer_head *bh,
+				    void *kaddr)
+{
+	struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
+	size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+	int n = nilfs_cpfile_checkpoints_per_block(cpfile);
+
+	while (n-- > 0) {
+		nilfs_checkpoint_set_invalid(cp);
+		cp = (void *)cp + cpsz;
+	}
+}
+
+static inline int nilfs_cpfile_get_header_block(struct inode *cpfile,
+						struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp);
+}
+
+static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile,
+						    __u64 cno,
+						    int create,
+						    struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(cpfile,
+				   nilfs_cpfile_get_blkoff(cpfile, cno),
+				   create, nilfs_cpfile_block_init, bhp);
+}
+
+static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
+						       __u64 cno)
+{
+	return nilfs_mdt_delete_block(cpfile,
+				      nilfs_cpfile_get_blkoff(cpfile, cno));
+}
+
+/**
+ * nilfs_cpfile_get_checkpoint - get a checkpoint
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @create: create flag
+ * @cpp: pointer to a checkpoint
+ * @bhp: pointer to a buffer head
+ *
+ * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint
+ * specified by @cno. A new checkpoint will be created if @cno is the current
+ * checkpoint number and @create is nonzero.
+ *
+ * Return Value: On success, 0 is returned, and the checkpoint and the
+ * buffer head of the buffer on which the checkpoint is located are stored in
+ * the place pointed by @cpp and @bhp, respectively. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ *
+ * %-EINVAL - invalid checkpoint.
+ */
+int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
+				__u64 cno,
+				int create,
+				struct nilfs_checkpoint **cpp,
+				struct buffer_head **bhp)
+{
+	struct buffer_head *header_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	void *kaddr;
+	int ret;
+
+	if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) ||
+		     (cno < nilfs_mdt_cno(cpfile) && create)))
+		return -EINVAL;
+
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh);
+	if (ret < 0)
+		goto out_header;
+	kaddr = kmap(cp_bh->b_page);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp)) {
+		if (!create) {
+			kunmap(cp_bh->b_page);
+			brelse(cp_bh);
+			ret = -ENOENT;
+			goto out_header;
+		}
+		/* a newly-created checkpoint */
+		nilfs_checkpoint_clear_invalid(cp);
+		if (!nilfs_cpfile_is_in_first(cpfile, cno))
+			nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
+								 kaddr, 1);
+		nilfs_mdt_mark_buffer_dirty(cp_bh);
+
+		kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
+						       kaddr);
+		le64_add_cpu(&header->ch_ncheckpoints, 1);
+		kunmap_atomic(kaddr, KM_USER0);
+		nilfs_mdt_mark_buffer_dirty(header_bh);
+		nilfs_mdt_mark_dirty(cpfile);
+	}
+
+	if (cpp != NULL)
+		*cpp = cp;
+	*bhp = cp_bh;
+
+ out_header:
+	brelse(header_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_put_checkpoint - put a checkpoint
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @bh: buffer head
+ *
+ * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint
+ * specified by @cno. @bh must be the buffer head which has been returned by
+ * a previous call to nilfs_cpfile_get_checkpoint() with @cno.
+ */
+void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
+				 struct buffer_head *bh)
+{
+	kunmap(bh->b_page);
+	brelse(bh);
+}
+
+/**
+ * nilfs_cpfile_delete_checkpoints - delete checkpoints
+ * @cpfile: inode of checkpoint file
+ * @start: start checkpoint number
+ * @end: end checkpoint numer
+ *
+ * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in
+ * the period from @start to @end, excluding @end itself. The checkpoints
+ * which have been already deleted are ignored.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - invalid checkpoints.
+ */
+int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
+				    __u64 start,
+				    __u64 end)
+{
+	struct buffer_head *header_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+	__u64 cno;
+	void *kaddr;
+	unsigned long tnicps;
+	int ret, ncps, nicps, count, i;
+
+	if (unlikely(start == 0 || start > end)) {
+		printk(KERN_ERR "%s: invalid range of checkpoint numbers: "
+		       "[%llu, %llu)\n", __func__,
+		       (unsigned long long)start, (unsigned long long)end);
+		return -EINVAL;
+	}
+
+	/* cannot delete the latest checkpoint */
+	if (start == nilfs_mdt_cno(cpfile) - 1)
+		return -EPERM;
+
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+	tnicps = 0;
+
+	for (cno = start; cno < end; cno += ncps) {
+		ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end);
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				goto out_sem;
+			/* skip hole */
+			ret = 0;
+			continue;
+		}
+
+		kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+		cp = nilfs_cpfile_block_get_checkpoint(
+			cpfile, cno, cp_bh, kaddr);
+		nicps = 0;
+		for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) {
+			WARN_ON(nilfs_checkpoint_snapshot(cp));
+			if (!nilfs_checkpoint_invalid(cp)) {
+				nilfs_checkpoint_set_invalid(cp);
+				nicps++;
+			}
+		}
+		if (nicps > 0) {
+			tnicps += nicps;
+			nilfs_mdt_mark_buffer_dirty(cp_bh);
+			nilfs_mdt_mark_dirty(cpfile);
+			if (!nilfs_cpfile_is_in_first(cpfile, cno) &&
+			    (count = nilfs_cpfile_block_sub_valid_checkpoints(
+				    cpfile, cp_bh, kaddr, nicps)) == 0) {
+				/* make hole */
+				kunmap_atomic(kaddr, KM_USER0);
+				brelse(cp_bh);
+				ret = nilfs_cpfile_delete_checkpoint_block(
+					cpfile, cno);
+				if (ret == 0)
+					continue;
+				printk(KERN_ERR "%s: cannot delete block\n",
+				       __func__);
+				goto out_sem;
+			}
+		}
+
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(cp_bh);
+	}
+
+	if (tnicps > 0) {
+		kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
+						       kaddr);
+		le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
+		nilfs_mdt_mark_buffer_dirty(header_bh);
+		nilfs_mdt_mark_dirty(cpfile);
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+	brelse(header_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
+					      struct nilfs_checkpoint *cp,
+					      struct nilfs_cpinfo *ci)
+{
+	ci->ci_flags = le32_to_cpu(cp->cp_flags);
+	ci->ci_cno = le64_to_cpu(cp->cp_cno);
+	ci->ci_create = le64_to_cpu(cp->cp_create);
+	ci->ci_nblk_inc = le64_to_cpu(cp->cp_nblk_inc);
+	ci->ci_inodes_count = le64_to_cpu(cp->cp_inodes_count);
+	ci->ci_blocks_count = le64_to_cpu(cp->cp_blocks_count);
+	ci->ci_next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
+}
+
+static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
+					  struct nilfs_cpinfo *ci, size_t nci)
+{
+	struct nilfs_checkpoint *cp;
+	struct buffer_head *bh;
+	size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
+	__u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop;
+	void *kaddr;
+	int n, ret;
+	int ncps, i;
+
+	if (cno == 0)
+		return -ENOENT; /* checkpoint number 0 is invalid */
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+
+	for (n = 0; cno < cur_cno && n < nci; cno += ncps) {
+		ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				goto out;
+			continue; /* skip hole */
+		}
+
+		kaddr = kmap_atomic(bh->b_page, KM_USER0);
+		cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+		for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
+			if (!nilfs_checkpoint_invalid(cp))
+				nilfs_cpfile_checkpoint_to_cpinfo(
+					cpfile, cp, &ci[n++]);
+		}
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(bh);
+	}
+
+	ret = n;
+	if (n > 0)
+		*cnop = ci[n - 1].ci_cno + 1;
+
+ out:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
+					  struct nilfs_cpinfo *ci, size_t nci)
+{
+	struct buffer_head *bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	__u64 curr = *cnop, next;
+	unsigned long curr_blkoff, next_blkoff;
+	void *kaddr;
+	int n = 0, ret;
+
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+
+	if (curr == 0) {
+		ret = nilfs_cpfile_get_header_block(cpfile, &bh);
+		if (ret < 0)
+			goto out;
+		kaddr = kmap_atomic(bh->b_page, KM_USER0);
+		header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+		curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(bh);
+		if (curr == 0) {
+			ret = 0;
+			goto out;
+		}
+	} else if (unlikely(curr == ~(__u64)0)) {
+		ret = 0;
+		goto out;
+	}
+
+	curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr);
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh);
+	if (unlikely(ret < 0)) {
+		if (ret == -ENOENT)
+			ret = 0; /* No snapshots (started from a hole block) */
+		goto out;
+	}
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	while (n < nci) {
+		cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);
+		curr = ~(__u64)0; /* Terminator */
+		if (unlikely(nilfs_checkpoint_invalid(cp) ||
+			     !nilfs_checkpoint_snapshot(cp)))
+			break;
+		nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, &ci[n++]);
+		next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
+		if (next == 0)
+			break; /* reach end of the snapshot list */
+
+		next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
+		if (curr_blkoff != next_blkoff) {
+			kunmap_atomic(kaddr, KM_USER0);
+			brelse(bh);
+			ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
+								0, &bh);
+			if (unlikely(ret < 0)) {
+				WARN_ON(ret == -ENOENT);
+				goto out;
+			}
+			kaddr = kmap_atomic(bh->b_page, KM_USER0);
+		}
+		curr = next;
+		curr_blkoff = next_blkoff;
+	}
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(bh);
+	*cnop = curr;
+	ret = n;
+
+ out:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_get_cpinfo -
+ * @cpfile:
+ * @cno:
+ * @ci:
+ * @nci:
+ */
+
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
+				struct nilfs_cpinfo *ci, size_t nci)
+{
+	switch (mode) {
+	case NILFS_CHECKPOINT:
+		return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, ci, nci);
+	case NILFS_SNAPSHOT:
+		return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, ci, nci);
+	default:
+		return -EINVAL;
+	}
+}
+
+/**
+ * nilfs_cpfile_delete_checkpoint -
+ * @cpfile:
+ * @cno:
+ */
+int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno)
+{
+	struct nilfs_cpinfo ci;
+	__u64 tcno = cno;
+	ssize_t nci;
+	int ret;
+
+	nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, 1);
+	if (nci < 0)
+		return nci;
+	else if (nci == 0 || ci.ci_cno != cno)
+		return -ENOENT;
+
+	/* cannot delete the latest checkpoint nor snapshots */
+	ret = nilfs_cpinfo_snapshot(&ci);
+	if (ret < 0)
+		return ret;
+	else if (ret > 0 || cno == nilfs_mdt_cno(cpfile) - 1)
+		return -EPERM;
+
+	return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1);
+}
+
+static struct nilfs_snapshot_list *
+nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile,
+				     __u64 cno,
+				     struct buffer_head *bh,
+				     void *kaddr)
+{
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	struct nilfs_snapshot_list *list;
+
+	if (cno != 0) {
+		cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+		list = &cp->cp_snapshot_list;
+	} else {
+		header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+		list = &header->ch_snapshot_list;
+	}
+	return list;
+}
+
+static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
+{
+	struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	struct nilfs_snapshot_list *list;
+	__u64 curr, prev;
+	unsigned long curr_blkoff, prev_blkoff;
+	void *kaddr;
+	int ret;
+
+	if (cno == 0)
+		return -ENOENT; /* checkpoint number 0 is invalid */
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp)) {
+		ret = -ENOENT;
+		kunmap_atomic(kaddr, KM_USER0);
+		goto out_cp;
+	}
+	if (nilfs_checkpoint_snapshot(cp)) {
+		ret = 0;
+		kunmap_atomic(kaddr, KM_USER0);
+		goto out_cp;
+	}
+	kunmap_atomic(kaddr, KM_USER0);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (ret < 0)
+		goto out_cp;
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+	list = &header->ch_snapshot_list;
+	curr_bh = header_bh;
+	get_bh(curr_bh);
+	curr = 0;
+	curr_blkoff = 0;
+	prev = le64_to_cpu(list->ssl_prev);
+	while (prev > cno) {
+		prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);
+		curr = prev;
+		if (curr_blkoff != prev_blkoff) {
+			kunmap_atomic(kaddr, KM_USER0);
+			brelse(curr_bh);
+			ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,
+								0, &curr_bh);
+			if (ret < 0)
+				goto out_header;
+			kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
+		}
+		curr_blkoff = prev_blkoff;
+		cp = nilfs_cpfile_block_get_checkpoint(
+			cpfile, curr, curr_bh, kaddr);
+		list = &cp->cp_snapshot_list;
+		prev = le64_to_cpu(list->ssl_prev);
+	}
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (prev != 0) {
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
+							&prev_bh);
+		if (ret < 0)
+			goto out_curr;
+	} else {
+		prev_bh = header_bh;
+		get_bh(prev_bh);
+	}
+
+	kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
+	list = nilfs_cpfile_block_get_snapshot_list(
+		cpfile, curr, curr_bh, kaddr);
+	list->ssl_prev = cpu_to_le64(cno);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);
+	cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);
+	nilfs_checkpoint_set_snapshot(cp);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
+	list = nilfs_cpfile_block_get_snapshot_list(
+		cpfile, prev, prev_bh, kaddr);
+	list->ssl_next = cpu_to_le64(cno);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+	le64_add_cpu(&header->ch_nsnapshots, 1);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_mdt_mark_buffer_dirty(prev_bh);
+	nilfs_mdt_mark_buffer_dirty(curr_bh);
+	nilfs_mdt_mark_buffer_dirty(cp_bh);
+	nilfs_mdt_mark_buffer_dirty(header_bh);
+	nilfs_mdt_mark_dirty(cpfile);
+
+	brelse(prev_bh);
+
+ out_curr:
+	brelse(curr_bh);
+
+ out_header:
+	brelse(header_bh);
+
+ out_cp:
+	brelse(cp_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
+{
+	struct buffer_head *header_bh, *next_bh, *prev_bh, *cp_bh;
+	struct nilfs_cpfile_header *header;
+	struct nilfs_checkpoint *cp;
+	struct nilfs_snapshot_list *list;
+	__u64 next, prev;
+	void *kaddr;
+	int ret;
+
+	if (cno == 0)
+		return -ENOENT; /* checkpoint number 0 is invalid */
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp)) {
+		ret = -ENOENT;
+		kunmap_atomic(kaddr, KM_USER0);
+		goto out_cp;
+	}
+	if (!nilfs_checkpoint_snapshot(cp)) {
+		ret = 0;
+		kunmap_atomic(kaddr, KM_USER0);
+		goto out_cp;
+	}
+
+	list = &cp->cp_snapshot_list;
+	next = le64_to_cpu(list->ssl_next);
+	prev = le64_to_cpu(list->ssl_prev);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
+	if (ret < 0)
+		goto out_cp;
+	if (next != 0) {
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0,
+							&next_bh);
+		if (ret < 0)
+			goto out_header;
+	} else {
+		next_bh = header_bh;
+		get_bh(next_bh);
+	}
+	if (prev != 0) {
+		ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
+							&prev_bh);
+		if (ret < 0)
+			goto out_next;
+	} else {
+		prev_bh = header_bh;
+		get_bh(prev_bh);
+	}
+
+	kaddr = kmap_atomic(next_bh->b_page, KM_USER0);
+	list = nilfs_cpfile_block_get_snapshot_list(
+		cpfile, next, next_bh, kaddr);
+	list->ssl_prev = cpu_to_le64(prev);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
+	list = nilfs_cpfile_block_get_snapshot_list(
+		cpfile, prev, prev_bh, kaddr);
+	list->ssl_next = cpu_to_le64(next);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);
+	cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);
+	nilfs_checkpoint_clear_snapshot(cp);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
+	le64_add_cpu(&header->ch_nsnapshots, -1);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_mdt_mark_buffer_dirty(next_bh);
+	nilfs_mdt_mark_buffer_dirty(prev_bh);
+	nilfs_mdt_mark_buffer_dirty(cp_bh);
+	nilfs_mdt_mark_buffer_dirty(header_bh);
+	nilfs_mdt_mark_dirty(cpfile);
+
+	brelse(prev_bh);
+
+ out_next:
+	brelse(next_bh);
+
+ out_header:
+	brelse(header_bh);
+
+ out_cp:
+	brelse(cp_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_is_snapshot -
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ *
+ * Description:
+ *
+ * Return Value: On success, 1 is returned if the checkpoint specified by
+ * @cno is a snapshot, or 0 if not. On error, one of the following negative
+ * error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ */
+int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
+{
+	struct buffer_head *bh;
+	struct nilfs_checkpoint *cp;
+	void *kaddr;
+	int ret;
+
+	if (cno == 0)
+		return -ENOENT; /* checkpoint number 0 is invalid */
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
+	if (ret < 0)
+		goto out;
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
+	ret = nilfs_checkpoint_snapshot(cp);
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(bh);
+
+ out:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_change_cpmode - change checkpoint mode
+ * @cpfile: inode of checkpoint file
+ * @cno: checkpoint number
+ * @status: mode of checkpoint
+ *
+ * Description: nilfs_change_cpmode() changes the mode of the checkpoint
+ * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - No such checkpoint.
+ */
+int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
+{
+	struct the_nilfs *nilfs;
+	int ret;
+
+	nilfs = NILFS_MDT(cpfile)->mi_nilfs;
+
+	switch (mode) {
+	case NILFS_CHECKPOINT:
+		/*
+		 * Check for protecting existing snapshot mounts:
+		 * bd_mount_sem is used to make this operation atomic and
+		 * exclusive with a new mount job.  Though it doesn't cover
+		 * umount, it's enough for the purpose.
+		 */
+		down(&nilfs->ns_bdev->bd_mount_sem);
+		if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
+			/* Current implementation does not have to protect
+			   plain read-only mounts since they are exclusive
+			   with a read/write mount and are protected from the
+			   cleaner. */
+			ret = -EBUSY;
+		} else
+			ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
+		up(&nilfs->ns_bdev->bd_mount_sem);
+		return ret;
+	case NILFS_SNAPSHOT:
+		return nilfs_cpfile_set_snapshot(cpfile, cno);
+	default:
+		return -EINVAL;
+	}
+}
+
+/**
+ * nilfs_cpfile_get_stat - get checkpoint statistics
+ * @cpfile: inode of checkpoint file
+ * @stat: pointer to a structure of checkpoint statistics
+ *
+ * Description: nilfs_cpfile_get_stat() returns information about checkpoints.
+ *
+ * Return Value: On success, 0 is returned, and checkpoints information is
+ * stored in the place pointed by @stat. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
+{
+	struct buffer_head *bh;
+	struct nilfs_cpfile_header *header;
+	void *kaddr;
+	int ret;
+
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+
+	ret = nilfs_cpfile_get_header_block(cpfile, &bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
+	cpstat->cs_cno = nilfs_mdt_cno(cpfile);
+	cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);
+	cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots);
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(bh);
+
+ out_sem:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
new file mode 100644
index 00000000000..1a8a1008c34
--- /dev/null
+++ b/fs/nilfs2/cpfile.h
@@ -0,0 +1,45 @@
+/*
+ * cpfile.h - NILFS checkpoint file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_CPFILE_H
+#define _NILFS_CPFILE_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+
+#define NILFS_CPFILE_GFP	NILFS_MDT_GFP
+
+
+int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
+				struct nilfs_checkpoint **,
+				struct buffer_head **);
+void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
+int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64);
+int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
+int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
+int nilfs_cpfile_is_snapshot(struct inode *, __u64);
+int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
+ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int,
+				struct nilfs_cpinfo *, size_t);
+
+#endif	/* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
new file mode 100644
index 00000000000..bb8a5818e7f
--- /dev/null
+++ b/fs/nilfs2/dat.c
@@ -0,0 +1,430 @@
+/*
+ * dat.c - NILFS disk address translation.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "dat.h"
+
+
+#define NILFS_CNO_MIN	((__u64)1)
+#define NILFS_CNO_MAX	(~(__u64)0)
+
+static int nilfs_dat_prepare_entry(struct inode *dat,
+				   struct nilfs_palloc_req *req, int create)
+{
+	return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
+					    create, &req->pr_entry_bh);
+}
+
+static void nilfs_dat_commit_entry(struct inode *dat,
+				   struct nilfs_palloc_req *req)
+{
+	nilfs_mdt_mark_buffer_dirty(req->pr_entry_bh);
+	nilfs_mdt_mark_dirty(dat);
+	brelse(req->pr_entry_bh);
+}
+
+static void nilfs_dat_abort_entry(struct inode *dat,
+				  struct nilfs_palloc_req *req)
+{
+	brelse(req->pr_entry_bh);
+}
+
+int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	int ret;
+
+	ret = nilfs_palloc_prepare_alloc_entry(dat, req);
+	if (ret < 0)
+		return ret;
+
+	ret = nilfs_dat_prepare_entry(dat, req, 1);
+	if (ret < 0)
+		nilfs_palloc_abort_alloc_entry(dat, req);
+
+	return ret;
+}
+
+void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	struct nilfs_dat_entry *entry;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
+	entry->de_end = cpu_to_le64(NILFS_CNO_MAX);
+	entry->de_blocknr = cpu_to_le64(0);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_palloc_commit_alloc_entry(dat, req);
+	nilfs_dat_commit_entry(dat, req);
+}
+
+void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	nilfs_dat_abort_entry(dat, req);
+	nilfs_palloc_abort_alloc_entry(dat, req);
+}
+
+int nilfs_dat_prepare_free(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	int ret;
+
+	ret = nilfs_palloc_prepare_free_entry(dat, req);
+	if (ret < 0)
+		return ret;
+	ret = nilfs_dat_prepare_entry(dat, req, 0);
+	if (ret < 0) {
+		nilfs_palloc_abort_free_entry(dat, req);
+		return ret;
+	}
+	return 0;
+}
+
+void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	struct nilfs_dat_entry *entry;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
+	entry->de_end = cpu_to_le64(NILFS_CNO_MIN);
+	entry->de_blocknr = cpu_to_le64(0);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_dat_commit_entry(dat, req);
+	nilfs_palloc_commit_free_entry(dat, req);
+}
+
+void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	nilfs_dat_abort_entry(dat, req);
+	nilfs_palloc_abort_free_entry(dat, req);
+}
+
+int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	int ret;
+
+	ret = nilfs_dat_prepare_entry(dat, req, 0);
+	WARN_ON(ret == -ENOENT);
+	return ret;
+}
+
+void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
+			    sector_t blocknr)
+{
+	struct nilfs_dat_entry *entry;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
+	if (entry->de_blocknr != cpu_to_le64(0) ||
+	    entry->de_end != cpu_to_le64(NILFS_CNO_MAX)) {
+		printk(KERN_CRIT
+		       "%s: vbn = %llu, start = %llu, end = %llu, pbn = %llu\n",
+		       __func__, (unsigned long long)req->pr_entry_nr,
+		       (unsigned long long)le64_to_cpu(entry->de_start),
+		       (unsigned long long)le64_to_cpu(entry->de_end),
+		       (unsigned long long)le64_to_cpu(entry->de_blocknr));
+	}
+	entry->de_blocknr = cpu_to_le64(blocknr);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_dat_commit_entry(dat, req);
+}
+
+void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	nilfs_dat_abort_entry(dat, req);
+}
+
+int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	struct nilfs_dat_entry *entry;
+	__u64 start;
+	sector_t blocknr;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_dat_prepare_entry(dat, req, 0);
+	if (ret < 0) {
+		WARN_ON(ret == -ENOENT);
+		return ret;
+	}
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	start = le64_to_cpu(entry->de_start);
+	blocknr = le64_to_cpu(entry->de_blocknr);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (blocknr == 0) {
+		ret = nilfs_palloc_prepare_free_entry(dat, req);
+		if (ret < 0) {
+			nilfs_dat_abort_entry(dat, req);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
+			  int dead)
+{
+	struct nilfs_dat_entry *entry;
+	__u64 start, end;
+	sector_t blocknr;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	end = start = le64_to_cpu(entry->de_start);
+	if (!dead) {
+		end = nilfs_mdt_cno(dat);
+		WARN_ON(start > end);
+	}
+	entry->de_end = cpu_to_le64(end);
+	blocknr = le64_to_cpu(entry->de_blocknr);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (blocknr == 0)
+		nilfs_dat_commit_free(dat, req);
+	else
+		nilfs_dat_commit_entry(dat, req);
+}
+
+void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
+{
+	struct nilfs_dat_entry *entry;
+	__u64 start;
+	sector_t blocknr;
+	void *kaddr;
+
+	kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
+					     req->pr_entry_bh, kaddr);
+	start = le64_to_cpu(entry->de_start);
+	blocknr = le64_to_cpu(entry->de_blocknr);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (start == nilfs_mdt_cno(dat) && blocknr == 0)
+		nilfs_palloc_abort_free_entry(dat, req);
+	nilfs_dat_abort_entry(dat, req);
+}
+
+/**
+ * nilfs_dat_mark_dirty -
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
+{
+	struct nilfs_palloc_req req;
+	int ret;
+
+	req.pr_entry_nr = vblocknr;
+	ret = nilfs_dat_prepare_entry(dat, &req, 0);
+	if (ret == 0)
+		nilfs_dat_commit_entry(dat, &req);
+	return ret;
+}
+
+/**
+ * nilfs_dat_freev - free virtual block numbers
+ * @dat: DAT file inode
+ * @vblocknrs: array of virtual block numbers
+ * @nitems: number of virtual block numbers
+ *
+ * Description: nilfs_dat_freev() frees the virtual block numbers specified by
+ * @vblocknrs and @nitems.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * nagative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The virtual block number have not been allocated.
+ */
+int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems)
+{
+	return nilfs_palloc_freev(dat, vblocknrs, nitems);
+}
+
+/**
+ * nilfs_dat_move - change a block number
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ * @blocknr: block number
+ *
+ * Description: nilfs_dat_move() changes the block number associated with
+ * @vblocknr to @blocknr.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
+{
+	struct buffer_head *entry_bh;
+	struct nilfs_dat_entry *entry;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
+	if (ret < 0)
+		return ret;
+	kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
+	if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
+		printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__,
+		       (unsigned long long)vblocknr,
+		       (unsigned long long)le64_to_cpu(entry->de_start),
+		       (unsigned long long)le64_to_cpu(entry->de_end));
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(entry_bh);
+		return -EINVAL;
+	}
+	WARN_ON(blocknr == 0);
+	entry->de_blocknr = cpu_to_le64(blocknr);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_mdt_mark_buffer_dirty(entry_bh);
+	nilfs_mdt_mark_dirty(dat);
+
+	brelse(entry_bh);
+
+	return 0;
+}
+
+/**
+ * nilfs_dat_translate - translate a virtual block number to a block number
+ * @dat: DAT file inode
+ * @vblocknr: virtual block number
+ * @blocknrp: pointer to a block number
+ *
+ * Description: nilfs_dat_translate() maps the virtual block number @vblocknr
+ * to the corresponding block number.
+ *
+ * Return Value: On success, 0 is returned and the block number associated
+ * with @vblocknr is stored in the place pointed by @blocknrp. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A block number associated with @vblocknr does not exist.
+ */
+int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
+{
+	struct buffer_head *entry_bh;
+	struct nilfs_dat_entry *entry;
+	sector_t blocknr;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
+	if (ret < 0)
+		return ret;
+
+	kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
+	blocknr = le64_to_cpu(entry->de_blocknr);
+	if (blocknr == 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+	if (blocknrp != NULL)
+		*blocknrp = blocknr;
+
+ out:
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(entry_bh);
+	return ret;
+}
+
+ssize_t nilfs_dat_get_vinfo(struct inode *dat, struct nilfs_vinfo *vinfo,
+			    size_t nvi)
+{
+	struct buffer_head *entry_bh;
+	struct nilfs_dat_entry *entry;
+	__u64 first, last;
+	void *kaddr;
+	unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block;
+	int i, j, n, ret;
+
+	for (i = 0; i < nvi; i += n) {
+		ret = nilfs_palloc_get_entry_block(dat, vinfo[i].vi_vblocknr,
+						   0, &entry_bh);
+		if (ret < 0)
+			return ret;
+		kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
+		/* last virtual block number in this block */
+		first = vinfo[i].vi_vblocknr;
+		do_div(first, entries_per_block);
+		first *= entries_per_block;
+		last = first + entries_per_block - 1;
+		for (j = i, n = 0;
+		     j < nvi && vinfo[j].vi_vblocknr >= first &&
+			     vinfo[j].vi_vblocknr <= last;
+		     j++, n++) {
+			entry = nilfs_palloc_block_get_entry(
+				dat, vinfo[j].vi_vblocknr, entry_bh, kaddr);
+			vinfo[j].vi_start = le64_to_cpu(entry->de_start);
+			vinfo[j].vi_end = le64_to_cpu(entry->de_end);
+			vinfo[j].vi_blocknr = le64_to_cpu(entry->de_blocknr);
+		}
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(entry_bh);
+	}
+
+	return nvi;
+}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
new file mode 100644
index 00000000000..d9560654a4b
--- /dev/null
+++ b/fs/nilfs2/dat.h
@@ -0,0 +1,52 @@
+/*
+ * dat.h - NILFS disk address translation.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_DAT_H
+#define _NILFS_DAT_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+
+#define NILFS_DAT_GFP	NILFS_MDT_GFP
+
+struct nilfs_palloc_req;
+
+int nilfs_dat_translate(struct inode *, __u64, sector_t *);
+
+int nilfs_dat_prepare_alloc(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_alloc(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
+			    sector_t);
+void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
+void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
+void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
+
+int nilfs_dat_mark_dirty(struct inode *, __u64);
+int nilfs_dat_freev(struct inode *, __u64 *, size_t);
+int nilfs_dat_move(struct inode *, __u64, sector_t);
+ssize_t nilfs_dat_get_vinfo(struct inode *, struct nilfs_vinfo *, size_t);
+
+#endif	/* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
new file mode 100644
index 00000000000..54100acc110
--- /dev/null
+++ b/fs/nilfs2/dir.c
@@ -0,0 +1,711 @@
+/*
+ * dir.c - NILFS directory entry operations
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/dir.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/dir.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  ext2 directory handling functions
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ *
+ * All code that works with directory layout had been switched to pagecache
+ * and moved here. AV
+ */
+
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include "nilfs.h"
+#include "page.h"
+
+/*
+ * nilfs uses block-sized chunks. Arguably, sector-sized ones would be
+ * more robust, but we have what we have
+ */
+static inline unsigned nilfs_chunk_size(struct inode *inode)
+{
+	return inode->i_sb->s_blocksize;
+}
+
+static inline void nilfs_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
+}
+
+/*
+ * Return the offset into page `page_nr' of the last valid
+ * byte in that page, plus one.
+ */
+static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+	unsigned last_byte = inode->i_size;
+
+	last_byte -= page_nr << PAGE_CACHE_SHIFT;
+	if (last_byte > PAGE_CACHE_SIZE)
+		last_byte = PAGE_CACHE_SIZE;
+	return last_byte;
+}
+
+static int nilfs_prepare_chunk_uninterruptible(struct page *page,
+					       struct address_space *mapping,
+					       unsigned from, unsigned to)
+{
+	loff_t pos = page_offset(page) + from;
+	return block_write_begin(NULL, mapping, pos, to - from,
+				 AOP_FLAG_UNINTERRUPTIBLE, &page,
+				 NULL, nilfs_get_block);
+}
+
+static int nilfs_prepare_chunk(struct page *page,
+			       struct address_space *mapping,
+			       unsigned from, unsigned to)
+{
+	loff_t pos = page_offset(page) + from;
+	return block_write_begin(NULL, mapping, pos, to - from, 0, &page,
+				 NULL, nilfs_get_block);
+}
+
+static int nilfs_commit_chunk(struct page *page,
+			      struct address_space *mapping,
+			      unsigned from, unsigned to)
+{
+	struct inode *dir = mapping->host;
+	struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
+	loff_t pos = page_offset(page) + from;
+	unsigned len = to - from;
+	unsigned nr_dirty, copied;
+	int err;
+
+	nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
+	copied = block_write_end(NULL, mapping, pos, len, len, page, NULL);
+	if (pos + copied > dir->i_size) {
+		i_size_write(dir, pos + copied);
+		mark_inode_dirty(dir);
+	}
+	if (IS_DIRSYNC(dir))
+		nilfs_set_transaction_flag(NILFS_TI_SYNC);
+	err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
+	unlock_page(page);
+	return err;
+}
+
+static void nilfs_check_page(struct page *page)
+{
+	struct inode *dir = page->mapping->host;
+	struct super_block *sb = dir->i_sb;
+	unsigned chunk_size = nilfs_chunk_size(dir);
+	char *kaddr = page_address(page);
+	unsigned offs, rec_len;
+	unsigned limit = PAGE_CACHE_SIZE;
+	struct nilfs_dir_entry *p;
+	char *error;
+
+	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_CACHE_MASK;
+		if (limit & (chunk_size - 1))
+			goto Ebadsize;
+		if (!limit)
+			goto out;
+	}
+	for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
+		p = (struct nilfs_dir_entry *)(kaddr + offs);
+		rec_len = le16_to_cpu(p->rec_len);
+
+		if (rec_len < NILFS_DIR_REC_LEN(1))
+			goto Eshort;
+		if (rec_len & 3)
+			goto Ealign;
+		if (rec_len < NILFS_DIR_REC_LEN(p->name_len))
+			goto Enamelen;
+		if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+			goto Espan;
+	}
+	if (offs != limit)
+		goto Eend;
+out:
+	SetPageChecked(page);
+	return;
+
+	/* Too bad, we had an error */
+
+Ebadsize:
+	nilfs_error(sb, "nilfs_check_page",
+		    "size of directory #%lu is not a multiple of chunk size",
+		    dir->i_ino
+	);
+	goto fail;
+Eshort:
+	error = "rec_len is smaller than minimal";
+	goto bad_entry;
+Ealign:
+	error = "unaligned directory entry";
+	goto bad_entry;
+Enamelen:
+	error = "rec_len is too small for name_len";
+	goto bad_entry;
+Espan:
+	error = "directory entry across blocks";
+bad_entry:
+	nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - "
+		    "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+		    dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		    (unsigned long) le64_to_cpu(p->inode),
+		    rec_len, p->name_len);
+	goto fail;
+Eend:
+	p = (struct nilfs_dir_entry *)(kaddr + offs);
+	nilfs_error(sb, "nilfs_check_page",
+		    "entry in directory #%lu spans the page boundary"
+		    "offset=%lu, inode=%lu",
+		    dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		    (unsigned long) le64_to_cpu(p->inode));
+fail:
+	SetPageChecked(page);
+	SetPageError(page);
+}
+
+static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_cache_page(mapping, n,
+				(filler_t *)mapping->a_ops->readpage, NULL);
+	if (!IS_ERR(page)) {
+		wait_on_page_locked(page);
+		kmap(page);
+		if (!PageUptodate(page))
+			goto fail;
+		if (!PageChecked(page))
+			nilfs_check_page(page);
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+fail:
+	nilfs_put_page(page);
+	return ERR_PTR(-EIO);
+}
+
+/*
+ * NOTE! unlike strncmp, nilfs_match returns 1 for success, 0 for failure.
+ *
+ * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller.
+ */
+static int
+nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+/*
+ * p is at least 6 bytes before the end of page
+ */
+static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
+{
+	return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+}
+
+static unsigned char
+nilfs_filetype_table[NILFS_FT_MAX] = {
+	[NILFS_FT_UNKNOWN]	= DT_UNKNOWN,
+	[NILFS_FT_REG_FILE]	= DT_REG,
+	[NILFS_FT_DIR]		= DT_DIR,
+	[NILFS_FT_CHRDEV]	= DT_CHR,
+	[NILFS_FT_BLKDEV]	= DT_BLK,
+	[NILFS_FT_FIFO]		= DT_FIFO,
+	[NILFS_FT_SOCK]		= DT_SOCK,
+	[NILFS_FT_SYMLINK]	= DT_LNK,
+};
+
+#define S_SHIFT 12
+static unsigned char
+nilfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= NILFS_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= NILFS_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= NILFS_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= NILFS_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= NILFS_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= NILFS_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= NILFS_FT_SYMLINK,
+};
+
+static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
+{
+	mode_t mode = inode->i_mode;
+
+	de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+}
+
+static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	loff_t pos = filp->f_pos;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	unsigned int offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = dir_pages(inode);
+/*	unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
+	unsigned char *types = NULL;
+	int ret;
+
+	if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
+		goto success;
+
+	types = nilfs_filetype_table;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		struct nilfs_dir_entry *de;
+		struct page *page = nilfs_get_page(inode, n);
+
+		if (IS_ERR(page)) {
+			nilfs_error(sb, __func__, "bad page in #%lu",
+				    inode->i_ino);
+			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			ret = -EIO;
+			goto done;
+		}
+		kaddr = page_address(page);
+		de = (struct nilfs_dir_entry *)(kaddr + offset);
+		limit = kaddr + nilfs_last_byte(inode, n) -
+			NILFS_DIR_REC_LEN(1);
+		for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) {
+			if (de->rec_len == 0) {
+				nilfs_error(sb, __func__,
+					    "zero-length directory entry");
+				ret = -EIO;
+				nilfs_put_page(page);
+				goto done;
+			}
+			if (de->inode) {
+				int over;
+				unsigned char d_type = DT_UNKNOWN;
+
+				if (types && de->file_type < NILFS_FT_MAX)
+					d_type = types[de->file_type];
+
+				offset = (char *)de - kaddr;
+				over = filldir(dirent, de->name, de->name_len,
+						(n<<PAGE_CACHE_SHIFT) | offset,
+						le64_to_cpu(de->inode), d_type);
+				if (over) {
+					nilfs_put_page(page);
+					goto success;
+				}
+			}
+			filp->f_pos += le16_to_cpu(de->rec_len);
+		}
+		nilfs_put_page(page);
+	}
+
+success:
+	ret = 0;
+done:
+	return ret;
+}
+
+/*
+ *	nilfs_find_entry()
+ *
+ * finds an entry in the specified directory with the wanted name. It
+ * returns the page in which the entry was found, and the entry itself
+ * (as a parameter - res_dir). Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct nilfs_dir_entry *
+nilfs_find_entry(struct inode *dir, struct dentry *dentry,
+		 struct page **res_page)
+{
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+	unsigned long start, n;
+	unsigned long npages = dir_pages(dir);
+	struct page *page = NULL;
+	struct nilfs_inode_info *ei = NILFS_I(dir);
+	struct nilfs_dir_entry *de;
+
+	if (npages == 0)
+		goto out;
+
+	/* OFFSET_CACHE */
+	*res_page = NULL;
+
+	start = ei->i_dir_start_lookup;
+	if (start >= npages)
+		start = 0;
+	n = start;
+	do {
+		char *kaddr;
+		page = nilfs_get_page(dir, n);
+		if (!IS_ERR(page)) {
+			kaddr = page_address(page);
+			de = (struct nilfs_dir_entry *)kaddr;
+			kaddr += nilfs_last_byte(dir, n) - reclen;
+			while ((char *) de <= kaddr) {
+				if (de->rec_len == 0) {
+					nilfs_error(dir->i_sb, __func__,
+						"zero-length directory entry");
+					nilfs_put_page(page);
+					goto out;
+				}
+				if (nilfs_match(namelen, name, de))
+					goto found;
+				de = nilfs_next_entry(de);
+			}
+			nilfs_put_page(page);
+		}
+		if (++n >= npages)
+			n = 0;
+		/* next page is past the blocks we've got */
+		if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
+			nilfs_error(dir->i_sb, __func__,
+			       "dir %lu size %lld exceeds block cout %llu",
+			       dir->i_ino, dir->i_size,
+			       (unsigned long long)dir->i_blocks);
+			goto out;
+		}
+	} while (n != start);
+out:
+	return NULL;
+
+found:
+	*res_page = page;
+	ei->i_dir_start_lookup = n;
+	return de;
+}
+
+struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
+{
+	struct page *page = nilfs_get_page(dir, 0);
+	struct nilfs_dir_entry *de = NULL;
+
+	if (!IS_ERR(page)) {
+		de = nilfs_next_entry(
+			(struct nilfs_dir_entry *)page_address(page));
+		*p = page;
+	}
+	return de;
+}
+
+ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry)
+{
+	ino_t res = 0;
+	struct nilfs_dir_entry *de;
+	struct page *page;
+
+	de = nilfs_find_entry(dir, dentry, &page);
+	if (de) {
+		res = le64_to_cpu(de->inode);
+		kunmap(page);
+		page_cache_release(page);
+	}
+	return res;
+}
+
+/* Releases the page */
+void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
+		    struct page *page, struct inode *inode)
+{
+	unsigned from = (char *) de - (char *) page_address(page);
+	unsigned to = from + le16_to_cpu(de->rec_len);
+	struct address_space *mapping = page->mapping;
+	int err;
+
+	lock_page(page);
+	err = nilfs_prepare_chunk_uninterruptible(page, mapping, from, to);
+	BUG_ON(err);
+	de->inode = cpu_to_le64(inode->i_ino);
+	nilfs_set_de_type(de, inode);
+	err = nilfs_commit_chunk(page, mapping, from, to);
+	nilfs_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+/*	NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
+	mark_inode_dirty(dir);
+}
+
+/*
+ *	Parent is locked.
+ */
+int nilfs_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned chunk_size = nilfs_chunk_size(dir);
+	unsigned reclen = NILFS_DIR_REC_LEN(namelen);
+	unsigned short rec_len, name_len;
+	struct page *page = NULL;
+	struct nilfs_dir_entry *de;
+	unsigned long npages = dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	unsigned from, to;
+	int err;
+
+	/*
+	 * We take care of directory expansion in the same loop.
+	 * This code plays outside i_size, so it locks the page
+	 * to protect that region.
+	 */
+	for (n = 0; n <= npages; n++) {
+		char *dir_end;
+
+		page = nilfs_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = page_address(page);
+		dir_end = kaddr + nilfs_last_byte(dir, n);
+		de = (struct nilfs_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - reclen;
+		while ((char *)de <= kaddr) {
+			if ((char *)de == dir_end) {
+				/* We hit i_size */
+				name_len = 0;
+				rec_len = chunk_size;
+				de->rec_len = cpu_to_le16(chunk_size);
+				de->inode = 0;
+				goto got_it;
+			}
+			if (de->rec_len == 0) {
+				nilfs_error(dir->i_sb, __func__,
+					    "zero-length directory entry");
+				err = -EIO;
+				goto out_unlock;
+			}
+			err = -EEXIST;
+			if (nilfs_match(namelen, name, de))
+				goto out_unlock;
+			name_len = NILFS_DIR_REC_LEN(de->name_len);
+			rec_len = le16_to_cpu(de->rec_len);
+			if (!de->inode && rec_len >= reclen)
+				goto got_it;
+			if (rec_len >= name_len + reclen)
+				goto got_it;
+			de = (struct nilfs_dir_entry *)((char *)de + rec_len);
+		}
+		unlock_page(page);
+		nilfs_put_page(page);
+	}
+	BUG();
+	return -EINVAL;
+
+got_it:
+	from = (char *)de - (char *)page_address(page);
+	to = from + rec_len;
+	err = nilfs_prepare_chunk(page, page->mapping, from, to);
+	if (err)
+		goto out_unlock;
+	if (de->inode) {
+		struct nilfs_dir_entry *de1;
+
+		de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
+		de1->rec_len = cpu_to_le16(rec_len - name_len);
+		de->rec_len = cpu_to_le16(name_len);
+		de = de1;
+	}
+	de->name_len = namelen;
+	memcpy(de->name, name, namelen);
+	de->inode = cpu_to_le64(inode->i_ino);
+	nilfs_set_de_type(de, inode);
+	err = nilfs_commit_chunk(page, page->mapping, from, to);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+/*	NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
+	mark_inode_dirty(dir);
+	/* OFFSET_CACHE */
+out_put:
+	nilfs_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+/*
+ * nilfs_delete_entry deletes a directory entry by merging it with the
+ * previous entry. Page is up-to-date. Releases the page.
+ */
+int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	char *kaddr = page_address(page);
+	unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
+	unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+	struct nilfs_dir_entry *pde = NULL;
+	struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
+	int err;
+
+	while ((char *)de < (char *)dir) {
+		if (de->rec_len == 0) {
+			nilfs_error(inode->i_sb, __func__,
+				    "zero-length directory entry");
+			err = -EIO;
+			goto out;
+		}
+		pde = de;
+		de = nilfs_next_entry(de);
+	}
+	if (pde)
+		from = (char *)pde - (char *)page_address(page);
+	lock_page(page);
+	err = nilfs_prepare_chunk(page, mapping, from, to);
+	BUG_ON(err);
+	if (pde)
+		pde->rec_len = cpu_to_le16(to - from);
+	dir->inode = 0;
+	err = nilfs_commit_chunk(page, mapping, from, to);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+/*	NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
+	mark_inode_dirty(inode);
+out:
+	nilfs_put_page(page);
+	return err;
+}
+
+/*
+ * Set the first fragment of directory.
+ */
+int nilfs_make_empty(struct inode *inode, struct inode *parent)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page = grab_cache_page(mapping, 0);
+	unsigned chunk_size = nilfs_chunk_size(inode);
+	struct nilfs_dir_entry *de;
+	int err;
+	void *kaddr;
+
+	if (!page)
+		return -ENOMEM;
+
+	err = nilfs_prepare_chunk(page, mapping, 0, chunk_size);
+	if (unlikely(err)) {
+		unlock_page(page);
+		goto fail;
+	}
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr, 0, chunk_size);
+	de = (struct nilfs_dir_entry *)kaddr;
+	de->name_len = 1;
+	de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1));
+	memcpy(de->name, ".\0\0", 4);
+	de->inode = cpu_to_le64(inode->i_ino);
+	nilfs_set_de_type(de, inode);
+
+	de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
+	de->name_len = 2;
+	de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1));
+	de->inode = cpu_to_le64(parent->i_ino);
+	memcpy(de->name, "..\0", 4);
+	nilfs_set_de_type(de, inode);
+	kunmap_atomic(kaddr, KM_USER0);
+	err = nilfs_commit_chunk(page, mapping, 0, chunk_size);
+fail:
+	page_cache_release(page);
+	return err;
+}
+
+/*
+ * routine to check that the specified directory is empty (for rmdir)
+ */
+int nilfs_empty_dir(struct inode *inode)
+{
+	struct page *page = NULL;
+	unsigned long i, npages = dir_pages(inode);
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		struct nilfs_dir_entry *de;
+
+		page = nilfs_get_page(inode, i);
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = page_address(page);
+		de = (struct nilfs_dir_entry *)kaddr;
+		kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1);
+
+		while ((char *)de <= kaddr) {
+			if (de->rec_len == 0) {
+				nilfs_error(inode->i_sb, __func__,
+					    "zero-length directory entry "
+					    "(kaddr=%p, de=%p)\n", kaddr, de);
+				goto not_empty;
+			}
+			if (de->inode != 0) {
+				/* check for . and .. */
+				if (de->name[0] != '.')
+					goto not_empty;
+				if (de->name_len > 2)
+					goto not_empty;
+				if (de->name_len < 2) {
+					if (de->inode !=
+					    cpu_to_le64(inode->i_ino))
+						goto not_empty;
+				} else if (de->name[1] != '.')
+					goto not_empty;
+			}
+			de = nilfs_next_entry(de);
+		}
+		nilfs_put_page(page);
+	}
+	return 1;
+
+not_empty:
+	nilfs_put_page(page);
+	return 0;
+}
+
+struct file_operations nilfs_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= nilfs_readdir,
+	.unlocked_ioctl	= nilfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= nilfs_ioctl,
+#endif	/* CONFIG_COMPAT */
+	.fsync		= nilfs_sync_file,
+
+};
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
new file mode 100644
index 00000000000..c6379e48278
--- /dev/null
+++ b/fs/nilfs2/direct.c
@@ -0,0 +1,436 @@
+/*
+ * direct.c - NILFS direct block pointer.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/errno.h>
+#include "nilfs.h"
+#include "page.h"
+#include "direct.h"
+#include "alloc.h"
+
+static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct)
+{
+	return (__le64 *)
+		((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1);
+}
+
+static inline __u64
+nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key)
+{
+	return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key));
+}
+
+static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct,
+					__u64 key, __u64 ptr)
+{
+	*(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr);
+}
+
+static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
+			       __u64 key, int level, __u64 *ptrp)
+{
+	struct nilfs_direct *direct;
+	__u64 ptr;
+
+	direct = (struct nilfs_direct *)bmap;
+	if ((key > NILFS_DIRECT_KEY_MAX) ||
+	    (level != 1) ||	/* XXX: use macro for level 1 */
+	    ((ptr = nilfs_direct_get_ptr(direct, key)) ==
+	     NILFS_BMAP_INVALID_PTR))
+		return -ENOENT;
+
+	if (ptrp != NULL)
+		*ptrp = ptr;
+	return 0;
+}
+
+static __u64
+nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key)
+{
+	__u64 ptr;
+
+	ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key);
+	if (ptr != NILFS_BMAP_INVALID_PTR)
+		/* sequential access */
+		return ptr;
+	else
+		/* block group */
+		return nilfs_bmap_find_target_in_group(&direct->d_bmap);
+}
+
+static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
+				      __u64 key, __u64 ptr)
+{
+	direct->d_bmap.b_last_allocated_key = key;
+	direct->d_bmap.b_last_allocated_ptr = ptr;
+}
+
+static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
+				       __u64 key,
+				       union nilfs_bmap_ptr_req *req,
+				       struct nilfs_bmap_stats *stats)
+{
+	int ret;
+
+	if (direct->d_ops->dop_find_target != NULL)
+		req->bpr_ptr = direct->d_ops->dop_find_target(direct, key);
+	ret = direct->d_bmap.b_pops->bpop_prepare_alloc_ptr(&direct->d_bmap,
+							       req);
+	if (ret < 0)
+		return ret;
+
+	stats->bs_nblocks = 1;
+	return 0;
+}
+
+static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
+				       union nilfs_bmap_ptr_req *req,
+				       __u64 key, __u64 ptr)
+{
+	struct buffer_head *bh;
+
+	/* ptr must be a pointer to a buffer head. */
+	bh = (struct buffer_head *)((unsigned long)ptr);
+	set_buffer_nilfs_volatile(bh);
+
+	if (direct->d_bmap.b_pops->bpop_commit_alloc_ptr != NULL)
+		direct->d_bmap.b_pops->bpop_commit_alloc_ptr(
+			&direct->d_bmap, req);
+	nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
+
+	if (!nilfs_bmap_dirty(&direct->d_bmap))
+		nilfs_bmap_set_dirty(&direct->d_bmap);
+
+	if (direct->d_ops->dop_set_target != NULL)
+		direct->d_ops->dop_set_target(direct, key, req->bpr_ptr);
+}
+
+static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
+{
+	struct nilfs_direct *direct;
+	union nilfs_bmap_ptr_req req;
+	struct nilfs_bmap_stats stats;
+	int ret;
+
+	direct = (struct nilfs_direct *)bmap;
+	if (key > NILFS_DIRECT_KEY_MAX)
+		return -ENOENT;
+	if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
+		return -EEXIST;
+
+	ret = nilfs_direct_prepare_insert(direct, key, &req, &stats);
+	if (ret < 0)
+		return ret;
+	nilfs_direct_commit_insert(direct, &req, key, ptr);
+	nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+
+	return 0;
+}
+
+static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
+				       union nilfs_bmap_ptr_req *req,
+				       __u64 key,
+				       struct nilfs_bmap_stats *stats)
+{
+	int ret;
+
+	if (direct->d_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
+		req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
+		ret = direct->d_bmap.b_pops->bpop_prepare_end_ptr(
+			&direct->d_bmap, req);
+		if (ret < 0)
+			return ret;
+	}
+
+	stats->bs_nblocks = 1;
+	return 0;
+}
+
+static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
+				       union nilfs_bmap_ptr_req *req,
+				       __u64 key)
+{
+	if (direct->d_bmap.b_pops->bpop_commit_end_ptr != NULL)
+		direct->d_bmap.b_pops->bpop_commit_end_ptr(
+			&direct->d_bmap, req);
+	nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
+}
+
+static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
+{
+	struct nilfs_direct *direct;
+	union nilfs_bmap_ptr_req req;
+	struct nilfs_bmap_stats stats;
+	int ret;
+
+	direct = (struct nilfs_direct *)bmap;
+	if ((key > NILFS_DIRECT_KEY_MAX) ||
+	    nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
+		return -ENOENT;
+
+	ret = nilfs_direct_prepare_delete(direct, &req, key, &stats);
+	if (ret < 0)
+		return ret;
+	nilfs_direct_commit_delete(direct, &req, key);
+	nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
+
+	return 0;
+}
+
+static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
+{
+	struct nilfs_direct *direct;
+	__u64 key, lastkey;
+
+	direct = (struct nilfs_direct *)bmap;
+	lastkey = NILFS_DIRECT_KEY_MAX + 1;
+	for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++)
+		if (nilfs_direct_get_ptr(direct, key) !=
+		    NILFS_BMAP_INVALID_PTR)
+			lastkey = key;
+
+	if (lastkey == NILFS_DIRECT_KEY_MAX + 1)
+		return -ENOENT;
+
+	*keyp = lastkey;
+
+	return 0;
+}
+
+static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key)
+{
+	return key > NILFS_DIRECT_KEY_MAX;
+}
+
+static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
+				    __u64 *keys, __u64 *ptrs, int nitems)
+{
+	struct nilfs_direct *direct;
+	__u64 key;
+	__u64 ptr;
+	int n;
+
+	direct = (struct nilfs_direct *)bmap;
+	if (nitems > NILFS_DIRECT_NBLOCKS)
+		nitems = NILFS_DIRECT_NBLOCKS;
+	n = 0;
+	for (key = 0; key < nitems; key++) {
+		ptr = nilfs_direct_get_ptr(direct, key);
+		if (ptr != NILFS_BMAP_INVALID_PTR) {
+			keys[n] = key;
+			ptrs[n] = ptr;
+			n++;
+		}
+	}
+	return n;
+}
+
+int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
+				    __u64 key, __u64 *keys, __u64 *ptrs,
+				    int n, __u64 low, __u64 high)
+{
+	struct nilfs_direct *direct;
+	__le64 *dptrs;
+	int ret, i, j;
+
+	/* no need to allocate any resource for conversion */
+
+	/* delete */
+	ret = bmap->b_ops->bop_delete(bmap, key);
+	if (ret < 0)
+		return ret;
+
+	/* free resources */
+	if (bmap->b_ops->bop_clear != NULL)
+		bmap->b_ops->bop_clear(bmap);
+
+	/* convert */
+	direct = (struct nilfs_direct *)bmap;
+	dptrs = nilfs_direct_dptrs(direct);
+	for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) {
+		if ((j < n) && (i == keys[j])) {
+			dptrs[i] = (i != key) ?
+				nilfs_bmap_ptr_to_dptr(ptrs[j]) :
+				NILFS_BMAP_INVALID_PTR;
+			j++;
+		} else
+			dptrs[i] = NILFS_BMAP_INVALID_PTR;
+	}
+
+	nilfs_direct_init(bmap, low, high);
+
+	return 0;
+}
+
+static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
+				    struct buffer_head *bh)
+{
+	union nilfs_bmap_ptr_req oldreq, newreq;
+	__u64 key;
+	__u64 ptr;
+	int ret;
+
+	key = nilfs_bmap_data_get_key(&direct->d_bmap, bh);
+	ptr = nilfs_direct_get_ptr(direct, key);
+	if (!buffer_nilfs_volatile(bh)) {
+		oldreq.bpr_ptr = ptr;
+		newreq.bpr_ptr = ptr;
+		ret = nilfs_bmap_prepare_update(&direct->d_bmap, &oldreq,
+						&newreq);
+		if (ret < 0)
+			return ret;
+		nilfs_bmap_commit_update(&direct->d_bmap, &oldreq, &newreq);
+		set_buffer_nilfs_volatile(bh);
+		nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
+	} else
+		ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr);
+
+	return ret;
+}
+
+static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
+				  struct buffer_head *bh)
+{
+	struct nilfs_direct *direct;
+
+	direct = (struct nilfs_direct *)bmap;
+	return (direct->d_ops->dop_propagate != NULL) ?
+		direct->d_ops->dop_propagate(direct, bh) :
+		0;
+}
+
+static int nilfs_direct_assign_v(struct nilfs_direct *direct,
+				 __u64 key, __u64 ptr,
+				 struct buffer_head **bh,
+				 sector_t blocknr,
+				 union nilfs_binfo *binfo)
+{
+	union nilfs_bmap_ptr_req req;
+	int ret;
+
+	req.bpr_ptr = ptr;
+	ret = direct->d_bmap.b_pops->bpop_prepare_start_ptr(
+		&direct->d_bmap, &req);
+	if (ret < 0)
+		return ret;
+	direct->d_bmap.b_pops->bpop_commit_start_ptr(&direct->d_bmap,
+						     &req, blocknr);
+
+	binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+	binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+
+	return 0;
+}
+
+static int nilfs_direct_assign_p(struct nilfs_direct *direct,
+				 __u64 key, __u64 ptr,
+				 struct buffer_head **bh,
+				 sector_t blocknr,
+				 union nilfs_binfo *binfo)
+{
+	nilfs_direct_set_ptr(direct, key, blocknr);
+
+	binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+	binfo->bi_dat.bi_level = 0;
+
+	return 0;
+}
+
+static int nilfs_direct_assign(struct nilfs_bmap *bmap,
+			       struct buffer_head **bh,
+			       sector_t blocknr,
+			       union nilfs_binfo *binfo)
+{
+	struct nilfs_direct *direct;
+	__u64 key;
+	__u64 ptr;
+
+	direct = (struct nilfs_direct *)bmap;
+	key = nilfs_bmap_data_get_key(bmap, *bh);
+	if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
+		printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
+		       (unsigned long long)key);
+		return -EINVAL;
+	}
+	ptr = nilfs_direct_get_ptr(direct, key);
+	if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
+		printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
+		       (unsigned long long)ptr);
+		return -EINVAL;
+	}
+
+	return direct->d_ops->dop_assign(direct, key, ptr, bh,
+					 blocknr, binfo);
+}
+
+static const struct nilfs_bmap_operations nilfs_direct_ops = {
+	.bop_lookup		=	nilfs_direct_lookup,
+	.bop_insert		=	nilfs_direct_insert,
+	.bop_delete		=	nilfs_direct_delete,
+	.bop_clear		=	NULL,
+
+	.bop_propagate		=	nilfs_direct_propagate,
+
+	.bop_lookup_dirty_buffers	=	NULL,
+
+	.bop_assign		=	nilfs_direct_assign,
+	.bop_mark		=	NULL,
+
+	.bop_last_key		=	nilfs_direct_last_key,
+	.bop_check_insert	=	nilfs_direct_check_insert,
+	.bop_check_delete	=	NULL,
+	.bop_gather_data	=	nilfs_direct_gather_data,
+};
+
+
+static const struct nilfs_direct_operations nilfs_direct_ops_v = {
+	.dop_find_target	=	nilfs_direct_find_target_v,
+	.dop_set_target		=	nilfs_direct_set_target_v,
+	.dop_propagate		=	nilfs_direct_propagate_v,
+	.dop_assign		=	nilfs_direct_assign_v,
+};
+
+static const struct nilfs_direct_operations nilfs_direct_ops_p = {
+	.dop_find_target	=	NULL,
+	.dop_set_target		=	NULL,
+	.dop_propagate		=	NULL,
+	.dop_assign		=	nilfs_direct_assign_p,
+};
+
+int nilfs_direct_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
+{
+	struct nilfs_direct *direct;
+
+	direct = (struct nilfs_direct *)bmap;
+	bmap->b_ops = &nilfs_direct_ops;
+	bmap->b_low = low;
+	bmap->b_high = high;
+	switch (bmap->b_inode->i_ino) {
+	case NILFS_DAT_INO:
+		direct->d_ops = &nilfs_direct_ops_p;
+		break;
+	default:
+		direct->d_ops = &nilfs_direct_ops_v;
+		break;
+	}
+
+	return 0;
+}
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
new file mode 100644
index 00000000000..45d2c5cda81
--- /dev/null
+++ b/fs/nilfs2/direct.h
@@ -0,0 +1,78 @@
+/*
+ * direct.h - NILFS direct block pointer.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_DIRECT_H
+#define _NILFS_DIRECT_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include "bmap.h"
+
+
+struct nilfs_direct;
+
+/**
+ * struct nilfs_direct_operations - direct mapping operation table
+ */
+struct nilfs_direct_operations {
+	__u64 (*dop_find_target)(const struct nilfs_direct *, __u64);
+	void (*dop_set_target)(struct nilfs_direct *, __u64, __u64);
+	int (*dop_propagate)(struct nilfs_direct *, struct buffer_head *);
+	int (*dop_assign)(struct nilfs_direct *, __u64, __u64,
+			  struct buffer_head **, sector_t,
+			  union nilfs_binfo *);
+};
+
+/**
+ * struct nilfs_direct_node - direct node
+ * @dn_flags: flags
+ * @dn_pad: padding
+ */
+struct nilfs_direct_node {
+	__u8 dn_flags;
+	__u8 pad[7];
+};
+
+/**
+ * struct nilfs_direct - direct mapping
+ * @d_bmap: bmap structure
+ * @d_ops: direct mapping operation table
+ */
+struct nilfs_direct {
+	struct nilfs_bmap d_bmap;
+
+	/* direct-mapping-specific members */
+	const struct nilfs_direct_operations *d_ops;
+};
+
+
+#define NILFS_DIRECT_NBLOCKS	(NILFS_BMAP_SIZE / sizeof(__le64) - 1)
+#define NILFS_DIRECT_KEY_MIN	0
+#define NILFS_DIRECT_KEY_MAX	(NILFS_DIRECT_NBLOCKS - 1)
+
+
+int nilfs_direct_init(struct nilfs_bmap *, __u64, __u64);
+int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *,
+				    __u64 *, int, __u64, __u64);
+
+
+#endif	/* _NILFS_DIRECT_H */
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
new file mode 100644
index 00000000000..6bd84a0d823
--- /dev/null
+++ b/fs/nilfs2/file.c
@@ -0,0 +1,160 @@
+/*
+ * file.c - NILFS regular file handling primitives including fsync().
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>,
+ *            Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/writeback.h>
+#include "nilfs.h"
+#include "segment.h"
+
+int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+{
+	/*
+	 * Called from fsync() system call
+	 * This is the only entry point that can catch write and synch
+	 * timing for both data blocks and intermediate blocks.
+	 *
+	 * This function should be implemented when the writeback function
+	 * will be implemented.
+	 */
+	struct inode *inode = dentry->d_inode;
+	int err;
+
+	if (!nilfs_inode_dirty(inode))
+		return 0;
+
+	if (datasync)
+		err = nilfs_construct_dsync_segment(inode->i_sb, inode, 0,
+						    LLONG_MAX);
+	else
+		err = nilfs_construct_segment(inode->i_sb);
+
+	return err;
+}
+
+static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
+	struct nilfs_transaction_info ti;
+	int ret;
+
+	if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs)))
+		return VM_FAULT_SIGBUS; /* -ENOSPC */
+
+	lock_page(page);
+	if (page->mapping != inode->i_mapping ||
+	    page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
+		unlock_page(page);
+		return VM_FAULT_NOPAGE; /* make the VM retry the fault */
+	}
+
+	/*
+	 * check to see if the page is mapped already (no holes)
+	 */
+	if (PageMappedToDisk(page)) {
+		unlock_page(page);
+		goto mapped;
+	}
+	if (page_has_buffers(page)) {
+		struct buffer_head *bh, *head;
+		int fully_mapped = 1;
+
+		bh = head = page_buffers(page);
+		do {
+			if (!buffer_mapped(bh)) {
+				fully_mapped = 0;
+				break;
+			}
+		} while (bh = bh->b_this_page, bh != head);
+
+		if (fully_mapped) {
+			SetPageMappedToDisk(page);
+			unlock_page(page);
+			goto mapped;
+		}
+	}
+	unlock_page(page);
+
+	/*
+	 * fill hole blocks
+	 */
+	ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
+	/* never returns -ENOMEM, but may return -ENOSPC */
+	if (unlikely(ret))
+		return VM_FAULT_SIGBUS;
+
+	ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
+	if (unlikely(ret)) {
+		nilfs_transaction_abort(inode->i_sb);
+		return ret;
+	}
+	nilfs_transaction_commit(inode->i_sb);
+
+ mapped:
+	SetPageChecked(page);
+	wait_on_page_writeback(page);
+	return 0;
+}
+
+struct vm_operations_struct nilfs_file_vm_ops = {
+	.fault		= filemap_fault,
+	.page_mkwrite	= nilfs_page_mkwrite,
+};
+
+static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	file_accessed(file);
+	vma->vm_ops = &nilfs_file_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+	return 0;
+}
+
+/*
+ * We have mostly NULL's here: the current defaults are ok for
+ * the nilfs filesystem.
+ */
+struct file_operations nilfs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.unlocked_ioctl	= nilfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= nilfs_ioctl,
+#endif	/* CONFIG_COMPAT */
+	.mmap		= nilfs_file_mmap,
+	.open		= generic_file_open,
+	/* .release	= nilfs_release_file, */
+	.fsync		= nilfs_sync_file,
+	.splice_read	= generic_file_splice_read,
+};
+
+struct inode_operations nilfs_file_inode_operations = {
+	.truncate	= nilfs_truncate,
+	.setattr	= nilfs_setattr,
+	.permission     = nilfs_permission,
+};
+
+/* end of file */
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
new file mode 100644
index 00000000000..93383c5cee9
--- /dev/null
+++ b/fs/nilfs2/gcdat.c
@@ -0,0 +1,84 @@
+/*
+ * gcdat.c - NILFS shadow DAT inode for GC
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
+ *            and Ryusuke Konishi <ryusuke@osrg.net>.
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include "nilfs.h"
+#include "page.h"
+#include "mdt.h"
+
+int nilfs_init_gcdat_inode(struct the_nilfs *nilfs)
+{
+	struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
+	struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
+	int err;
+
+	gcdat->i_state = 0;
+	gcdat->i_blocks = dat->i_blocks;
+	gii->i_flags = dii->i_flags;
+	gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT);
+	gii->i_cno = 0;
+	nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap);
+	err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping);
+	if (unlikely(err))
+		return err;
+
+	return nilfs_copy_dirty_pages(&gii->i_btnode_cache,
+				      &dii->i_btnode_cache);
+}
+
+void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
+{
+	struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
+	struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
+	struct address_space *mapping = dat->i_mapping;
+	struct address_space *gmapping = gcdat->i_mapping;
+
+	down_write(&NILFS_MDT(dat)->mi_sem);
+	dat->i_blocks = gcdat->i_blocks;
+	dii->i_flags = gii->i_flags;
+	dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT);
+
+	nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
+
+	nilfs_clear_dirty_pages(mapping);
+	nilfs_copy_back_pages(mapping, gmapping);
+	/* note: mdt dirty flags should be cleared by segctor. */
+
+	nilfs_clear_dirty_pages(&dii->i_btnode_cache);
+	nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache);
+
+	up_write(&NILFS_MDT(dat)->mi_sem);
+}
+
+void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
+{
+	struct inode *gcdat = nilfs->ns_gc_dat;
+	struct nilfs_inode_info *gii = NILFS_I(gcdat);
+
+	gcdat->i_state = I_CLEAR;
+	gii->i_flags = 0;
+
+	truncate_inode_pages(gcdat->i_mapping, 0);
+	truncate_inode_pages(&gii->i_btnode_cache, 0);
+}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
new file mode 100644
index 00000000000..19d2102b6a6
--- /dev/null
+++ b/fs/nilfs2/gcinode.c
@@ -0,0 +1,288 @@
+/*
+ * gcinode.c - dummy inodes to buffer blocks for garbage collection
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
+ *            and Ryusuke Konishi <ryusuke@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ *
+ */
+/*
+ * This file adds the cache of on-disk blocks to be moved in garbage
+ * collection.  The disk blocks are held with dummy inodes (called
+ * gcinodes), and this file provides lookup function of the dummy
+ * inodes and their buffer read function.
+ *
+ * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it
+ * has to treat blocks that belong to a same file but have different
+ * checkpoint numbers.  To avoid interference among generations, dummy
+ * inodes are managed separatly from actual inodes, and their lookup
+ * function (nilfs_gc_iget) is designed to be specified with a
+ * checkpoint number argument as well as an inode number.
+ *
+ * Buffers and pages held by the dummy inodes will be released each
+ * time after they are copied to a new log.  Dirty blocks made on the
+ * current generation and the blocks to be moved by GC never overlap
+ * because the dirty blocks make a new generation; they rather must be
+ * written individually.
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/hash.h>
+#include <linux/swap.h>
+#include "nilfs.h"
+#include "page.h"
+#include "mdt.h"
+#include "dat.h"
+#include "ifile.h"
+
+static struct address_space_operations def_gcinode_aops = {};
+/* XXX need def_gcinode_iops/fops? */
+
+/*
+ * nilfs_gccache_submit_read_data() - add data buffer and submit read request
+ * @inode - gc inode
+ * @blkoff - dummy offset treated as the key for the page cache
+ * @pbn - physical block number of the block
+ * @vbn - virtual block number of the block, 0 for non-virtual block
+ * @out_bh - indirect pointer to a buffer_head struct to receive the results
+ *
+ * Description: nilfs_gccache_submit_read_data() registers the data buffer
+ * specified by @pbn to the GC pagecache with the key @blkoff.
+ * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer.
+ *
+ * Return Value: On success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The block specified with @pbn does not exist.
+ */
+int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
+				   sector_t pbn, __u64 vbn,
+				   struct buffer_head **out_bh)
+{
+	struct buffer_head *bh;
+	int err;
+
+	bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
+	if (unlikely(!bh))
+		return -ENOMEM;
+
+	if (buffer_uptodate(bh))
+		goto out;
+
+	if (pbn == 0) {
+		struct inode *dat_inode = NILFS_I_NILFS(inode)->ns_dat;
+					  /* use original dat, not gc dat. */
+		err = nilfs_dat_translate(dat_inode, vbn, &pbn);
+		if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
+			brelse(bh);
+			goto failed;
+		}
+	}
+
+	lock_buffer(bh);
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		goto out;
+	}
+
+	if (!buffer_mapped(bh)) {
+		bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+		set_buffer_mapped(bh);
+	}
+	bh->b_blocknr = pbn;
+	bh->b_end_io = end_buffer_read_sync;
+	get_bh(bh);
+	submit_bh(READ, bh);
+	if (vbn)
+		bh->b_blocknr = vbn;
+ out:
+	err = 0;
+	*out_bh = bh;
+
+ failed:
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	return err;
+}
+
+/*
+ * nilfs_gccache_submit_read_node() - add node buffer and submit read request
+ * @inode - gc inode
+ * @pbn - physical block number for the block
+ * @vbn - virtual block number for the block
+ * @out_bh - indirect pointer to a buffer_head struct to receive the results
+ *
+ * Description: nilfs_gccache_submit_read_node() registers the node buffer
+ * specified by @vbn to the GC pagecache.  @pbn can be supplied by the
+ * caller to avoid translation of the disk block address.
+ *
+ * Return Value: On success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
+				   __u64 vbn, struct buffer_head **out_bh)
+{
+	int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
+					    vbn ? : pbn, pbn, out_bh, 0);
+	if (ret == -EEXIST) /* internal code (cache hit) */
+		ret = 0;
+	return ret;
+}
+
+int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
+{
+	wait_on_buffer(bh);
+	if (!buffer_uptodate(bh))
+		return -EIO;
+	if (buffer_dirty(bh))
+		return -EEXIST;
+
+	if (buffer_nilfs_node(bh))
+		nilfs_btnode_mark_dirty(bh);
+	else
+		nilfs_mdt_mark_buffer_dirty(bh);
+	return 0;
+}
+
+/*
+ * nilfs_init_gccache() - allocate and initialize gc_inode hash table
+ * @nilfs - the_nilfs
+ *
+ * Return Value: On success, 0.
+ * On error, a negative error code is returned.
+ */
+int nilfs_init_gccache(struct the_nilfs *nilfs)
+{
+	int loop;
+
+	BUG_ON(nilfs->ns_gc_inodes_h);
+
+	INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
+
+	nilfs->ns_gc_inodes_h =
+		kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE,
+			GFP_NOFS);
+	if (nilfs->ns_gc_inodes_h == NULL)
+		return -ENOMEM;
+
+	for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++)
+		INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]);
+	return 0;
+}
+
+/*
+ * nilfs_destroy_gccache() - free gc_inode hash table
+ * @nilfs - the nilfs
+ */
+void nilfs_destroy_gccache(struct the_nilfs *nilfs)
+{
+	if (nilfs->ns_gc_inodes_h) {
+		nilfs_remove_all_gcinode(nilfs);
+		kfree(nilfs->ns_gc_inodes_h);
+		nilfs->ns_gc_inodes_h = NULL;
+	}
+}
+
+static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
+				   __u64 cno)
+{
+	struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS);
+	struct nilfs_inode_info *ii;
+
+	if (!inode)
+		return NULL;
+
+	inode->i_op = NULL;
+	inode->i_fop = NULL;
+	inode->i_mapping->a_ops = &def_gcinode_aops;
+
+	ii = NILFS_I(inode);
+	ii->i_cno = cno;
+	ii->i_flags = 0;
+	ii->i_state = 1 << NILFS_I_GCINODE;
+	ii->i_bh = NULL;
+	nilfs_bmap_init_gc(ii->i_bmap);
+
+	return inode;
+}
+
+static unsigned long ihash(ino_t ino, __u64 cno)
+{
+	return hash_long((unsigned long)((ino << 2) + cno),
+			 NILFS_GCINODE_HASH_BITS);
+}
+
+/*
+ * nilfs_gc_iget() - find or create gc inode with specified (ino,cno)
+ */
+struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
+{
+	struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno);
+	struct hlist_node *node;
+	struct inode *inode;
+
+	hlist_for_each_entry(inode, node, head, i_hash) {
+		if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno)
+			return inode;
+	}
+
+	inode = alloc_gcinode(nilfs, ino, cno);
+	if (likely(inode)) {
+		hlist_add_head(&inode->i_hash, head);
+		list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
+	}
+	return inode;
+}
+
+/*
+ * nilfs_clear_gcinode() - clear and free a gc inode
+ */
+void nilfs_clear_gcinode(struct inode *inode)
+{
+	nilfs_mdt_clear(inode);
+	nilfs_mdt_destroy(inode);
+}
+
+/*
+ * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs
+ */
+void nilfs_remove_all_gcinode(struct the_nilfs *nilfs)
+{
+	struct hlist_head *head = nilfs->ns_gc_inodes_h;
+	struct hlist_node *node, *n;
+	struct inode *inode;
+	int loop;
+
+	for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) {
+		hlist_for_each_entry_safe(inode, node, n, head, i_hash) {
+			hlist_del_init(&inode->i_hash);
+			list_del_init(&NILFS_I(inode)->i_dirty);
+			nilfs_clear_gcinode(inode); /* might sleep */
+		}
+	}
+}
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
new file mode 100644
index 00000000000..de86401f209
--- /dev/null
+++ b/fs/nilfs2/ifile.c
@@ -0,0 +1,150 @@
+/*
+ * ifile.c - NILFS inode file
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>.
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "ifile.h"
+
+/**
+ * nilfs_ifile_create_inode - create a new disk inode
+ * @ifile: ifile inode
+ * @out_ino: pointer to a variable to store inode number
+ * @out_bh: buffer_head contains newly allocated disk inode
+ *
+ * Return Value: On success, 0 is returned and the newly allocated inode
+ * number is stored in the place pointed by @ino, and buffer_head pointer
+ * that contains newly allocated disk inode structure is stored in the
+ * place pointed by @out_bh
+ * On error, one of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOSPC - No inode left.
+ */
+int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
+			     struct buffer_head **out_bh)
+{
+	struct nilfs_palloc_req req;
+	int ret;
+
+	req.pr_entry_nr = 0;  /* 0 says find free inode from beginning of
+				 a group. dull code!! */
+	req.pr_entry_bh = NULL;
+
+	ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
+	if (!ret) {
+		ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
+						   &req.pr_entry_bh);
+		if (ret < 0)
+			nilfs_palloc_abort_alloc_entry(ifile, &req);
+	}
+	if (ret < 0) {
+		brelse(req.pr_entry_bh);
+		return ret;
+	}
+	nilfs_palloc_commit_alloc_entry(ifile, &req);
+	nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
+	nilfs_mdt_mark_dirty(ifile);
+	*out_ino = (ino_t)req.pr_entry_nr;
+	*out_bh = req.pr_entry_bh;
+	return 0;
+}
+
+/**
+ * nilfs_ifile_delete_inode - delete a disk inode
+ * @ifile: ifile inode
+ * @ino: inode number
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The inode number @ino have not been allocated.
+ */
+int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
+{
+	struct nilfs_palloc_req req = {
+		.pr_entry_nr = ino, .pr_entry_bh = NULL
+	};
+	struct nilfs_inode *raw_inode;
+	void *kaddr;
+	int ret;
+
+	ret = nilfs_palloc_prepare_free_entry(ifile, &req);
+	if (!ret) {
+		ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 0,
+						   &req.pr_entry_bh);
+		if (ret < 0)
+			nilfs_palloc_abort_free_entry(ifile, &req);
+	}
+	if (ret < 0) {
+		brelse(req.pr_entry_bh);
+		return ret;
+	}
+
+	kaddr = kmap_atomic(req.pr_entry_bh->b_page, KM_USER0);
+	raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr,
+						 req.pr_entry_bh, kaddr);
+	raw_inode->i_flags = 0;
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
+	brelse(req.pr_entry_bh);
+
+	nilfs_palloc_commit_free_entry(ifile, &req);
+
+	return 0;
+}
+
+int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
+				struct buffer_head **out_bh)
+{
+	struct super_block *sb = ifile->i_sb;
+	int err;
+
+	if (unlikely(!NILFS_VALID_INODE(sb, ino))) {
+		nilfs_error(sb, __func__, "bad inode number: %lu",
+			    (unsigned long) ino);
+		return -EINVAL;
+	}
+
+	err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
+	if (unlikely(err)) {
+		if (err == -EINVAL)
+			nilfs_error(sb, __func__, "ifile is broken");
+		else
+			nilfs_warning(sb, __func__,
+				      "unable to read inode: %lu",
+				      (unsigned long) ino);
+	}
+	return err;
+}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
new file mode 100644
index 00000000000..5d30a35679b
--- /dev/null
+++ b/fs/nilfs2/ifile.h
@@ -0,0 +1,53 @@
+/*
+ * ifile.h - NILFS inode file
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Amagai Yoshiji <amagai@osrg.net>
+ * Revised by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#ifndef _NILFS_IFILE_H
+#define _NILFS_IFILE_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "alloc.h"
+
+#define NILFS_IFILE_GFP  NILFS_MDT_GFP
+
+static inline struct nilfs_inode *
+nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
+{
+	void *kaddr = kmap(ibh->b_page);
+	return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
+}
+
+static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino,
+					   struct buffer_head *ibh)
+{
+	kunmap(ibh->b_page);
+}
+
+int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
+int nilfs_ifile_delete_inode(struct inode *, ino_t);
+int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
+
+#endif	/* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
new file mode 100644
index 00000000000..49ab4a49bb4
--- /dev/null
+++ b/fs/nilfs2/inode.c
@@ -0,0 +1,785 @@
+/*
+ * inode.c - NILFS inode operations.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/uio.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "page.h"
+#include "mdt.h"
+#include "cpfile.h"
+#include "ifile.h"
+
+
+/**
+ * nilfs_get_block() - get a file block on the filesystem (callback function)
+ * @inode - inode struct of the target file
+ * @blkoff - file block number
+ * @bh_result - buffer head to be mapped on
+ * @create - indicate whether allocating the block or not when it has not
+ *      been allocated yet.
+ *
+ * This function does not issue actual read request of the specified data
+ * block. It is done by VFS.
+ * Bulk read for direct-io is not supported yet. (should be supported)
+ */
+int nilfs_get_block(struct inode *inode, sector_t blkoff,
+		    struct buffer_head *bh_result, int create)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	unsigned long blknum = 0;
+	int err = 0, ret;
+	struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
+
+	/* This exclusion control is a workaround; should be revised */
+	down_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	ret = nilfs_bmap_lookup(ii->i_bmap, (unsigned long)blkoff, &blknum);
+	up_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	if (ret == 0) {	/* found */
+		map_bh(bh_result, inode->i_sb, blknum);
+		goto out;
+	}
+	/* data block was not found */
+	if (ret == -ENOENT && create) {
+		struct nilfs_transaction_info ti;
+
+		bh_result->b_blocknr = 0;
+		err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
+		if (unlikely(err))
+			goto out;
+		err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff,
+					(unsigned long)bh_result);
+		if (unlikely(err != 0)) {
+			if (err == -EEXIST) {
+				/*
+				 * The get_block() function could be called
+				 * from multiple callers for an inode.
+				 * However, the page having this block must
+				 * be locked in this case.
+				 */
+				printk(KERN_WARNING
+				       "nilfs_get_block: a race condition "
+				       "while inserting a data block. "
+				       "(inode number=%lu, file block "
+				       "offset=%llu)\n",
+				       inode->i_ino,
+				       (unsigned long long)blkoff);
+				err = 0;
+			} else if (err == -EINVAL) {
+				nilfs_error(inode->i_sb, __func__,
+					    "broken bmap (inode=%lu)\n",
+					    inode->i_ino);
+				err = -EIO;
+			}
+			nilfs_transaction_abort(inode->i_sb);
+			goto out;
+		}
+		nilfs_transaction_commit(inode->i_sb); /* never fails */
+		/* Error handling should be detailed */
+		set_buffer_new(bh_result);
+		map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
+						      to proper value */
+	} else if (ret == -ENOENT) {
+		/* not found is not error (e.g. hole); must return without
+		   the mapped state flag. */
+		;
+	} else {
+		err = ret;
+	}
+
+ out:
+	return err;
+}
+
+/**
+ * nilfs_readpage() - implement readpage() method of nilfs_aops {}
+ * address_space_operations.
+ * @file - file struct of the file to be read
+ * @page - the page to be read
+ */
+static int nilfs_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, nilfs_get_block);
+}
+
+/**
+ * nilfs_readpages() - implement readpages() method of nilfs_aops {}
+ * address_space_operations.
+ * @file - file struct of the file to be read
+ * @mapping - address_space struct used for reading multiple pages
+ * @pages - the pages to be read
+ * @nr_pages - number of pages to be read
+ */
+static int nilfs_readpages(struct file *file, struct address_space *mapping,
+			   struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
+}
+
+static int nilfs_writepages(struct address_space *mapping,
+			    struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	int err = 0;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		err = nilfs_construct_dsync_segment(inode->i_sb, inode,
+						    wbc->range_start,
+						    wbc->range_end);
+	return err;
+}
+
+static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	int err;
+
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+
+	if (wbc->sync_mode == WB_SYNC_ALL) {
+		err = nilfs_construct_segment(inode->i_sb);
+		if (unlikely(err))
+			return err;
+	} else if (wbc->for_reclaim)
+		nilfs_flush_segment(inode->i_sb, inode->i_ino);
+
+	return 0;
+}
+
+static int nilfs_set_page_dirty(struct page *page)
+{
+	int ret = __set_page_dirty_buffers(page);
+
+	if (ret) {
+		struct inode *inode = page->mapping->host;
+		struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+		unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
+
+		nilfs_set_file_dirty(sbi, inode, nr_dirty);
+	}
+	return ret;
+}
+
+static int nilfs_write_begin(struct file *file, struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata)
+
+{
+	struct inode *inode = mapping->host;
+	int err = nilfs_transaction_begin(inode->i_sb, NULL, 1);
+
+	if (unlikely(err))
+		return err;
+
+	*pagep = NULL;
+	err = block_write_begin(file, mapping, pos, len, flags, pagep,
+				fsdata, nilfs_get_block);
+	if (unlikely(err))
+		nilfs_transaction_abort(inode->i_sb);
+	return err;
+}
+
+static int nilfs_write_end(struct file *file, struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned nr_dirty;
+	int err;
+
+	nr_dirty = nilfs_page_count_clean_buffers(page, start,
+						  start + copied);
+	copied = generic_write_end(file, mapping, pos, len, copied, page,
+				   fsdata);
+	nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty);
+	err = nilfs_transaction_commit(inode->i_sb);
+	return err ? : copied;
+}
+
+static ssize_t
+nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+		loff_t offset, unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t size;
+
+	if (rw == WRITE)
+		return 0;
+
+	/* Needs synchronization with the cleaner */
+	size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				  offset, nr_segs, nilfs_get_block, NULL);
+	return size;
+}
+
+struct address_space_operations nilfs_aops = {
+	.writepage		= nilfs_writepage,
+	.readpage		= nilfs_readpage,
+	/* .sync_page		= nilfs_sync_page, */
+	.writepages		= nilfs_writepages,
+	.set_page_dirty		= nilfs_set_page_dirty,
+	.readpages		= nilfs_readpages,
+	.write_begin		= nilfs_write_begin,
+	.write_end		= nilfs_write_end,
+	/* .releasepage		= nilfs_releasepage, */
+	.invalidatepage		= block_invalidatepage,
+	.direct_IO		= nilfs_direct_IO,
+};
+
+struct inode *nilfs_new_inode(struct inode *dir, int mode)
+{
+	struct super_block *sb = dir->i_sb;
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct inode *inode;
+	struct nilfs_inode_info *ii;
+	int err = -ENOMEM;
+	ino_t ino;
+
+	inode = new_inode(sb);
+	if (unlikely(!inode))
+		goto failed;
+
+	mapping_set_gfp_mask(inode->i_mapping,
+			     mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+
+	ii = NILFS_I(inode);
+	ii->i_state = 1 << NILFS_I_NEW;
+
+	err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh);
+	if (unlikely(err))
+		goto failed_ifile_create_inode;
+	/* reference count of i_bh inherits from nilfs_mdt_read_block() */
+
+	atomic_inc(&sbi->s_inodes_count);
+
+	inode->i_uid = current_fsuid();
+	if (dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		inode->i_gid = current_fsgid();
+
+	inode->i_mode = mode;
+	inode->i_ino = ino;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
+		err = nilfs_bmap_read(ii->i_bmap, NULL);
+		if (err < 0)
+			goto failed_bmap;
+
+		set_bit(NILFS_I_BMAP, &ii->i_state);
+		/* No lock is needed; iget() ensures it. */
+	}
+
+	ii->i_flags = NILFS_I(dir)->i_flags;
+	if (S_ISLNK(mode))
+		ii->i_flags &= ~(NILFS_IMMUTABLE_FL | NILFS_APPEND_FL);
+	if (!S_ISDIR(mode))
+		ii->i_flags &= ~NILFS_DIRSYNC_FL;
+
+	/* ii->i_file_acl = 0; */
+	/* ii->i_dir_acl = 0; */
+	ii->i_dir_start_lookup = 0;
+#ifdef CONFIG_NILFS_FS_POSIX_ACL
+	ii->i_acl = NULL;
+	ii->i_default_acl = NULL;
+#endif
+	ii->i_cno = 0;
+	nilfs_set_inode_flags(inode);
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+	insert_inode_hash(inode);
+
+	err = nilfs_init_acl(inode, dir);
+	if (unlikely(err))
+		goto failed_acl; /* never occur. When supporting
+				    nilfs_init_acl(), proper cancellation of
+				    above jobs should be considered */
+
+	mark_inode_dirty(inode);
+	return inode;
+
+ failed_acl:
+ failed_bmap:
+	inode->i_nlink = 0;
+	iput(inode);  /* raw_inode will be deleted through
+			 generic_delete_inode() */
+	goto failed;
+
+ failed_ifile_create_inode:
+	make_bad_inode(inode);
+	iput(inode);  /* if i_nlink == 1, generic_forget_inode() will be
+			 called */
+ failed:
+	return ERR_PTR(err);
+}
+
+void nilfs_free_inode(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+
+	clear_inode(inode);
+	/* XXX: check error code? Is there any thing I can do? */
+	(void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
+	atomic_dec(&sbi->s_inodes_count);
+}
+
+void nilfs_set_inode_flags(struct inode *inode)
+{
+	unsigned int flags = NILFS_I(inode)->i_flags;
+
+	inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
+			    S_DIRSYNC);
+	if (flags & NILFS_SYNC_FL)
+		inode->i_flags |= S_SYNC;
+	if (flags & NILFS_APPEND_FL)
+		inode->i_flags |= S_APPEND;
+	if (flags & NILFS_IMMUTABLE_FL)
+		inode->i_flags |= S_IMMUTABLE;
+#ifndef NILFS_ATIME_DISABLE
+	if (flags & NILFS_NOATIME_FL)
+#endif
+		inode->i_flags |= S_NOATIME;
+	if (flags & NILFS_DIRSYNC_FL)
+		inode->i_flags |= S_DIRSYNC;
+	mapping_set_gfp_mask(inode->i_mapping,
+			     mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+}
+
+int nilfs_read_inode_common(struct inode *inode,
+			    struct nilfs_inode *raw_inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	int err;
+
+	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+	inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid);
+	inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid);
+	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+	inode->i_size = le64_to_cpu(raw_inode->i_size);
+	inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+	inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
+	inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+	inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+	inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
+	inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+	if (inode->i_nlink == 0 && inode->i_mode == 0)
+		return -EINVAL; /* this inode is deleted */
+
+	inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
+	ii->i_flags = le32_to_cpu(raw_inode->i_flags);
+#if 0
+	ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+	ii->i_dir_acl = S_ISREG(inode->i_mode) ?
+		0 : le32_to_cpu(raw_inode->i_dir_acl);
+#endif
+	ii->i_cno = 0;
+	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+
+	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+	    S_ISLNK(inode->i_mode)) {
+		err = nilfs_bmap_read(ii->i_bmap, raw_inode);
+		if (err < 0)
+			return err;
+		set_bit(NILFS_I_BMAP, &ii->i_state);
+		/* No lock is needed; iget() ensures it. */
+	}
+	return 0;
+}
+
+static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
+			      struct inode *inode)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
+	struct buffer_head *bh;
+	struct nilfs_inode *raw_inode;
+	int err;
+
+	down_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh);
+	if (unlikely(err))
+		goto bad_inode;
+
+	raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
+
+#ifdef CONFIG_NILFS_FS_POSIX_ACL
+	ii->i_acl = NILFS_ACL_NOT_CACHED;
+	ii->i_default_acl = NILFS_ACL_NOT_CACHED;
+#endif
+	if (nilfs_read_inode_common(inode, raw_inode))
+		goto failed_unmap;
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &nilfs_file_inode_operations;
+		inode->i_fop = &nilfs_file_operations;
+		inode->i_mapping->a_ops = &nilfs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &nilfs_dir_inode_operations;
+		inode->i_fop = &nilfs_dir_operations;
+		inode->i_mapping->a_ops = &nilfs_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &nilfs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &nilfs_aops;
+	} else {
+		inode->i_op = &nilfs_special_inode_operations;
+		init_special_inode(
+			inode, inode->i_mode,
+			new_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
+	}
+	nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
+	brelse(bh);
+	up_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	nilfs_set_inode_flags(inode);
+	return 0;
+
+ failed_unmap:
+	nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
+	brelse(bh);
+
+ bad_inode:
+	up_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	return err;
+}
+
+struct inode *nilfs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct inode *inode;
+	int err;
+
+	inode = iget_locked(sb, ino);
+	if (unlikely(!inode))
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	err = __nilfs_read_inode(sb, ino, inode);
+	if (unlikely(err)) {
+		iget_failed(inode);
+		return ERR_PTR(err);
+	}
+	unlock_new_inode(inode);
+	return inode;
+}
+
+void nilfs_write_inode_common(struct inode *inode,
+			      struct nilfs_inode *raw_inode, int has_bmap)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+	raw_inode->i_uid = cpu_to_le32(inode->i_uid);
+	raw_inode->i_gid = cpu_to_le32(inode->i_gid);
+	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+	raw_inode->i_size = cpu_to_le64(inode->i_size);
+	raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+	raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+	raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+	raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+	raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
+
+	raw_inode->i_flags = cpu_to_le32(ii->i_flags);
+	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+
+	if (has_bmap)
+		nilfs_bmap_write(ii->i_bmap, raw_inode);
+	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		raw_inode->i_device_code =
+			cpu_to_le64(new_encode_dev(inode->i_rdev));
+	/* When extending inode, nilfs->ns_inode_size should be checked
+	   for substitutions of appended fields */
+}
+
+void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
+{
+	ino_t ino = inode->i_ino;
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct nilfs_inode *raw_inode;
+
+	raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
+
+	/* The buffer is guarded with lock_buffer() by the caller */
+	if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
+		memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
+	set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
+
+	nilfs_write_inode_common(inode, raw_inode, 0);
+		/* XXX: call with has_bmap = 0 is a workaround to avoid
+		   deadlock of bmap. This delays update of i_bmap to just
+		   before writing */
+	nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh);
+}
+
+#define NILFS_MAX_TRUNCATE_BLOCKS	16384  /* 64MB for 4KB block */
+
+static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
+				unsigned long from)
+{
+	unsigned long b;
+	int ret;
+
+	if (!test_bit(NILFS_I_BMAP, &ii->i_state))
+		return;
+ repeat:
+	ret = nilfs_bmap_last_key(ii->i_bmap, &b);
+	if (ret == -ENOENT)
+		return;
+	else if (ret < 0)
+		goto failed;
+
+	if (b < from)
+		return;
+
+	b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
+	ret = nilfs_bmap_truncate(ii->i_bmap, b);
+	nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
+	if (!ret || (ret == -ENOMEM &&
+		     nilfs_bmap_truncate(ii->i_bmap, b) == 0))
+		goto repeat;
+
+ failed:
+	if (ret == -EINVAL)
+		nilfs_error(ii->vfs_inode.i_sb, __func__,
+			    "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino);
+	else
+		nilfs_warning(ii->vfs_inode.i_sb, __func__,
+			      "failed to truncate bmap (ino=%lu, err=%d)",
+			      ii->vfs_inode.i_ino, ret);
+}
+
+void nilfs_truncate(struct inode *inode)
+{
+	unsigned long blkoff;
+	unsigned int blocksize;
+	struct nilfs_transaction_info ti;
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	if (!test_bit(NILFS_I_BMAP, &ii->i_state))
+		return;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return;
+
+	blocksize = sb->s_blocksize;
+	blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits;
+	nilfs_transaction_begin(sb, &ti, 0); /* never fails */
+
+	block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block);
+
+	nilfs_truncate_bmap(ii, blkoff);
+
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	if (IS_SYNC(inode))
+		nilfs_set_transaction_flag(NILFS_TI_SYNC);
+
+	nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
+	nilfs_transaction_commit(sb);
+	/* May construct a logical segment and may fail in sync mode.
+	   But truncate has no return value. */
+}
+
+void nilfs_delete_inode(struct inode *inode)
+{
+	struct nilfs_transaction_info ti;
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	if (unlikely(is_bad_inode(inode))) {
+		if (inode->i_data.nrpages)
+			truncate_inode_pages(&inode->i_data, 0);
+		clear_inode(inode);
+		return;
+	}
+	nilfs_transaction_begin(sb, &ti, 0); /* never fails */
+
+	if (inode->i_data.nrpages)
+		truncate_inode_pages(&inode->i_data, 0);
+
+	nilfs_truncate_bmap(ii, 0);
+	nilfs_free_inode(inode);
+	/* nilfs_free_inode() marks inode buffer dirty */
+	if (IS_SYNC(inode))
+		nilfs_set_transaction_flag(NILFS_TI_SYNC);
+	nilfs_transaction_commit(sb);
+	/* May construct a logical segment and may fail in sync mode.
+	   But delete_inode has no return value. */
+}
+
+int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct nilfs_transaction_info ti;
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	int err;
+
+	err = inode_change_ok(inode, iattr);
+	if (err)
+		return err;
+
+	err = nilfs_transaction_begin(sb, &ti, 0);
+	if (unlikely(err))
+		return err;
+	err = inode_setattr(inode, iattr);
+	if (!err && (iattr->ia_valid & ATTR_MODE))
+		err = nilfs_acl_chmod(inode);
+	if (likely(!err))
+		err = nilfs_transaction_commit(sb);
+	else
+		nilfs_transaction_abort(sb);
+
+	return err;
+}
+
+int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
+			   struct buffer_head **pbh)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	int err;
+
+	spin_lock(&sbi->s_inode_lock);
+	/* Caller of this function MUST lock s_inode_lock */
+	if (ii->i_bh == NULL) {
+		spin_unlock(&sbi->s_inode_lock);
+		err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino,
+						  pbh);
+		if (unlikely(err))
+			return err;
+		spin_lock(&sbi->s_inode_lock);
+		if (ii->i_bh == NULL)
+			ii->i_bh = *pbh;
+		else {
+			brelse(*pbh);
+			*pbh = ii->i_bh;
+		}
+	} else
+		*pbh = ii->i_bh;
+
+	get_bh(*pbh);
+	spin_unlock(&sbi->s_inode_lock);
+	return 0;
+}
+
+int nilfs_inode_dirty(struct inode *inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+	int ret = 0;
+
+	if (!list_empty(&ii->i_dirty)) {
+		spin_lock(&sbi->s_inode_lock);
+		ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
+			test_bit(NILFS_I_BUSY, &ii->i_state);
+		spin_unlock(&sbi->s_inode_lock);
+	}
+	return ret;
+}
+
+int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
+			 unsigned nr_dirty)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
+
+	if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
+		return 0;
+
+	spin_lock(&sbi->s_inode_lock);
+	if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
+	    !test_bit(NILFS_I_BUSY, &ii->i_state)) {
+		/* Because this routine may race with nilfs_dispose_list(),
+		   we have to check NILFS_I_QUEUED here, too. */
+		if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
+			/* This will happen when somebody is freeing
+			   this inode. */
+			nilfs_warning(sbi->s_super, __func__,
+				      "cannot get inode (ino=%lu)\n",
+				      inode->i_ino);
+			spin_unlock(&sbi->s_inode_lock);
+			return -EINVAL; /* NILFS_I_DIRTY may remain for
+					   freeing inode */
+		}
+		list_del(&ii->i_dirty);
+		list_add_tail(&ii->i_dirty, &sbi->s_dirty_files);
+		set_bit(NILFS_I_QUEUED, &ii->i_state);
+	}
+	spin_unlock(&sbi->s_inode_lock);
+	return 0;
+}
+
+int nilfs_mark_inode_dirty(struct inode *inode)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
+	struct buffer_head *ibh;
+	int err;
+
+	err = nilfs_load_inode_block(sbi, inode, &ibh);
+	if (unlikely(err)) {
+		nilfs_warning(inode->i_sb, __func__,
+			      "failed to reget inode block.\n");
+		return err;
+	}
+	lock_buffer(ibh);
+	nilfs_update_inode(inode, ibh);
+	unlock_buffer(ibh);
+	nilfs_mdt_mark_buffer_dirty(ibh);
+	nilfs_mdt_mark_dirty(sbi->s_ifile);
+	brelse(ibh);
+	return 0;
+}
+
+/**
+ * nilfs_dirty_inode - reflect changes on given inode to an inode block.
+ * @inode: inode of the file to be registered.
+ *
+ * nilfs_dirty_inode() loads a inode block containing the specified
+ * @inode and copies data from a nilfs_inode to a corresponding inode
+ * entry in the inode block. This operation is excluded from the segment
+ * construction. This function can be called both as a single operation
+ * and as a part of indivisible file operations.
+ */
+void nilfs_dirty_inode(struct inode *inode)
+{
+	struct nilfs_transaction_info ti;
+
+	if (is_bad_inode(inode)) {
+		nilfs_warning(inode->i_sb, __func__,
+			      "tried to mark bad_inode dirty. ignored.\n");
+		dump_stack();
+		return;
+	}
+	nilfs_transaction_begin(inode->i_sb, &ti, 0);
+	nilfs_mark_inode_dirty(inode);
+	nilfs_transaction_commit(inode->i_sb); /* never fails */
+}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
new file mode 100644
index 00000000000..108d281ebca
--- /dev/null
+++ b/fs/nilfs2/ioctl.c
@@ -0,0 +1,654 @@
+/*
+ * ioctl.c - NILFS ioctl operations.
+ *
+ * Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/fs.h>
+#include <linux/wait.h>
+#include <linux/smp_lock.h>	/* lock_kernel(), unlock_kernel() */
+#include <linux/capability.h>	/* capable() */
+#include <linux/uaccess.h>	/* copy_from_user(), copy_to_user() */
+#include <linux/nilfs2_fs.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "bmap.h"
+#include "cpfile.h"
+#include "sufile.h"
+#include "dat.h"
+
+
+static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
+				 struct nilfs_argv *argv, int dir,
+				 ssize_t (*dofunc)(struct the_nilfs *,
+						   __u64 *, int,
+						   void *, size_t, size_t))
+{
+	void *buf;
+	void __user *base = (void __user *)(unsigned long)argv->v_base;
+	size_t maxmembs, total, n;
+	ssize_t nr;
+	int ret, i;
+	__u64 pos, ppos;
+
+	if (argv->v_nmembs == 0)
+		return 0;
+
+	if (argv->v_size > PAGE_SIZE)
+		return -EINVAL;
+
+	buf = (void *)__get_free_pages(GFP_NOFS, 0);
+	if (unlikely(!buf))
+		return -ENOMEM;
+	maxmembs = PAGE_SIZE / argv->v_size;
+
+	ret = 0;
+	total = 0;
+	pos = argv->v_index;
+	for (i = 0; i < argv->v_nmembs; i += n) {
+		n = (argv->v_nmembs - i < maxmembs) ?
+			argv->v_nmembs - i : maxmembs;
+		if ((dir & _IOC_WRITE) &&
+		    copy_from_user(buf, base + argv->v_size * i,
+				   argv->v_size * n)) {
+			ret = -EFAULT;
+			break;
+		}
+		ppos = pos;
+		nr = dofunc(nilfs, &pos, argv->v_flags, buf, argv->v_size,
+			       n);
+		if (nr < 0) {
+			ret = nr;
+			break;
+		}
+		if ((dir & _IOC_READ) &&
+		    copy_to_user(base + argv->v_size * i, buf,
+				 argv->v_size * nr)) {
+			ret = -EFAULT;
+			break;
+		}
+		total += nr;
+		if ((size_t)nr < n)
+			break;
+		if (pos == ppos)
+			pos += n;
+	}
+	argv->v_nmembs = total;
+
+	free_pages((unsigned long)buf, 0);
+	return ret;
+}
+
+static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
+				     unsigned int cmd, void __user *argp)
+{
+	struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
+	struct nilfs_transaction_info ti;
+	struct nilfs_cpmode cpmode;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
+		return -EFAULT;
+
+	nilfs_transaction_begin(inode->i_sb, &ti, 0);
+	ret = nilfs_cpfile_change_cpmode(
+		cpfile, cpmode.cm_cno, cpmode.cm_mode);
+	if (unlikely(ret < 0)) {
+		nilfs_transaction_abort(inode->i_sb);
+		return ret;
+	}
+	nilfs_transaction_commit(inode->i_sb); /* never fails */
+	return ret;
+}
+
+static int
+nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
+			      unsigned int cmd, void __user *argp)
+{
+	struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
+	struct nilfs_transaction_info ti;
+	__u64 cno;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (copy_from_user(&cno, argp, sizeof(cno)))
+		return -EFAULT;
+
+	nilfs_transaction_begin(inode->i_sb, &ti, 0);
+	ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
+	if (unlikely(ret < 0)) {
+		nilfs_transaction_abort(inode->i_sb);
+		return ret;
+	}
+	nilfs_transaction_commit(inode->i_sb); /* never fails */
+	return ret;
+}
+
+static ssize_t
+nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			  void *buf, size_t size, size_t nmembs)
+{
+	return nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf,
+				       nmembs);
+}
+
+static int nilfs_ioctl_get_cpinfo(struct inode *inode, struct file *filp,
+				  unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	struct nilfs_argv argv;
+	int ret;
+
+	if (copy_from_user(&argv, argp, sizeof(argv)))
+		return -EFAULT;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
+				    nilfs_ioctl_do_get_cpinfo);
+	up_read(&nilfs->ns_segctor_sem);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &argv, sizeof(argv)))
+		ret = -EFAULT;
+	return ret;
+}
+
+static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
+				  unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	struct nilfs_cpstat cpstat;
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
+	up_read(&nilfs->ns_segctor_sem);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &cpstat, sizeof(cpstat)))
+		ret = -EFAULT;
+	return ret;
+}
+
+static ssize_t
+nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			  void *buf, size_t size, size_t nmembs)
+{
+	return nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, nmembs);
+}
+
+static int nilfs_ioctl_get_suinfo(struct inode *inode, struct file *filp,
+				  unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	struct nilfs_argv argv;
+	int ret;
+
+	if (copy_from_user(&argv, argp, sizeof(argv)))
+		return -EFAULT;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
+				    nilfs_ioctl_do_get_suinfo);
+	up_read(&nilfs->ns_segctor_sem);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &argv, sizeof(argv)))
+		ret = -EFAULT;
+	return ret;
+}
+
+static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
+				  unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	struct nilfs_sustat sustat;
+	int ret;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
+	up_read(&nilfs->ns_segctor_sem);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &sustat, sizeof(sustat)))
+		ret = -EFAULT;
+	return ret;
+}
+
+static ssize_t
+nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			 void *buf, size_t size, size_t nmembs)
+{
+	return nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, nmembs);
+}
+
+static int nilfs_ioctl_get_vinfo(struct inode *inode, struct file *filp,
+				 unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	struct nilfs_argv argv;
+	int ret;
+
+	if (copy_from_user(&argv, argp, sizeof(argv)))
+		return -EFAULT;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
+				    nilfs_ioctl_do_get_vinfo);
+	up_read(&nilfs->ns_segctor_sem);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &argv, sizeof(argv)))
+		ret = -EFAULT;
+	return ret;
+}
+
+static ssize_t
+nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			  void *buf, size_t size, size_t nmembs)
+{
+	struct inode *dat = nilfs_dat_inode(nilfs);
+	struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
+	struct nilfs_bdesc *bdescs = buf;
+	int ret, i;
+
+	for (i = 0; i < nmembs; i++) {
+		ret = nilfs_bmap_lookup_at_level(bmap,
+						 bdescs[i].bd_offset,
+						 bdescs[i].bd_level + 1,
+						 &bdescs[i].bd_blocknr);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				return ret;
+			bdescs[i].bd_blocknr = 0;
+		}
+	}
+	return nmembs;
+}
+
+static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
+				  unsigned int cmd, void __user *argp)
+{
+	struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
+	struct nilfs_argv argv;
+	int ret;
+
+	if (copy_from_user(&argv, argp, sizeof(argv)))
+		return -EFAULT;
+
+	down_read(&nilfs->ns_segctor_sem);
+	ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
+				    nilfs_ioctl_do_get_bdescs);
+	up_read(&nilfs->ns_segctor_sem);
+	if (ret < 0)
+		return ret;
+
+	if (copy_to_user(argp, &argv, sizeof(argv)))
+		ret = -EFAULT;
+	return ret;
+}
+
+static int nilfs_ioctl_move_inode_block(struct inode *inode,
+					struct nilfs_vdesc *vdesc,
+					struct list_head *buffers)
+{
+	struct buffer_head *bh;
+	int ret;
+
+	if (vdesc->vd_flags == 0)
+		ret = nilfs_gccache_submit_read_data(
+			inode, vdesc->vd_offset, vdesc->vd_blocknr,
+			vdesc->vd_vblocknr, &bh);
+	else
+		ret = nilfs_gccache_submit_read_node(
+			inode, vdesc->vd_blocknr, vdesc->vd_vblocknr, &bh);
+
+	if (unlikely(ret < 0)) {
+		if (ret == -ENOENT)
+			printk(KERN_CRIT
+			       "%s: invalid virtual block address (%s): "
+			       "ino=%llu, cno=%llu, offset=%llu, "
+			       "blocknr=%llu, vblocknr=%llu\n",
+			       __func__, vdesc->vd_flags ? "node" : "data",
+			       (unsigned long long)vdesc->vd_ino,
+			       (unsigned long long)vdesc->vd_cno,
+			       (unsigned long long)vdesc->vd_offset,
+			       (unsigned long long)vdesc->vd_blocknr,
+			       (unsigned long long)vdesc->vd_vblocknr);
+		return ret;
+	}
+	bh->b_private = vdesc;
+	list_add_tail(&bh->b_assoc_buffers, buffers);
+	return 0;
+}
+
+static ssize_t
+nilfs_ioctl_do_move_blocks(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			   void *buf, size_t size, size_t nmembs)
+{
+	struct inode *inode;
+	struct nilfs_vdesc *vdesc;
+	struct buffer_head *bh, *n;
+	LIST_HEAD(buffers);
+	ino_t ino;
+	__u64 cno;
+	int i, ret;
+
+	for (i = 0, vdesc = buf; i < nmembs; ) {
+		ino = vdesc->vd_ino;
+		cno = vdesc->vd_cno;
+		inode = nilfs_gc_iget(nilfs, ino, cno);
+		if (unlikely(inode == NULL)) {
+			ret = -ENOMEM;
+			goto failed;
+		}
+		do {
+			ret = nilfs_ioctl_move_inode_block(inode, vdesc,
+							   &buffers);
+			if (unlikely(ret < 0))
+				goto failed;
+			vdesc++;
+		} while (++i < nmembs &&
+			 vdesc->vd_ino == ino && vdesc->vd_cno == cno);
+	}
+
+	list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
+		ret = nilfs_gccache_wait_and_mark_dirty(bh);
+		if (unlikely(ret < 0)) {
+			if (ret == -EEXIST) {
+				vdesc = bh->b_private;
+				printk(KERN_CRIT
+				       "%s: conflicting %s buffer: "
+				       "ino=%llu, cno=%llu, offset=%llu, "
+				       "blocknr=%llu, vblocknr=%llu\n",
+				       __func__,
+				       vdesc->vd_flags ? "node" : "data",
+				       (unsigned long long)vdesc->vd_ino,
+				       (unsigned long long)vdesc->vd_cno,
+				       (unsigned long long)vdesc->vd_offset,
+				       (unsigned long long)vdesc->vd_blocknr,
+				       (unsigned long long)vdesc->vd_vblocknr);
+			}
+			goto failed;
+		}
+		list_del_init(&bh->b_assoc_buffers);
+		bh->b_private = NULL;
+		brelse(bh);
+	}
+	return nmembs;
+
+ failed:
+	list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
+		list_del_init(&bh->b_assoc_buffers);
+		bh->b_private = NULL;
+		brelse(bh);
+	}
+	return ret;
+}
+
+static inline int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
+					  struct nilfs_argv *argv,
+					  int dir)
+{
+	return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
+				     nilfs_ioctl_do_move_blocks);
+}
+
+static ssize_t
+nilfs_ioctl_do_delete_checkpoints(struct the_nilfs *nilfs, __u64 *posp,
+				  int flags, void *buf, size_t size,
+				  size_t nmembs)
+{
+	struct inode *cpfile = nilfs->ns_cpfile;
+	struct nilfs_period *periods = buf;
+	int ret, i;
+
+	for (i = 0; i < nmembs; i++) {
+		ret = nilfs_cpfile_delete_checkpoints(
+			cpfile, periods[i].p_start, periods[i].p_end);
+		if (ret < 0)
+			return ret;
+	}
+	return nmembs;
+}
+
+static inline int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
+						 struct nilfs_argv *argv,
+						 int dir)
+{
+	return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
+				     nilfs_ioctl_do_delete_checkpoints);
+}
+
+static ssize_t
+nilfs_ioctl_do_free_vblocknrs(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			      void *buf, size_t size, size_t nmembs)
+{
+	int ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs);
+
+	return (ret < 0) ? ret : nmembs;
+}
+
+static inline int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
+					     struct nilfs_argv *argv,
+					     int dir)
+{
+	return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
+				     nilfs_ioctl_do_free_vblocknrs);
+}
+
+static ssize_t
+nilfs_ioctl_do_mark_blocks_dirty(struct the_nilfs *nilfs, __u64 *posp,
+				 int flags, void *buf, size_t size,
+				 size_t nmembs)
+{
+	struct inode *dat = nilfs_dat_inode(nilfs);
+	struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
+	struct nilfs_bdesc *bdescs = buf;
+	int ret, i;
+
+	for (i = 0; i < nmembs; i++) {
+		/* XXX: use macro or inline func to check liveness */
+		ret = nilfs_bmap_lookup_at_level(bmap,
+						 bdescs[i].bd_offset,
+						 bdescs[i].bd_level + 1,
+						 &bdescs[i].bd_blocknr);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				return ret;
+			bdescs[i].bd_blocknr = 0;
+		}
+		if (bdescs[i].bd_blocknr != bdescs[i].bd_oblocknr)
+			/* skip dead block */
+			continue;
+		if (bdescs[i].bd_level == 0) {
+			ret = nilfs_mdt_mark_block_dirty(dat,
+							 bdescs[i].bd_offset);
+			if (ret < 0) {
+				WARN_ON(ret == -ENOENT);
+				return ret;
+			}
+		} else {
+			ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset,
+					      bdescs[i].bd_level);
+			if (ret < 0) {
+				WARN_ON(ret == -ENOENT);
+				return ret;
+			}
+		}
+	}
+	return nmembs;
+}
+
+static inline int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
+						struct nilfs_argv *argv,
+						int dir)
+{
+	return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
+				     nilfs_ioctl_do_mark_blocks_dirty);
+}
+
+static ssize_t
+nilfs_ioctl_do_free_segments(struct the_nilfs *nilfs, __u64 *posp, int flags,
+			     void *buf, size_t size, size_t nmembs)
+{
+	struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs);
+	int ret;
+
+	if (unlikely(!sbi))
+		return -EROFS;
+	ret = nilfs_segctor_add_segments_to_be_freed(
+		NILFS_SC(sbi), buf, nmembs);
+	nilfs_put_writer(nilfs);
+
+	return (ret < 0) ? ret : nmembs;
+}
+
+static inline int nilfs_ioctl_free_segments(struct the_nilfs *nilfs,
+					     struct nilfs_argv *argv,
+					     int dir)
+{
+	return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
+				     nilfs_ioctl_do_free_segments);
+}
+
+int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
+				       void __user *argp)
+{
+	struct nilfs_argv argv[5];
+	const char *msg;
+	int dir, ret;
+
+	if (copy_from_user(argv, argp, sizeof(argv)))
+		return -EFAULT;
+
+	dir = _IOC_WRITE;
+	ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], dir);
+	if (ret < 0) {
+		msg = "cannot read source blocks";
+		goto failed;
+	}
+	ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], dir);
+	if (ret < 0) {
+		/*
+		 * can safely abort because checkpoints can be removed
+		 * independently.
+		 */
+		msg = "cannot delete checkpoints";
+		goto failed;
+	}
+	ret = nilfs_ioctl_free_vblocknrs(nilfs, &argv[2], dir);
+	if (ret < 0) {
+		/*
+		 * can safely abort because DAT file is updated atomically
+		 * using a copy-on-write technique.
+		 */
+		msg = "cannot delete virtual blocks from DAT file";
+		goto failed;
+	}
+	ret = nilfs_ioctl_mark_blocks_dirty(nilfs, &argv[3], dir);
+	if (ret < 0) {
+		/*
+		 * can safely abort because the operation is nondestructive.
+		 */
+		msg = "cannot mark copying blocks dirty";
+		goto failed;
+	}
+	ret = nilfs_ioctl_free_segments(nilfs, &argv[4], dir);
+	if (ret < 0) {
+		/*
+		 * can safely abort because this operation is atomic.
+		 */
+		msg = "cannot set segments to be freed";
+		goto failed;
+	}
+	return 0;
+
+ failed:
+	nilfs_remove_all_gcinode(nilfs);
+	printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n",
+	       msg, ret);
+	return ret;
+}
+
+static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
+				      unsigned int cmd, void __user *argp)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	return nilfs_clean_segments(inode->i_sb, argp);
+}
+
+static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
+			    unsigned int cmd, void __user *argp)
+{
+	__u64 cno;
+	int ret;
+
+	ret = nilfs_construct_segment(inode->i_sb);
+	if (ret < 0)
+		return ret;
+
+	if (argp != NULL) {
+		cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1;
+		if (copy_to_user(argp, &cno, sizeof(cno)))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	void __user *argp = (void * __user *)arg;
+
+	switch (cmd) {
+	case NILFS_IOCTL_CHANGE_CPMODE:
+		return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp);
+	case NILFS_IOCTL_DELETE_CHECKPOINT:
+		return nilfs_ioctl_delete_checkpoint(inode, filp, cmd, argp);
+	case NILFS_IOCTL_GET_CPINFO:
+		return nilfs_ioctl_get_cpinfo(inode, filp, cmd, argp);
+	case NILFS_IOCTL_GET_CPSTAT:
+		return nilfs_ioctl_get_cpstat(inode, filp, cmd, argp);
+	case NILFS_IOCTL_GET_SUINFO:
+		return nilfs_ioctl_get_suinfo(inode, filp, cmd, argp);
+	case NILFS_IOCTL_GET_SUSTAT:
+		return nilfs_ioctl_get_sustat(inode, filp, cmd, argp);
+	case NILFS_IOCTL_GET_VINFO:
+		/* XXX: rename to ??? */
+		return nilfs_ioctl_get_vinfo(inode, filp, cmd, argp);
+	case NILFS_IOCTL_GET_BDESCS:
+		return nilfs_ioctl_get_bdescs(inode, filp, cmd, argp);
+	case NILFS_IOCTL_CLEAN_SEGMENTS:
+		return nilfs_ioctl_clean_segments(inode, filp, cmd, argp);
+	case NILFS_IOCTL_SYNC:
+		return nilfs_ioctl_sync(inode, filp, cmd, argp);
+	default:
+		return -ENOTTY;
+	}
+}
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
new file mode 100644
index 00000000000..47dd815433f
--- /dev/null
+++ b/fs/nilfs2/mdt.c
@@ -0,0 +1,563 @@
+/*
+ * mdt.c - meta data file for NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/mm.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/swap.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "page.h"
+#include "mdt.h"
+
+
+#define NILFS_MDT_MAX_RA_BLOCKS		(16 - 1)
+
+#define INIT_UNUSED_INODE_FIELDS
+
+static int
+nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
+			   struct buffer_head *bh,
+			   void (*init_block)(struct inode *,
+					      struct buffer_head *, void *))
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	void *kaddr;
+	int ret;
+
+	/* Caller exclude read accesses using page lock */
+
+	/* set_buffer_new(bh); */
+	bh->b_blocknr = 0;
+
+	ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh);
+	if (unlikely(ret))
+		return ret;
+
+	set_buffer_mapped(bh);
+
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits);
+	if (init_block)
+		init_block(inode, bh, kaddr);
+	flush_dcache_page(bh->b_page);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	set_buffer_uptodate(bh);
+	nilfs_mark_buffer_dirty(bh);
+	nilfs_mdt_mark_dirty(inode);
+	return 0;
+}
+
+static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
+				  struct buffer_head **out_bh,
+				  void (*init_block)(struct inode *,
+						     struct buffer_head *,
+						     void *))
+{
+	struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
+	struct nilfs_sb_info *writer = NULL;
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_transaction_info ti;
+	struct buffer_head *bh;
+	int err;
+
+	if (!sb) {
+		writer = nilfs_get_writer(nilfs);
+		if (!writer) {
+			err = -EROFS;
+			goto out;
+		}
+		sb = writer->s_super;
+	}
+
+	nilfs_transaction_begin(sb, &ti, 0);
+
+	err = -ENOMEM;
+	bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0);
+	if (unlikely(!bh))
+		goto failed_unlock;
+
+	err = -EEXIST;
+	if (buffer_uptodate(bh) || buffer_mapped(bh))
+		goto failed_bh;
+#if 0
+	/* The uptodate flag is not protected by the page lock, but
+	   the mapped flag is.  Thus, we don't have to wait the buffer. */
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
+		goto failed_bh;
+#endif
+
+	bh->b_bdev = nilfs->ns_bdev;
+	err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
+	if (likely(!err)) {
+		get_bh(bh);
+		*out_bh = bh;
+	}
+
+ failed_bh:
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	brelse(bh);
+
+ failed_unlock:
+	if (likely(!err))
+		err = nilfs_transaction_commit(sb);
+	else
+		nilfs_transaction_abort(sb);
+	if (writer)
+		nilfs_put_writer(nilfs);
+ out:
+	return err;
+}
+
+static int
+nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
+		       int mode, struct buffer_head **out_bh)
+{
+	struct buffer_head *bh;
+	unsigned long blknum = 0;
+	int ret = -ENOMEM;
+
+	bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
+	if (unlikely(!bh))
+		goto failed;
+
+	ret = -EEXIST; /* internal code */
+	if (buffer_uptodate(bh))
+		goto out;
+
+	if (mode == READA) {
+		if (!trylock_buffer(bh)) {
+			ret = -EBUSY;
+			goto failed_bh;
+		}
+	} else /* mode == READ */
+		lock_buffer(bh);
+
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		goto out;
+	}
+	if (!buffer_mapped(bh)) { /* unused buffer */
+		ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff,
+					&blknum);
+		if (unlikely(ret)) {
+			unlock_buffer(bh);
+			goto failed_bh;
+		}
+		bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
+		bh->b_blocknr = blknum;
+		set_buffer_mapped(bh);
+	}
+
+	bh->b_end_io = end_buffer_read_sync;
+	get_bh(bh);
+	submit_bh(mode, bh);
+	ret = 0;
+ out:
+	get_bh(bh);
+	*out_bh = bh;
+
+ failed_bh:
+	unlock_page(bh->b_page);
+	page_cache_release(bh->b_page);
+	brelse(bh);
+ failed:
+	return ret;
+}
+
+static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
+				struct buffer_head **out_bh)
+{
+	struct buffer_head *first_bh, *bh;
+	unsigned long blkoff;
+	int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS;
+	int err;
+
+	err = nilfs_mdt_submit_block(inode, block, READ, &first_bh);
+	if (err == -EEXIST) /* internal code */
+		goto out;
+
+	if (unlikely(err))
+		goto failed;
+
+	blkoff = block + 1;
+	for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
+		err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
+		if (likely(!err || err == -EEXIST))
+			brelse(bh);
+		else if (err != -EBUSY)
+			break; /* abort readahead if bmap lookup failed */
+
+		if (!buffer_locked(first_bh))
+			goto out_no_wait;
+	}
+
+	wait_on_buffer(first_bh);
+
+ out_no_wait:
+	err = -EIO;
+	if (!buffer_uptodate(first_bh))
+		goto failed_bh;
+ out:
+	*out_bh = first_bh;
+	return 0;
+
+ failed_bh:
+	brelse(first_bh);
+ failed:
+	return err;
+}
+
+/**
+ * nilfs_mdt_get_block - read or create a buffer on meta data file.
+ * @inode: inode of the meta data file
+ * @blkoff: block offset
+ * @create: create flag
+ * @init_block: initializer used for newly allocated block
+ * @out_bh: output of a pointer to the buffer_head
+ *
+ * nilfs_mdt_get_block() looks up the specified buffer and tries to create
+ * a new buffer if @create is not zero.  On success, the returned buffer is
+ * assured to be either existing or formatted using a buffer lock on success.
+ * @out_bh is substituted only when zero is returned.
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - the specified block does not exist (hole block)
+ *
+ * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
+ *
+ * %-EROFS - Read only filesystem (for create mode)
+ */
+int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
+			void (*init_block)(struct inode *,
+					   struct buffer_head *, void *),
+			struct buffer_head **out_bh)
+{
+	int ret;
+
+	/* Should be rewritten with merging nilfs_mdt_read_block() */
+ retry:
+	ret = nilfs_mdt_read_block(inode, blkoff, out_bh);
+	if (!create || ret != -ENOENT)
+		return ret;
+
+	ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block);
+	if (unlikely(ret == -EEXIST)) {
+		/* create = 0; */  /* limit read-create loop retries */
+		goto retry;
+	}
+	return ret;
+}
+
+/**
+ * nilfs_mdt_delete_block - make a hole on the meta data file.
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * Return Value: On success, zero is returned.
+ * On error, one of the following negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
+ */
+int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	int err;
+
+	err = nilfs_bmap_delete(ii->i_bmap, block);
+	if (likely(!err)) {
+		nilfs_mdt_mark_dirty(inode);
+		nilfs_mdt_forget_block(inode, block);
+	}
+	return err;
+}
+
+/**
+ * nilfs_mdt_forget_block - discard dirty state and try to remove the page
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and
+ * tries to release the page including the buffer from a page cache.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EBUSY - page has an active buffer.
+ *
+ * %-ENOENT - page cache has no page addressed by the offset.
+ */
+int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
+{
+	pgoff_t index = (pgoff_t)block >>
+		(PAGE_CACHE_SHIFT - inode->i_blkbits);
+	struct page *page;
+	unsigned long first_block;
+	int ret = 0;
+	int still_dirty;
+
+	page = find_lock_page(inode->i_mapping, index);
+	if (!page)
+		return -ENOENT;
+
+	wait_on_page_writeback(page);
+
+	first_block = (unsigned long)index <<
+		(PAGE_CACHE_SHIFT - inode->i_blkbits);
+	if (page_has_buffers(page)) {
+		struct buffer_head *bh;
+
+		bh = nilfs_page_get_nth_block(page, block - first_block);
+		nilfs_forget_buffer(bh);
+	}
+	still_dirty = PageDirty(page);
+	unlock_page(page);
+	page_cache_release(page);
+
+	if (still_dirty ||
+	    invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0)
+		ret = -EBUSY;
+	return ret;
+}
+
+/**
+ * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty.
+ * @inode: inode of the meta data file
+ * @block: block offset
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - the specified block does not exist (hole block)
+ *
+ * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
+ */
+int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
+{
+	struct buffer_head *bh;
+	int err;
+
+	err = nilfs_mdt_read_block(inode, block, &bh);
+	if (unlikely(err))
+		return err;
+	nilfs_mark_buffer_dirty(bh);
+	nilfs_mdt_mark_dirty(inode);
+	brelse(bh);
+	return 0;
+}
+
+int nilfs_mdt_fetch_dirty(struct inode *inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) {
+		set_bit(NILFS_I_DIRTY, &ii->i_state);
+		return 1;
+	}
+	return test_bit(NILFS_I_DIRTY, &ii->i_state);
+}
+
+static int
+nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
+{
+	struct inode *inode = container_of(page->mapping,
+					   struct inode, i_data);
+	struct super_block *sb = inode->i_sb;
+	struct nilfs_sb_info *writer = NULL;
+	int err = 0;
+
+	redirty_page_for_writepage(wbc, page);
+	unlock_page(page);
+
+	if (page->mapping->assoc_mapping)
+		return 0; /* Do not request flush for shadow page cache */
+	if (!sb) {
+		writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
+		if (!writer)
+			return -EROFS;
+		sb = writer->s_super;
+	}
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		err = nilfs_construct_segment(sb);
+	else if (wbc->for_reclaim)
+		nilfs_flush_segment(sb, inode->i_ino);
+
+	if (writer)
+		nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
+	return err;
+}
+
+
+static struct address_space_operations def_mdt_aops = {
+	.writepage		= nilfs_mdt_write_page,
+};
+
+static struct inode_operations def_mdt_iops;
+static struct file_operations def_mdt_fops;
+
+/*
+ * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
+ * ifile, or gcinodes.  This allows the B-tree code and segment constructor
+ * to treat them like regular files, and this helps to simplify the
+ * implementation.
+ *   On the other hand, some of the pseudo inodes have an irregular point:
+ * They don't have valid inode->i_sb pointer because their lifetimes are
+ * longer than those of the super block structs; they may continue for
+ * several consecutive mounts/umounts.  This would need discussions.
+ */
+struct inode *
+nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
+		     ino_t ino, gfp_t gfp_mask)
+{
+	struct inode *inode = nilfs_alloc_inode(sb);
+
+	if (!inode)
+		return NULL;
+	else {
+		struct address_space * const mapping = &inode->i_data;
+		struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS);
+
+		if (!mi) {
+			nilfs_destroy_inode(inode);
+			return NULL;
+		}
+		mi->mi_nilfs = nilfs;
+		init_rwsem(&mi->mi_sem);
+
+		inode->i_sb = sb; /* sb may be NULL for some meta data files */
+		inode->i_blkbits = nilfs->ns_blocksize_bits;
+		inode->i_flags = 0;
+		atomic_set(&inode->i_count, 1);
+		inode->i_nlink = 1;
+		inode->i_ino = ino;
+		inode->i_mode = S_IFREG;
+		inode->i_private = mi;
+
+#ifdef INIT_UNUSED_INODE_FIELDS
+		atomic_set(&inode->i_writecount, 0);
+		inode->i_size = 0;
+		inode->i_blocks = 0;
+		inode->i_bytes = 0;
+		inode->i_generation = 0;
+#ifdef CONFIG_QUOTA
+		memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+#endif
+		inode->i_pipe = NULL;
+		inode->i_bdev = NULL;
+		inode->i_cdev = NULL;
+		inode->i_rdev = 0;
+#ifdef CONFIG_SECURITY
+		inode->i_security = NULL;
+#endif
+		inode->dirtied_when = 0;
+
+		INIT_LIST_HEAD(&inode->i_list);
+		INIT_LIST_HEAD(&inode->i_sb_list);
+		inode->i_state = 0;
+#endif
+
+		spin_lock_init(&inode->i_lock);
+		mutex_init(&inode->i_mutex);
+		init_rwsem(&inode->i_alloc_sem);
+
+		mapping->host = NULL;  /* instead of inode */
+		mapping->flags = 0;
+		mapping_set_gfp_mask(mapping, gfp_mask);
+		mapping->assoc_mapping = NULL;
+		mapping->backing_dev_info = nilfs->ns_bdi;
+
+		inode->i_mapping = mapping;
+	}
+
+	return inode;
+}
+
+struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
+			    ino_t ino, gfp_t gfp_mask)
+{
+	struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask);
+
+	if (!inode)
+		return NULL;
+
+	inode->i_op = &def_mdt_iops;
+	inode->i_fop = &def_mdt_fops;
+	inode->i_mapping->a_ops = &def_mdt_aops;
+	return inode;
+}
+
+void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
+			      unsigned header_size)
+{
+	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+
+	mi->mi_entry_size = entry_size;
+	mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size;
+	mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
+}
+
+void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
+{
+	shadow->i_mapping->assoc_mapping = orig->i_mapping;
+	NILFS_I(shadow)->i_btnode_cache.assoc_mapping =
+		&NILFS_I(orig)->i_btnode_cache;
+}
+
+void nilfs_mdt_clear(struct inode *inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+	invalidate_mapping_pages(inode->i_mapping, 0, -1);
+	truncate_inode_pages(inode->i_mapping, 0);
+
+	nilfs_bmap_clear(ii->i_bmap);
+	nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+}
+
+void nilfs_mdt_destroy(struct inode *inode)
+{
+	struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+
+	kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
+	kfree(mdi);
+	nilfs_destroy_inode(inode);
+}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
new file mode 100644
index 00000000000..df683e0bca6
--- /dev/null
+++ b/fs/nilfs2/mdt.h
@@ -0,0 +1,125 @@
+/*
+ * mdt.h - NILFS meta data file prototype and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#ifndef _NILFS_MDT_H
+#define _NILFS_MDT_H
+
+#include <linux/buffer_head.h>
+#include <linux/blockgroup_lock.h>
+#include "nilfs.h"
+#include "page.h"
+
+/**
+ * struct nilfs_mdt_info - on-memory private data of meta data files
+ * @mi_nilfs: back pointer to the_nilfs struct
+ * @mi_sem: reader/writer semaphore for meta data operations
+ * @mi_bgl: per-blockgroup locking
+ * @mi_entry_size: size of an entry
+ * @mi_first_entry_offset: offset to the first entry
+ * @mi_entries_per_block: number of entries in a block
+ * @mi_blocks_per_group: number of blocks in a group
+ * @mi_blocks_per_desc_block: number of blocks per descriptor block
+ */
+struct nilfs_mdt_info {
+	struct the_nilfs       *mi_nilfs;
+	struct rw_semaphore	mi_sem;
+	struct blockgroup_lock *mi_bgl;
+	unsigned		mi_entry_size;
+	unsigned		mi_first_entry_offset;
+	unsigned long		mi_entries_per_block;
+	unsigned long		mi_blocks_per_group;
+	unsigned long		mi_blocks_per_desc_block;
+};
+
+static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
+{
+	return inode->i_private;
+}
+
+static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs;
+}
+
+/* Default GFP flags using highmem */
+#define NILFS_MDT_GFP      (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
+
+int nilfs_mdt_get_block(struct inode *, unsigned long, int,
+			void (*init_block)(struct inode *,
+					   struct buffer_head *, void *),
+			struct buffer_head **);
+int nilfs_mdt_delete_block(struct inode *, unsigned long);
+int nilfs_mdt_forget_block(struct inode *, unsigned long);
+int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
+int nilfs_mdt_fetch_dirty(struct inode *);
+
+struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
+			    gfp_t);
+struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
+				   ino_t, gfp_t);
+void nilfs_mdt_destroy(struct inode *);
+void nilfs_mdt_clear(struct inode *);
+void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
+void nilfs_mdt_set_shadow(struct inode *, struct inode *);
+
+
+#define nilfs_mdt_mark_buffer_dirty(bh)	nilfs_mark_buffer_dirty(bh)
+
+static inline void nilfs_mdt_mark_dirty(struct inode *inode)
+{
+	if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state))
+		set_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
+}
+
+static inline void nilfs_mdt_clear_dirty(struct inode *inode)
+{
+	clear_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
+}
+
+static inline __u64 nilfs_mdt_cno(struct inode *inode)
+{
+	return NILFS_MDT(inode)->mi_nilfs->ns_cno;
+}
+
+#define nilfs_mdt_bgl_lock(inode, bg) \
+	(&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
+
+
+static inline int
+nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh,
+			    unsigned n)
+{
+	return nilfs_read_inode_common(
+		inode, (struct nilfs_inode *)(bh->b_data + n));
+}
+
+static inline void
+nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh,
+			     unsigned n)
+{
+	nilfs_write_inode_common(
+		inode, (struct nilfs_inode *)(bh->b_data + n), 1);
+}
+
+#endif /* _NILFS_MDT_H */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
new file mode 100644
index 00000000000..df70dadb336
--- /dev/null
+++ b/fs/nilfs2/namei.c
@@ -0,0 +1,474 @@
+/*
+ * namei.c - NILFS pathname lookup operations.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>,
+ *                       Ryusuke Konishi <ryusuke@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/namei.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/namei.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/pagemap.h>
+#include "nilfs.h"
+
+
+static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	int err = nilfs_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
+}
+
+/*
+ * Methods themselves.
+ */
+
+static struct dentry *
+nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	struct inode *inode;
+	ino_t ino;
+
+	if (dentry->d_name.len > NILFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ino = nilfs_inode_by_name(dir, dentry);
+	inode = NULL;
+	if (ino) {
+		inode = nilfs_iget(dir->i_sb, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+struct dentry *nilfs_get_parent(struct dentry *child)
+{
+	unsigned long ino;
+	struct inode *inode;
+	struct dentry dotdot;
+
+	dotdot.d_name.name = "..";
+	dotdot.d_name.len = 2;
+
+	ino = nilfs_inode_by_name(child->d_inode, &dotdot);
+	if (!ino)
+		return ERR_PTR(-ENOENT);
+
+	inode = nilfs_iget(child->d_inode->i_sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	return d_obtain_alias(inode);
+}
+
+/*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+ * is so far negative - it has no inode.
+ *
+ * If the create succeeds, we fill in the inode information
+ * with d_instantiate().
+ */
+static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode,
+			struct nameidata *nd)
+{
+	struct inode *inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+	inode = nilfs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &nilfs_file_inode_operations;
+		inode->i_fop = &nilfs_file_operations;
+		inode->i_mapping->a_ops = &nilfs_aops;
+		mark_inode_dirty(inode);
+		err = nilfs_add_nondir(dentry, inode);
+	}
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int
+nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
+{
+	struct inode *inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+	inode = nilfs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, inode->i_mode, rdev);
+		mark_inode_dirty(inode);
+		err = nilfs_add_nondir(dentry, inode);
+	}
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
+			 const char *symname)
+{
+	struct nilfs_transaction_info ti;
+	struct super_block *sb = dir->i_sb;
+	unsigned l = strlen(symname)+1;
+	struct inode *inode;
+	int err;
+
+	if (l > sb->s_blocksize)
+		return -ENAMETOOLONG;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+
+	inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+
+	/* slow symlink */
+	inode->i_op = &nilfs_symlink_inode_operations;
+	inode->i_mapping->a_ops = &nilfs_aops;
+	err = page_symlink(inode, symname, l);
+	if (err)
+		goto out_fail;
+
+	/* mark_inode_dirty(inode); */
+	/* nilfs_new_inode() and page_symlink() do this */
+
+	err = nilfs_add_nondir(dentry, inode);
+out:
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	iput(inode);
+	goto out;
+}
+
+static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	if (inode->i_nlink >= NILFS_LINK_MAX)
+		return -EMLINK;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+
+	inode->i_ctime = CURRENT_TIME;
+	inode_inc_link_count(inode);
+	atomic_inc(&inode->i_count);
+
+	err = nilfs_add_nondir(dentry, inode);
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	if (dir->i_nlink >= NILFS_LINK_MAX)
+		return -EMLINK;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
+	if (err)
+		return err;
+
+	inode_inc_link_count(dir);
+
+	inode = nilfs_new_inode(dir, S_IFDIR | mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_dir;
+
+	inode->i_op = &nilfs_dir_inode_operations;
+	inode->i_fop = &nilfs_dir_operations;
+	inode->i_mapping->a_ops = &nilfs_aops;
+
+	inode_inc_link_count(inode);
+
+	err = nilfs_make_empty(inode, dir);
+	if (err)
+		goto out_fail;
+
+	err = nilfs_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+
+	d_instantiate(dentry, inode);
+out:
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	inode_dec_link_count(inode);
+	iput(inode);
+out_dir:
+	inode_dec_link_count(dir);
+	goto out;
+}
+
+static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode;
+	struct nilfs_dir_entry *de;
+	struct page *page;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
+	if (err)
+		return err;
+
+	err = -ENOENT;
+	de = nilfs_find_entry(dir, dentry, &page);
+	if (!de)
+		goto out;
+
+	inode = dentry->d_inode;
+	err = -EIO;
+	if (le64_to_cpu(de->inode) != inode->i_ino)
+		goto out;
+
+	if (!inode->i_nlink) {
+		nilfs_warning(inode->i_sb, __func__,
+			      "deleting nonexistent file (%lu), %d\n",
+			      inode->i_ino, inode->i_nlink);
+		inode->i_nlink = 1;
+	}
+	err = nilfs_delete_entry(de, page);
+	if (err)
+		goto out;
+
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+	err = 0;
+out:
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
+	if (err)
+		return err;
+
+	err = -ENOTEMPTY;
+	if (nilfs_empty_dir(inode)) {
+		err = nilfs_unlink(dir, dentry);
+		if (!err) {
+			inode->i_size = 0;
+			inode_dec_link_count(inode);
+			inode_dec_link_count(dir);
+		}
+	}
+	if (!err)
+		err = nilfs_transaction_commit(dir->i_sb);
+	else
+		nilfs_transaction_abort(dir->i_sb);
+
+	return err;
+}
+
+static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			struct inode *new_dir,	struct dentry *new_dentry)
+{
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct page *dir_page = NULL;
+	struct nilfs_dir_entry *dir_de = NULL;
+	struct page *old_page;
+	struct nilfs_dir_entry *old_de;
+	struct nilfs_transaction_info ti;
+	int err;
+
+	err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
+	if (unlikely(err))
+		return err;
+
+	err = -ENOENT;
+	old_de = nilfs_find_entry(old_dir, old_dentry, &old_page);
+	if (!old_de)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		dir_de = nilfs_dotdot(old_inode, &dir_page);
+		if (!dir_de)
+			goto out_old;
+	}
+
+	if (new_inode) {
+		struct page *new_page;
+		struct nilfs_dir_entry *new_de;
+
+		err = -ENOTEMPTY;
+		if (dir_de && !nilfs_empty_dir(new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_de = nilfs_find_entry(new_dir, new_dentry, &new_page);
+		if (!new_de)
+			goto out_dir;
+		inode_inc_link_count(old_inode);
+		nilfs_set_link(new_dir, new_de, new_page, old_inode);
+		new_inode->i_ctime = CURRENT_TIME;
+		if (dir_de)
+			drop_nlink(new_inode);
+		inode_dec_link_count(new_inode);
+	} else {
+		if (dir_de) {
+			err = -EMLINK;
+			if (new_dir->i_nlink >= NILFS_LINK_MAX)
+				goto out_dir;
+		}
+		inode_inc_link_count(old_inode);
+		err = nilfs_add_link(new_dentry, old_inode);
+		if (err) {
+			inode_dec_link_count(old_inode);
+			goto out_dir;
+		}
+		if (dir_de)
+			inode_inc_link_count(new_dir);
+	}
+
+	/*
+	 * Like most other Unix systems, set the ctime for inodes on a
+	 * rename.
+	 * inode_dec_link_count() will mark the inode dirty.
+	 */
+	old_inode->i_ctime = CURRENT_TIME;
+
+	nilfs_delete_entry(old_de, old_page);
+	inode_dec_link_count(old_inode);
+
+	if (dir_de) {
+		nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
+		inode_dec_link_count(old_dir);
+	}
+
+	err = nilfs_transaction_commit(old_dir->i_sb);
+	return err;
+
+out_dir:
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
+out_old:
+	kunmap(old_page);
+	page_cache_release(old_page);
+out:
+	nilfs_transaction_abort(old_dir->i_sb);
+	return err;
+}
+
+struct inode_operations nilfs_dir_inode_operations = {
+	.create		= nilfs_create,
+	.lookup		= nilfs_lookup,
+	.link		= nilfs_link,
+	.unlink		= nilfs_unlink,
+	.symlink	= nilfs_symlink,
+	.mkdir		= nilfs_mkdir,
+	.rmdir		= nilfs_rmdir,
+	.mknod		= nilfs_mknod,
+	.rename		= nilfs_rename,
+	.setattr	= nilfs_setattr,
+	.permission	= nilfs_permission,
+};
+
+struct inode_operations nilfs_special_inode_operations = {
+	.setattr	= nilfs_setattr,
+	.permission	= nilfs_permission,
+};
+
+struct inode_operations nilfs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+};
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
new file mode 100644
index 00000000000..3d0c18a16db
--- /dev/null
+++ b/fs/nilfs2/nilfs.h
@@ -0,0 +1,313 @@
+/*
+ * nilfs.h - NILFS local header file.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>
+ *            Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#ifndef _NILFS_H
+#define _NILFS_H
+
+#include <linux/kernel.h>
+#include <linux/buffer_head.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/nilfs2_fs.h>
+#include "the_nilfs.h"
+#include "sb.h"
+#include "bmap.h"
+#include "bmap_union.h"
+
+/*
+ * nilfs inode data in memory
+ */
+struct nilfs_inode_info {
+	__u32 i_flags;
+	unsigned long  i_state;		/* Dynamic state flags */
+	struct nilfs_bmap *i_bmap;
+	union nilfs_bmap_union i_bmap_union;
+	__u64 i_xattr;	/* sector_t ??? */
+	__u32 i_dir_start_lookup;
+	__u64 i_cno;		/* check point number for GC inode */
+	struct address_space i_btnode_cache;
+	struct list_head i_dirty;	/* List for connecting dirty files */
+
+#ifdef CONFIG_NILFS_XATTR
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_sem even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+#endif
+#ifdef CONFIG_NILFS_POSIX_ACL
+	struct posix_acl *i_acl;
+	struct posix_acl *i_default_acl;
+#endif
+	struct buffer_head *i_bh;	/* i_bh contains a new or dirty
+					   disk inode */
+	struct inode vfs_inode;
+};
+
+static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
+{
+	return container_of(inode, struct nilfs_inode_info, vfs_inode);
+}
+
+static inline struct nilfs_inode_info *
+NILFS_BMAP_I(const struct nilfs_bmap *bmap)
+{
+	return container_of((union nilfs_bmap_union *)bmap,
+			    struct nilfs_inode_info,
+			    i_bmap_union);
+}
+
+static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
+{
+	struct nilfs_inode_info *ii =
+		container_of(btnc, struct nilfs_inode_info, i_btnode_cache);
+	return &ii->vfs_inode;
+}
+
+static inline struct inode *NILFS_AS_I(struct address_space *mapping)
+{
+	return (mapping->host) ? :
+		container_of(mapping, struct inode, i_data);
+}
+
+/*
+ * Dynamic state flags of NILFS on-memory inode (i_state)
+ */
+enum {
+	NILFS_I_NEW = 0,		/* Inode is newly created */
+	NILFS_I_DIRTY,			/* The file is dirty */
+	NILFS_I_QUEUED,			/* inode is in dirty_files list */
+	NILFS_I_BUSY,			/* inode is grabbed by a segment
+					   constructor */
+	NILFS_I_COLLECTED,		/* All dirty blocks are collected */
+	NILFS_I_UPDATED,		/* The file has been written back */
+	NILFS_I_INODE_DIRTY,		/* write_inode is requested */
+	NILFS_I_BMAP,			/* has bmap and btnode_cache */
+	NILFS_I_GCINODE,		/* inode for GC, on memory only */
+	NILFS_I_GCDAT,			/* shadow DAT, on memory only */
+};
+
+/*
+ * Macros to check inode numbers
+ */
+#define NILFS_MDT_INO_BITS   \
+  ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO |		\
+		  1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO |	\
+		  1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO))
+
+#define NILFS_SYS_INO_BITS   \
+  ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
+
+#define NILFS_FIRST_INO(sb)  (NILFS_SB(sb)->s_nilfs->ns_first_ino)
+
+#define NILFS_MDT_INODE(sb, ino) \
+  ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
+#define NILFS_VALID_INODE(sb, ino) \
+  ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino))))
+
+/**
+ * struct nilfs_transaction_info: context information for synchronization
+ * @ti_magic: Magic number
+ * @ti_save: Backup of journal_info field of task_struct
+ * @ti_flags: Flags
+ * @ti_count: Nest level
+ * @ti_garbage:	List of inode to be put when releasing semaphore
+ */
+struct nilfs_transaction_info {
+	u32			ti_magic;
+	void		       *ti_save;
+				/* This should never used. If this happens,
+				   one of other filesystems has a bug. */
+	unsigned short		ti_flags;
+	unsigned short		ti_count;
+	struct list_head	ti_garbage;
+};
+
+/* ti_magic */
+#define NILFS_TI_MAGIC		0xd9e392fb
+
+/* ti_flags */
+#define NILFS_TI_DYNAMIC_ALLOC	0x0001  /* Allocated from slab */
+#define NILFS_TI_SYNC		0x0002	/* Force to construct segment at the
+					   end of transaction. */
+#define NILFS_TI_GC		0x0004	/* GC context */
+#define NILFS_TI_COMMIT		0x0008	/* Change happened or not */
+#define NILFS_TI_WRITER		0x0010	/* Constructor context */
+
+
+int nilfs_transaction_begin(struct super_block *,
+			    struct nilfs_transaction_info *, int);
+int nilfs_transaction_commit(struct super_block *);
+void nilfs_transaction_abort(struct super_block *);
+
+static inline void nilfs_set_transaction_flag(unsigned int flag)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+
+	ti->ti_flags |= flag;
+}
+
+static inline int nilfs_test_transaction_flag(unsigned int flag)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+
+	if (ti == NULL || ti->ti_magic != NILFS_TI_MAGIC)
+		return 0;
+	return !!(ti->ti_flags & flag);
+}
+
+static inline int nilfs_doing_gc(void)
+{
+	return nilfs_test_transaction_flag(NILFS_TI_GC);
+}
+
+static inline int nilfs_doing_construction(void)
+{
+	return nilfs_test_transaction_flag(NILFS_TI_WRITER);
+}
+
+static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
+{
+	return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat;
+}
+
+/*
+ * function prototype
+ */
+#ifdef CONFIG_NILFS_POSIX_ACL
+#error "NILFS: not yet supported POSIX ACL"
+extern int nilfs_permission(struct inode *, int, struct nameidata *);
+extern int nilfs_acl_chmod(struct inode *);
+extern int nilfs_init_acl(struct inode *, struct inode *);
+#else
+#define nilfs_permission   NULL
+
+static inline int nilfs_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
+
+static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
+{
+	inode->i_mode &= ~current_umask();
+	return 0;
+}
+#endif
+
+#define NILFS_ATIME_DISABLE
+
+/* dir.c */
+extern int nilfs_add_link(struct dentry *, struct inode *);
+extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *);
+extern int nilfs_make_empty(struct inode *, struct inode *);
+extern struct nilfs_dir_entry *
+nilfs_find_entry(struct inode *, struct dentry *, struct page **);
+extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
+extern int nilfs_empty_dir(struct inode *);
+extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
+extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
+			   struct page *, struct inode *);
+
+/* file.c */
+extern int nilfs_sync_file(struct file *, struct dentry *, int);
+
+/* ioctl.c */
+long nilfs_ioctl(struct file *, unsigned int, unsigned long);
+int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, void __user *);
+
+/* inode.c */
+extern struct inode *nilfs_new_inode(struct inode *, int);
+extern void nilfs_free_inode(struct inode *);
+extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
+extern void nilfs_set_inode_flags(struct inode *);
+extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
+extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
+extern struct inode *nilfs_iget(struct super_block *, unsigned long);
+extern void nilfs_update_inode(struct inode *, struct buffer_head *);
+extern void nilfs_truncate(struct inode *);
+extern void nilfs_delete_inode(struct inode *);
+extern int nilfs_setattr(struct dentry *, struct iattr *);
+extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
+				  struct buffer_head **);
+extern int nilfs_inode_dirty(struct inode *);
+extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
+				unsigned);
+extern int nilfs_mark_inode_dirty(struct inode *);
+extern void nilfs_dirty_inode(struct inode *);
+
+/* namei.c */
+extern struct dentry *nilfs_get_parent(struct dentry *);
+
+/* super.c */
+extern struct inode *nilfs_alloc_inode(struct super_block *);
+extern void nilfs_destroy_inode(struct inode *);
+extern void nilfs_error(struct super_block *, const char *, const char *, ...)
+	__attribute__ ((format (printf, 3, 4)));
+extern void nilfs_warning(struct super_block *, const char *, const char *, ...)
+       __attribute__ ((format (printf, 3, 4)));
+extern struct nilfs_super_block *
+nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
+extern int nilfs_store_magic_and_option(struct super_block *,
+					struct nilfs_super_block *, char *);
+extern int nilfs_commit_super(struct nilfs_sb_info *, int);
+extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64);
+extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
+
+/* gcinode.c */
+int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
+				   struct buffer_head **);
+int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
+				   struct buffer_head **);
+int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
+int nilfs_init_gccache(struct the_nilfs *);
+void nilfs_destroy_gccache(struct the_nilfs *);
+void nilfs_clear_gcinode(struct inode *);
+struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64);
+void nilfs_remove_all_gcinode(struct the_nilfs *);
+
+/* gcdat.c */
+int nilfs_init_gcdat_inode(struct the_nilfs *);
+void nilfs_commit_gcdat_inode(struct the_nilfs *);
+void nilfs_clear_gcdat_inode(struct the_nilfs *);
+
+/*
+ * Inodes and files operations
+ */
+extern struct file_operations nilfs_dir_operations;
+extern struct inode_operations nilfs_file_inode_operations;
+extern struct file_operations nilfs_file_operations;
+extern struct address_space_operations nilfs_aops;
+extern struct inode_operations nilfs_dir_inode_operations;
+extern struct inode_operations nilfs_special_inode_operations;
+extern struct inode_operations nilfs_symlink_inode_operations;
+
+/*
+ * filesystem type
+ */
+extern struct file_system_type nilfs_fs_type;
+
+
+#endif	/* _NILFS_H */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
new file mode 100644
index 00000000000..1bfbba9c0e9
--- /dev/null
+++ b/fs/nilfs2/page.c
@@ -0,0 +1,540 @@
+/*
+ * page.c - buffer/page management specific to NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>,
+ *            Seiji Kihara <kihara@osrg.net>.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/swap.h>
+#include <linux/bitops.h>
+#include <linux/page-flags.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/pagevec.h>
+#include "nilfs.h"
+#include "page.h"
+#include "mdt.h"
+
+
+#define NILFS_BUFFER_INHERENT_BITS  \
+	((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
+	 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated))
+
+static struct buffer_head *
+__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
+		       int blkbits, unsigned long b_state)
+
+{
+	unsigned long first_block;
+	struct buffer_head *bh;
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, 1 << blkbits, b_state);
+
+	first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits);
+	bh = nilfs_page_get_nth_block(page, block - first_block);
+
+	touch_buffer(bh);
+	wait_on_buffer(bh);
+	return bh;
+}
+
+/*
+ * Since the page cache of B-tree node pages or data page cache of pseudo
+ * inodes does not have a valid mapping->host pointer, calling
+ * mark_buffer_dirty() for their buffers causes a NULL pointer dereference;
+ * it calls __mark_inode_dirty(NULL) through __set_page_dirty().
+ * To avoid this problem, the old style mark_buffer_dirty() is used instead.
+ */
+void nilfs_mark_buffer_dirty(struct buffer_head *bh)
+{
+	if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
+		__set_page_dirty_nobuffers(bh->b_page);
+}
+
+struct buffer_head *nilfs_grab_buffer(struct inode *inode,
+				      struct address_space *mapping,
+				      unsigned long blkoff,
+				      unsigned long b_state)
+{
+	int blkbits = inode->i_blkbits;
+	pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
+	struct page *page, *opage;
+	struct buffer_head *bh, *obh;
+
+	page = grab_cache_page(mapping, index);
+	if (unlikely(!page))
+		return NULL;
+
+	bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state);
+	if (unlikely(!bh)) {
+		unlock_page(page);
+		page_cache_release(page);
+		return NULL;
+	}
+	if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) {
+		/*
+		 * Shadow page cache uses assoc_mapping to point its original
+		 * page cache.  The following code tries the original cache
+		 * if the given cache is a shadow and it didn't hit.
+		 */
+		opage = find_lock_page(mapping->assoc_mapping, index);
+		if (!opage)
+			return bh;
+
+		obh = __nilfs_get_page_block(opage, blkoff, index, blkbits,
+					     b_state);
+		if (buffer_uptodate(obh)) {
+			nilfs_copy_buffer(bh, obh);
+			if (buffer_dirty(obh)) {
+				nilfs_mark_buffer_dirty(bh);
+				if (!buffer_nilfs_node(bh) && NILFS_MDT(inode))
+					nilfs_mdt_mark_dirty(inode);
+			}
+		}
+		brelse(obh);
+		unlock_page(opage);
+		page_cache_release(opage);
+	}
+	return bh;
+}
+
+/**
+ * nilfs_forget_buffer - discard dirty state
+ * @inode: owner inode of the buffer
+ * @bh: buffer head of the buffer to be discarded
+ */
+void nilfs_forget_buffer(struct buffer_head *bh)
+{
+	struct page *page = bh->b_page;
+
+	lock_buffer(bh);
+	clear_buffer_nilfs_volatile(bh);
+	if (test_clear_buffer_dirty(bh) && nilfs_page_buffers_clean(page))
+		__nilfs_clear_page_dirty(page);
+
+	clear_buffer_uptodate(bh);
+	clear_buffer_mapped(bh);
+	bh->b_blocknr = -1;
+	ClearPageUptodate(page);
+	ClearPageMappedToDisk(page);
+	unlock_buffer(bh);
+	brelse(bh);
+}
+
+/**
+ * nilfs_copy_buffer -- copy buffer data and flags
+ * @dbh: destination buffer
+ * @sbh: source buffer
+ */
+void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
+{
+	void *kaddr0, *kaddr1;
+	unsigned long bits;
+	struct page *spage = sbh->b_page, *dpage = dbh->b_page;
+	struct buffer_head *bh;
+
+	kaddr0 = kmap_atomic(spage, KM_USER0);
+	kaddr1 = kmap_atomic(dpage, KM_USER1);
+	memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
+	kunmap_atomic(kaddr1, KM_USER1);
+	kunmap_atomic(kaddr0, KM_USER0);
+
+	dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
+	dbh->b_blocknr = sbh->b_blocknr;
+	dbh->b_bdev = sbh->b_bdev;
+
+	bh = dbh;
+	bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped));
+	while ((bh = bh->b_this_page) != dbh) {
+		lock_buffer(bh);
+		bits &= bh->b_state;
+		unlock_buffer(bh);
+	}
+	if (bits & (1UL << BH_Uptodate))
+		SetPageUptodate(dpage);
+	else
+		ClearPageUptodate(dpage);
+	if (bits & (1UL << BH_Mapped))
+		SetPageMappedToDisk(dpage);
+	else
+		ClearPageMappedToDisk(dpage);
+}
+
+/**
+ * nilfs_page_buffers_clean - check if a page has dirty buffers or not.
+ * @page: page to be checked
+ *
+ * nilfs_page_buffers_clean() returns zero if the page has dirty buffers.
+ * Otherwise, it returns non-zero value.
+ */
+int nilfs_page_buffers_clean(struct page *page)
+{
+	struct buffer_head *bh, *head;
+
+	bh = head = page_buffers(page);
+	do {
+		if (buffer_dirty(bh))
+			return 0;
+		bh = bh->b_this_page;
+	} while (bh != head);
+	return 1;
+}
+
+void nilfs_page_bug(struct page *page)
+{
+	struct address_space *m;
+	unsigned long ino = 0;
+
+	if (unlikely(!page)) {
+		printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n");
+		return;
+	}
+
+	m = page->mapping;
+	if (m) {
+		struct inode *inode = NILFS_AS_I(m);
+		if (inode != NULL)
+			ino = inode->i_ino;
+	}
+	printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
+	       "mapping=%p ino=%lu\n",
+	       page, atomic_read(&page->_count),
+	       (unsigned long long)page->index, page->flags, m, ino);
+
+	if (page_has_buffers(page)) {
+		struct buffer_head *bh, *head;
+		int i = 0;
+
+		bh = head = page_buffers(page);
+		do {
+			printk(KERN_CRIT
+			       " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n",
+			       i++, bh, atomic_read(&bh->b_count),
+			       (unsigned long long)bh->b_blocknr, bh->b_state);
+			bh = bh->b_this_page;
+		} while (bh != head);
+	}
+}
+
+/**
+ * nilfs_alloc_private_page - allocate a private page with buffer heads
+ *
+ * Return Value: On success, a pointer to the allocated page is returned.
+ * On error, NULL is returned.
+ */
+struct page *nilfs_alloc_private_page(struct block_device *bdev, int size,
+				      unsigned long state)
+{
+	struct buffer_head *bh, *head, *tail;
+	struct page *page;
+
+	page = alloc_page(GFP_NOFS); /* page_count of the returned page is 1 */
+	if (unlikely(!page))
+		return NULL;
+
+	lock_page(page);
+	head = alloc_page_buffers(page, size, 0);
+	if (unlikely(!head)) {
+		unlock_page(page);
+		__free_page(page);
+		return NULL;
+	}
+
+	bh = head;
+	do {
+		bh->b_state = (1UL << BH_NILFS_Allocated) | state;
+		tail = bh;
+		bh->b_bdev = bdev;
+		bh = bh->b_this_page;
+	} while (bh);
+
+	tail->b_this_page = head;
+	attach_page_buffers(page, head);
+
+	return page;
+}
+
+void nilfs_free_private_page(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+	BUG_ON(page->mapping);
+
+	if (page_has_buffers(page) && !try_to_free_buffers(page))
+		NILFS_PAGE_BUG(page, "failed to free page");
+
+	unlock_page(page);
+	__free_page(page);
+}
+
+/**
+ * nilfs_copy_page -- copy the page with buffers
+ * @dst: destination page
+ * @src: source page
+ * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
+ *
+ * This fuction is for both data pages and btnode pages.  The dirty flag
+ * should be treated by caller.  The page must not be under i/o.
+ * Both src and dst page must be locked
+ */
+static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
+{
+	struct buffer_head *dbh, *dbufs, *sbh, *sbufs;
+	unsigned long mask = NILFS_BUFFER_INHERENT_BITS;
+
+	BUG_ON(PageWriteback(dst));
+
+	sbh = sbufs = page_buffers(src);
+	if (!page_has_buffers(dst))
+		create_empty_buffers(dst, sbh->b_size, 0);
+
+	if (copy_dirty)
+		mask |= (1UL << BH_Dirty);
+
+	dbh = dbufs = page_buffers(dst);
+	do {
+		lock_buffer(sbh);
+		lock_buffer(dbh);
+		dbh->b_state = sbh->b_state & mask;
+		dbh->b_blocknr = sbh->b_blocknr;
+		dbh->b_bdev = sbh->b_bdev;
+		sbh = sbh->b_this_page;
+		dbh = dbh->b_this_page;
+	} while (dbh != dbufs);
+
+	copy_highpage(dst, src);
+
+	if (PageUptodate(src) && !PageUptodate(dst))
+		SetPageUptodate(dst);
+	else if (!PageUptodate(src) && PageUptodate(dst))
+		ClearPageUptodate(dst);
+	if (PageMappedToDisk(src) && !PageMappedToDisk(dst))
+		SetPageMappedToDisk(dst);
+	else if (!PageMappedToDisk(src) && PageMappedToDisk(dst))
+		ClearPageMappedToDisk(dst);
+
+	do {
+		unlock_buffer(sbh);
+		unlock_buffer(dbh);
+		sbh = sbh->b_this_page;
+		dbh = dbh->b_this_page;
+	} while (dbh != dbufs);
+}
+
+int nilfs_copy_dirty_pages(struct address_space *dmap,
+			   struct address_space *smap)
+{
+	struct pagevec pvec;
+	unsigned int i;
+	pgoff_t index = 0;
+	int err = 0;
+
+	pagevec_init(&pvec, 0);
+repeat:
+	if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY,
+				PAGEVEC_SIZE))
+		return 0;
+
+	for (i = 0; i < pagevec_count(&pvec); i++) {
+		struct page *page = pvec.pages[i], *dpage;
+
+		lock_page(page);
+		if (unlikely(!PageDirty(page)))
+			NILFS_PAGE_BUG(page, "inconsistent dirty state");
+
+		dpage = grab_cache_page(dmap, page->index);
+		if (unlikely(!dpage)) {
+			/* No empty page is added to the page cache */
+			err = -ENOMEM;
+			unlock_page(page);
+			break;
+		}
+		if (unlikely(!page_has_buffers(page)))
+			NILFS_PAGE_BUG(page,
+				       "found empty page in dat page cache");
+
+		nilfs_copy_page(dpage, page, 1);
+		__set_page_dirty_nobuffers(dpage);
+
+		unlock_page(dpage);
+		page_cache_release(dpage);
+		unlock_page(page);
+	}
+	pagevec_release(&pvec);
+	cond_resched();
+
+	if (likely(!err))
+		goto repeat;
+	return err;
+}
+
+/**
+ * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache
+ * @dmap: destination page cache
+ * @smap: source page cache
+ *
+ * No pages must no be added to the cache during this process.
+ * This must be ensured by the caller.
+ */
+void nilfs_copy_back_pages(struct address_space *dmap,
+			   struct address_space *smap)
+{
+	struct pagevec pvec;
+	unsigned int i, n;
+	pgoff_t index = 0;
+	int err;
+
+	pagevec_init(&pvec, 0);
+repeat:
+	n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE);
+	if (!n)
+		return;
+	index = pvec.pages[n - 1]->index + 1;
+
+	for (i = 0; i < pagevec_count(&pvec); i++) {
+		struct page *page = pvec.pages[i], *dpage;
+		pgoff_t offset = page->index;
+
+		lock_page(page);
+		dpage = find_lock_page(dmap, offset);
+		if (dpage) {
+			/* override existing page on the destination cache */
+			WARN_ON(PageDirty(dpage));
+			nilfs_copy_page(dpage, page, 0);
+			unlock_page(dpage);
+			page_cache_release(dpage);
+		} else {
+			struct page *page2;
+
+			/* move the page to the destination cache */
+			spin_lock_irq(&smap->tree_lock);
+			page2 = radix_tree_delete(&smap->page_tree, offset);
+			WARN_ON(page2 != page);
+
+			smap->nrpages--;
+			spin_unlock_irq(&smap->tree_lock);
+
+			spin_lock_irq(&dmap->tree_lock);
+			err = radix_tree_insert(&dmap->page_tree, offset, page);
+			if (unlikely(err < 0)) {
+				WARN_ON(err == -EEXIST);
+				page->mapping = NULL;
+				page_cache_release(page); /* for cache */
+			} else {
+				page->mapping = dmap;
+				dmap->nrpages++;
+				if (PageDirty(page))
+					radix_tree_tag_set(&dmap->page_tree,
+							   offset,
+							   PAGECACHE_TAG_DIRTY);
+			}
+			spin_unlock_irq(&dmap->tree_lock);
+		}
+		unlock_page(page);
+	}
+	pagevec_release(&pvec);
+	cond_resched();
+
+	goto repeat;
+}
+
+void nilfs_clear_dirty_pages(struct address_space *mapping)
+{
+	struct pagevec pvec;
+	unsigned int i;
+	pgoff_t index = 0;
+
+	pagevec_init(&pvec, 0);
+
+	while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+				  PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			struct buffer_head *bh, *head;
+
+			lock_page(page);
+			ClearPageUptodate(page);
+			ClearPageMappedToDisk(page);
+			bh = head = page_buffers(page);
+			do {
+				lock_buffer(bh);
+				clear_buffer_dirty(bh);
+				clear_buffer_nilfs_volatile(bh);
+				clear_buffer_uptodate(bh);
+				clear_buffer_mapped(bh);
+				unlock_buffer(bh);
+				bh = bh->b_this_page;
+			} while (bh != head);
+
+			__nilfs_clear_page_dirty(page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+unsigned nilfs_page_count_clean_buffers(struct page *page,
+					unsigned from, unsigned to)
+{
+	unsigned block_start, block_end;
+	struct buffer_head *bh, *head;
+	unsigned nc = 0;
+
+	for (bh = head = page_buffers(page), block_start = 0;
+	     bh != head || !block_start;
+	     block_start = block_end, bh = bh->b_this_page) {
+		block_end = block_start + bh->b_size;
+		if (block_end > from && block_start < to && !buffer_dirty(bh))
+			nc++;
+	}
+	return nc;
+}
+
+/*
+ * NILFS2 needs clear_page_dirty() in the following two cases:
+ *
+ * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears
+ *    page dirty flags when it copies back pages from the shadow cache
+ *    (gcdat->{i_mapping,i_btnode_cache}) to its original cache
+ *    (dat->{i_mapping,i_btnode_cache}).
+ *
+ * 2) Some B-tree operations like insertion or deletion may dispose buffers
+ *    in dirty state, and this needs to cancel the dirty state of their pages.
+ */
+int __nilfs_clear_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	if (mapping) {
+		spin_lock_irq(&mapping->tree_lock);
+		if (test_bit(PG_dirty, &page->flags)) {
+			radix_tree_tag_clear(&mapping->page_tree,
+					     page_index(page),
+					     PAGECACHE_TAG_DIRTY);
+			spin_unlock_irq(&mapping->tree_lock);
+			return clear_page_dirty_for_io(page);
+		}
+		spin_unlock_irq(&mapping->tree_lock);
+		return 0;
+	}
+	return TestClearPageDirty(page);
+}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
new file mode 100644
index 00000000000..8abca4d1c1f
--- /dev/null
+++ b/fs/nilfs2/page.h
@@ -0,0 +1,76 @@
+/*
+ * page.h - buffer/page management specific to NILFS
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>,
+ *            Seiji Kihara <kihara@osrg.net>.
+ */
+
+#ifndef _NILFS_PAGE_H
+#define _NILFS_PAGE_H
+
+#include <linux/buffer_head.h>
+#include "nilfs.h"
+
+/*
+ * Extended buffer state bits
+ */
+enum {
+	BH_NILFS_Allocated = BH_PrivateStart,
+	BH_NILFS_Node,
+	BH_NILFS_Volatile,
+};
+
+BUFFER_FNS(NILFS_Allocated, nilfs_allocated)	/* nilfs private buffers */
+BUFFER_FNS(NILFS_Node, nilfs_node)		/* nilfs node buffers */
+BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
+
+
+void nilfs_mark_buffer_dirty(struct buffer_head *bh);
+int __nilfs_clear_page_dirty(struct page *);
+
+struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *,
+				      unsigned long, unsigned long);
+void nilfs_forget_buffer(struct buffer_head *);
+void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *);
+int nilfs_page_buffers_clean(struct page *);
+void nilfs_page_bug(struct page *);
+struct page *nilfs_alloc_private_page(struct block_device *, int,
+				      unsigned long);
+void nilfs_free_private_page(struct page *);
+
+int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
+void nilfs_copy_back_pages(struct address_space *, struct address_space *);
+void nilfs_clear_dirty_pages(struct address_space *);
+unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
+
+#define NILFS_PAGE_BUG(page, m, a...) \
+	do { nilfs_page_bug(page); BUG(); } while (0)
+
+static inline struct buffer_head *
+nilfs_page_get_nth_block(struct page *page, unsigned int count)
+{
+	struct buffer_head *bh = page_buffers(page);
+
+	while (count-- > 0)
+		bh = bh->b_this_page;
+	get_bh(bh);
+	return bh;
+}
+
+#endif /* _NILFS_PAGE_H */
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
new file mode 100644
index 00000000000..4fc081e47d7
--- /dev/null
+++ b/fs/nilfs2/recovery.c
@@ -0,0 +1,917 @@
+/*
+ * recovery.c - NILFS recovery logic
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/crc32.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "sufile.h"
+#include "page.h"
+#include "seglist.h"
+#include "segbuf.h"
+
+/*
+ * Segment check result
+ */
+enum {
+	NILFS_SEG_VALID,
+	NILFS_SEG_NO_SUPER_ROOT,
+	NILFS_SEG_FAIL_IO,
+	NILFS_SEG_FAIL_MAGIC,
+	NILFS_SEG_FAIL_SEQ,
+	NILFS_SEG_FAIL_CHECKSUM_SEGSUM,
+	NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
+	NILFS_SEG_FAIL_CHECKSUM_FULL,
+	NILFS_SEG_FAIL_CONSISTENCY,
+};
+
+/* work structure for recovery */
+struct nilfs_recovery_block {
+	ino_t ino;		/* Inode number of the file that this block
+				   belongs to */
+	sector_t blocknr;	/* block number */
+	__u64 vblocknr;		/* virtual block number */
+	unsigned long blkoff;	/* File offset of the data block (per block) */
+	struct list_head list;
+};
+
+
+static int nilfs_warn_segment_error(int err)
+{
+	switch (err) {
+	case NILFS_SEG_FAIL_IO:
+		printk(KERN_WARNING
+		       "NILFS warning: I/O error on loading last segment\n");
+		return -EIO;
+	case NILFS_SEG_FAIL_MAGIC:
+		printk(KERN_WARNING
+		       "NILFS warning: Segment magic number invalid\n");
+		break;
+	case NILFS_SEG_FAIL_SEQ:
+		printk(KERN_WARNING
+		       "NILFS warning: Sequence number mismatch\n");
+		break;
+	case NILFS_SEG_FAIL_CHECKSUM_SEGSUM:
+		printk(KERN_WARNING
+		       "NILFS warning: Checksum error in segment summary\n");
+		break;
+	case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
+		printk(KERN_WARNING
+		       "NILFS warning: Checksum error in super root\n");
+		break;
+	case NILFS_SEG_FAIL_CHECKSUM_FULL:
+		printk(KERN_WARNING
+		       "NILFS warning: Checksum error in segment payload\n");
+		break;
+	case NILFS_SEG_FAIL_CONSISTENCY:
+		printk(KERN_WARNING
+		       "NILFS warning: Inconsistent segment\n");
+		break;
+	case NILFS_SEG_NO_SUPER_ROOT:
+		printk(KERN_WARNING
+		       "NILFS warning: No super root in the last segment\n");
+		break;
+	}
+	return -EINVAL;
+}
+
+static void store_segsum_info(struct nilfs_segsum_info *ssi,
+			      struct nilfs_segment_summary *sum,
+			      unsigned int blocksize)
+{
+	ssi->flags = le16_to_cpu(sum->ss_flags);
+	ssi->seg_seq = le64_to_cpu(sum->ss_seq);
+	ssi->ctime = le64_to_cpu(sum->ss_create);
+	ssi->next = le64_to_cpu(sum->ss_next);
+	ssi->nblocks = le32_to_cpu(sum->ss_nblocks);
+	ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo);
+	ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes);
+
+	ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
+	ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
+}
+
+/**
+ * calc_crc_cont - check CRC of blocks continuously
+ * @sbi: nilfs_sb_info
+ * @bhs: buffer head of start block
+ * @sum: place to store result
+ * @offset: offset bytes in the first block
+ * @check_bytes: number of bytes to be checked
+ * @start: DBN of start block
+ * @nblock: number of blocks to be checked
+ */
+static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs,
+			 u32 *sum, unsigned long offset, u64 check_bytes,
+			 sector_t start, unsigned long nblock)
+{
+	unsigned long blocksize = sbi->s_super->s_blocksize;
+	unsigned long size;
+	u32 crc;
+
+	BUG_ON(offset >= blocksize);
+	check_bytes -= offset;
+	size = min_t(u64, check_bytes, blocksize - offset);
+	crc = crc32_le(sbi->s_nilfs->ns_crc_seed,
+		       (unsigned char *)bhs->b_data + offset, size);
+	if (--nblock > 0) {
+		do {
+			struct buffer_head *bh
+				= sb_bread(sbi->s_super, ++start);
+			if (!bh)
+				return -EIO;
+			check_bytes -= size;
+			size = min_t(u64, check_bytes, blocksize);
+			crc = crc32_le(crc, bh->b_data, size);
+			brelse(bh);
+		} while (--nblock > 0);
+	}
+	*sum = crc;
+	return 0;
+}
+
+/**
+ * nilfs_read_super_root_block - read super root block
+ * @sb: super_block
+ * @sr_block: disk block number of the super root block
+ * @pbh: address of a buffer_head pointer to return super root buffer
+ * @check: CRC check flag
+ */
+int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
+				struct buffer_head **pbh, int check)
+{
+	struct buffer_head *bh_sr;
+	struct nilfs_super_root *sr;
+	u32 crc;
+	int ret;
+
+	*pbh = NULL;
+	bh_sr = sb_bread(sb, sr_block);
+	if (unlikely(!bh_sr)) {
+		ret = NILFS_SEG_FAIL_IO;
+		goto failed;
+	}
+
+	sr = (struct nilfs_super_root *)bh_sr->b_data;
+	if (check) {
+		unsigned bytes = le16_to_cpu(sr->sr_bytes);
+
+		if (bytes == 0 || bytes > sb->s_blocksize) {
+			ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
+			goto failed_bh;
+		}
+		if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc,
+				  sizeof(sr->sr_sum), bytes, sr_block, 1)) {
+			ret = NILFS_SEG_FAIL_IO;
+			goto failed_bh;
+		}
+		if (crc != le32_to_cpu(sr->sr_sum)) {
+			ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
+			goto failed_bh;
+		}
+	}
+	*pbh = bh_sr;
+	return 0;
+
+ failed_bh:
+	brelse(bh_sr);
+
+ failed:
+	return nilfs_warn_segment_error(ret);
+}
+
+/**
+ * load_segment_summary - read segment summary of the specified partial segment
+ * @sbi: nilfs_sb_info
+ * @pseg_start: start disk block number of partial segment
+ * @seg_seq: sequence number requested
+ * @ssi: pointer to nilfs_segsum_info struct to store information
+ * @full_check: full check flag
+ *              (0: only checks segment summary CRC, 1: data CRC)
+ */
+static int
+load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
+		     u64 seg_seq, struct nilfs_segsum_info *ssi,
+		     int full_check)
+{
+	struct buffer_head *bh_sum;
+	struct nilfs_segment_summary *sum;
+	unsigned long offset, nblock;
+	u64 check_bytes;
+	u32 crc, crc_sum;
+	int ret = NILFS_SEG_FAIL_IO;
+
+	bh_sum = sb_bread(sbi->s_super, pseg_start);
+	if (!bh_sum)
+		goto out;
+
+	sum = (struct nilfs_segment_summary *)bh_sum->b_data;
+
+	/* Check consistency of segment summary */
+	if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) {
+		ret = NILFS_SEG_FAIL_MAGIC;
+		goto failed;
+	}
+	store_segsum_info(ssi, sum, sbi->s_super->s_blocksize);
+	if (seg_seq != ssi->seg_seq) {
+		ret = NILFS_SEG_FAIL_SEQ;
+		goto failed;
+	}
+	if (full_check) {
+		offset = sizeof(sum->ss_datasum);
+		check_bytes =
+			((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits);
+		nblock = ssi->nblocks;
+		crc_sum = le32_to_cpu(sum->ss_datasum);
+		ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
+	} else { /* only checks segment summary */
+		offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum);
+		check_bytes = ssi->sumbytes;
+		nblock = ssi->nsumblk;
+		crc_sum = le32_to_cpu(sum->ss_sumsum);
+		ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM;
+	}
+
+	if (unlikely(nblock == 0 ||
+		     nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
+		/* This limits the number of blocks read in the CRC check */
+		ret = NILFS_SEG_FAIL_CONSISTENCY;
+		goto failed;
+	}
+	if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes,
+			  pseg_start, nblock)) {
+		ret = NILFS_SEG_FAIL_IO;
+		goto failed;
+	}
+	if (crc == crc_sum)
+		ret = 0;
+ failed:
+	brelse(bh_sum);
+ out:
+	return ret;
+}
+
+static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
+			unsigned int *offset, unsigned int bytes)
+{
+	void *ptr;
+	sector_t blocknr;
+
+	BUG_ON((*pbh)->b_size < *offset);
+	if (bytes > (*pbh)->b_size - *offset) {
+		blocknr = (*pbh)->b_blocknr;
+		brelse(*pbh);
+		*pbh = sb_bread(sb, blocknr + 1);
+		if (unlikely(!*pbh))
+			return NULL;
+		*offset = 0;
+	}
+	ptr = (*pbh)->b_data + *offset;
+	*offset += bytes;
+	return ptr;
+}
+
+static void segsum_skip(struct super_block *sb, struct buffer_head **pbh,
+			unsigned int *offset, unsigned int bytes,
+			unsigned long count)
+{
+	unsigned int rest_item_in_current_block
+		= ((*pbh)->b_size - *offset) / bytes;
+
+	if (count <= rest_item_in_current_block) {
+		*offset += bytes * count;
+	} else {
+		sector_t blocknr = (*pbh)->b_blocknr;
+		unsigned int nitem_per_block = (*pbh)->b_size / bytes;
+		unsigned int bcnt;
+
+		count -= rest_item_in_current_block;
+		bcnt = DIV_ROUND_UP(count, nitem_per_block);
+		*offset = bytes * (count - (bcnt - 1) * nitem_per_block);
+
+		brelse(*pbh);
+		*pbh = sb_bread(sb, blocknr + bcnt);
+	}
+}
+
+static int
+collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
+			   struct nilfs_segsum_info *ssi,
+			   struct list_head *head)
+{
+	struct buffer_head *bh;
+	unsigned int offset;
+	unsigned long nfinfo = ssi->nfinfo;
+	sector_t blocknr = sum_blocknr + ssi->nsumblk;
+	ino_t ino;
+	int err = -EIO;
+
+	if (!nfinfo)
+		return 0;
+
+	bh = sb_bread(sbi->s_super, sum_blocknr);
+	if (unlikely(!bh))
+		goto out;
+
+	offset = le16_to_cpu(
+		((struct nilfs_segment_summary *)bh->b_data)->ss_bytes);
+	for (;;) {
+		unsigned long nblocks, ndatablk, nnodeblk;
+		struct nilfs_finfo *finfo;
+
+		finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo));
+		if (unlikely(!finfo))
+			goto out;
+
+		ino = le64_to_cpu(finfo->fi_ino);
+		nblocks = le32_to_cpu(finfo->fi_nblocks);
+		ndatablk = le32_to_cpu(finfo->fi_ndatablk);
+		nnodeblk = nblocks - ndatablk;
+
+		while (ndatablk-- > 0) {
+			struct nilfs_recovery_block *rb;
+			struct nilfs_binfo_v *binfo;
+
+			binfo = segsum_get(sbi->s_super, &bh, &offset,
+					   sizeof(*binfo));
+			if (unlikely(!binfo))
+				goto out;
+
+			rb = kmalloc(sizeof(*rb), GFP_NOFS);
+			if (unlikely(!rb)) {
+				err = -ENOMEM;
+				goto out;
+			}
+			rb->ino = ino;
+			rb->blocknr = blocknr++;
+			rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr);
+			rb->blkoff = le64_to_cpu(binfo->bi_blkoff);
+			/* INIT_LIST_HEAD(&rb->list); */
+			list_add_tail(&rb->list, head);
+		}
+		if (--nfinfo == 0)
+			break;
+		blocknr += nnodeblk; /* always 0 for the data sync segments */
+		segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64),
+			    nnodeblk);
+		if (unlikely(!bh))
+			goto out;
+	}
+	err = 0;
+ out:
+	brelse(bh);   /* brelse(NULL) is just ignored */
+	return err;
+}
+
+static void dispose_recovery_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct nilfs_recovery_block *rb
+			= list_entry(head->next,
+				     struct nilfs_recovery_block, list);
+		list_del(&rb->list);
+		kfree(rb);
+	}
+}
+
+void nilfs_dispose_segment_list(struct list_head *head)
+{
+	while (!list_empty(head)) {
+		struct nilfs_segment_entry *ent
+			= list_entry(head->next,
+				     struct nilfs_segment_entry, list);
+		list_del(&ent->list);
+		nilfs_free_segment_entry(ent);
+	}
+}
+
+static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
+					      struct nilfs_recovery_info *ri)
+{
+	struct list_head *head = &ri->ri_used_segments;
+	struct nilfs_segment_entry *ent, *n;
+	struct inode *sufile = nilfs->ns_sufile;
+	__u64 segnum[4];
+	int err;
+	int i;
+
+	segnum[0] = nilfs->ns_segnum;
+	segnum[1] = nilfs->ns_nextnum;
+	segnum[2] = ri->ri_segnum;
+	segnum[3] = ri->ri_nextnum;
+
+	/*
+	 * Releasing the next segment of the latest super root.
+	 * The next segment is invalidated by this recovery.
+	 */
+	err = nilfs_sufile_free(sufile, segnum[1]);
+	if (unlikely(err))
+		goto failed;
+
+	err = -ENOMEM;
+	for (i = 1; i < 4; i++) {
+		ent = nilfs_alloc_segment_entry(segnum[i]);
+		if (unlikely(!ent))
+			goto failed;
+		list_add_tail(&ent->list, head);
+	}
+
+	/*
+	 * Collecting segments written after the latest super root.
+	 * These are marked dirty to avoid being reallocated in the next write.
+	 */
+	list_for_each_entry_safe(ent, n, head, list) {
+		if (ent->segnum != segnum[0]) {
+			err = nilfs_sufile_scrap(sufile, ent->segnum);
+			if (unlikely(err))
+				goto failed;
+		}
+		list_del(&ent->list);
+		nilfs_free_segment_entry(ent);
+	}
+
+	/* Allocate new segments for recovery */
+	err = nilfs_sufile_alloc(sufile, &segnum[0]);
+	if (unlikely(err))
+		goto failed;
+
+	nilfs->ns_pseg_offset = 0;
+	nilfs->ns_seg_seq = ri->ri_seq + 2;
+	nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0];
+	return 0;
+
+ failed:
+	/* No need to recover sufile because it will be destroyed on error */
+	return err;
+}
+
+static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi,
+				     struct nilfs_recovery_block *rb,
+				     struct page *page)
+{
+	struct buffer_head *bh_org;
+	void *kaddr;
+
+	bh_org = sb_bread(sbi->s_super, rb->blocknr);
+	if (unlikely(!bh_org))
+		return -EIO;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size);
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(bh_org);
+	return 0;
+}
+
+static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
+				struct list_head *head,
+				unsigned long *nr_salvaged_blocks)
+{
+	struct inode *inode;
+	struct nilfs_recovery_block *rb, *n;
+	unsigned blocksize = sbi->s_super->s_blocksize;
+	struct page *page;
+	loff_t pos;
+	int err = 0, err2 = 0;
+
+	list_for_each_entry_safe(rb, n, head, list) {
+		inode = nilfs_iget(sbi->s_super, rb->ino);
+		if (IS_ERR(inode)) {
+			err = PTR_ERR(inode);
+			inode = NULL;
+			goto failed_inode;
+		}
+
+		pos = rb->blkoff << inode->i_blkbits;
+		page = NULL;
+		err = block_write_begin(NULL, inode->i_mapping, pos, blocksize,
+					0, &page, NULL, nilfs_get_block);
+		if (unlikely(err))
+			goto failed_inode;
+
+		err = nilfs_recovery_copy_block(sbi, rb, page);
+		if (unlikely(err))
+			goto failed_page;
+
+		err = nilfs_set_file_dirty(sbi, inode, 1);
+		if (unlikely(err))
+			goto failed_page;
+
+		block_write_end(NULL, inode->i_mapping, pos, blocksize,
+				blocksize, page, NULL);
+
+		unlock_page(page);
+		page_cache_release(page);
+
+		(*nr_salvaged_blocks)++;
+		goto next;
+
+ failed_page:
+		unlock_page(page);
+		page_cache_release(page);
+
+ failed_inode:
+		printk(KERN_WARNING
+		       "NILFS warning: error recovering data block "
+		       "(err=%d, ino=%lu, block-offset=%llu)\n",
+		       err, rb->ino, (unsigned long long)rb->blkoff);
+		if (!err2)
+			err2 = err;
+ next:
+		iput(inode); /* iput(NULL) is just ignored */
+		list_del_init(&rb->list);
+		kfree(rb);
+	}
+	return err2;
+}
+
+/**
+ * nilfs_do_roll_forward - salvage logical segments newer than the latest
+ * checkpoint
+ * @sbi: nilfs_sb_info
+ * @nilfs: the_nilfs
+ * @ri: pointer to a nilfs_recovery_info
+ */
+static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
+				 struct nilfs_sb_info *sbi,
+				 struct nilfs_recovery_info *ri)
+{
+	struct nilfs_segsum_info ssi;
+	sector_t pseg_start;
+	sector_t seg_start, seg_end;  /* Starting/ending DBN of full segment */
+	unsigned long nsalvaged_blocks = 0;
+	u64 seg_seq;
+	__u64 segnum, nextnum = 0;
+	int empty_seg = 0;
+	int err = 0, ret;
+	LIST_HEAD(dsync_blocks);  /* list of data blocks to be recovered */
+	enum {
+		RF_INIT_ST,
+		RF_DSYNC_ST,   /* scanning data-sync segments */
+	};
+	int state = RF_INIT_ST;
+
+	nilfs_attach_writer(nilfs, sbi);
+	pseg_start = ri->ri_lsegs_start;
+	seg_seq = ri->ri_lsegs_start_seq;
+	segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
+	nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+
+	while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
+
+		ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
+		if (ret) {
+			if (ret == NILFS_SEG_FAIL_IO) {
+				err = -EIO;
+				goto failed;
+			}
+			goto strayed;
+		}
+		if (unlikely(NILFS_SEG_HAS_SR(&ssi)))
+			goto confused;
+
+		/* Found a valid partial segment; do recovery actions */
+		nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
+		empty_seg = 0;
+		nilfs->ns_ctime = ssi.ctime;
+		if (!(ssi.flags & NILFS_SS_GC))
+			nilfs->ns_nongc_ctime = ssi.ctime;
+
+		switch (state) {
+		case RF_INIT_ST:
+			if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi))
+				goto try_next_pseg;
+			state = RF_DSYNC_ST;
+			/* Fall through */
+		case RF_DSYNC_ST:
+			if (!NILFS_SEG_DSYNC(&ssi))
+				goto confused;
+
+			err = collect_blocks_from_segsum(
+				sbi, pseg_start, &ssi, &dsync_blocks);
+			if (unlikely(err))
+				goto failed;
+			if (NILFS_SEG_LOGEND(&ssi)) {
+				err = recover_dsync_blocks(
+					sbi, &dsync_blocks, &nsalvaged_blocks);
+				if (unlikely(err))
+					goto failed;
+				state = RF_INIT_ST;
+			}
+			break; /* Fall through to try_next_pseg */
+		}
+
+ try_next_pseg:
+		if (pseg_start == ri->ri_lsegs_end)
+			break;
+		pseg_start += ssi.nblocks;
+		if (pseg_start < seg_end)
+			continue;
+		goto feed_segment;
+
+ strayed:
+		if (pseg_start == ri->ri_lsegs_end)
+			break;
+
+ feed_segment:
+		/* Looking to the next full segment */
+		if (empty_seg++)
+			break;
+		seg_seq++;
+		segnum = nextnum;
+		nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+		pseg_start = seg_start;
+	}
+
+	if (nsalvaged_blocks) {
+		printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
+		       sbi->s_super->s_id, nsalvaged_blocks);
+		ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
+	}
+ out:
+	dispose_recovery_list(&dsync_blocks);
+	nilfs_detach_writer(sbi->s_nilfs, sbi);
+	return err;
+
+ confused:
+	err = -EINVAL;
+ failed:
+	printk(KERN_ERR
+	       "NILFS (device %s): Error roll-forwarding "
+	       "(err=%d, pseg block=%llu). ",
+	       sbi->s_super->s_id, err, (unsigned long long)pseg_start);
+	goto out;
+}
+
+static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
+				      struct nilfs_sb_info *sbi,
+				      struct nilfs_recovery_info *ri)
+{
+	struct buffer_head *bh;
+	int err;
+
+	if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) !=
+	    nilfs_get_segnum_of_block(nilfs, ri->ri_super_root))
+		return;
+
+	bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start);
+	BUG_ON(!bh);
+	memset(bh->b_data, 0, bh->b_size);
+	set_buffer_dirty(bh);
+	err = sync_dirty_buffer(bh);
+	if (unlikely(err))
+		printk(KERN_WARNING
+		       "NILFS warning: buffer sync write failed during "
+		       "post-cleaning of recovery.\n");
+	brelse(bh);
+}
+
+/**
+ * nilfs_recover_logical_segments - salvage logical segments written after
+ * the latest super root
+ * @nilfs: the_nilfs
+ * @sbi: nilfs_sb_info
+ * @ri: pointer to a nilfs_recovery_info struct to store search results.
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EINVAL - Inconsistent filesystem state.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
+				   struct nilfs_sb_info *sbi,
+				   struct nilfs_recovery_info *ri)
+{
+	int err;
+
+	if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
+		return 0;
+
+	err = nilfs_attach_checkpoint(sbi, ri->ri_cno);
+	if (unlikely(err)) {
+		printk(KERN_ERR
+		       "NILFS: error loading the latest checkpoint.\n");
+		return err;
+	}
+
+	err = nilfs_do_roll_forward(nilfs, sbi, ri);
+	if (unlikely(err))
+		goto failed;
+
+	if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
+		err = nilfs_prepare_segment_for_recovery(nilfs, ri);
+		if (unlikely(err)) {
+			printk(KERN_ERR "NILFS: Error preparing segments for "
+			       "recovery.\n");
+			goto failed;
+		}
+
+		err = nilfs_attach_segment_constructor(sbi);
+		if (unlikely(err))
+			goto failed;
+
+		set_nilfs_discontinued(nilfs);
+		err = nilfs_construct_segment(sbi->s_super);
+		nilfs_detach_segment_constructor(sbi);
+
+		if (unlikely(err)) {
+			printk(KERN_ERR "NILFS: Oops! recovery failed. "
+			       "(err=%d)\n", err);
+			goto failed;
+		}
+
+		nilfs_finish_roll_forward(nilfs, sbi, ri);
+	}
+
+	nilfs_detach_checkpoint(sbi);
+	return 0;
+
+ failed:
+	nilfs_detach_checkpoint(sbi);
+	nilfs_mdt_clear(nilfs->ns_cpfile);
+	nilfs_mdt_clear(nilfs->ns_sufile);
+	nilfs_mdt_clear(nilfs->ns_dat);
+	return err;
+}
+
+/**
+ * nilfs_search_super_root - search the latest valid super root
+ * @nilfs: the_nilfs
+ * @sbi: nilfs_sb_info
+ * @ri: pointer to a nilfs_recovery_info struct to store search results.
+ *
+ * nilfs_search_super_root() looks for the latest super-root from a partial
+ * segment pointed by the superblock.  It sets up struct the_nilfs through
+ * this search. It fills nilfs_recovery_info (ri) required for recovery.
+ *
+ * Return Value: On success, 0 is returned.  On error, one of the following
+ * negative error code is returned.
+ *
+ * %-EINVAL - No valid segment found
+ *
+ * %-EIO - I/O error
+ */
+int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
+			    struct nilfs_recovery_info *ri)
+{
+	struct nilfs_segsum_info ssi;
+	sector_t pseg_start, pseg_end, sr_pseg_start = 0;
+	sector_t seg_start, seg_end; /* range of full segment (block number) */
+	u64 seg_seq;
+	__u64 segnum, nextnum = 0;
+	__u64 cno;
+	struct nilfs_segment_entry *ent;
+	LIST_HEAD(segments);
+	int empty_seg = 0, scan_newer = 0;
+	int ret;
+
+	pseg_start = nilfs->ns_last_pseg;
+	seg_seq = nilfs->ns_last_seq;
+	cno = nilfs->ns_last_cno;
+	segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
+
+	/* Calculate range of segment */
+	nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+
+	for (;;) {
+		/* Load segment summary */
+		ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
+		if (ret) {
+			if (ret == NILFS_SEG_FAIL_IO)
+				goto failed;
+			goto strayed;
+		}
+		pseg_end = pseg_start + ssi.nblocks - 1;
+		if (unlikely(pseg_end > seg_end)) {
+			ret = NILFS_SEG_FAIL_CONSISTENCY;
+			goto strayed;
+		}
+
+		/* A valid partial segment */
+		ri->ri_pseg_start = pseg_start;
+		ri->ri_seq = seg_seq;
+		ri->ri_segnum = segnum;
+		nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
+		ri->ri_nextnum = nextnum;
+		empty_seg = 0;
+
+		if (!NILFS_SEG_HAS_SR(&ssi)) {
+			if (!scan_newer) {
+				/* This will never happen because a superblock
+				   (last_segment) always points to a pseg
+				   having a super root. */
+				ret = NILFS_SEG_FAIL_CONSISTENCY;
+				goto failed;
+			}
+			if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) {
+				ri->ri_lsegs_start = pseg_start;
+				ri->ri_lsegs_start_seq = seg_seq;
+			}
+			if (NILFS_SEG_LOGEND(&ssi))
+				ri->ri_lsegs_end = pseg_start;
+			goto try_next_pseg;
+		}
+
+		/* A valid super root was found. */
+		ri->ri_cno = cno++;
+		ri->ri_super_root = pseg_end;
+		ri->ri_lsegs_start = ri->ri_lsegs_end = 0;
+
+		nilfs_dispose_segment_list(&segments);
+		nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start)
+			+ ssi.nblocks - seg_start;
+		nilfs->ns_seg_seq = seg_seq;
+		nilfs->ns_segnum = segnum;
+		nilfs->ns_cno = cno;  /* nilfs->ns_cno = ri->ri_cno + 1 */
+		nilfs->ns_ctime = ssi.ctime;
+		nilfs->ns_nextnum = nextnum;
+
+		if (scan_newer)
+			ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED;
+		else {
+			if (nilfs->ns_mount_state & NILFS_VALID_FS)
+				goto super_root_found;
+			scan_newer = 1;
+		}
+
+		/* reset region for roll-forward */
+		pseg_start += ssi.nblocks;
+		if (pseg_start < seg_end)
+			continue;
+		goto feed_segment;
+
+ try_next_pseg:
+		/* Standing on a course, or met an inconsistent state */
+		pseg_start += ssi.nblocks;
+		if (pseg_start < seg_end)
+			continue;
+		goto feed_segment;
+
+ strayed:
+		/* Off the trail */
+		if (!scan_newer)
+			/*
+			 * This can happen if a checkpoint was written without
+			 * barriers, or as a result of an I/O failure.
+			 */
+			goto failed;
+
+ feed_segment:
+		/* Looking to the next full segment */
+		if (empty_seg++)
+			goto super_root_found; /* found a valid super root */
+
+		ent = nilfs_alloc_segment_entry(segnum);
+		if (unlikely(!ent)) {
+			ret = -ENOMEM;
+			goto failed;
+		}
+		list_add_tail(&ent->list, &segments);
+
+		seg_seq++;
+		segnum = nextnum;
+		nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
+		pseg_start = seg_start;
+	}
+
+ super_root_found:
+	/* Updating pointers relating to the latest checkpoint */
+	list_splice(&segments, ri->ri_used_segments.prev);
+	nilfs->ns_last_pseg = sr_pseg_start;
+	nilfs->ns_last_seq = nilfs->ns_seg_seq;
+	nilfs->ns_last_cno = ri->ri_cno;
+	return 0;
+
+ failed:
+	nilfs_dispose_segment_list(&segments);
+	return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
+}
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
new file mode 100644
index 00000000000..adccd4fc654
--- /dev/null
+++ b/fs/nilfs2/sb.h
@@ -0,0 +1,102 @@
+/*
+ * sb.h - NILFS on-memory super block structure.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#ifndef _NILFS_SB
+#define _NILFS_SB
+
+#include <linux/types.h>
+#include <linux/fs.h>
+
+/*
+ * Mount options
+ */
+struct nilfs_mount_options {
+	unsigned long mount_opt;
+	__u64 snapshot_cno;
+};
+
+struct the_nilfs;
+struct nilfs_sc_info;
+
+/*
+ * NILFS super-block data in memory
+ */
+struct nilfs_sb_info {
+	/* Snapshot status */
+	__u64 s_snapshot_cno;		/* Checkpoint number */
+	atomic_t s_inodes_count;
+	atomic_t s_blocks_count;	/* Reserved (might be deleted) */
+
+	/* Mount options */
+	unsigned long s_mount_opt;
+	uid_t s_resuid;
+	gid_t s_resgid;
+
+	unsigned long s_interval;	/* construction interval */
+	unsigned long s_watermark;	/* threshold of data amount
+					   for the segment construction */
+
+	/* Fundamental members */
+	struct super_block *s_super;	/* reverse pointer to super_block */
+	struct the_nilfs *s_nilfs;
+	struct list_head s_list;	/* list head for nilfs->ns_supers */
+
+	/* Segment constructor */
+	struct list_head s_dirty_files;	/* dirty files list */
+	struct nilfs_sc_info *s_sc_info; /* segment constructor info */
+	spinlock_t s_inode_lock;	/* Lock for the nilfs inode.
+					   It covers s_dirty_files list */
+
+	/* Metadata files */
+	struct inode *s_ifile;		/* index file inode */
+
+	/* Inode allocator */
+	spinlock_t s_next_gen_lock;
+	u32 s_next_generation;
+};
+
+static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct nilfs_sc_info *NILFS_SC(struct nilfs_sb_info *sbi)
+{
+	return sbi->s_sc_info;
+}
+
+/*
+ * Bit operations for the mount option
+ */
+#define nilfs_clear_opt(sbi, opt)  \
+	do { (sbi)->s_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
+#define nilfs_set_opt(sbi, opt)  \
+	do { (sbi)->s_mount_opt |= NILFS_MOUNT_##opt; } while (0)
+#define nilfs_test_opt(sbi, opt)   ((sbi)->s_mount_opt & NILFS_MOUNT_##opt)
+#define nilfs_write_opt(sbi, mask, opt)					\
+	do { (sbi)->s_mount_opt =					\
+		(((sbi)->s_mount_opt & ~NILFS_MOUNT_##mask) |		\
+		 NILFS_MOUNT_##opt);					\
+	} while (0)
+
+#endif /* _NILFS_SB */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
new file mode 100644
index 00000000000..1e68821b4a9
--- /dev/null
+++ b/fs/nilfs2/segbuf.c
@@ -0,0 +1,439 @@
+/*
+ * segbuf.c - NILFS segment buffer
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/crc32.h>
+#include "page.h"
+#include "segbuf.h"
+#include "seglist.h"
+
+
+static struct kmem_cache *nilfs_segbuf_cachep;
+
+static void nilfs_segbuf_init_once(void *obj)
+{
+	memset(obj, 0, sizeof(struct nilfs_segment_buffer));
+}
+
+int __init nilfs_init_segbuf_cache(void)
+{
+	nilfs_segbuf_cachep =
+		kmem_cache_create("nilfs2_segbuf_cache",
+				  sizeof(struct nilfs_segment_buffer),
+				  0, SLAB_RECLAIM_ACCOUNT,
+				  nilfs_segbuf_init_once);
+
+	return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
+}
+
+void nilfs_destroy_segbuf_cache(void)
+{
+	kmem_cache_destroy(nilfs_segbuf_cachep);
+}
+
+struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
+{
+	struct nilfs_segment_buffer *segbuf;
+
+	segbuf = kmem_cache_alloc(nilfs_segbuf_cachep, GFP_NOFS);
+	if (unlikely(!segbuf))
+		return NULL;
+
+	segbuf->sb_super = sb;
+	INIT_LIST_HEAD(&segbuf->sb_list);
+	INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
+	INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
+	return segbuf;
+}
+
+void nilfs_segbuf_free(struct nilfs_segment_buffer *segbuf)
+{
+	kmem_cache_free(nilfs_segbuf_cachep, segbuf);
+}
+
+void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum,
+		     unsigned long offset, struct the_nilfs *nilfs)
+{
+	segbuf->sb_segnum = segnum;
+	nilfs_get_segment_range(nilfs, segnum, &segbuf->sb_fseg_start,
+				&segbuf->sb_fseg_end);
+
+	segbuf->sb_pseg_start = segbuf->sb_fseg_start + offset;
+	segbuf->sb_rest_blocks =
+		segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
+}
+
+void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf,
+				  __u64 nextnum, struct the_nilfs *nilfs)
+{
+	segbuf->sb_nextnum = nextnum;
+	segbuf->sb_sum.next = nilfs_get_segment_start_blocknr(nilfs, nextnum);
+}
+
+int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf)
+{
+	struct buffer_head *bh;
+
+	bh = sb_getblk(segbuf->sb_super,
+		       segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk);
+	if (unlikely(!bh))
+		return -ENOMEM;
+
+	nilfs_segbuf_add_segsum_buffer(segbuf, bh);
+	return 0;
+}
+
+int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
+				struct buffer_head **bhp)
+{
+	struct buffer_head *bh;
+
+	bh = sb_getblk(segbuf->sb_super,
+		       segbuf->sb_pseg_start + segbuf->sb_sum.nblocks);
+	if (unlikely(!bh))
+		return -ENOMEM;
+
+	nilfs_segbuf_add_payload_buffer(segbuf, bh);
+	*bhp = bh;
+	return 0;
+}
+
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
+		       time_t ctime)
+{
+	int err;
+
+	segbuf->sb_sum.nblocks = segbuf->sb_sum.nsumblk = 0;
+	err = nilfs_segbuf_extend_segsum(segbuf);
+	if (unlikely(err))
+		return err;
+
+	segbuf->sb_sum.flags = flags;
+	segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
+	segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
+	segbuf->sb_sum.ctime = ctime;
+
+	segbuf->sb_io_error = 0;
+	return 0;
+}
+
+/*
+ * Setup segument summary
+ */
+void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
+{
+	struct nilfs_segment_summary *raw_sum;
+	struct buffer_head *bh_sum;
+
+	bh_sum = list_entry(segbuf->sb_segsum_buffers.next,
+			    struct buffer_head, b_assoc_buffers);
+	raw_sum = (struct nilfs_segment_summary *)bh_sum->b_data;
+
+	raw_sum->ss_magic    = cpu_to_le32(NILFS_SEGSUM_MAGIC);
+	raw_sum->ss_bytes    = cpu_to_le16(sizeof(*raw_sum));
+	raw_sum->ss_flags    = cpu_to_le16(segbuf->sb_sum.flags);
+	raw_sum->ss_seq      = cpu_to_le64(segbuf->sb_sum.seg_seq);
+	raw_sum->ss_create   = cpu_to_le64(segbuf->sb_sum.ctime);
+	raw_sum->ss_next     = cpu_to_le64(segbuf->sb_sum.next);
+	raw_sum->ss_nblocks  = cpu_to_le32(segbuf->sb_sum.nblocks);
+	raw_sum->ss_nfinfo   = cpu_to_le32(segbuf->sb_sum.nfinfo);
+	raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
+	raw_sum->ss_pad      = 0;
+}
+
+/*
+ * CRC calculation routines
+ */
+void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
+				     u32 seed)
+{
+	struct buffer_head *bh;
+	struct nilfs_segment_summary *raw_sum;
+	unsigned long size, bytes = segbuf->sb_sum.sumbytes;
+	u32 crc;
+
+	bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
+			b_assoc_buffers);
+
+	raw_sum = (struct nilfs_segment_summary *)bh->b_data;
+	size = min_t(unsigned long, bytes, bh->b_size);
+	crc = crc32_le(seed,
+		       (unsigned char *)raw_sum +
+		       sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum),
+		       size - (sizeof(raw_sum->ss_datasum) +
+			       sizeof(raw_sum->ss_sumsum)));
+
+	list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
+				     b_assoc_buffers) {
+		bytes -= size;
+		size = min_t(unsigned long, bytes, bh->b_size);
+		crc = crc32_le(crc, bh->b_data, size);
+	}
+	raw_sum->ss_sumsum = cpu_to_le32(crc);
+}
+
+void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
+				   u32 seed)
+{
+	struct buffer_head *bh;
+	struct nilfs_segment_summary *raw_sum;
+	void *kaddr;
+	u32 crc;
+
+	bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
+			b_assoc_buffers);
+	raw_sum = (struct nilfs_segment_summary *)bh->b_data;
+	crc = crc32_le(seed,
+		       (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum),
+		       bh->b_size - sizeof(raw_sum->ss_datasum));
+
+	list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
+				     b_assoc_buffers) {
+		crc = crc32_le(crc, bh->b_data, bh->b_size);
+	}
+	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+		kaddr = kmap_atomic(bh->b_page, KM_USER0);
+		crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size);
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+	raw_sum->ss_datasum = cpu_to_le32(crc);
+}
+
+void nilfs_release_buffers(struct list_head *list)
+{
+	struct buffer_head *bh, *n;
+
+	list_for_each_entry_safe(bh, n, list, b_assoc_buffers) {
+		list_del_init(&bh->b_assoc_buffers);
+		if (buffer_nilfs_allocated(bh)) {
+			struct page *clone_page = bh->b_page;
+
+			/* remove clone page */
+			brelse(bh);
+			page_cache_release(clone_page); /* for each bh */
+			if (page_count(clone_page) <= 2) {
+				lock_page(clone_page);
+				nilfs_free_private_page(clone_page);
+			}
+			continue;
+		}
+		brelse(bh);
+	}
+}
+
+/*
+ * BIO operations
+ */
+static void nilfs_end_bio_write(struct bio *bio, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct nilfs_write_info *wi = bio->bi_private;
+
+	if (err == -EOPNOTSUPP) {
+		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+		bio_put(bio);
+		/* to be detected by submit_seg_bio() */
+	}
+
+	if (!uptodate)
+		atomic_inc(&wi->err);
+
+	bio_put(bio);
+	complete(&wi->bio_event);
+}
+
+static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
+{
+	struct bio *bio = wi->bio;
+	int err;
+
+	if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) {
+		wait_for_completion(&wi->bio_event);
+		wi->nbio--;
+		if (unlikely(atomic_read(&wi->err))) {
+			bio_put(bio);
+			err = -EIO;
+			goto failed;
+		}
+	}
+
+	bio->bi_end_io = nilfs_end_bio_write;
+	bio->bi_private = wi;
+	bio_get(bio);
+	submit_bio(mode, bio);
+	if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
+		bio_put(bio);
+		err = -EOPNOTSUPP;
+		goto failed;
+	}
+	wi->nbio++;
+	bio_put(bio);
+
+	wi->bio = NULL;
+	wi->rest_blocks -= wi->end - wi->start;
+	wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
+	wi->start = wi->end;
+	return 0;
+
+ failed:
+	wi->bio = NULL;
+	return err;
+}
+
+/**
+ * nilfs_alloc_seg_bio - allocate a bio for writing segment.
+ * @sb: super block
+ * @start: beginning disk block number of this BIO.
+ * @nr_vecs: request size of page vector.
+ *
+ * alloc_seg_bio() allocates a new BIO structure and initialize it.
+ *
+ * Return Value: On success, pointer to the struct bio is returned.
+ * On error, NULL is returned.
+ */
+static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
+				       int nr_vecs)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+	if (bio == NULL) {
+		while (!bio && (nr_vecs >>= 1))
+			bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+	}
+	if (likely(bio)) {
+		bio->bi_bdev = sb->s_bdev;
+		bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9);
+	}
+	return bio;
+}
+
+void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
+				struct nilfs_write_info *wi)
+{
+	wi->bio = NULL;
+	wi->rest_blocks = segbuf->sb_sum.nblocks;
+	wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev);
+	wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
+	wi->start = wi->end = 0;
+	wi->nbio = 0;
+	wi->blocknr = segbuf->sb_pseg_start;
+
+	atomic_set(&wi->err, 0);
+	init_completion(&wi->bio_event);
+}
+
+static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh,
+			   int mode)
+{
+	int len, err;
+
+	BUG_ON(wi->nr_vecs <= 0);
+ repeat:
+	if (!wi->bio) {
+		wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end,
+					      wi->nr_vecs);
+		if (unlikely(!wi->bio))
+			return -ENOMEM;
+	}
+
+	len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh));
+	if (len == bh->b_size) {
+		wi->end++;
+		return 0;
+	}
+	/* bio is FULL */
+	err = nilfs_submit_seg_bio(wi, mode);
+	/* never submit current bh */
+	if (likely(!err))
+		goto repeat;
+	return err;
+}
+
+int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
+		       struct nilfs_write_info *wi)
+{
+	struct buffer_head *bh;
+	int res, rw = WRITE;
+
+	list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
+		res = nilfs_submit_bh(wi, bh, rw);
+		if (unlikely(res))
+			goto failed_bio;
+	}
+
+	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+		res = nilfs_submit_bh(wi, bh, rw);
+		if (unlikely(res))
+			goto failed_bio;
+	}
+
+	if (wi->bio) {
+		/*
+		 * Last BIO is always sent through the following
+		 * submission.
+		 */
+		rw |= (1 << BIO_RW_SYNCIO);
+		res = nilfs_submit_seg_bio(wi, rw);
+		if (unlikely(res))
+			goto failed_bio;
+	}
+
+	res = 0;
+ out:
+	return res;
+
+ failed_bio:
+	atomic_inc(&wi->err);
+	goto out;
+}
+
+/**
+ * nilfs_segbuf_wait - wait for completion of requested BIOs
+ * @wi: nilfs_write_info
+ *
+ * Return Value: On Success, 0 is returned. On Error, one of the following
+ * negative error code is returned.
+ *
+ * %-EIO - I/O error
+ */
+int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf,
+		      struct nilfs_write_info *wi)
+{
+	int err = 0;
+
+	if (!wi->nbio)
+		return 0;
+
+	do {
+		wait_for_completion(&wi->bio_event);
+	} while (--wi->nbio > 0);
+
+	if (unlikely(atomic_read(&wi->err) > 0)) {
+		printk(KERN_ERR "NILFS: IO error writing segment\n");
+		err = -EIO;
+		segbuf->sb_io_error = 1;
+	}
+	return err;
+}
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
new file mode 100644
index 00000000000..0c3076f4e59
--- /dev/null
+++ b/fs/nilfs2/segbuf.h
@@ -0,0 +1,201 @@
+/*
+ * segbuf.h - NILFS Segment buffer prototypes and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SEGBUF_H
+#define _NILFS_SEGBUF_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/bio.h>
+#include <linux/completion.h>
+#include <linux/backing-dev.h>
+
+/**
+ * struct nilfs_segsum_info - On-memory segment summary
+ * @flags: Flags
+ * @nfinfo: Number of file information structures
+ * @nblocks: Number of blocks included in the partial segment
+ * @nsumblk: Number of summary blocks
+ * @sumbytes: Byte count of segment summary
+ * @nfileblk: Total number of file blocks
+ * @seg_seq: Segment sequence number
+ * @ctime: Creation time
+ * @next: Block number of the next full segment
+ */
+struct nilfs_segsum_info {
+	unsigned int		flags;
+	unsigned long		nfinfo;
+	unsigned long		nblocks;
+	unsigned long		nsumblk;
+	unsigned long		sumbytes;
+	unsigned long		nfileblk;
+	u64			seg_seq;
+	time_t			ctime;
+	sector_t		next;
+};
+
+/* macro for the flags */
+#define NILFS_SEG_HAS_SR(sum)    ((sum)->flags & NILFS_SS_SR)
+#define NILFS_SEG_LOGBGN(sum)    ((sum)->flags & NILFS_SS_LOGBGN)
+#define NILFS_SEG_LOGEND(sum)    ((sum)->flags & NILFS_SS_LOGEND)
+#define NILFS_SEG_DSYNC(sum)     ((sum)->flags & NILFS_SS_SYNDT)
+#define NILFS_SEG_SIMPLEX(sum) \
+	(((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \
+	 (NILFS_SS_LOGBGN | NILFS_SS_LOGEND))
+
+#define NILFS_SEG_EMPTY(sum)	((sum)->nblocks == (sum)->nsumblk)
+
+/**
+ * struct nilfs_segment_buffer - Segment buffer
+ * @sb_super: back pointer to a superblock struct
+ * @sb_list: List head to chain this structure
+ * @sb_sum: On-memory segment summary
+ * @sb_segnum: Index number of the full segment
+ * @sb_nextnum: Index number of the next full segment
+ * @sb_fseg_start: Start block number of the full segment
+ * @sb_fseg_end: End block number of the full segment
+ * @sb_pseg_start: Disk block number of partial segment
+ * @sb_rest_blocks: Number of residual blocks in the current segment
+ * @sb_segsum_buffers: List of buffers for segment summaries
+ * @sb_payload_buffers: List of buffers for segment payload
+ * @sb_io_error: I/O error status
+ */
+struct nilfs_segment_buffer {
+	struct super_block     *sb_super;
+	struct list_head	sb_list;
+
+	/* Segment information */
+	struct nilfs_segsum_info sb_sum;
+	__u64			sb_segnum;
+	__u64			sb_nextnum;
+	sector_t		sb_fseg_start, sb_fseg_end;
+	sector_t		sb_pseg_start;
+	unsigned		sb_rest_blocks;
+
+	/* Buffers */
+	struct list_head	sb_segsum_buffers;
+	struct list_head	sb_payload_buffers; /* including super root */
+
+	/* io status */
+	int			sb_io_error;
+};
+
+#define NILFS_LIST_SEGBUF(head)  \
+	list_entry((head), struct nilfs_segment_buffer, sb_list)
+#define NILFS_NEXT_SEGBUF(segbuf)  NILFS_LIST_SEGBUF((segbuf)->sb_list.next)
+#define NILFS_PREV_SEGBUF(segbuf)  NILFS_LIST_SEGBUF((segbuf)->sb_list.prev)
+#define NILFS_LAST_SEGBUF(head)    NILFS_LIST_SEGBUF((head)->prev)
+#define NILFS_FIRST_SEGBUF(head)   NILFS_LIST_SEGBUF((head)->next)
+#define NILFS_SEGBUF_IS_LAST(segbuf, head)  ((segbuf)->sb_list.next == (head))
+
+#define nilfs_for_each_segbuf_before(s, t, h) \
+	for ((s) = NILFS_FIRST_SEGBUF(h); (s) != (t); \
+	     (s) = NILFS_NEXT_SEGBUF(s))
+
+#define NILFS_SEGBUF_FIRST_BH(head)  \
+	(list_entry((head)->next, struct buffer_head, b_assoc_buffers))
+#define NILFS_SEGBUF_NEXT_BH(bh)  \
+	(list_entry((bh)->b_assoc_buffers.next, struct buffer_head, \
+		    b_assoc_buffers))
+#define NILFS_SEGBUF_BH_IS_LAST(bh, head)  ((bh)->b_assoc_buffers.next == head)
+
+
+int __init nilfs_init_segbuf_cache(void);
+void nilfs_destroy_segbuf_cache(void);
+struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
+void nilfs_segbuf_free(struct nilfs_segment_buffer *);
+void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
+		      struct the_nilfs *);
+void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
+				  struct the_nilfs *);
+int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
+int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
+int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
+				struct buffer_head **);
+void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
+void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32);
+void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32);
+
+static inline void
+nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
+			       struct buffer_head *bh)
+{
+	list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_segsum_buffers);
+	segbuf->sb_sum.nblocks++;
+	segbuf->sb_sum.nsumblk++;
+}
+
+static inline void
+nilfs_segbuf_add_payload_buffer(struct nilfs_segment_buffer *segbuf,
+				struct buffer_head *bh)
+{
+	list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_payload_buffers);
+	segbuf->sb_sum.nblocks++;
+}
+
+static inline void
+nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
+			     struct buffer_head *bh)
+{
+	get_bh(bh);
+	nilfs_segbuf_add_payload_buffer(segbuf, bh);
+	segbuf->sb_sum.nfileblk++;
+}
+
+void nilfs_release_buffers(struct list_head *);
+
+static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
+{
+	nilfs_release_buffers(&segbuf->sb_segsum_buffers);
+	nilfs_release_buffers(&segbuf->sb_payload_buffers);
+}
+
+struct nilfs_write_info {
+	struct bio	       *bio;
+	int 			start, end; /* The region to be submitted */
+	int			rest_blocks;
+	int			max_pages;
+	int			nr_vecs;
+	sector_t		blocknr;
+
+	int			nbio;
+	atomic_t		err;
+	struct completion	bio_event;
+				/* completion event of segment write */
+
+	/*
+	 * The following fields must be set explicitly
+	 */
+	struct super_block     *sb;
+	struct backing_dev_info *bdi; /* backing dev info */
+	struct buffer_head     *bh_sr;
+};
+
+
+void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *,
+				struct nilfs_write_info *);
+int nilfs_segbuf_write(struct nilfs_segment_buffer *,
+		       struct nilfs_write_info *);
+int nilfs_segbuf_wait(struct nilfs_segment_buffer *,
+		      struct nilfs_write_info *);
+
+#endif /* _NILFS_SEGBUF_H */
diff --git a/fs/nilfs2/seglist.h b/fs/nilfs2/seglist.h
new file mode 100644
index 00000000000..d39df9144e9
--- /dev/null
+++ b/fs/nilfs2/seglist.h
@@ -0,0 +1,85 @@
+/*
+ * seglist.h - expediential structure and routines to handle list of segments
+ *             (would be removed in a future release)
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SEGLIST_H
+#define _NILFS_SEGLIST_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "sufile.h"
+
+struct nilfs_segment_entry {
+	__u64			segnum;
+
+#define NILFS_SLH_FREED		0x0001	/* The segment was freed provisonally.
+					   It must be cancelled if
+					   construction aborted */
+
+	unsigned		flags;
+	struct list_head	list;
+	struct buffer_head     *bh_su;
+	struct nilfs_segment_usage *raw_su;
+};
+
+
+void nilfs_dispose_segment_list(struct list_head *);
+
+static inline struct nilfs_segment_entry *
+nilfs_alloc_segment_entry(__u64 segnum)
+{
+	struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
+
+	if (likely(ent)) {
+		ent->segnum = segnum;
+		ent->flags = 0;
+		ent->bh_su = NULL;
+		ent->raw_su = NULL;
+		INIT_LIST_HEAD(&ent->list);
+	}
+	return ent;
+}
+
+static inline int nilfs_open_segment_entry(struct nilfs_segment_entry *ent,
+					   struct inode *sufile)
+{
+	return nilfs_sufile_get_segment_usage(sufile, ent->segnum,
+					      &ent->raw_su, &ent->bh_su);
+}
+
+static inline void nilfs_close_segment_entry(struct nilfs_segment_entry *ent,
+					     struct inode *sufile)
+{
+	if (!ent->bh_su)
+		return;
+	nilfs_sufile_put_segment_usage(sufile, ent->segnum, ent->bh_su);
+	ent->bh_su = NULL;
+	ent->raw_su = NULL;
+}
+
+static inline void nilfs_free_segment_entry(struct nilfs_segment_entry *ent)
+{
+	kfree(ent);
+}
+
+#endif /* _NILFS_SEGLIST_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
new file mode 100644
index 00000000000..fb70ec3be20
--- /dev/null
+++ b/fs/nilfs2/segment.c
@@ -0,0 +1,2977 @@
+/*
+ * segment.c - NILFS segment constructor.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/bio.h>
+#include <linux/completion.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/crc32.h>
+#include <linux/pagevec.h>
+#include "nilfs.h"
+#include "btnode.h"
+#include "page.h"
+#include "segment.h"
+#include "sufile.h"
+#include "cpfile.h"
+#include "ifile.h"
+#include "seglist.h"
+#include "segbuf.h"
+
+
+/*
+ * Segment constructor
+ */
+#define SC_N_INODEVEC	16   /* Size of locally allocated inode vector */
+
+#define SC_MAX_SEGDELTA 64   /* Upper limit of the number of segments
+				appended in collection retry loop */
+
+/* Construction mode */
+enum {
+	SC_LSEG_SR = 1,	/* Make a logical segment having a super root */
+	SC_LSEG_DSYNC,	/* Flush data blocks of a given file and make
+			   a logical segment without a super root */
+	SC_FLUSH_FILE,	/* Flush data files, leads to segment writes without
+			   creating a checkpoint */
+	SC_FLUSH_DAT,	/* Flush DAT file. This also creates segments without
+			   a checkpoint */
+};
+
+/* Stage numbers of dirty block collection */
+enum {
+	NILFS_ST_INIT = 0,
+	NILFS_ST_GC,		/* Collecting dirty blocks for GC */
+	NILFS_ST_FILE,
+	NILFS_ST_IFILE,
+	NILFS_ST_CPFILE,
+	NILFS_ST_SUFILE,
+	NILFS_ST_DAT,
+	NILFS_ST_SR,		/* Super root */
+	NILFS_ST_DSYNC,		/* Data sync blocks */
+	NILFS_ST_DONE,
+};
+
+/* State flags of collection */
+#define NILFS_CF_NODE		0x0001	/* Collecting node blocks */
+#define NILFS_CF_IFILE_STARTED	0x0002	/* IFILE stage has started */
+#define NILFS_CF_HISTORY_MASK	(NILFS_CF_IFILE_STARTED)
+
+/* Operations depending on the construction mode and file type */
+struct nilfs_sc_operations {
+	int (*collect_data)(struct nilfs_sc_info *, struct buffer_head *,
+			    struct inode *);
+	int (*collect_node)(struct nilfs_sc_info *, struct buffer_head *,
+			    struct inode *);
+	int (*collect_bmap)(struct nilfs_sc_info *, struct buffer_head *,
+			    struct inode *);
+	void (*write_data_binfo)(struct nilfs_sc_info *,
+				 struct nilfs_segsum_pointer *,
+				 union nilfs_binfo *);
+	void (*write_node_binfo)(struct nilfs_sc_info *,
+				 struct nilfs_segsum_pointer *,
+				 union nilfs_binfo *);
+};
+
+/*
+ * Other definitions
+ */
+static void nilfs_segctor_start_timer(struct nilfs_sc_info *);
+static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int);
+static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *);
+static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
+			       int);
+
+#define nilfs_cnt32_gt(a, b)   \
+	(typecheck(__u32, a) && typecheck(__u32, b) && \
+	 ((__s32)(b) - (__s32)(a) < 0))
+#define nilfs_cnt32_ge(a, b)   \
+	(typecheck(__u32, a) && typecheck(__u32, b) && \
+	 ((__s32)(a) - (__s32)(b) >= 0))
+#define nilfs_cnt32_lt(a, b)  nilfs_cnt32_gt(b, a)
+#define nilfs_cnt32_le(a, b)  nilfs_cnt32_ge(b, a)
+
+/*
+ * Transaction
+ */
+static struct kmem_cache *nilfs_transaction_cachep;
+
+/**
+ * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
+ *
+ * nilfs_init_transaction_cache() creates a slab cache for the struct
+ * nilfs_transaction_info.
+ *
+ * Return Value: On success, it returns 0. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_init_transaction_cache(void)
+{
+	nilfs_transaction_cachep =
+		kmem_cache_create("nilfs2_transaction_cache",
+				  sizeof(struct nilfs_transaction_info),
+				  0, SLAB_RECLAIM_ACCOUNT, NULL);
+	return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
+}
+
+/**
+ * nilfs_detroy_transaction_cache - destroy the cache for transaction info
+ *
+ * nilfs_destroy_transaction_cache() frees the slab cache for the struct
+ * nilfs_transaction_info.
+ */
+void nilfs_destroy_transaction_cache(void)
+{
+	kmem_cache_destroy(nilfs_transaction_cachep);
+}
+
+static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
+{
+	struct nilfs_transaction_info *cur_ti = current->journal_info;
+	void *save = NULL;
+
+	if (cur_ti) {
+		if (cur_ti->ti_magic == NILFS_TI_MAGIC)
+			return ++cur_ti->ti_count;
+		else {
+			/*
+			 * If journal_info field is occupied by other FS,
+			 * it is saved and will be restored on
+			 * nilfs_transaction_commit().
+			 */
+			printk(KERN_WARNING
+			       "NILFS warning: journal info from a different "
+			       "FS\n");
+			save = current->journal_info;
+		}
+	}
+	if (!ti) {
+		ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS);
+		if (!ti)
+			return -ENOMEM;
+		ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC;
+	} else {
+		ti->ti_flags = 0;
+	}
+	ti->ti_count = 0;
+	ti->ti_save = save;
+	ti->ti_magic = NILFS_TI_MAGIC;
+	current->journal_info = ti;
+	return 0;
+}
+
+/**
+ * nilfs_transaction_begin - start indivisible file operations.
+ * @sb: super block
+ * @ti: nilfs_transaction_info
+ * @vacancy_check: flags for vacancy rate checks
+ *
+ * nilfs_transaction_begin() acquires a reader/writer semaphore, called
+ * the segment semaphore, to make a segment construction and write tasks
+ * exclusive.  The function is used with nilfs_transaction_commit() in pairs.
+ * The region enclosed by these two functions can be nested.  To avoid a
+ * deadlock, the semaphore is only acquired or released in the outermost call.
+ *
+ * This function allocates a nilfs_transaction_info struct to keep context
+ * information on it.  It is initialized and hooked onto the current task in
+ * the outermost call.  If a pre-allocated struct is given to @ti, it is used
+ * instead; othewise a new struct is assigned from a slab.
+ *
+ * When @vacancy_check flag is set, this function will check the amount of
+ * free space, and will wait for the GC to reclaim disk space if low capacity.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-ENOSPC - No space left on device
+ */
+int nilfs_transaction_begin(struct super_block *sb,
+			    struct nilfs_transaction_info *ti,
+			    int vacancy_check)
+{
+	struct nilfs_sb_info *sbi;
+	struct the_nilfs *nilfs;
+	int ret = nilfs_prepare_segment_lock(ti);
+
+	if (unlikely(ret < 0))
+		return ret;
+	if (ret > 0)
+		return 0;
+
+	sbi = NILFS_SB(sb);
+	nilfs = sbi->s_nilfs;
+	down_read(&nilfs->ns_segctor_sem);
+	if (vacancy_check && nilfs_near_disk_full(nilfs)) {
+		up_read(&nilfs->ns_segctor_sem);
+		ret = -ENOSPC;
+		goto failed;
+	}
+	return 0;
+
+ failed:
+	ti = current->journal_info;
+	current->journal_info = ti->ti_save;
+	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+		kmem_cache_free(nilfs_transaction_cachep, ti);
+	return ret;
+}
+
+/**
+ * nilfs_transaction_commit - commit indivisible file operations.
+ * @sb: super block
+ *
+ * nilfs_transaction_commit() releases the read semaphore which is
+ * acquired by nilfs_transaction_begin(). This is only performed
+ * in outermost call of this function.  If a commit flag is set,
+ * nilfs_transaction_commit() sets a timer to start the segment
+ * constructor.  If a sync flag is set, it starts construction
+ * directly.
+ */
+int nilfs_transaction_commit(struct super_block *sb)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+	struct nilfs_sb_info *sbi;
+	struct nilfs_sc_info *sci;
+	int err = 0;
+
+	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+	ti->ti_flags |= NILFS_TI_COMMIT;
+	if (ti->ti_count > 0) {
+		ti->ti_count--;
+		return 0;
+	}
+	sbi = NILFS_SB(sb);
+	sci = NILFS_SC(sbi);
+	if (sci != NULL) {
+		if (ti->ti_flags & NILFS_TI_COMMIT)
+			nilfs_segctor_start_timer(sci);
+		if (atomic_read(&sbi->s_nilfs->ns_ndirtyblks) >
+		    sci->sc_watermark)
+			nilfs_segctor_do_flush(sci, 0);
+	}
+	up_read(&sbi->s_nilfs->ns_segctor_sem);
+	current->journal_info = ti->ti_save;
+
+	if (ti->ti_flags & NILFS_TI_SYNC)
+		err = nilfs_construct_segment(sb);
+	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+		kmem_cache_free(nilfs_transaction_cachep, ti);
+	return err;
+}
+
+void nilfs_transaction_abort(struct super_block *sb)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+
+	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+	if (ti->ti_count > 0) {
+		ti->ti_count--;
+		return;
+	}
+	up_read(&NILFS_SB(sb)->s_nilfs->ns_segctor_sem);
+
+	current->journal_info = ti->ti_save;
+	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
+		kmem_cache_free(nilfs_transaction_cachep, ti);
+}
+
+void nilfs_relax_pressure_in_lock(struct super_block *sb)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct nilfs_sc_info *sci = NILFS_SC(sbi);
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+
+	if (!sci || !sci->sc_flush_request)
+		return;
+
+	set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
+	up_read(&nilfs->ns_segctor_sem);
+
+	down_write(&nilfs->ns_segctor_sem);
+	if (sci->sc_flush_request &&
+	    test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) {
+		struct nilfs_transaction_info *ti = current->journal_info;
+
+		ti->ti_flags |= NILFS_TI_WRITER;
+		nilfs_segctor_do_immediate_flush(sci);
+		ti->ti_flags &= ~NILFS_TI_WRITER;
+	}
+	downgrade_write(&nilfs->ns_segctor_sem);
+}
+
+static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
+				   struct nilfs_transaction_info *ti,
+				   int gcflag)
+{
+	struct nilfs_transaction_info *cur_ti = current->journal_info;
+
+	WARN_ON(cur_ti);
+	ti->ti_flags = NILFS_TI_WRITER;
+	ti->ti_count = 0;
+	ti->ti_save = cur_ti;
+	ti->ti_magic = NILFS_TI_MAGIC;
+	INIT_LIST_HEAD(&ti->ti_garbage);
+	current->journal_info = ti;
+
+	for (;;) {
+		down_write(&sbi->s_nilfs->ns_segctor_sem);
+		if (!test_bit(NILFS_SC_PRIOR_FLUSH, &NILFS_SC(sbi)->sc_flags))
+			break;
+
+		nilfs_segctor_do_immediate_flush(NILFS_SC(sbi));
+
+		up_write(&sbi->s_nilfs->ns_segctor_sem);
+		yield();
+	}
+	if (gcflag)
+		ti->ti_flags |= NILFS_TI_GC;
+}
+
+static void nilfs_transaction_unlock(struct nilfs_sb_info *sbi)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+
+	BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
+	BUG_ON(ti->ti_count > 0);
+
+	up_write(&sbi->s_nilfs->ns_segctor_sem);
+	current->journal_info = ti->ti_save;
+	if (!list_empty(&ti->ti_garbage))
+		nilfs_dispose_list(sbi, &ti->ti_garbage, 0);
+}
+
+static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
+					    struct nilfs_segsum_pointer *ssp,
+					    unsigned bytes)
+{
+	struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+	unsigned blocksize = sci->sc_super->s_blocksize;
+	void *p;
+
+	if (unlikely(ssp->offset + bytes > blocksize)) {
+		ssp->offset = 0;
+		BUG_ON(NILFS_SEGBUF_BH_IS_LAST(ssp->bh,
+					       &segbuf->sb_segsum_buffers));
+		ssp->bh = NILFS_SEGBUF_NEXT_BH(ssp->bh);
+	}
+	p = ssp->bh->b_data + ssp->offset;
+	ssp->offset += bytes;
+	return p;
+}
+
+/**
+ * nilfs_segctor_reset_segment_buffer - reset the current segment buffer
+ * @sci: nilfs_sc_info
+ */
+static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+	struct buffer_head *sumbh;
+	unsigned sumbytes;
+	unsigned flags = 0;
+	int err;
+
+	if (nilfs_doing_gc())
+		flags = NILFS_SS_GC;
+	err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime);
+	if (unlikely(err))
+		return err;
+
+	sumbh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
+	sumbytes = segbuf->sb_sum.sumbytes;
+	sci->sc_finfo_ptr.bh = sumbh;  sci->sc_finfo_ptr.offset = sumbytes;
+	sci->sc_binfo_ptr.bh = sumbh;  sci->sc_binfo_ptr.offset = sumbytes;
+	sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
+	return 0;
+}
+
+static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
+{
+	sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
+	if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs))
+		return -E2BIG; /* The current segment is filled up
+				  (internal code) */
+	sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
+	return nilfs_segctor_reset_segment_buffer(sci);
+}
+
+static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
+	int err;
+
+	if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) {
+		err = nilfs_segctor_feed_segment(sci);
+		if (err)
+			return err;
+		segbuf = sci->sc_curseg;
+	}
+	err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root);
+	if (likely(!err))
+		segbuf->sb_sum.flags |= NILFS_SS_SR;
+	return err;
+}
+
+/*
+ * Functions for making segment summary and payloads
+ */
+static int nilfs_segctor_segsum_block_required(
+	struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp,
+	unsigned binfo_size)
+{
+	unsigned blocksize = sci->sc_super->s_blocksize;
+	/* Size of finfo and binfo is enough small against blocksize */
+
+	return ssp->offset + binfo_size +
+		(!sci->sc_blk_cnt ? sizeof(struct nilfs_finfo) : 0) >
+		blocksize;
+}
+
+static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
+				      struct inode *inode)
+{
+	sci->sc_curseg->sb_sum.nfinfo++;
+	sci->sc_binfo_ptr = sci->sc_finfo_ptr;
+	nilfs_segctor_map_segsum_entry(
+		sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
+
+	if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+		set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
+	/* skip finfo */
+}
+
+static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
+				    struct inode *inode)
+{
+	struct nilfs_finfo *finfo;
+	struct nilfs_inode_info *ii;
+	struct nilfs_segment_buffer *segbuf;
+
+	if (sci->sc_blk_cnt == 0)
+		return;
+
+	ii = NILFS_I(inode);
+	finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
+						 sizeof(*finfo));
+	finfo->fi_ino = cpu_to_le64(inode->i_ino);
+	finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
+	finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
+	finfo->fi_cno = cpu_to_le64(ii->i_cno);
+
+	segbuf = sci->sc_curseg;
+	segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
+		sci->sc_super->s_blocksize * (segbuf->sb_sum.nsumblk - 1);
+	sci->sc_finfo_ptr = sci->sc_binfo_ptr;
+	sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
+}
+
+static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
+					struct buffer_head *bh,
+					struct inode *inode,
+					unsigned binfo_size)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int required, err = 0;
+
+ retry:
+	segbuf = sci->sc_curseg;
+	required = nilfs_segctor_segsum_block_required(
+		sci, &sci->sc_binfo_ptr, binfo_size);
+	if (segbuf->sb_sum.nblocks + required + 1 > segbuf->sb_rest_blocks) {
+		nilfs_segctor_end_finfo(sci, inode);
+		err = nilfs_segctor_feed_segment(sci);
+		if (err)
+			return err;
+		goto retry;
+	}
+	if (unlikely(required)) {
+		err = nilfs_segbuf_extend_segsum(segbuf);
+		if (unlikely(err))
+			goto failed;
+	}
+	if (sci->sc_blk_cnt == 0)
+		nilfs_segctor_begin_finfo(sci, inode);
+
+	nilfs_segctor_map_segsum_entry(sci, &sci->sc_binfo_ptr, binfo_size);
+	/* Substitution to vblocknr is delayed until update_blocknr() */
+	nilfs_segbuf_add_file_buffer(segbuf, bh);
+	sci->sc_blk_cnt++;
+ failed:
+	return err;
+}
+
+static int nilfs_handle_bmap_error(int err, const char *fname,
+				   struct inode *inode, struct super_block *sb)
+{
+	if (err == -EINVAL) {
+		nilfs_error(sb, fname, "broken bmap (inode=%lu)\n",
+			    inode->i_ino);
+		err = -EIO;
+	}
+	return err;
+}
+
+/*
+ * Callback functions that enumerate, mark, and collect dirty blocks
+ */
+static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
+				   struct buffer_head *bh, struct inode *inode)
+{
+	int err;
+
+	err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+	if (unlikely(err < 0))
+		return nilfs_handle_bmap_error(err, __func__, inode,
+					       sci->sc_super);
+
+	err = nilfs_segctor_add_file_block(sci, bh, inode,
+					   sizeof(struct nilfs_binfo_v));
+	if (!err)
+		sci->sc_datablk_cnt++;
+	return err;
+}
+
+static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
+				   struct buffer_head *bh,
+				   struct inode *inode)
+{
+	int err;
+
+	err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+	if (unlikely(err < 0))
+		return nilfs_handle_bmap_error(err, __func__, inode,
+					       sci->sc_super);
+	return 0;
+}
+
+static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
+				   struct buffer_head *bh,
+				   struct inode *inode)
+{
+	WARN_ON(!buffer_dirty(bh));
+	return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
+}
+
+static void nilfs_write_file_data_binfo(struct nilfs_sc_info *sci,
+					struct nilfs_segsum_pointer *ssp,
+					union nilfs_binfo *binfo)
+{
+	struct nilfs_binfo_v *binfo_v = nilfs_segctor_map_segsum_entry(
+		sci, ssp, sizeof(*binfo_v));
+	*binfo_v = binfo->bi_v;
+}
+
+static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
+					struct nilfs_segsum_pointer *ssp,
+					union nilfs_binfo *binfo)
+{
+	__le64 *vblocknr = nilfs_segctor_map_segsum_entry(
+		sci, ssp, sizeof(*vblocknr));
+	*vblocknr = binfo->bi_v.bi_vblocknr;
+}
+
+struct nilfs_sc_operations nilfs_sc_file_ops = {
+	.collect_data = nilfs_collect_file_data,
+	.collect_node = nilfs_collect_file_node,
+	.collect_bmap = nilfs_collect_file_bmap,
+	.write_data_binfo = nilfs_write_file_data_binfo,
+	.write_node_binfo = nilfs_write_file_node_binfo,
+};
+
+static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
+				  struct buffer_head *bh, struct inode *inode)
+{
+	int err;
+
+	err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
+	if (unlikely(err < 0))
+		return nilfs_handle_bmap_error(err, __func__, inode,
+					       sci->sc_super);
+
+	err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
+	if (!err)
+		sci->sc_datablk_cnt++;
+	return err;
+}
+
+static int nilfs_collect_dat_bmap(struct nilfs_sc_info *sci,
+				  struct buffer_head *bh, struct inode *inode)
+{
+	WARN_ON(!buffer_dirty(bh));
+	return nilfs_segctor_add_file_block(sci, bh, inode,
+					    sizeof(struct nilfs_binfo_dat));
+}
+
+static void nilfs_write_dat_data_binfo(struct nilfs_sc_info *sci,
+				       struct nilfs_segsum_pointer *ssp,
+				       union nilfs_binfo *binfo)
+{
+	__le64 *blkoff = nilfs_segctor_map_segsum_entry(sci, ssp,
+							  sizeof(*blkoff));
+	*blkoff = binfo->bi_dat.bi_blkoff;
+}
+
+static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
+				       struct nilfs_segsum_pointer *ssp,
+				       union nilfs_binfo *binfo)
+{
+	struct nilfs_binfo_dat *binfo_dat =
+		nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*binfo_dat));
+	*binfo_dat = binfo->bi_dat;
+}
+
+struct nilfs_sc_operations nilfs_sc_dat_ops = {
+	.collect_data = nilfs_collect_dat_data,
+	.collect_node = nilfs_collect_file_node,
+	.collect_bmap = nilfs_collect_dat_bmap,
+	.write_data_binfo = nilfs_write_dat_data_binfo,
+	.write_node_binfo = nilfs_write_dat_node_binfo,
+};
+
+struct nilfs_sc_operations nilfs_sc_dsync_ops = {
+	.collect_data = nilfs_collect_file_data,
+	.collect_node = NULL,
+	.collect_bmap = NULL,
+	.write_data_binfo = nilfs_write_file_data_binfo,
+	.write_node_binfo = NULL,
+};
+
+static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
+					      struct list_head *listp,
+					      size_t nlimit,
+					      loff_t start, loff_t end)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	pgoff_t index = 0, last = ULONG_MAX;
+	size_t ndirties = 0;
+	int i;
+
+	if (unlikely(start != 0 || end != LLONG_MAX)) {
+		/*
+		 * A valid range is given for sync-ing data pages. The
+		 * range is rounded to per-page; extra dirty buffers
+		 * may be included if blocksize < pagesize.
+		 */
+		index = start >> PAGE_SHIFT;
+		last = end >> PAGE_SHIFT;
+	}
+	pagevec_init(&pvec, 0);
+ repeat:
+	if (unlikely(index > last) ||
+	    !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+				min_t(pgoff_t, last - index,
+				      PAGEVEC_SIZE - 1) + 1))
+		return ndirties;
+
+	for (i = 0; i < pagevec_count(&pvec); i++) {
+		struct buffer_head *bh, *head;
+		struct page *page = pvec.pages[i];
+
+		if (unlikely(page->index > last))
+			break;
+
+		if (mapping->host) {
+			lock_page(page);
+			if (!page_has_buffers(page))
+				create_empty_buffers(page,
+						     1 << inode->i_blkbits, 0);
+			unlock_page(page);
+		}
+
+		bh = head = page_buffers(page);
+		do {
+			if (!buffer_dirty(bh))
+				continue;
+			get_bh(bh);
+			list_add_tail(&bh->b_assoc_buffers, listp);
+			ndirties++;
+			if (unlikely(ndirties >= nlimit)) {
+				pagevec_release(&pvec);
+				cond_resched();
+				return ndirties;
+			}
+		} while (bh = bh->b_this_page, bh != head);
+	}
+	pagevec_release(&pvec);
+	cond_resched();
+	goto repeat;
+}
+
+static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
+					    struct list_head *listp)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+	struct address_space *mapping = &ii->i_btnode_cache;
+	struct pagevec pvec;
+	struct buffer_head *bh, *head;
+	unsigned int i;
+	pgoff_t index = 0;
+
+	pagevec_init(&pvec, 0);
+
+	while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
+				  PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			bh = head = page_buffers(pvec.pages[i]);
+			do {
+				if (buffer_dirty(bh)) {
+					get_bh(bh);
+					list_add_tail(&bh->b_assoc_buffers,
+						      listp);
+				}
+				bh = bh->b_this_page;
+			} while (bh != head);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
+			       struct list_head *head, int force)
+{
+	struct nilfs_inode_info *ii, *n;
+	struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii;
+	unsigned nv = 0;
+
+	while (!list_empty(head)) {
+		spin_lock(&sbi->s_inode_lock);
+		list_for_each_entry_safe(ii, n, head, i_dirty) {
+			list_del_init(&ii->i_dirty);
+			if (force) {
+				if (unlikely(ii->i_bh)) {
+					brelse(ii->i_bh);
+					ii->i_bh = NULL;
+				}
+			} else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) {
+				set_bit(NILFS_I_QUEUED, &ii->i_state);
+				list_add_tail(&ii->i_dirty,
+					      &sbi->s_dirty_files);
+				continue;
+			}
+			ivec[nv++] = ii;
+			if (nv == SC_N_INODEVEC)
+				break;
+		}
+		spin_unlock(&sbi->s_inode_lock);
+
+		for (pii = ivec; nv > 0; pii++, nv--)
+			iput(&(*pii)->vfs_inode);
+	}
+}
+
+static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	int ret = 0;
+
+	if (nilfs_mdt_fetch_dirty(sbi->s_ifile))
+		ret++;
+	if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
+		ret++;
+	if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
+		ret++;
+	if (ret || nilfs_doing_gc())
+		if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs)))
+			ret++;
+	return ret;
+}
+
+static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
+{
+	return list_empty(&sci->sc_dirty_files) &&
+		!test_bit(NILFS_SC_DIRTY, &sci->sc_flags) &&
+		list_empty(&sci->sc_cleaning_segments) &&
+		(!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes));
+}
+
+static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
+{
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	int ret = 0;
+
+	if (nilfs_test_metadata_dirty(sbi))
+		set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+
+	spin_lock(&sbi->s_inode_lock);
+	if (list_empty(&sbi->s_dirty_files) && nilfs_segctor_clean(sci))
+		ret++;
+
+	spin_unlock(&sbi->s_inode_lock);
+	return ret;
+}
+
+static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
+{
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+
+	nilfs_mdt_clear_dirty(sbi->s_ifile);
+	nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
+	nilfs_mdt_clear_dirty(nilfs->ns_sufile);
+	nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
+}
+
+static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
+{
+	struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+	struct buffer_head *bh_cp;
+	struct nilfs_checkpoint *raw_cp;
+	int err;
+
+	/* XXX: this interface will be changed */
+	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
+					  &raw_cp, &bh_cp);
+	if (likely(!err)) {
+		/* The following code is duplicated with cpfile.  But, it is
+		   needed to collect the checkpoint even if it was not newly
+		   created */
+		nilfs_mdt_mark_buffer_dirty(bh_cp);
+		nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
+		nilfs_cpfile_put_checkpoint(
+			nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
+	} else
+		WARN_ON(err == -EINVAL || err == -ENOENT);
+
+	return err;
+}
+
+static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
+{
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct buffer_head *bh_cp;
+	struct nilfs_checkpoint *raw_cp;
+	int err;
+
+	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
+					  &raw_cp, &bh_cp);
+	if (unlikely(err)) {
+		WARN_ON(err == -EINVAL || err == -ENOENT);
+		goto failed_ibh;
+	}
+	raw_cp->cp_snapshot_list.ssl_next = 0;
+	raw_cp->cp_snapshot_list.ssl_prev = 0;
+	raw_cp->cp_inodes_count =
+		cpu_to_le64(atomic_read(&sbi->s_inodes_count));
+	raw_cp->cp_blocks_count =
+		cpu_to_le64(atomic_read(&sbi->s_blocks_count));
+	raw_cp->cp_nblk_inc =
+		cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
+	raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
+	raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno);
+
+	if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+		nilfs_checkpoint_clear_minor(raw_cp);
+	else
+		nilfs_checkpoint_set_minor(raw_cp);
+
+	nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1);
+	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
+	return 0;
+
+ failed_ibh:
+	return err;
+}
+
+static void nilfs_fill_in_file_bmap(struct inode *ifile,
+				    struct nilfs_inode_info *ii)
+
+{
+	struct buffer_head *ibh;
+	struct nilfs_inode *raw_inode;
+
+	if (test_bit(NILFS_I_BMAP, &ii->i_state)) {
+		ibh = ii->i_bh;
+		BUG_ON(!ibh);
+		raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino,
+						  ibh);
+		nilfs_bmap_write(ii->i_bmap, raw_inode);
+		nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh);
+	}
+}
+
+static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
+					    struct inode *ifile)
+{
+	struct nilfs_inode_info *ii;
+
+	list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
+		nilfs_fill_in_file_bmap(ifile, ii);
+		set_bit(NILFS_I_COLLECTED, &ii->i_state);
+	}
+}
+
+/*
+ * CRC calculation routines
+ */
+static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
+{
+	struct nilfs_super_root *raw_sr =
+		(struct nilfs_super_root *)bh_sr->b_data;
+	u32 crc;
+
+	crc = crc32_le(seed,
+		       (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
+		       NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
+	raw_sr->sr_sum = cpu_to_le32(crc);
+}
+
+static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
+					    u32 seed)
+{
+	struct nilfs_segment_buffer *segbuf;
+
+	if (sci->sc_super_root)
+		nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
+		nilfs_segbuf_fill_in_data_crc(segbuf, seed);
+	}
+}
+
+static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
+					     struct the_nilfs *nilfs)
+{
+	struct buffer_head *bh_sr = sci->sc_super_root;
+	struct nilfs_super_root *raw_sr =
+		(struct nilfs_super_root *)bh_sr->b_data;
+	unsigned isz = nilfs->ns_inode_size;
+
+	raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
+	raw_sr->sr_nongc_ctime
+		= cpu_to_le64(nilfs_doing_gc() ?
+			      nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
+	raw_sr->sr_flags = 0;
+
+	nilfs_mdt_write_inode_direct(
+		nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz));
+	nilfs_mdt_write_inode_direct(
+		nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz));
+	nilfs_mdt_write_inode_direct(
+		nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz));
+}
+
+static void nilfs_redirty_inodes(struct list_head *head)
+{
+	struct nilfs_inode_info *ii;
+
+	list_for_each_entry(ii, head, i_dirty) {
+		if (test_bit(NILFS_I_COLLECTED, &ii->i_state))
+			clear_bit(NILFS_I_COLLECTED, &ii->i_state);
+	}
+}
+
+static void nilfs_drop_collected_inodes(struct list_head *head)
+{
+	struct nilfs_inode_info *ii;
+
+	list_for_each_entry(ii, head, i_dirty) {
+		if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state))
+			continue;
+
+		clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
+		set_bit(NILFS_I_UPDATED, &ii->i_state);
+	}
+}
+
+static void nilfs_segctor_cancel_free_segments(struct nilfs_sc_info *sci,
+					       struct inode *sufile)
+
+{
+	struct list_head *head = &sci->sc_cleaning_segments;
+	struct nilfs_segment_entry *ent;
+	int err;
+
+	list_for_each_entry(ent, head, list) {
+		if (!(ent->flags & NILFS_SLH_FREED))
+			break;
+		err = nilfs_sufile_cancel_free(sufile, ent->segnum);
+		WARN_ON(err); /* do not happen */
+		ent->flags &= ~NILFS_SLH_FREED;
+	}
+}
+
+static int nilfs_segctor_prepare_free_segments(struct nilfs_sc_info *sci,
+					       struct inode *sufile)
+{
+	struct list_head *head = &sci->sc_cleaning_segments;
+	struct nilfs_segment_entry *ent;
+	int err;
+
+	list_for_each_entry(ent, head, list) {
+		err = nilfs_sufile_free(sufile, ent->segnum);
+		if (unlikely(err))
+			return err;
+		ent->flags |= NILFS_SLH_FREED;
+	}
+	return 0;
+}
+
+static void nilfs_segctor_commit_free_segments(struct nilfs_sc_info *sci)
+{
+	nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
+}
+
+static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
+				       struct inode *inode,
+				       struct list_head *listp,
+				       int (*collect)(struct nilfs_sc_info *,
+						      struct buffer_head *,
+						      struct inode *))
+{
+	struct buffer_head *bh, *n;
+	int err = 0;
+
+	if (collect) {
+		list_for_each_entry_safe(bh, n, listp, b_assoc_buffers) {
+			list_del_init(&bh->b_assoc_buffers);
+			err = collect(sci, bh, inode);
+			brelse(bh);
+			if (unlikely(err))
+				goto dispose_buffers;
+		}
+		return 0;
+	}
+
+ dispose_buffers:
+	while (!list_empty(listp)) {
+		bh = list_entry(listp->next, struct buffer_head,
+				b_assoc_buffers);
+		list_del_init(&bh->b_assoc_buffers);
+		brelse(bh);
+	}
+	return err;
+}
+
+static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci)
+{
+	/* Remaining number of blocks within segment buffer */
+	return sci->sc_segbuf_nblocks -
+		(sci->sc_nblk_this_inc + sci->sc_curseg->sb_sum.nblocks);
+}
+
+static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci,
+				   struct inode *inode,
+				   struct nilfs_sc_operations *sc_ops)
+{
+	LIST_HEAD(data_buffers);
+	LIST_HEAD(node_buffers);
+	int err;
+
+	if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
+		size_t n, rest = nilfs_segctor_buffer_rest(sci);
+
+		n = nilfs_lookup_dirty_data_buffers(
+			inode, &data_buffers, rest + 1, 0, LLONG_MAX);
+		if (n > rest) {
+			err = nilfs_segctor_apply_buffers(
+				sci, inode, &data_buffers,
+				sc_ops->collect_data);
+			BUG_ON(!err); /* always receive -E2BIG or true error */
+			goto break_or_fail;
+		}
+	}
+	nilfs_lookup_dirty_node_buffers(inode, &node_buffers);
+
+	if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
+		err = nilfs_segctor_apply_buffers(
+			sci, inode, &data_buffers, sc_ops->collect_data);
+		if (unlikely(err)) {
+			/* dispose node list */
+			nilfs_segctor_apply_buffers(
+				sci, inode, &node_buffers, NULL);
+			goto break_or_fail;
+		}
+		sci->sc_stage.flags |= NILFS_CF_NODE;
+	}
+	/* Collect node */
+	err = nilfs_segctor_apply_buffers(
+		sci, inode, &node_buffers, sc_ops->collect_node);
+	if (unlikely(err))
+		goto break_or_fail;
+
+	nilfs_bmap_lookup_dirty_buffers(NILFS_I(inode)->i_bmap, &node_buffers);
+	err = nilfs_segctor_apply_buffers(
+		sci, inode, &node_buffers, sc_ops->collect_bmap);
+	if (unlikely(err))
+		goto break_or_fail;
+
+	nilfs_segctor_end_finfo(sci, inode);
+	sci->sc_stage.flags &= ~NILFS_CF_NODE;
+
+ break_or_fail:
+	return err;
+}
+
+static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
+					 struct inode *inode)
+{
+	LIST_HEAD(data_buffers);
+	size_t n, rest = nilfs_segctor_buffer_rest(sci);
+	int err;
+
+	n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1,
+					    sci->sc_dsync_start,
+					    sci->sc_dsync_end);
+
+	err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers,
+					  nilfs_collect_file_data);
+	if (!err) {
+		nilfs_segctor_end_finfo(sci, inode);
+		BUG_ON(n > rest);
+		/* always receive -E2BIG or true error if n > rest */
+	}
+	return err;
+}
+
+static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
+{
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct list_head *head;
+	struct nilfs_inode_info *ii;
+	int err = 0;
+
+	switch (sci->sc_stage.scnt) {
+	case NILFS_ST_INIT:
+		/* Pre-processes */
+		sci->sc_stage.flags = 0;
+
+		if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) {
+			sci->sc_nblk_inc = 0;
+			sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
+			if (mode == SC_LSEG_DSYNC) {
+				sci->sc_stage.scnt = NILFS_ST_DSYNC;
+				goto dsync_mode;
+			}
+		}
+
+		sci->sc_stage.dirty_file_ptr = NULL;
+		sci->sc_stage.gc_inode_ptr = NULL;
+		if (mode == SC_FLUSH_DAT) {
+			sci->sc_stage.scnt = NILFS_ST_DAT;
+			goto dat_stage;
+		}
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_GC:
+		if (nilfs_doing_gc()) {
+			head = &sci->sc_gc_inodes;
+			ii = list_prepare_entry(sci->sc_stage.gc_inode_ptr,
+						head, i_dirty);
+			list_for_each_entry_continue(ii, head, i_dirty) {
+				err = nilfs_segctor_scan_file(
+					sci, &ii->vfs_inode,
+					&nilfs_sc_file_ops);
+				if (unlikely(err)) {
+					sci->sc_stage.gc_inode_ptr = list_entry(
+						ii->i_dirty.prev,
+						struct nilfs_inode_info,
+						i_dirty);
+					goto break_or_fail;
+				}
+				set_bit(NILFS_I_COLLECTED, &ii->i_state);
+			}
+			sci->sc_stage.gc_inode_ptr = NULL;
+		}
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_FILE:
+		head = &sci->sc_dirty_files;
+		ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
+					i_dirty);
+		list_for_each_entry_continue(ii, head, i_dirty) {
+			clear_bit(NILFS_I_DIRTY, &ii->i_state);
+
+			err = nilfs_segctor_scan_file(sci, &ii->vfs_inode,
+						      &nilfs_sc_file_ops);
+			if (unlikely(err)) {
+				sci->sc_stage.dirty_file_ptr =
+					list_entry(ii->i_dirty.prev,
+						   struct nilfs_inode_info,
+						   i_dirty);
+				goto break_or_fail;
+			}
+			/* sci->sc_stage.dirty_file_ptr = NILFS_I(inode); */
+			/* XXX: required ? */
+		}
+		sci->sc_stage.dirty_file_ptr = NULL;
+		if (mode == SC_FLUSH_FILE) {
+			sci->sc_stage.scnt = NILFS_ST_DONE;
+			return 0;
+		}
+		sci->sc_stage.scnt++;
+		sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
+		/* Fall through */
+	case NILFS_ST_IFILE:
+		err = nilfs_segctor_scan_file(sci, sbi->s_ifile,
+					      &nilfs_sc_file_ops);
+		if (unlikely(err))
+			break;
+		sci->sc_stage.scnt++;
+		/* Creating a checkpoint */
+		err = nilfs_segctor_create_checkpoint(sci);
+		if (unlikely(err))
+			break;
+		/* Fall through */
+	case NILFS_ST_CPFILE:
+		err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile,
+					      &nilfs_sc_file_ops);
+		if (unlikely(err))
+			break;
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_SUFILE:
+		err = nilfs_segctor_prepare_free_segments(sci,
+							  nilfs->ns_sufile);
+		if (unlikely(err))
+			break;
+		err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile,
+					      &nilfs_sc_file_ops);
+		if (unlikely(err))
+			break;
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_DAT:
+ dat_stage:
+		err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs),
+					      &nilfs_sc_dat_ops);
+		if (unlikely(err))
+			break;
+		if (mode == SC_FLUSH_DAT) {
+			sci->sc_stage.scnt = NILFS_ST_DONE;
+			return 0;
+		}
+		sci->sc_stage.scnt++;  /* Fall through */
+	case NILFS_ST_SR:
+		if (mode == SC_LSEG_SR) {
+			/* Appending a super root */
+			err = nilfs_segctor_add_super_root(sci);
+			if (unlikely(err))
+				break;
+		}
+		/* End of a logical segment */
+		sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
+		sci->sc_stage.scnt = NILFS_ST_DONE;
+		return 0;
+	case NILFS_ST_DSYNC:
+ dsync_mode:
+		sci->sc_curseg->sb_sum.flags |= NILFS_SS_SYNDT;
+		ii = sci->sc_dsync_inode;
+		if (!test_bit(NILFS_I_BUSY, &ii->i_state))
+			break;
+
+		err = nilfs_segctor_scan_file_dsync(sci, &ii->vfs_inode);
+		if (unlikely(err))
+			break;
+		sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
+		sci->sc_stage.scnt = NILFS_ST_DONE;
+		return 0;
+	case NILFS_ST_DONE:
+		return 0;
+	default:
+		BUG();
+	}
+
+ break_or_fail:
+	return err;
+}
+
+static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum)
+{
+	struct buffer_head *bh_su;
+	struct nilfs_segment_usage *raw_su;
+	int err;
+
+	err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su);
+	if (unlikely(err))
+		return err;
+	nilfs_mdt_mark_buffer_dirty(bh_su);
+	nilfs_mdt_mark_dirty(sufile);
+	nilfs_sufile_put_segment_usage(sufile, segnum, bh_su);
+	return 0;
+}
+
+static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
+					    struct the_nilfs *nilfs)
+{
+	struct nilfs_segment_buffer *segbuf, *n;
+	__u64 nextnum;
+	int err;
+
+	if (list_empty(&sci->sc_segbufs)) {
+		segbuf = nilfs_segbuf_new(sci->sc_super);
+		if (unlikely(!segbuf))
+			return -ENOMEM;
+		list_add(&segbuf->sb_list, &sci->sc_segbufs);
+	} else
+		segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+
+	nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset,
+			 nilfs);
+
+	if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
+		nilfs_shift_to_next_segment(nilfs);
+		nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
+	}
+	sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
+
+	err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum);
+	if (unlikely(err))
+		return err;
+
+	if (nilfs->ns_segnum == nilfs->ns_nextnum) {
+		/* Start from the head of a new full segment */
+		err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum);
+		if (unlikely(err))
+			return err;
+	} else
+		nextnum = nilfs->ns_nextnum;
+
+	segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
+	nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs);
+
+	/* truncating segment buffers */
+	list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
+					  sb_list) {
+		list_del_init(&segbuf->sb_list);
+		nilfs_segbuf_free(segbuf);
+	}
+	return 0;
+}
+
+static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
+					 struct the_nilfs *nilfs, int nadd)
+{
+	struct nilfs_segment_buffer *segbuf, *prev, *n;
+	struct inode *sufile = nilfs->ns_sufile;
+	__u64 nextnextnum;
+	LIST_HEAD(list);
+	int err, ret, i;
+
+	prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
+	/*
+	 * Since the segment specified with nextnum might be allocated during
+	 * the previous construction, the buffer including its segusage may
+	 * not be dirty.  The following call ensures that the buffer is dirty
+	 * and will pin the buffer on memory until the sufile is written.
+	 */
+	err = nilfs_touch_segusage(sufile, prev->sb_nextnum);
+	if (unlikely(err))
+		return err;
+
+	for (i = 0; i < nadd; i++) {
+		/* extend segment info */
+		err = -ENOMEM;
+		segbuf = nilfs_segbuf_new(sci->sc_super);
+		if (unlikely(!segbuf))
+			goto failed;
+
+		/* map this buffer to region of segment on-disk */
+		nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
+		sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks;
+
+		/* allocate the next next full segment */
+		err = nilfs_sufile_alloc(sufile, &nextnextnum);
+		if (unlikely(err))
+			goto failed_segbuf;
+
+		segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq + 1;
+		nilfs_segbuf_set_next_segnum(segbuf, nextnextnum, nilfs);
+
+		list_add_tail(&segbuf->sb_list, &list);
+		prev = segbuf;
+	}
+	list_splice(&list, sci->sc_segbufs.prev);
+	return 0;
+
+ failed_segbuf:
+	nilfs_segbuf_free(segbuf);
+ failed:
+	list_for_each_entry_safe(segbuf, n, &list, sb_list) {
+		ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+		WARN_ON(ret); /* never fails */
+		list_del_init(&segbuf->sb_list);
+		nilfs_segbuf_free(segbuf);
+	}
+	return err;
+}
+
+static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci,
+						   struct the_nilfs *nilfs)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int ret, done = 0;
+
+	segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+	if (nilfs->ns_nextnum != segbuf->sb_nextnum) {
+		ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
+		WARN_ON(ret); /* never fails */
+	}
+	if (segbuf->sb_io_error) {
+		/* Case 1: The first segment failed */
+		if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
+			/* Case 1a:  Partial segment appended into an existing
+			   segment */
+			nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start,
+						segbuf->sb_fseg_end);
+		else /* Case 1b:  New full segment */
+			set_nilfs_discontinued(nilfs);
+		done++;
+	}
+
+	list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
+		ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
+		WARN_ON(ret); /* never fails */
+		if (!done && segbuf->sb_io_error) {
+			if (segbuf->sb_segnum != nilfs->ns_nextnum)
+				/* Case 2: extended segment (!= next) failed */
+				nilfs_sufile_set_error(nilfs->ns_sufile,
+						       segbuf->sb_segnum);
+			done++;
+		}
+	}
+}
+
+static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf;
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list)
+		nilfs_segbuf_clear(segbuf);
+	sci->sc_super_root = NULL;
+}
+
+static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf;
+
+	while (!list_empty(&sci->sc_segbufs)) {
+		segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+		list_del_init(&segbuf->sb_list);
+		nilfs_segbuf_free(segbuf);
+	}
+	/* sci->sc_curseg = NULL; */
+}
+
+static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
+					   struct the_nilfs *nilfs, int err)
+{
+	if (unlikely(err)) {
+		nilfs_segctor_free_incomplete_segments(sci, nilfs);
+		nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
+	}
+	nilfs_segctor_clear_segment_buffers(sci);
+}
+
+static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci,
+					  struct inode *sufile)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct buffer_head *bh_su;
+	struct nilfs_segment_usage *raw_su;
+	unsigned long live_blocks;
+	int ret;
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
+						     &raw_su, &bh_su);
+		WARN_ON(ret); /* always succeed because bh_su is dirty */
+		live_blocks = segbuf->sb_sum.nblocks +
+			(segbuf->sb_pseg_start - segbuf->sb_fseg_start);
+		raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime);
+		raw_su->su_nblocks = cpu_to_le32(live_blocks);
+		nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
+					       bh_su);
+	}
+}
+
+static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci,
+					  struct inode *sufile)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct buffer_head *bh_su;
+	struct nilfs_segment_usage *raw_su;
+	int ret;
+
+	segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+	ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
+					     &raw_su, &bh_su);
+	WARN_ON(ret); /* always succeed because bh_su is dirty */
+	raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start -
+					 segbuf->sb_fseg_start);
+	nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su);
+
+	list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
+		ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
+						     &raw_su, &bh_su);
+		WARN_ON(ret); /* always succeed */
+		raw_su->su_nblocks = 0;
+		nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
+					       bh_su);
+	}
+}
+
+static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci,
+					    struct nilfs_segment_buffer *last,
+					    struct inode *sufile)
+{
+	struct nilfs_segment_buffer *segbuf = last, *n;
+	int ret;
+
+	list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
+					  sb_list) {
+		list_del_init(&segbuf->sb_list);
+		sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks;
+		ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
+		WARN_ON(ret);
+		nilfs_segbuf_free(segbuf);
+	}
+}
+
+
+static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
+				 struct the_nilfs *nilfs, int mode)
+{
+	struct nilfs_cstage prev_stage = sci->sc_stage;
+	int err, nadd = 1;
+
+	/* Collection retry loop */
+	for (;;) {
+		sci->sc_super_root = NULL;
+		sci->sc_nblk_this_inc = 0;
+		sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
+
+		err = nilfs_segctor_reset_segment_buffer(sci);
+		if (unlikely(err))
+			goto failed;
+
+		err = nilfs_segctor_collect_blocks(sci, mode);
+		sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
+		if (!err)
+			break;
+
+		if (unlikely(err != -E2BIG))
+			goto failed;
+
+		/* The current segment is filled up */
+		if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
+			break;
+
+		nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
+		nilfs_segctor_clear_segment_buffers(sci);
+
+		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+		if (unlikely(err))
+			return err;
+
+		nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
+		sci->sc_stage = prev_stage;
+	}
+	nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile);
+	return 0;
+
+ failed:
+	return err;
+}
+
+static void nilfs_list_replace_buffer(struct buffer_head *old_bh,
+				      struct buffer_head *new_bh)
+{
+	BUG_ON(!list_empty(&new_bh->b_assoc_buffers));
+
+	list_replace_init(&old_bh->b_assoc_buffers, &new_bh->b_assoc_buffers);
+	/* The caller must release old_bh */
+}
+
+static int
+nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
+				     struct nilfs_segment_buffer *segbuf,
+				     int mode)
+{
+	struct inode *inode = NULL;
+	sector_t blocknr;
+	unsigned long nfinfo = segbuf->sb_sum.nfinfo;
+	unsigned long nblocks = 0, ndatablk = 0;
+	struct nilfs_sc_operations *sc_op = NULL;
+	struct nilfs_segsum_pointer ssp;
+	struct nilfs_finfo *finfo = NULL;
+	union nilfs_binfo binfo;
+	struct buffer_head *bh, *bh_org;
+	ino_t ino = 0;
+	int err = 0;
+
+	if (!nfinfo)
+		goto out;
+
+	blocknr = segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk;
+	ssp.bh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
+	ssp.offset = sizeof(struct nilfs_segment_summary);
+
+	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
+		if (bh == sci->sc_super_root)
+			break;
+		if (!finfo) {
+			finfo =	nilfs_segctor_map_segsum_entry(
+				sci, &ssp, sizeof(*finfo));
+			ino = le64_to_cpu(finfo->fi_ino);
+			nblocks = le32_to_cpu(finfo->fi_nblocks);
+			ndatablk = le32_to_cpu(finfo->fi_ndatablk);
+
+			if (buffer_nilfs_node(bh))
+				inode = NILFS_BTNC_I(bh->b_page->mapping);
+			else
+				inode = NILFS_AS_I(bh->b_page->mapping);
+
+			if (mode == SC_LSEG_DSYNC)
+				sc_op = &nilfs_sc_dsync_ops;
+			else if (ino == NILFS_DAT_INO)
+				sc_op = &nilfs_sc_dat_ops;
+			else /* file blocks */
+				sc_op = &nilfs_sc_file_ops;
+		}
+		bh_org = bh;
+		get_bh(bh_org);
+		err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr,
+					&binfo);
+		if (bh != bh_org)
+			nilfs_list_replace_buffer(bh_org, bh);
+		brelse(bh_org);
+		if (unlikely(err))
+			goto failed_bmap;
+
+		if (ndatablk > 0)
+			sc_op->write_data_binfo(sci, &ssp, &binfo);
+		else
+			sc_op->write_node_binfo(sci, &ssp, &binfo);
+
+		blocknr++;
+		if (--nblocks == 0) {
+			finfo = NULL;
+			if (--nfinfo == 0)
+				break;
+		} else if (ndatablk > 0)
+			ndatablk--;
+	}
+ out:
+	return 0;
+
+ failed_bmap:
+	err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super);
+	return err;
+}
+
+static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
+{
+	struct nilfs_segment_buffer *segbuf;
+	int err;
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		err = nilfs_segctor_update_payload_blocknr(sci, segbuf, mode);
+		if (unlikely(err))
+			return err;
+		nilfs_segbuf_fill_in_segsum(segbuf);
+	}
+	return 0;
+}
+
+static int
+nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
+{
+	struct page *clone_page;
+	struct buffer_head *bh, *head, *bh2;
+	void *kaddr;
+
+	bh = head = page_buffers(page);
+
+	clone_page = nilfs_alloc_private_page(bh->b_bdev, bh->b_size, 0);
+	if (unlikely(!clone_page))
+		return -ENOMEM;
+
+	bh2 = page_buffers(clone_page);
+	kaddr = kmap_atomic(page, KM_USER0);
+	do {
+		if (list_empty(&bh->b_assoc_buffers))
+			continue;
+		get_bh(bh2);
+		page_cache_get(clone_page); /* for each bh */
+		memcpy(bh2->b_data, kaddr + bh_offset(bh), bh2->b_size);
+		bh2->b_blocknr = bh->b_blocknr;
+		list_replace(&bh->b_assoc_buffers, &bh2->b_assoc_buffers);
+		list_add_tail(&bh->b_assoc_buffers, out);
+	} while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (!TestSetPageWriteback(clone_page))
+		inc_zone_page_state(clone_page, NR_WRITEBACK);
+	unlock_page(clone_page);
+
+	return 0;
+}
+
+static int nilfs_test_page_to_be_frozen(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	if (!mapping || !mapping->host || S_ISDIR(mapping->host->i_mode))
+		return 0;
+
+	if (page_mapped(page)) {
+		ClearPageChecked(page);
+		return 1;
+	}
+	return PageChecked(page);
+}
+
+static int nilfs_begin_page_io(struct page *page, struct list_head *out)
+{
+	if (!page || PageWriteback(page))
+		/* For split b-tree node pages, this function may be called
+		   twice.  We ignore the 2nd or later calls by this check. */
+		return 0;
+
+	lock_page(page);
+	clear_page_dirty_for_io(page);
+	set_page_writeback(page);
+	unlock_page(page);
+
+	if (nilfs_test_page_to_be_frozen(page)) {
+		int err = nilfs_copy_replace_page_buffers(page, out);
+		if (unlikely(err))
+			return err;
+	}
+	return 0;
+}
+
+static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
+				       struct page **failed_page)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct page *bd_page = NULL, *fs_page = NULL;
+	struct list_head *list = &sci->sc_copied_buffers;
+	int err;
+
+	*failed_page = NULL;
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		struct buffer_head *bh;
+
+		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+				    b_assoc_buffers) {
+			if (bh->b_page != bd_page) {
+				if (bd_page) {
+					lock_page(bd_page);
+					clear_page_dirty_for_io(bd_page);
+					set_page_writeback(bd_page);
+					unlock_page(bd_page);
+				}
+				bd_page = bh->b_page;
+			}
+		}
+
+		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+				    b_assoc_buffers) {
+			if (bh == sci->sc_super_root) {
+				if (bh->b_page != bd_page) {
+					lock_page(bd_page);
+					clear_page_dirty_for_io(bd_page);
+					set_page_writeback(bd_page);
+					unlock_page(bd_page);
+					bd_page = bh->b_page;
+				}
+				break;
+			}
+			if (bh->b_page != fs_page) {
+				err = nilfs_begin_page_io(fs_page, list);
+				if (unlikely(err)) {
+					*failed_page = fs_page;
+					goto out;
+				}
+				fs_page = bh->b_page;
+			}
+		}
+	}
+	if (bd_page) {
+		lock_page(bd_page);
+		clear_page_dirty_for_io(bd_page);
+		set_page_writeback(bd_page);
+		unlock_page(bd_page);
+	}
+	err = nilfs_begin_page_io(fs_page, list);
+	if (unlikely(err))
+		*failed_page = fs_page;
+ out:
+	return err;
+}
+
+static int nilfs_segctor_write(struct nilfs_sc_info *sci,
+			       struct backing_dev_info *bdi)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct nilfs_write_info wi;
+	int err, res;
+
+	wi.sb = sci->sc_super;
+	wi.bh_sr = sci->sc_super_root;
+	wi.bdi = bdi;
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		nilfs_segbuf_prepare_write(segbuf, &wi);
+		err = nilfs_segbuf_write(segbuf, &wi);
+
+		res = nilfs_segbuf_wait(segbuf, &wi);
+		err = unlikely(err) ? : res;
+		if (unlikely(err))
+			return err;
+	}
+	return 0;
+}
+
+static int nilfs_page_has_uncleared_buffer(struct page *page)
+{
+	struct buffer_head *head, *bh;
+
+	head = bh = page_buffers(page);
+	do {
+		if (buffer_dirty(bh) && !list_empty(&bh->b_assoc_buffers))
+			return 1;
+		bh = bh->b_this_page;
+	} while (bh != head);
+	return 0;
+}
+
+static void __nilfs_end_page_io(struct page *page, int err)
+{
+	if (!err) {
+		if (!nilfs_page_buffers_clean(page))
+			__set_page_dirty_nobuffers(page);
+		ClearPageError(page);
+	} else {
+		__set_page_dirty_nobuffers(page);
+		SetPageError(page);
+	}
+
+	if (buffer_nilfs_allocated(page_buffers(page))) {
+		if (TestClearPageWriteback(page))
+			dec_zone_page_state(page, NR_WRITEBACK);
+	} else
+		end_page_writeback(page);
+}
+
+static void nilfs_end_page_io(struct page *page, int err)
+{
+	if (!page)
+		return;
+
+	if (buffer_nilfs_node(page_buffers(page)) &&
+	    nilfs_page_has_uncleared_buffer(page))
+		/* For b-tree node pages, this function may be called twice
+		   or more because they might be split in a segment.
+		   This check assures that cleanup has been done for all
+		   buffers in a split btnode page. */
+		return;
+
+	__nilfs_end_page_io(page, err);
+}
+
+static void nilfs_clear_copied_buffers(struct list_head *list, int err)
+{
+	struct buffer_head *bh, *head;
+	struct page *page;
+
+	while (!list_empty(list)) {
+		bh = list_entry(list->next, struct buffer_head,
+				b_assoc_buffers);
+		page = bh->b_page;
+		page_cache_get(page);
+		head = bh = page_buffers(page);
+		do {
+			if (!list_empty(&bh->b_assoc_buffers)) {
+				list_del_init(&bh->b_assoc_buffers);
+				if (!err) {
+					set_buffer_uptodate(bh);
+					clear_buffer_dirty(bh);
+					clear_buffer_nilfs_volatile(bh);
+				}
+				brelse(bh); /* for b_assoc_buffers */
+			}
+		} while ((bh = bh->b_this_page) != head);
+
+		__nilfs_end_page_io(page, err);
+		page_cache_release(page);
+	}
+}
+
+static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
+				      struct page *failed_page, int err)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct page *bd_page = NULL, *fs_page = NULL;
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		struct buffer_head *bh;
+
+		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+				    b_assoc_buffers) {
+			if (bh->b_page != bd_page) {
+				if (bd_page)
+					end_page_writeback(bd_page);
+				bd_page = bh->b_page;
+			}
+		}
+
+		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+				    b_assoc_buffers) {
+			if (bh == sci->sc_super_root) {
+				if (bh->b_page != bd_page) {
+					end_page_writeback(bd_page);
+					bd_page = bh->b_page;
+				}
+				break;
+			}
+			if (bh->b_page != fs_page) {
+				nilfs_end_page_io(fs_page, err);
+				if (unlikely(fs_page == failed_page))
+					goto done;
+				fs_page = bh->b_page;
+			}
+		}
+	}
+	if (bd_page)
+		end_page_writeback(bd_page);
+
+	nilfs_end_page_io(fs_page, err);
+ done:
+	nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err);
+}
+
+static void nilfs_set_next_segment(struct the_nilfs *nilfs,
+				   struct nilfs_segment_buffer *segbuf)
+{
+	nilfs->ns_segnum = segbuf->sb_segnum;
+	nilfs->ns_nextnum = segbuf->sb_nextnum;
+	nilfs->ns_pseg_offset = segbuf->sb_pseg_start - segbuf->sb_fseg_start
+		+ segbuf->sb_sum.nblocks;
+	nilfs->ns_seg_seq = segbuf->sb_sum.seg_seq;
+	nilfs->ns_ctime = segbuf->sb_sum.ctime;
+}
+
+static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segment_buffer *segbuf;
+	struct page *bd_page = NULL, *fs_page = NULL;
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	int update_sr = (sci->sc_super_root != NULL);
+
+	list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
+		struct buffer_head *bh;
+
+		list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+				    b_assoc_buffers) {
+			set_buffer_uptodate(bh);
+			clear_buffer_dirty(bh);
+			if (bh->b_page != bd_page) {
+				if (bd_page)
+					end_page_writeback(bd_page);
+				bd_page = bh->b_page;
+			}
+		}
+		/*
+		 * We assume that the buffers which belong to the same page
+		 * continue over the buffer list.
+		 * Under this assumption, the last BHs of pages is
+		 * identifiable by the discontinuity of bh->b_page
+		 * (page != fs_page).
+		 *
+		 * For B-tree node blocks, however, this assumption is not
+		 * guaranteed.  The cleanup code of B-tree node pages needs
+		 * special care.
+		 */
+		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+				    b_assoc_buffers) {
+			set_buffer_uptodate(bh);
+			clear_buffer_dirty(bh);
+			clear_buffer_nilfs_volatile(bh);
+			if (bh == sci->sc_super_root) {
+				if (bh->b_page != bd_page) {
+					end_page_writeback(bd_page);
+					bd_page = bh->b_page;
+				}
+				break;
+			}
+			if (bh->b_page != fs_page) {
+				nilfs_end_page_io(fs_page, 0);
+				fs_page = bh->b_page;
+			}
+		}
+
+		if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) {
+			if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) {
+				set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
+				sci->sc_lseg_stime = jiffies;
+			}
+			if (NILFS_SEG_LOGEND(&segbuf->sb_sum))
+				clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
+		}
+	}
+	/*
+	 * Since pages may continue over multiple segment buffers,
+	 * end of the last page must be checked outside of the loop.
+	 */
+	if (bd_page)
+		end_page_writeback(bd_page);
+
+	nilfs_end_page_io(fs_page, 0);
+
+	nilfs_clear_copied_buffers(&sci->sc_copied_buffers, 0);
+
+	nilfs_drop_collected_inodes(&sci->sc_dirty_files);
+
+	if (nilfs_doing_gc()) {
+		nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
+		if (update_sr)
+			nilfs_commit_gcdat_inode(nilfs);
+	} else
+		nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
+
+	sci->sc_nblk_inc += sci->sc_nblk_this_inc;
+
+	segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
+	nilfs_set_next_segment(nilfs, segbuf);
+
+	if (update_sr) {
+		nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
+				       segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
+		sbi->s_super->s_dirt = 1;
+
+		clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
+		clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+		set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
+	} else
+		clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
+}
+
+static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
+					struct nilfs_sb_info *sbi)
+{
+	struct nilfs_inode_info *ii, *n;
+	__u64 cno = sbi->s_nilfs->ns_cno;
+
+	spin_lock(&sbi->s_inode_lock);
+ retry:
+	list_for_each_entry_safe(ii, n, &sbi->s_dirty_files, i_dirty) {
+		if (!ii->i_bh) {
+			struct buffer_head *ibh;
+			int err;
+
+			spin_unlock(&sbi->s_inode_lock);
+			err = nilfs_ifile_get_inode_block(
+				sbi->s_ifile, ii->vfs_inode.i_ino, &ibh);
+			if (unlikely(err)) {
+				nilfs_warning(sbi->s_super, __func__,
+					      "failed to get inode block.\n");
+				return err;
+			}
+			nilfs_mdt_mark_buffer_dirty(ibh);
+			nilfs_mdt_mark_dirty(sbi->s_ifile);
+			spin_lock(&sbi->s_inode_lock);
+			if (likely(!ii->i_bh))
+				ii->i_bh = ibh;
+			else
+				brelse(ibh);
+			goto retry;
+		}
+		ii->i_cno = cno;
+
+		clear_bit(NILFS_I_QUEUED, &ii->i_state);
+		set_bit(NILFS_I_BUSY, &ii->i_state);
+		list_del(&ii->i_dirty);
+		list_add_tail(&ii->i_dirty, &sci->sc_dirty_files);
+	}
+	spin_unlock(&sbi->s_inode_lock);
+
+	NILFS_I(sbi->s_ifile)->i_cno = cno;
+
+	return 0;
+}
+
+static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
+					  struct nilfs_sb_info *sbi)
+{
+	struct nilfs_transaction_info *ti = current->journal_info;
+	struct nilfs_inode_info *ii, *n;
+	__u64 cno = sbi->s_nilfs->ns_cno;
+
+	spin_lock(&sbi->s_inode_lock);
+	list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
+		if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
+		    test_bit(NILFS_I_DIRTY, &ii->i_state)) {
+			/* The current checkpoint number (=nilfs->ns_cno) is
+			   changed between check-in and check-out only if the
+			   super root is written out.  So, we can update i_cno
+			   for the inodes that remain in the dirty list. */
+			ii->i_cno = cno;
+			continue;
+		}
+		clear_bit(NILFS_I_BUSY, &ii->i_state);
+		brelse(ii->i_bh);
+		ii->i_bh = NULL;
+		list_del(&ii->i_dirty);
+		list_add_tail(&ii->i_dirty, &ti->ti_garbage);
+	}
+	spin_unlock(&sbi->s_inode_lock);
+}
+
+/*
+ * Main procedure of segment constructor
+ */
+static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
+{
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct page *failed_page;
+	int err, has_sr = 0;
+
+	sci->sc_stage.scnt = NILFS_ST_INIT;
+
+	err = nilfs_segctor_check_in_files(sci, sbi);
+	if (unlikely(err))
+		goto out;
+
+	if (nilfs_test_metadata_dirty(sbi))
+		set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
+
+	if (nilfs_segctor_clean(sci))
+		goto out;
+
+	do {
+		sci->sc_stage.flags &= ~NILFS_CF_HISTORY_MASK;
+
+		err = nilfs_segctor_begin_construction(sci, nilfs);
+		if (unlikely(err))
+			goto out;
+
+		/* Update time stamp */
+		sci->sc_seg_ctime = get_seconds();
+
+		err = nilfs_segctor_collect(sci, nilfs, mode);
+		if (unlikely(err))
+			goto failed;
+
+		has_sr = (sci->sc_super_root != NULL);
+
+		/* Avoid empty segment */
+		if (sci->sc_stage.scnt == NILFS_ST_DONE &&
+		    NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
+			nilfs_segctor_end_construction(sci, nilfs, 1);
+			goto out;
+		}
+
+		err = nilfs_segctor_assign(sci, mode);
+		if (unlikely(err))
+			goto failed;
+
+		if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
+			nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
+
+		if (has_sr) {
+			err = nilfs_segctor_fill_in_checkpoint(sci);
+			if (unlikely(err))
+				goto failed_to_make_up;
+
+			nilfs_segctor_fill_in_super_root(sci, nilfs);
+		}
+		nilfs_segctor_update_segusage(sci, nilfs->ns_sufile);
+
+		/* Write partial segments */
+		err = nilfs_segctor_prepare_write(sci, &failed_page);
+		if (unlikely(err))
+			goto failed_to_write;
+
+		nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
+
+		err = nilfs_segctor_write(sci, nilfs->ns_bdi);
+		if (unlikely(err))
+			goto failed_to_write;
+
+		nilfs_segctor_complete_write(sci);
+
+		/* Commit segments */
+		if (has_sr) {
+			nilfs_segctor_commit_free_segments(sci);
+			nilfs_segctor_clear_metadata_dirty(sci);
+		}
+
+		nilfs_segctor_end_construction(sci, nilfs, 0);
+
+	} while (sci->sc_stage.scnt != NILFS_ST_DONE);
+
+ out:
+	nilfs_segctor_destroy_segment_buffers(sci);
+	nilfs_segctor_check_out_files(sci, sbi);
+	return err;
+
+ failed_to_write:
+	nilfs_segctor_abort_write(sci, failed_page, err);
+	nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile);
+
+ failed_to_make_up:
+	if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
+		nilfs_redirty_inodes(&sci->sc_dirty_files);
+
+ failed:
+	if (nilfs_doing_gc())
+		nilfs_redirty_inodes(&sci->sc_gc_inodes);
+	nilfs_segctor_end_construction(sci, nilfs, err);
+	goto out;
+}
+
+/**
+ * nilfs_secgtor_start_timer - set timer of background write
+ * @sci: nilfs_sc_info
+ *
+ * If the timer has already been set, it ignores the new request.
+ * This function MUST be called within a section locking the segment
+ * semaphore.
+ */
+static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
+{
+	spin_lock(&sci->sc_state_lock);
+	if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
+		sci->sc_timer->expires = jiffies + sci->sc_interval;
+		add_timer(sci->sc_timer);
+		sci->sc_state |= NILFS_SEGCTOR_COMMIT;
+	}
+	spin_unlock(&sci->sc_state_lock);
+}
+
+static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
+{
+	spin_lock(&sci->sc_state_lock);
+	if (!(sci->sc_flush_request & (1 << bn))) {
+		unsigned long prev_req = sci->sc_flush_request;
+
+		sci->sc_flush_request |= (1 << bn);
+		if (!prev_req)
+			wake_up(&sci->sc_wait_daemon);
+	}
+	spin_unlock(&sci->sc_state_lock);
+}
+
+/**
+ * nilfs_flush_segment - trigger a segment construction for resource control
+ * @sb: super block
+ * @ino: inode number of the file to be flushed out.
+ */
+void nilfs_flush_segment(struct super_block *sb, ino_t ino)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct nilfs_sc_info *sci = NILFS_SC(sbi);
+
+	if (!sci || nilfs_doing_construction())
+		return;
+	nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0);
+					/* assign bit 0 to data files */
+}
+
+int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *sci,
+					   __u64 *segnum, size_t nsegs)
+{
+	struct nilfs_segment_entry *ent;
+	struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
+	struct inode *sufile = nilfs->ns_sufile;
+	LIST_HEAD(list);
+	__u64 *pnum;
+	size_t i;
+	int err;
+
+	for (pnum = segnum, i = 0; i < nsegs; pnum++, i++) {
+		ent = nilfs_alloc_segment_entry(*pnum);
+		if (unlikely(!ent)) {
+			err = -ENOMEM;
+			goto failed;
+		}
+		list_add_tail(&ent->list, &list);
+
+		err = nilfs_open_segment_entry(ent, sufile);
+		if (unlikely(err))
+			goto failed;
+
+		if (unlikely(!nilfs_segment_usage_dirty(ent->raw_su)))
+			printk(KERN_WARNING "NILFS: unused segment is "
+			       "requested to be cleaned (segnum=%llu)\n",
+			       (unsigned long long)ent->segnum);
+		nilfs_close_segment_entry(ent, sufile);
+	}
+	list_splice(&list, sci->sc_cleaning_segments.prev);
+	return 0;
+
+ failed:
+	nilfs_dispose_segment_list(&list);
+	return err;
+}
+
+void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *sci)
+{
+	nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
+}
+
+struct nilfs_segctor_wait_request {
+	wait_queue_t	wq;
+	__u32		seq;
+	int		err;
+	atomic_t	done;
+};
+
+static int nilfs_segctor_sync(struct nilfs_sc_info *sci)
+{
+	struct nilfs_segctor_wait_request wait_req;
+	int err = 0;
+
+	spin_lock(&sci->sc_state_lock);
+	init_wait(&wait_req.wq);
+	wait_req.err = 0;
+	atomic_set(&wait_req.done, 0);
+	wait_req.seq = ++sci->sc_seq_request;
+	spin_unlock(&sci->sc_state_lock);
+
+	init_waitqueue_entry(&wait_req.wq, current);
+	add_wait_queue(&sci->sc_wait_request, &wait_req.wq);
+	set_current_state(TASK_INTERRUPTIBLE);
+	wake_up(&sci->sc_wait_daemon);
+
+	for (;;) {
+		if (atomic_read(&wait_req.done)) {
+			err = wait_req.err;
+			break;
+		}
+		if (!signal_pending(current)) {
+			schedule();
+			continue;
+		}
+		err = -ERESTARTSYS;
+		break;
+	}
+	finish_wait(&sci->sc_wait_request, &wait_req.wq);
+	return err;
+}
+
+static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
+{
+	struct nilfs_segctor_wait_request *wrq, *n;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sci->sc_wait_request.lock, flags);
+	list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list,
+				 wq.task_list) {
+		if (!atomic_read(&wrq->done) &&
+		    nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) {
+			wrq->err = err;
+			atomic_set(&wrq->done, 1);
+		}
+		if (atomic_read(&wrq->done)) {
+			wrq->wq.func(&wrq->wq,
+				     TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+				     0, NULL);
+		}
+	}
+	spin_unlock_irqrestore(&sci->sc_wait_request.lock, flags);
+}
+
+/**
+ * nilfs_construct_segment - construct a logical segment
+ * @sb: super block
+ *
+ * Return Value: On success, 0 is retured. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-EROFS - Read only filesystem.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_construct_segment(struct super_block *sb)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct nilfs_sc_info *sci = NILFS_SC(sbi);
+	struct nilfs_transaction_info *ti;
+	int err;
+
+	if (!sci)
+		return -EROFS;
+
+	/* A call inside transactions causes a deadlock. */
+	BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC);
+
+	err = nilfs_segctor_sync(sci);
+	return err;
+}
+
+/**
+ * nilfs_construct_dsync_segment - construct a data-only logical segment
+ * @sb: super block
+ * @inode: inode whose data blocks should be written out
+ * @start: start byte offset
+ * @end: end byte offset (inclusive)
+ *
+ * Return Value: On success, 0 is retured. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-EROFS - Read only filesystem.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
+				  loff_t start, loff_t end)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct nilfs_sc_info *sci = NILFS_SC(sbi);
+	struct nilfs_inode_info *ii;
+	struct nilfs_transaction_info ti;
+	int err = 0;
+
+	if (!sci)
+		return -EROFS;
+
+	nilfs_transaction_lock(sbi, &ti, 0);
+
+	ii = NILFS_I(inode);
+	if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) ||
+	    nilfs_test_opt(sbi, STRICT_ORDER) ||
+	    test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
+	    nilfs_discontinued(sbi->s_nilfs)) {
+		nilfs_transaction_unlock(sbi);
+		err = nilfs_segctor_sync(sci);
+		return err;
+	}
+
+	spin_lock(&sbi->s_inode_lock);
+	if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
+	    !test_bit(NILFS_I_BUSY, &ii->i_state)) {
+		spin_unlock(&sbi->s_inode_lock);
+		nilfs_transaction_unlock(sbi);
+		return 0;
+	}
+	spin_unlock(&sbi->s_inode_lock);
+	sci->sc_dsync_inode = ii;
+	sci->sc_dsync_start = start;
+	sci->sc_dsync_end = end;
+
+	err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
+
+	nilfs_transaction_unlock(sbi);
+	return err;
+}
+
+struct nilfs_segctor_req {
+	int mode;
+	__u32 seq_accepted;
+	int sc_err;  /* construction failure */
+	int sb_err;  /* super block writeback failure */
+};
+
+#define FLUSH_FILE_BIT	(0x1) /* data file only */
+#define FLUSH_DAT_BIT	(1 << NILFS_DAT_INO) /* DAT only */
+
+static void nilfs_segctor_accept(struct nilfs_sc_info *sci,
+				 struct nilfs_segctor_req *req)
+{
+	req->sc_err = req->sb_err = 0;
+	spin_lock(&sci->sc_state_lock);
+	req->seq_accepted = sci->sc_seq_request;
+	spin_unlock(&sci->sc_state_lock);
+
+	if (sci->sc_timer)
+		del_timer_sync(sci->sc_timer);
+}
+
+static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
+				 struct nilfs_segctor_req *req)
+{
+	/* Clear requests (even when the construction failed) */
+	spin_lock(&sci->sc_state_lock);
+
+	sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
+
+	if (req->mode == SC_LSEG_SR) {
+		sci->sc_seq_done = req->seq_accepted;
+		nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err);
+		sci->sc_flush_request = 0;
+	} else if (req->mode == SC_FLUSH_FILE)
+		sci->sc_flush_request &= ~FLUSH_FILE_BIT;
+	else if (req->mode == SC_FLUSH_DAT)
+		sci->sc_flush_request &= ~FLUSH_DAT_BIT;
+
+	spin_unlock(&sci->sc_state_lock);
+}
+
+static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
+				   struct nilfs_segctor_req *req)
+{
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	int err = 0;
+
+	if (nilfs_discontinued(nilfs))
+		req->mode = SC_LSEG_SR;
+	if (!nilfs_segctor_confirm(sci)) {
+		err = nilfs_segctor_do_construct(sci, req->mode);
+		req->sc_err = err;
+	}
+	if (likely(!err)) {
+		if (req->mode != SC_FLUSH_DAT)
+			atomic_set(&nilfs->ns_ndirtyblks, 0);
+		if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
+		    nilfs_discontinued(nilfs)) {
+			down_write(&nilfs->ns_sem);
+			req->sb_err = nilfs_commit_super(sbi, 0);
+			up_write(&nilfs->ns_sem);
+		}
+	}
+	return err;
+}
+
+static void nilfs_construction_timeout(unsigned long data)
+{
+	struct task_struct *p = (struct task_struct *)data;
+	wake_up_process(p);
+}
+
+static void
+nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
+{
+	struct nilfs_inode_info *ii, *n;
+
+	list_for_each_entry_safe(ii, n, head, i_dirty) {
+		if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
+			continue;
+		hlist_del_init(&ii->vfs_inode.i_hash);
+		list_del_init(&ii->i_dirty);
+		nilfs_clear_gcinode(&ii->vfs_inode);
+	}
+}
+
+int nilfs_clean_segments(struct super_block *sb, void __user *argp)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct nilfs_sc_info *sci = NILFS_SC(sbi);
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_transaction_info ti;
+	struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
+	int err;
+
+	if (unlikely(!sci))
+		return -EROFS;
+
+	nilfs_transaction_lock(sbi, &ti, 1);
+
+	err = nilfs_init_gcdat_inode(nilfs);
+	if (unlikely(err))
+		goto out_unlock;
+	err = nilfs_ioctl_prepare_clean_segments(nilfs, argp);
+	if (unlikely(err))
+		goto out_unlock;
+
+	list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev);
+
+	for (;;) {
+		nilfs_segctor_accept(sci, &req);
+		err = nilfs_segctor_construct(sci, &req);
+		nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
+		nilfs_segctor_notify(sci, &req);
+
+		if (likely(!err))
+			break;
+
+		nilfs_warning(sb, __func__,
+			      "segment construction failed. (err=%d)", err);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(sci->sc_interval);
+	}
+
+ out_unlock:
+	nilfs_clear_gcdat_inode(nilfs);
+	nilfs_transaction_unlock(sbi);
+	return err;
+}
+
+static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
+{
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	struct nilfs_transaction_info ti;
+	struct nilfs_segctor_req req = { .mode = mode };
+
+	nilfs_transaction_lock(sbi, &ti, 0);
+
+	nilfs_segctor_accept(sci, &req);
+	nilfs_segctor_construct(sci, &req);
+	nilfs_segctor_notify(sci, &req);
+
+	/*
+	 * Unclosed segment should be retried.  We do this using sc_timer.
+	 * Timeout of sc_timer will invoke complete construction which leads
+	 * to close the current logical segment.
+	 */
+	if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags))
+		nilfs_segctor_start_timer(sci);
+
+	nilfs_transaction_unlock(sbi);
+}
+
+static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
+{
+	int mode = 0;
+	int err;
+
+	spin_lock(&sci->sc_state_lock);
+	mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
+		SC_FLUSH_DAT : SC_FLUSH_FILE;
+	spin_unlock(&sci->sc_state_lock);
+
+	if (mode) {
+		err = nilfs_segctor_do_construct(sci, mode);
+
+		spin_lock(&sci->sc_state_lock);
+		sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
+			~FLUSH_FILE_BIT : ~FLUSH_DAT_BIT;
+		spin_unlock(&sci->sc_state_lock);
+	}
+	clear_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
+}
+
+static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
+{
+	if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
+	    time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq)) {
+		if (!(sci->sc_flush_request & ~FLUSH_FILE_BIT))
+			return SC_FLUSH_FILE;
+		else if (!(sci->sc_flush_request & ~FLUSH_DAT_BIT))
+			return SC_FLUSH_DAT;
+	}
+	return SC_LSEG_SR;
+}
+
+/**
+ * nilfs_segctor_thread - main loop of the segment constructor thread.
+ * @arg: pointer to a struct nilfs_sc_info.
+ *
+ * nilfs_segctor_thread() initializes a timer and serves as a daemon
+ * to execute segment constructions.
+ */
+static int nilfs_segctor_thread(void *arg)
+{
+	struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
+	struct timer_list timer;
+	int timeout = 0;
+
+	init_timer(&timer);
+	timer.data = (unsigned long)current;
+	timer.function = nilfs_construction_timeout;
+	sci->sc_timer = &timer;
+
+	/* start sync. */
+	sci->sc_task = current;
+	wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */
+	printk(KERN_INFO
+	       "segctord starting. Construction interval = %lu seconds, "
+	       "CP frequency < %lu seconds\n",
+	       sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
+
+	spin_lock(&sci->sc_state_lock);
+ loop:
+	for (;;) {
+		int mode;
+
+		if (sci->sc_state & NILFS_SEGCTOR_QUIT)
+			goto end_thread;
+
+		if (timeout || sci->sc_seq_request != sci->sc_seq_done)
+			mode = SC_LSEG_SR;
+		else if (!sci->sc_flush_request)
+			break;
+		else
+			mode = nilfs_segctor_flush_mode(sci);
+
+		spin_unlock(&sci->sc_state_lock);
+		nilfs_segctor_thread_construct(sci, mode);
+		spin_lock(&sci->sc_state_lock);
+		timeout = 0;
+	}
+
+
+	if (freezing(current)) {
+		spin_unlock(&sci->sc_state_lock);
+		refrigerator();
+		spin_lock(&sci->sc_state_lock);
+	} else {
+		DEFINE_WAIT(wait);
+		int should_sleep = 1;
+
+		prepare_to_wait(&sci->sc_wait_daemon, &wait,
+				TASK_INTERRUPTIBLE);
+
+		if (sci->sc_seq_request != sci->sc_seq_done)
+			should_sleep = 0;
+		else if (sci->sc_flush_request)
+			should_sleep = 0;
+		else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
+			should_sleep = time_before(jiffies,
+						   sci->sc_timer->expires);
+
+		if (should_sleep) {
+			spin_unlock(&sci->sc_state_lock);
+			schedule();
+			spin_lock(&sci->sc_state_lock);
+		}
+		finish_wait(&sci->sc_wait_daemon, &wait);
+		timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
+			   time_after_eq(jiffies, sci->sc_timer->expires));
+	}
+	goto loop;
+
+ end_thread:
+	spin_unlock(&sci->sc_state_lock);
+	del_timer_sync(sci->sc_timer);
+	sci->sc_timer = NULL;
+
+	/* end sync. */
+	sci->sc_task = NULL;
+	wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */
+	return 0;
+}
+
+static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
+{
+	struct task_struct *t;
+
+	t = kthread_run(nilfs_segctor_thread, sci, "segctord");
+	if (IS_ERR(t)) {
+		int err = PTR_ERR(t);
+
+		printk(KERN_ERR "NILFS: error %d creating segctord thread\n",
+		       err);
+		return err;
+	}
+	wait_event(sci->sc_wait_task, sci->sc_task != NULL);
+	return 0;
+}
+
+static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
+{
+	sci->sc_state |= NILFS_SEGCTOR_QUIT;
+
+	while (sci->sc_task) {
+		wake_up(&sci->sc_wait_daemon);
+		spin_unlock(&sci->sc_state_lock);
+		wait_event(sci->sc_wait_task, sci->sc_task == NULL);
+		spin_lock(&sci->sc_state_lock);
+	}
+}
+
+static int nilfs_segctor_init(struct nilfs_sc_info *sci)
+{
+	sci->sc_seq_done = sci->sc_seq_request;
+
+	return nilfs_segctor_start_thread(sci);
+}
+
+/*
+ * Setup & clean-up functions
+ */
+static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
+{
+	struct nilfs_sc_info *sci;
+
+	sci = kzalloc(sizeof(*sci), GFP_KERNEL);
+	if (!sci)
+		return NULL;
+
+	sci->sc_sbi = sbi;
+	sci->sc_super = sbi->s_super;
+
+	init_waitqueue_head(&sci->sc_wait_request);
+	init_waitqueue_head(&sci->sc_wait_daemon);
+	init_waitqueue_head(&sci->sc_wait_task);
+	spin_lock_init(&sci->sc_state_lock);
+	INIT_LIST_HEAD(&sci->sc_dirty_files);
+	INIT_LIST_HEAD(&sci->sc_segbufs);
+	INIT_LIST_HEAD(&sci->sc_gc_inodes);
+	INIT_LIST_HEAD(&sci->sc_cleaning_segments);
+	INIT_LIST_HEAD(&sci->sc_copied_buffers);
+
+	sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
+	sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
+	sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK;
+
+	if (sbi->s_interval)
+		sci->sc_interval = sbi->s_interval;
+	if (sbi->s_watermark)
+		sci->sc_watermark = sbi->s_watermark;
+	return sci;
+}
+
+static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
+{
+	int ret, retrycount = NILFS_SC_CLEANUP_RETRY;
+
+	/* The segctord thread was stopped and its timer was removed.
+	   But some tasks remain. */
+	do {
+		struct nilfs_sb_info *sbi = sci->sc_sbi;
+		struct nilfs_transaction_info ti;
+		struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
+
+		nilfs_transaction_lock(sbi, &ti, 0);
+		nilfs_segctor_accept(sci, &req);
+		ret = nilfs_segctor_construct(sci, &req);
+		nilfs_segctor_notify(sci, &req);
+		nilfs_transaction_unlock(sbi);
+
+	} while (ret && retrycount-- > 0);
+}
+
+/**
+ * nilfs_segctor_destroy - destroy the segment constructor.
+ * @sci: nilfs_sc_info
+ *
+ * nilfs_segctor_destroy() kills the segctord thread and frees
+ * the nilfs_sc_info struct.
+ * Caller must hold the segment semaphore.
+ */
+static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
+{
+	struct nilfs_sb_info *sbi = sci->sc_sbi;
+	int flag;
+
+	up_write(&sbi->s_nilfs->ns_segctor_sem);
+
+	spin_lock(&sci->sc_state_lock);
+	nilfs_segctor_kill_thread(sci);
+	flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request
+		|| sci->sc_seq_request != sci->sc_seq_done);
+	spin_unlock(&sci->sc_state_lock);
+
+	if (flag || nilfs_segctor_confirm(sci))
+		nilfs_segctor_write_out(sci);
+
+	WARN_ON(!list_empty(&sci->sc_copied_buffers));
+
+	if (!list_empty(&sci->sc_dirty_files)) {
+		nilfs_warning(sbi->s_super, __func__,
+			      "dirty file(s) after the final construction\n");
+		nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1);
+	}
+
+	if (!list_empty(&sci->sc_cleaning_segments))
+		nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
+
+	WARN_ON(!list_empty(&sci->sc_segbufs));
+
+	down_write(&sbi->s_nilfs->ns_segctor_sem);
+
+	kfree(sci);
+}
+
+/**
+ * nilfs_attach_segment_constructor - attach a segment constructor
+ * @sbi: nilfs_sb_info
+ *
+ * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
+ * initilizes it, and starts the segment constructor.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	int err;
+
+	/* Each field of nilfs_segctor is cleared through the initialization
+	   of super-block info */
+	sbi->s_sc_info = nilfs_segctor_new(sbi);
+	if (!sbi->s_sc_info)
+		return -ENOMEM;
+
+	nilfs_attach_writer(nilfs, sbi);
+	err = nilfs_segctor_init(NILFS_SC(sbi));
+	if (err) {
+		nilfs_detach_writer(nilfs, sbi);
+		kfree(sbi->s_sc_info);
+		sbi->s_sc_info = NULL;
+	}
+	return err;
+}
+
+/**
+ * nilfs_detach_segment_constructor - destroy the segment constructor
+ * @sbi: nilfs_sb_info
+ *
+ * nilfs_detach_segment_constructor() kills the segment constructor daemon,
+ * frees the struct nilfs_sc_info, and destroy the dirty file list.
+ */
+void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	LIST_HEAD(garbage_list);
+
+	down_write(&nilfs->ns_segctor_sem);
+	if (NILFS_SC(sbi)) {
+		nilfs_segctor_destroy(NILFS_SC(sbi));
+		sbi->s_sc_info = NULL;
+	}
+
+	/* Force to free the list of dirty files */
+	spin_lock(&sbi->s_inode_lock);
+	if (!list_empty(&sbi->s_dirty_files)) {
+		list_splice_init(&sbi->s_dirty_files, &garbage_list);
+		nilfs_warning(sbi->s_super, __func__,
+			      "Non empty dirty list after the last "
+			      "segment construction\n");
+	}
+	spin_unlock(&sbi->s_inode_lock);
+	up_write(&nilfs->ns_segctor_sem);
+
+	nilfs_dispose_list(sbi, &garbage_list, 1);
+	nilfs_detach_writer(nilfs, sbi);
+}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
new file mode 100644
index 00000000000..a98fc1ed0bb
--- /dev/null
+++ b/fs/nilfs2/segment.h
@@ -0,0 +1,243 @@
+/*
+ * segment.h - NILFS Segment constructor prototypes and definitions
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+#ifndef _NILFS_SEGMENT_H
+#define _NILFS_SEGMENT_H
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "sb.h"
+
+/**
+ * struct nilfs_recovery_info - Recovery infomation
+ * @ri_need_recovery: Recovery status
+ * @ri_super_root: Block number of the last super root
+ * @ri_ri_cno: Number of the last checkpoint
+ * @ri_lsegs_start: Region for roll-forwarding (start block number)
+ * @ri_lsegs_end: Region for roll-forwarding (end block number)
+ * @ri_lseg_start_seq: Sequence value of the segment at ri_lsegs_start
+ * @ri_used_segments: List of segments to be mark active
+ * @ri_pseg_start: Block number of the last partial segment
+ * @ri_seq: Sequence number on the last partial segment
+ * @ri_segnum: Segment number on the last partial segment
+ * @ri_nextnum: Next segment number on the last partial segment
+ */
+struct nilfs_recovery_info {
+	int			ri_need_recovery;
+	sector_t		ri_super_root;
+	__u64			ri_cno;
+
+	sector_t		ri_lsegs_start;
+	sector_t		ri_lsegs_end;
+	u64			ri_lsegs_start_seq;
+	struct list_head	ri_used_segments;
+	sector_t		ri_pseg_start;
+	u64			ri_seq;
+	__u64			ri_segnum;
+	__u64			ri_nextnum;
+};
+
+/* ri_need_recovery */
+#define NILFS_RECOVERY_SR_UPDATED	 1  /* The super root was updated */
+#define NILFS_RECOVERY_ROLLFORWARD_DONE	 2  /* Rollforward was carried out */
+
+/**
+ * struct nilfs_cstage - Context of collection stage
+ * @scnt: Stage count
+ * @flags: State flags
+ * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file
+ * @gc_inode_ptr: Pointer on the list of gc-inodes
+ */
+struct nilfs_cstage {
+	int			scnt;
+	unsigned 		flags;
+	struct nilfs_inode_info *dirty_file_ptr;
+	struct nilfs_inode_info *gc_inode_ptr;
+};
+
+struct nilfs_segment_buffer;
+
+struct nilfs_segsum_pointer {
+	struct buffer_head     *bh;
+	unsigned		offset; /* offset in bytes */
+};
+
+/**
+ * struct nilfs_sc_info - Segment constructor information
+ * @sc_super: Back pointer to super_block struct
+ * @sc_sbi: Back pointer to nilfs_sb_info struct
+ * @sc_nblk_inc: Block count of current generation
+ * @sc_dirty_files: List of files to be written
+ * @sc_gc_inodes: List of GC inodes having blocks to be written
+ * @sc_cleaning_segments: List of segments to be freed through construction
+ * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data
+ * @sc_dsync_inode: inode whose data pages are written for a sync operation
+ * @sc_dsync_start: start byte offset of data pages
+ * @sc_dsync_end: end byte offset of data pages (inclusive)
+ * @sc_segbufs: List of segment buffers
+ * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
+ * @sc_curseg: Current segment buffer
+ * @sc_super_root: Pointer to the super root buffer
+ * @sc_stage: Collection stage
+ * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
+ * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
+ * @sc_blk_cnt:	Block count of a file
+ * @sc_datablk_cnt: Data block count of a file
+ * @sc_nblk_this_inc: Number of blocks included in the current logical segment
+ * @sc_seg_ctime: Creation time
+ * @sc_flags: Internal flags
+ * @sc_state_lock: spinlock for sc_state and so on
+ * @sc_state: Segctord state flags
+ * @sc_flush_request: inode bitmap of metadata files to be flushed
+ * @sc_wait_request: Client request queue
+ * @sc_wait_daemon: Daemon wait queue
+ * @sc_wait_task: Start/end wait queue to control segctord task
+ * @sc_seq_request: Request counter
+ * @sc_seq_done: Completion counter
+ * @sc_sync: Request of explicit sync operation
+ * @sc_interval: Timeout value of background construction
+ * @sc_mjcp_freq: Frequency of creating checkpoints
+ * @sc_lseg_stime: Start time of the latest logical segment
+ * @sc_watermark: Watermark for the number of dirty buffers
+ * @sc_timer: Timer for segctord
+ * @sc_task: current thread of segctord
+ */
+struct nilfs_sc_info {
+	struct super_block     *sc_super;
+	struct nilfs_sb_info   *sc_sbi;
+
+	unsigned long		sc_nblk_inc;
+
+	struct list_head	sc_dirty_files;
+	struct list_head	sc_gc_inodes;
+	struct list_head	sc_cleaning_segments;
+	struct list_head	sc_copied_buffers;
+
+	struct nilfs_inode_info *sc_dsync_inode;
+	loff_t			sc_dsync_start;
+	loff_t			sc_dsync_end;
+
+	/* Segment buffers */
+	struct list_head	sc_segbufs;
+	unsigned long		sc_segbuf_nblocks;
+	struct nilfs_segment_buffer *sc_curseg;
+	struct buffer_head     *sc_super_root;
+
+	struct nilfs_cstage	sc_stage;
+
+	struct nilfs_segsum_pointer sc_finfo_ptr;
+	struct nilfs_segsum_pointer sc_binfo_ptr;
+	unsigned long		sc_blk_cnt;
+	unsigned long		sc_datablk_cnt;
+	unsigned long		sc_nblk_this_inc;
+	time_t			sc_seg_ctime;
+
+	unsigned long		sc_flags;
+
+	spinlock_t		sc_state_lock;
+	unsigned long		sc_state;
+	unsigned long		sc_flush_request;
+
+	wait_queue_head_t	sc_wait_request;
+	wait_queue_head_t	sc_wait_daemon;
+	wait_queue_head_t	sc_wait_task;
+
+	__u32			sc_seq_request;
+	__u32			sc_seq_done;
+
+	int			sc_sync;
+	unsigned long		sc_interval;
+	unsigned long		sc_mjcp_freq;
+	unsigned long		sc_lseg_stime;	/* in 1/HZ seconds */
+	unsigned long		sc_watermark;
+
+	struct timer_list      *sc_timer;
+	struct task_struct     *sc_task;
+};
+
+/* sc_flags */
+enum {
+	NILFS_SC_DIRTY,		/* One or more dirty meta-data blocks exist */
+	NILFS_SC_UNCLOSED,	/* Logical segment is not closed */
+	NILFS_SC_SUPER_ROOT,	/* The latest segment has a super root */
+	NILFS_SC_PRIOR_FLUSH,	/* Requesting immediate flush without making a
+				   checkpoint */
+	NILFS_SC_HAVE_DELTA,	/* Next checkpoint will have update of files
+				   other than DAT, cpfile, sufile, or files
+				   moved by GC */
+};
+
+/* sc_state */
+#define NILFS_SEGCTOR_QUIT	    0x0001  /* segctord is being destroyed */
+#define NILFS_SEGCTOR_COMMIT	    0x0004  /* committed transaction exists */
+
+/*
+ * Constant parameters
+ */
+#define NILFS_SC_CLEANUP_RETRY	    3  /* Retry count of construction when
+					  destroying segctord */
+
+/*
+ * Default values of timeout, in seconds.
+ */
+#define NILFS_SC_DEFAULT_TIMEOUT    5   /* Timeout value of dirty blocks.
+					   It triggers construction of a
+					   logical segment with a super root */
+#define NILFS_SC_DEFAULT_SR_FREQ    30  /* Maximum frequency of super root
+					   creation */
+
+/*
+ * The default threshold amount of data, in block counts.
+ */
+#define NILFS_SC_DEFAULT_WATERMARK  3600
+
+
+/* segment.c */
+extern int nilfs_init_transaction_cache(void);
+extern void nilfs_destroy_transaction_cache(void);
+extern void nilfs_relax_pressure_in_lock(struct super_block *);
+
+extern int nilfs_construct_segment(struct super_block *);
+extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *,
+					 loff_t, loff_t);
+extern void nilfs_flush_segment(struct super_block *, ino_t);
+extern int nilfs_clean_segments(struct super_block *, void __user *);
+
+extern int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *,
+						  __u64 *, size_t);
+extern void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *);
+
+extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
+extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
+
+/* recovery.c */
+extern int nilfs_read_super_root_block(struct super_block *, sector_t,
+				       struct buffer_head **, int);
+extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *,
+				   struct nilfs_recovery_info *);
+extern int nilfs_recover_logical_segments(struct the_nilfs *,
+					  struct nilfs_sb_info *,
+					  struct nilfs_recovery_info *);
+
+#endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
new file mode 100644
index 00000000000..98e68677f04
--- /dev/null
+++ b/fs/nilfs2/sufile.c
@@ -0,0 +1,558 @@
+/*
+ * sufile.c - NILFS segment usage file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/errno.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+#include "sufile.h"
+
+
+static inline unsigned long
+nilfs_sufile_segment_usages_per_block(const struct inode *sufile)
+{
+	return NILFS_MDT(sufile)->mi_entries_per_block;
+}
+
+static unsigned long
+nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum)
+{
+	__u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+	do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
+	return (unsigned long)t;
+}
+
+static unsigned long
+nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum)
+{
+	__u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
+	return do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
+}
+
+static unsigned long
+nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr,
+				     __u64 max)
+{
+	return min_t(unsigned long,
+		     nilfs_sufile_segment_usages_per_block(sufile) -
+		     nilfs_sufile_get_offset(sufile, curr),
+		     max - curr + 1);
+}
+
+static inline struct nilfs_sufile_header *
+nilfs_sufile_block_get_header(const struct inode *sufile,
+			      struct buffer_head *bh,
+			      void *kaddr)
+{
+	return kaddr + bh_offset(bh);
+}
+
+static struct nilfs_segment_usage *
+nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum,
+				     struct buffer_head *bh, void *kaddr)
+{
+	return kaddr + bh_offset(bh) +
+		nilfs_sufile_get_offset(sufile, segnum) *
+		NILFS_MDT(sufile)->mi_entry_size;
+}
+
+static inline int nilfs_sufile_get_header_block(struct inode *sufile,
+						struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp);
+}
+
+static inline int
+nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
+				     int create, struct buffer_head **bhp)
+{
+	return nilfs_mdt_get_block(sufile,
+				   nilfs_sufile_get_blkoff(sufile, segnum),
+				   create, NULL, bhp);
+}
+
+static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
+				     u64 ncleanadd, u64 ndirtyadd)
+{
+	struct nilfs_sufile_header *header;
+	void *kaddr;
+
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = kaddr + bh_offset(header_bh);
+	le64_add_cpu(&header->sh_ncleansegs, ncleanadd);
+	le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_mdt_mark_buffer_dirty(header_bh);
+}
+
+int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
+			void (*dofunc)(struct inode *, __u64,
+				       struct buffer_head *,
+				       struct buffer_head *))
+{
+	struct buffer_head *header_bh, *bh;
+	int ret;
+
+	if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
+		printk(KERN_WARNING "%s: invalid segment number: %llu\n",
+		       __func__, (unsigned long long)segnum);
+		return -EINVAL;
+	}
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+
+	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, create, &bh);
+	if (!ret) {
+		dofunc(sufile, segnum, header_bh, bh);
+		brelse(bh);
+	}
+	brelse(header_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_sufile_alloc - allocate a segment
+ * @sufile: inode of segment usage file
+ * @segnump: pointer to segment number
+ *
+ * Description: nilfs_sufile_alloc() allocates a clean segment.
+ *
+ * Return Value: On success, 0 is returned and the segment number of the
+ * allocated segment is stored in the place pointed by @segnump. On error, one
+ * of the following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOSPC - No clean segment left.
+ */
+int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
+{
+	struct buffer_head *header_bh, *su_bh;
+	struct nilfs_sufile_header *header;
+	struct nilfs_segment_usage *su;
+	size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+	__u64 segnum, maxsegnum, last_alloc;
+	void *kaddr;
+	unsigned long nsegments, ncleansegs, nsus;
+	int ret, i, j;
+
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+	ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+	last_alloc = le64_to_cpu(header->sh_last_alloc);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nsegments = nilfs_sufile_get_nsegments(sufile);
+	segnum = last_alloc + 1;
+	maxsegnum = nsegments - 1;
+	for (i = 0; i < nsegments; i += nsus) {
+		if (segnum >= nsegments) {
+			/* wrap around */
+			segnum = 0;
+			maxsegnum = last_alloc;
+		}
+		ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
+							   &su_bh);
+		if (ret < 0)
+			goto out_header;
+		kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+		su = nilfs_sufile_block_get_segment_usage(
+			sufile, segnum, su_bh, kaddr);
+
+		nsus = nilfs_sufile_segment_usages_in_block(
+			sufile, segnum, maxsegnum);
+		for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) {
+			if (!nilfs_segment_usage_clean(su))
+				continue;
+			/* found a clean segment */
+			nilfs_segment_usage_set_dirty(su);
+			kunmap_atomic(kaddr, KM_USER0);
+
+			kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+			header = nilfs_sufile_block_get_header(
+				sufile, header_bh, kaddr);
+			le64_add_cpu(&header->sh_ncleansegs, -1);
+			le64_add_cpu(&header->sh_ndirtysegs, 1);
+			header->sh_last_alloc = cpu_to_le64(segnum);
+			kunmap_atomic(kaddr, KM_USER0);
+
+			nilfs_mdt_mark_buffer_dirty(header_bh);
+			nilfs_mdt_mark_buffer_dirty(su_bh);
+			nilfs_mdt_mark_dirty(sufile);
+			brelse(su_bh);
+			*segnump = segnum;
+			goto out_header;
+		}
+
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(su_bh);
+	}
+
+	/* no segments left */
+	ret = -ENOSPC;
+
+ out_header:
+	brelse(header_bh);
+
+ out_sem:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
+				 struct buffer_head *header_bh,
+				 struct buffer_head *su_bh)
+{
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+
+	kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+	if (unlikely(!nilfs_segment_usage_clean(su))) {
+		printk(KERN_WARNING "%s: segment %llu must be clean\n",
+		       __func__, (unsigned long long)segnum);
+		kunmap_atomic(kaddr, KM_USER0);
+		return;
+	}
+	nilfs_segment_usage_set_dirty(su);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_sufile_mod_counter(header_bh, -1, 1);
+	nilfs_mdt_mark_buffer_dirty(su_bh);
+	nilfs_mdt_mark_dirty(sufile);
+}
+
+void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
+			   struct buffer_head *header_bh,
+			   struct buffer_head *su_bh)
+{
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	int clean, dirty;
+
+	kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+	if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) &&
+	    su->su_nblocks == cpu_to_le32(0)) {
+		kunmap_atomic(kaddr, KM_USER0);
+		return;
+	}
+	clean = nilfs_segment_usage_clean(su);
+	dirty = nilfs_segment_usage_dirty(su);
+
+	/* make the segment garbage */
+	su->su_lastmod = cpu_to_le64(0);
+	su->su_nblocks = cpu_to_le32(0);
+	su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
+	nilfs_mdt_mark_buffer_dirty(su_bh);
+	nilfs_mdt_mark_dirty(sufile);
+}
+
+void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
+			  struct buffer_head *header_bh,
+			  struct buffer_head *su_bh)
+{
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	int sudirty;
+
+	kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+	if (nilfs_segment_usage_clean(su)) {
+		printk(KERN_WARNING "%s: segment %llu is already clean\n",
+		       __func__, (unsigned long long)segnum);
+		kunmap_atomic(kaddr, KM_USER0);
+		return;
+	}
+	WARN_ON(nilfs_segment_usage_error(su));
+	WARN_ON(!nilfs_segment_usage_dirty(su));
+
+	sudirty = nilfs_segment_usage_dirty(su);
+	nilfs_segment_usage_set_clean(su);
+	kunmap_atomic(kaddr, KM_USER0);
+	nilfs_mdt_mark_buffer_dirty(su_bh);
+
+	nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
+	nilfs_mdt_mark_dirty(sufile);
+}
+
+/**
+ * nilfs_sufile_get_segment_usage - get a segment usage
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ * @sup: pointer to segment usage
+ * @bhp: pointer to buffer head
+ *
+ * Description: nilfs_sufile_get_segment_usage() acquires the segment usage
+ * specified by @segnum.
+ *
+ * Return Value: On success, 0 is returned, and the segment usage and the
+ * buffer head of the buffer on which the segment usage is located are stored
+ * in the place pointed by @sup and @bhp, respectively. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid segment usage number.
+ */
+int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum,
+				   struct nilfs_segment_usage **sup,
+				   struct buffer_head **bhp)
+{
+	struct buffer_head *bh;
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	int ret;
+
+	/* segnum is 0 origin */
+	if (segnum >= nilfs_sufile_get_nsegments(sufile))
+		return -EINVAL;
+	down_write(&NILFS_MDT(sufile)->mi_sem);
+	ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh);
+	if (ret < 0)
+		goto out_sem;
+	kaddr = kmap(bh->b_page);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+	if (nilfs_segment_usage_error(su)) {
+		kunmap(bh->b_page);
+		brelse(bh);
+		ret = -EINVAL;
+		goto out_sem;
+	}
+
+	if (sup != NULL)
+		*sup = su;
+	*bhp = bh;
+
+ out_sem:
+	up_write(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_sufile_put_segment_usage - put a segment usage
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ * @bh: buffer head
+ *
+ * Description: nilfs_sufile_put_segment_usage() releases the segment usage
+ * specified by @segnum. @bh must be the buffer head which have been returned
+ * by a previous call to nilfs_sufile_get_segment_usage() with @segnum.
+ */
+void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum,
+				    struct buffer_head *bh)
+{
+	kunmap(bh->b_page);
+	brelse(bh);
+}
+
+/**
+ * nilfs_sufile_get_stat - get segment usage statistics
+ * @sufile: inode of segment usage file
+ * @stat: pointer to a structure of segment usage statistics
+ *
+ * Description: nilfs_sufile_get_stat() returns information about segment
+ * usage.
+ *
+ * Return Value: On success, 0 is returned, and segment usage information is
+ * stored in the place pointed by @stat. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
+{
+	struct buffer_head *header_bh;
+	struct nilfs_sufile_header *header;
+	struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
+	void *kaddr;
+	int ret;
+
+	down_read(&NILFS_MDT(sufile)->mi_sem);
+
+	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+	if (ret < 0)
+		goto out_sem;
+
+	kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+	header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
+	sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
+	sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
+	sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
+	sustat->ss_ctime = nilfs->ns_ctime;
+	sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime;
+	spin_lock(&nilfs->ns_last_segment_lock);
+	sustat->ss_prot_seq = nilfs->ns_prot_seq;
+	spin_unlock(&nilfs->ns_last_segment_lock);
+	kunmap_atomic(kaddr, KM_USER0);
+	brelse(header_bh);
+
+ out_sem:
+	up_read(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_sufile_get_ncleansegs - get the number of clean segments
+ * @sufile: inode of segment usage file
+ * @nsegsp: pointer to the number of clean segments
+ *
+ * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean
+ * segments.
+ *
+ * Return Value: On success, 0 is returned and the number of clean segments is
+ * stored in the place pointed by @nsegsp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp)
+{
+	struct nilfs_sustat sustat;
+	int ret;
+
+	ret = nilfs_sufile_get_stat(sufile, &sustat);
+	if (ret == 0)
+		*nsegsp = sustat.ss_ncleansegs;
+	return ret;
+}
+
+void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
+			       struct buffer_head *header_bh,
+			       struct buffer_head *su_bh)
+{
+	struct nilfs_segment_usage *su;
+	void *kaddr;
+	int suclean;
+
+	kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
+	if (nilfs_segment_usage_error(su)) {
+		kunmap_atomic(kaddr, KM_USER0);
+		return;
+	}
+	suclean = nilfs_segment_usage_clean(su);
+	nilfs_segment_usage_set_error(su);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (suclean)
+		nilfs_sufile_mod_counter(header_bh, -1, 0);
+	nilfs_mdt_mark_buffer_dirty(su_bh);
+	nilfs_mdt_mark_dirty(sufile);
+}
+
+/**
+ * nilfs_sufile_get_suinfo -
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to start looking
+ * @si: array of suinfo
+ * @nsi: size of suinfo array
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned and .... On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
+				struct nilfs_suinfo *si, size_t nsi)
+{
+	struct buffer_head *su_bh;
+	struct nilfs_segment_usage *su;
+	size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+	struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
+	void *kaddr;
+	unsigned long nsegs, segusages_per_block;
+	ssize_t n;
+	int ret, i, j;
+
+	down_read(&NILFS_MDT(sufile)->mi_sem);
+
+	segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
+	nsegs = min_t(unsigned long,
+		      nilfs_sufile_get_nsegments(sufile) - segnum,
+		      nsi);
+	for (i = 0; i < nsegs; i += n, segnum += n) {
+		n = min_t(unsigned long,
+			  segusages_per_block -
+				  nilfs_sufile_get_offset(sufile, segnum),
+			  nsegs - i);
+		ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+							   &su_bh);
+		if (ret < 0) {
+			if (ret != -ENOENT)
+				goto out;
+			/* hole */
+			memset(&si[i], 0, sizeof(struct nilfs_suinfo) * n);
+			continue;
+		}
+
+		kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+		su = nilfs_sufile_block_get_segment_usage(
+			sufile, segnum, su_bh, kaddr);
+		for (j = 0; j < n; j++, su = (void *)su + susz) {
+			si[i + j].sui_lastmod = le64_to_cpu(su->su_lastmod);
+			si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks);
+			si[i + j].sui_flags = le32_to_cpu(su->su_flags) &
+				~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+			if (nilfs_segment_is_active(nilfs, segnum + j))
+				si[i + j].sui_flags |=
+					(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+		}
+		kunmap_atomic(kaddr, KM_USER0);
+		brelse(su_bh);
+	}
+	ret = nsegs;
+
+ out:
+	up_read(&NILFS_MDT(sufile)->mi_sem);
+	return ret;
+}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
new file mode 100644
index 00000000000..a2e2efd4ade
--- /dev/null
+++ b/fs/nilfs2/sufile.h
@@ -0,0 +1,125 @@
+/*
+ * sufile.h - NILFS segment usage file.
+ *
+ * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Koji Sato <koji@osrg.net>.
+ */
+
+#ifndef _NILFS_SUFILE_H
+#define _NILFS_SUFILE_H
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/nilfs2_fs.h>
+#include "mdt.h"
+
+#define NILFS_SUFILE_GFP	NILFS_MDT_GFP
+
+static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
+{
+	return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
+}
+
+int nilfs_sufile_alloc(struct inode *, __u64 *);
+int nilfs_sufile_get_segment_usage(struct inode *, __u64,
+				   struct nilfs_segment_usage **,
+				   struct buffer_head **);
+void nilfs_sufile_put_segment_usage(struct inode *, __u64,
+				    struct buffer_head *);
+int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
+int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
+ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *,
+				size_t);
+
+int nilfs_sufile_update(struct inode *, __u64, int,
+			void (*dofunc)(struct inode *, __u64,
+				       struct buffer_head *,
+				       struct buffer_head *));
+void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
+				 struct buffer_head *);
+void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *,
+			   struct buffer_head *);
+void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *,
+			  struct buffer_head *);
+void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
+			       struct buffer_head *);
+
+/**
+ * nilfs_sufile_cancel_free -
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ *
+ * Description:
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ */
+static inline int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum)
+{
+	return nilfs_sufile_update(sufile, segnum, 0,
+				   nilfs_sufile_do_cancel_free);
+}
+
+/**
+ * nilfs_sufile_scrap - make a segment garbage
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to be freed
+ */
+static inline int nilfs_sufile_scrap(struct inode *sufile, __u64 segnum)
+{
+	return nilfs_sufile_update(sufile, segnum, 1, nilfs_sufile_do_scrap);
+}
+
+/**
+ * nilfs_sufile_free - free segment
+ * @sufile: inode of segment usage file
+ * @segnum: segment number to be freed
+ */
+static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
+{
+	return nilfs_sufile_update(sufile, segnum, 0, nilfs_sufile_do_free);
+}
+
+/**
+ * nilfs_sufile_set_error - mark a segment as erroneous
+ * @sufile: inode of segment usage file
+ * @segnum: segment number
+ *
+ * Description: nilfs_sufile_set_error() marks the segment specified by
+ * @segnum as erroneous. The error segment will never be used again.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid segment usage number.
+ */
+static inline int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum)
+{
+	return nilfs_sufile_update(sufile, segnum, 0,
+				   nilfs_sufile_do_set_error);
+}
+
+#endif	/* _NILFS_SUFILE_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
new file mode 100644
index 00000000000..6989b03e97a
--- /dev/null
+++ b/fs/nilfs2/super.c
@@ -0,0 +1,1326 @@
+/*
+ * super.c - NILFS module and super block management.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ */
+/*
+ *  linux/fs/ext2/super.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ *
+ *  from
+ *
+ *  linux/fs/minix/inode.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Big-endian to little-endian byte-swapping/bitmaps by
+ *        David S. Miller (davem@caip.rutgers.edu), 1995
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/random.h>
+#include <linux/crc32.h>
+#include <linux/smp_lock.h>
+#include <linux/vfs.h>
+#include <linux/writeback.h>
+#include <linux/kobject.h>
+#include <linux/exportfs.h>
+#include "nilfs.h"
+#include "mdt.h"
+#include "alloc.h"
+#include "page.h"
+#include "cpfile.h"
+#include "ifile.h"
+#include "dat.h"
+#include "segment.h"
+#include "segbuf.h"
+
+MODULE_AUTHOR("NTT Corp.");
+MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
+		   "(NILFS)");
+MODULE_LICENSE("GPL");
+
+static int nilfs_remount(struct super_block *sb, int *flags, char *data);
+static int test_exclusive_mount(struct file_system_type *fs_type,
+				struct block_device *bdev, int flags);
+
+/**
+ * nilfs_error() - report failure condition on a filesystem
+ *
+ * nilfs_error() sets an ERROR_FS flag on the superblock as well as
+ * reporting an error message.  It should be called when NILFS detects
+ * incoherences or defects of meta data on disk.  As for sustainable
+ * errors such as a single-shot I/O error, nilfs_warning() or the printk()
+ * function should be used instead.
+ *
+ * The segment constructor must not call this function because it can
+ * kill itself.
+ */
+void nilfs_error(struct super_block *sb, const char *function,
+		 const char *fmt, ...)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	va_list args;
+
+	va_start(args, fmt);
+	printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		struct the_nilfs *nilfs = sbi->s_nilfs;
+
+		if (!nilfs_test_opt(sbi, ERRORS_CONT))
+			nilfs_detach_segment_constructor(sbi);
+
+		down_write(&nilfs->ns_sem);
+		if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
+			nilfs->ns_mount_state |= NILFS_ERROR_FS;
+			nilfs->ns_sbp[0]->s_state |=
+				cpu_to_le16(NILFS_ERROR_FS);
+			nilfs_commit_super(sbi, 1);
+		}
+		up_write(&nilfs->ns_sem);
+
+		if (nilfs_test_opt(sbi, ERRORS_RO)) {
+			printk(KERN_CRIT "Remounting filesystem read-only\n");
+			sb->s_flags |= MS_RDONLY;
+		}
+	}
+
+	if (nilfs_test_opt(sbi, ERRORS_PANIC))
+		panic("NILFS (device %s): panic forced after error\n",
+		      sb->s_id);
+}
+
+void nilfs_warning(struct super_block *sb, const char *function,
+		   const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	printk(KERN_WARNING "NILFS warning (device %s): %s: ",
+	       sb->s_id, function);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+}
+
+static struct kmem_cache *nilfs_inode_cachep;
+
+struct inode *nilfs_alloc_inode(struct super_block *sb)
+{
+	struct nilfs_inode_info *ii;
+
+	ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS);
+	if (!ii)
+		return NULL;
+	ii->i_bh = NULL;
+	ii->i_state = 0;
+	ii->vfs_inode.i_version = 1;
+	nilfs_btnode_cache_init(&ii->i_btnode_cache);
+	return &ii->vfs_inode;
+}
+
+void nilfs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
+}
+
+static void init_once(void *obj)
+{
+	struct nilfs_inode_info *ii = obj;
+
+	INIT_LIST_HEAD(&ii->i_dirty);
+#ifdef CONFIG_NILFS_XATTR
+	init_rwsem(&ii->xattr_sem);
+#endif
+	nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+	ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
+	inode_init_once(&ii->vfs_inode);
+}
+
+static int nilfs_init_inode_cache(void)
+{
+	nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
+					       sizeof(struct nilfs_inode_info),
+					       0, SLAB_RECLAIM_ACCOUNT,
+					       init_once);
+
+	return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
+}
+
+static inline void nilfs_destroy_inode_cache(void)
+{
+	kmem_cache_destroy(nilfs_inode_cachep);
+}
+
+static void nilfs_clear_inode(struct inode *inode)
+{
+	struct nilfs_inode_info *ii = NILFS_I(inode);
+
+#ifdef CONFIG_NILFS_POSIX_ACL
+	if (ii->i_acl && ii->i_acl != NILFS_ACL_NOT_CACHED) {
+		posix_acl_release(ii->i_acl);
+		ii->i_acl = NILFS_ACL_NOT_CACHED;
+	}
+	if (ii->i_default_acl && ii->i_default_acl != NILFS_ACL_NOT_CACHED) {
+		posix_acl_release(ii->i_default_acl);
+		ii->i_default_acl = NILFS_ACL_NOT_CACHED;
+	}
+#endif
+	/*
+	 * Free resources allocated in nilfs_read_inode(), here.
+	 */
+	BUG_ON(!list_empty(&ii->i_dirty));
+	brelse(ii->i_bh);
+	ii->i_bh = NULL;
+
+	if (test_bit(NILFS_I_BMAP, &ii->i_state))
+		nilfs_bmap_clear(ii->i_bmap);
+
+	nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+}
+
+static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	int err;
+	int barrier_done = 0;
+
+	if (nilfs_test_opt(sbi, BARRIER)) {
+		set_buffer_ordered(nilfs->ns_sbh[0]);
+		barrier_done = 1;
+	}
+ retry:
+	set_buffer_dirty(nilfs->ns_sbh[0]);
+	err = sync_dirty_buffer(nilfs->ns_sbh[0]);
+	if (err == -EOPNOTSUPP && barrier_done) {
+		nilfs_warning(sbi->s_super, __func__,
+			      "barrier-based sync failed. "
+			      "disabling barriers\n");
+		nilfs_clear_opt(sbi, BARRIER);
+		barrier_done = 0;
+		clear_buffer_ordered(nilfs->ns_sbh[0]);
+		goto retry;
+	}
+	if (unlikely(err)) {
+		printk(KERN_ERR
+		       "NILFS: unable to write superblock (err=%d)\n", err);
+		if (err == -EIO && nilfs->ns_sbh[1]) {
+			nilfs_fall_back_super_block(nilfs);
+			goto retry;
+		}
+	} else {
+		struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
+
+		/*
+		 * The latest segment becomes trailable from the position
+		 * written in superblock.
+		 */
+		clear_nilfs_discontinued(nilfs);
+
+		/* update GC protection for recent segments */
+		if (nilfs->ns_sbh[1]) {
+			sbp = NULL;
+			if (dupsb) {
+				set_buffer_dirty(nilfs->ns_sbh[1]);
+				if (!sync_dirty_buffer(nilfs->ns_sbh[1]))
+					sbp = nilfs->ns_sbp[1];
+			}
+		}
+		if (sbp) {
+			spin_lock(&nilfs->ns_last_segment_lock);
+			nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
+			spin_unlock(&nilfs->ns_last_segment_lock);
+		}
+	}
+
+	return err;
+}
+
+int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	sector_t nfreeblocks;
+	time_t t;
+	int err;
+
+	/* nilfs->sem must be locked by the caller. */
+	if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) {
+		if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC)
+			nilfs_swap_super_block(nilfs);
+		else {
+			printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
+			       sbi->s_super->s_id);
+			return -EIO;
+		}
+	}
+	err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
+	if (unlikely(err)) {
+		printk(KERN_ERR "NILFS: failed to count free blocks\n");
+		return err;
+	}
+	spin_lock(&nilfs->ns_last_segment_lock);
+	sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
+	sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
+	sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
+	spin_unlock(&nilfs->ns_last_segment_lock);
+
+	t = get_seconds();
+	nilfs->ns_sbwtime[0] = t;
+	sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks);
+	sbp[0]->s_wtime = cpu_to_le64(t);
+	sbp[0]->s_sum = 0;
+	sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
+					     (unsigned char *)sbp[0],
+					     nilfs->ns_sbsize));
+	if (dupsb && sbp[1]) {
+		memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+		nilfs->ns_sbwtime[1] = t;
+	}
+	sbi->s_super->s_dirt = 0;
+	return nilfs_sync_super(sbi, dupsb);
+}
+
+static void nilfs_put_super(struct super_block *sb)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+
+	nilfs_detach_segment_constructor(sbi);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		down_write(&nilfs->ns_sem);
+		nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
+		nilfs_commit_super(sbi, 1);
+		up_write(&nilfs->ns_sem);
+	}
+
+	nilfs_detach_checkpoint(sbi);
+	put_nilfs(sbi->s_nilfs);
+	sbi->s_super = NULL;
+	sb->s_fs_info = NULL;
+	kfree(sbi);
+}
+
+/**
+ * nilfs_write_super - write super block(s) of NILFS
+ * @sb: super_block
+ *
+ * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and
+ * clears s_dirt.  This function is called in the section protected by
+ * lock_super().
+ *
+ * The s_dirt flag is managed by each filesystem and we protect it by ns_sem
+ * of the struct the_nilfs.  Lock order must be as follows:
+ *
+ *   1. lock_super()
+ *   2.    down_write(&nilfs->ns_sem)
+ *
+ * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
+ * of the super block (nilfs->ns_sbp[]).
+ *
+ * In most cases, VFS functions call lock_super() before calling these
+ * methods.  So we must be careful not to bring on deadlocks when using
+ * lock_super();  see generic_shutdown_super(), write_super(), and so on.
+ *
+ * Note that order of lock_kernel() and lock_super() depends on contexts
+ * of VFS.  We should also note that lock_kernel() can be used in its
+ * protective section and only the outermost one has an effect.
+ */
+static void nilfs_write_super(struct super_block *sb)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+
+	down_write(&nilfs->ns_sem);
+	if (!(sb->s_flags & MS_RDONLY)) {
+		struct nilfs_super_block **sbp = nilfs->ns_sbp;
+		u64 t = get_seconds();
+		int dupsb;
+
+		if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
+		    t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
+			up_write(&nilfs->ns_sem);
+			return;
+		}
+		dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
+		nilfs_commit_super(sbi, dupsb);
+	}
+	sb->s_dirt = 0;
+	up_write(&nilfs->ns_sem);
+}
+
+static int nilfs_sync_fs(struct super_block *sb, int wait)
+{
+	int err = 0;
+
+	/* This function is called when super block should be written back */
+	if (wait)
+		err = nilfs_construct_segment(sb);
+	return err;
+}
+
+int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_checkpoint *raw_cp;
+	struct buffer_head *bh_cp;
+	int err;
+
+	down_write(&nilfs->ns_sem);
+	list_add(&sbi->s_list, &nilfs->ns_supers);
+	up_write(&nilfs->ns_sem);
+
+	sbi->s_ifile = nilfs_mdt_new(
+		nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
+	if (!sbi->s_ifile)
+		return -ENOMEM;
+
+	err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size);
+	if (unlikely(err))
+		goto failed;
+
+	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
+					  &bh_cp);
+	if (unlikely(err)) {
+		if (err == -ENOENT || err == -EINVAL) {
+			printk(KERN_ERR
+			       "NILFS: Invalid checkpoint "
+			       "(checkpoint number=%llu)\n",
+			       (unsigned long long)cno);
+			err = -EINVAL;
+		}
+		goto failed;
+	}
+	err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode);
+	if (unlikely(err))
+		goto failed_bh;
+	atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
+	atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
+
+	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
+	return 0;
+
+ failed_bh:
+	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
+ failed:
+	nilfs_mdt_destroy(sbi->s_ifile);
+	sbi->s_ifile = NULL;
+
+	down_write(&nilfs->ns_sem);
+	list_del_init(&sbi->s_list);
+	up_write(&nilfs->ns_sem);
+
+	return err;
+}
+
+void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+
+	nilfs_mdt_clear(sbi->s_ifile);
+	nilfs_mdt_destroy(sbi->s_ifile);
+	sbi->s_ifile = NULL;
+	down_write(&nilfs->ns_sem);
+	list_del_init(&sbi->s_list);
+	up_write(&nilfs->ns_sem);
+}
+
+static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	int err = 0;
+
+	down_write(&nilfs->ns_sem);
+	if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
+		nilfs->ns_mount_state |= NILFS_VALID_FS;
+		err = nilfs_commit_super(sbi, 1);
+		if (likely(!err))
+			printk(KERN_INFO "NILFS: recovery complete.\n");
+	}
+	up_write(&nilfs->ns_sem);
+	return err;
+}
+
+static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+	unsigned long long blocks;
+	unsigned long overhead;
+	unsigned long nrsvblocks;
+	sector_t nfreeblocks;
+	int err;
+
+	/*
+	 * Compute all of the segment blocks
+	 *
+	 * The blocks before first segment and after last segment
+	 * are excluded.
+	 */
+	blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments
+		- nilfs->ns_first_data_block;
+	nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment;
+
+	/*
+	 * Compute the overhead
+	 *
+	 * When distributing meta data blocks outside semgent structure,
+	 * We must count them as the overhead.
+	 */
+	overhead = 0;
+
+	err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
+	if (unlikely(err))
+		return err;
+
+	buf->f_type = NILFS_SUPER_MAGIC;
+	buf->f_bsize = sb->s_blocksize;
+	buf->f_blocks = blocks - overhead;
+	buf->f_bfree = nfreeblocks;
+	buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
+		(buf->f_bfree - nrsvblocks) : 0;
+	buf->f_files = atomic_read(&sbi->s_inodes_count);
+	buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
+	buf->f_namelen = NILFS_NAME_LEN;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+
+	return 0;
+}
+
+static struct super_operations nilfs_sops = {
+	.alloc_inode    = nilfs_alloc_inode,
+	.destroy_inode  = nilfs_destroy_inode,
+	.dirty_inode    = nilfs_dirty_inode,
+	/* .write_inode    = nilfs_write_inode, */
+	/* .put_inode      = nilfs_put_inode, */
+	/* .drop_inode	  = nilfs_drop_inode, */
+	.delete_inode   = nilfs_delete_inode,
+	.put_super      = nilfs_put_super,
+	.write_super    = nilfs_write_super,
+	.sync_fs        = nilfs_sync_fs,
+	/* .write_super_lockfs */
+	/* .unlockfs */
+	.statfs         = nilfs_statfs,
+	.remount_fs     = nilfs_remount,
+	.clear_inode    = nilfs_clear_inode,
+	/* .umount_begin */
+	/* .show_options */
+};
+
+static struct inode *
+nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO &&
+	    ino != NILFS_SKETCH_INO)
+		return ERR_PTR(-ESTALE);
+
+	inode = nilfs_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+
+	return inode;
+}
+
+static struct dentry *
+nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
+		   int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    nilfs_nfs_get_inode);
+}
+
+static struct dentry *
+nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
+		   int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    nilfs_nfs_get_inode);
+}
+
+static struct export_operations nilfs_export_ops = {
+	.fh_to_dentry = nilfs_fh_to_dentry,
+	.fh_to_parent = nilfs_fh_to_parent,
+	.get_parent = nilfs_get_parent,
+};
+
+enum {
+	Opt_err_cont, Opt_err_panic, Opt_err_ro,
+	Opt_barrier, Opt_snapshot, Opt_order,
+	Opt_err,
+};
+
+static match_table_t tokens = {
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_barrier, "barrier=%s"},
+	{Opt_snapshot, "cp=%u"},
+	{Opt_order, "order=%s"},
+	{Opt_err, NULL}
+};
+
+static int match_bool(substring_t *s, int *result)
+{
+	int len = s->to - s->from;
+
+	if (strncmp(s->from, "on", len) == 0)
+		*result = 1;
+	else if (strncmp(s->from, "off", len) == 0)
+		*result = 0;
+	else
+		return 1;
+	return 0;
+}
+
+static int parse_options(char *options, struct super_block *sb)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+
+	if (!options)
+		return 1;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_barrier:
+			if (match_bool(&args[0], &option))
+				return 0;
+			if (option)
+				nilfs_set_opt(sbi, BARRIER);
+			else
+				nilfs_clear_opt(sbi, BARRIER);
+			break;
+		case Opt_order:
+			if (strcmp(args[0].from, "relaxed") == 0)
+				/* Ordered data semantics */
+				nilfs_clear_opt(sbi, STRICT_ORDER);
+			else if (strcmp(args[0].from, "strict") == 0)
+				/* Strict in-order semantics */
+				nilfs_set_opt(sbi, STRICT_ORDER);
+			else
+				return 0;
+			break;
+		case Opt_err_panic:
+			nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC);
+			break;
+		case Opt_err_ro:
+			nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO);
+			break;
+		case Opt_err_cont:
+			nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
+			break;
+		case Opt_snapshot:
+			if (match_int(&args[0], &option) || option <= 0)
+				return 0;
+			if (!(sb->s_flags & MS_RDONLY))
+				return 0;
+			sbi->s_snapshot_cno = option;
+			nilfs_set_opt(sbi, SNAPSHOT);
+			break;
+		default:
+			printk(KERN_ERR
+			       "NILFS: Unrecognized mount option \"%s\"\n", p);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static inline void
+nilfs_set_default_options(struct nilfs_sb_info *sbi,
+			  struct nilfs_super_block *sbp)
+{
+	sbi->s_mount_opt =
+		NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER;
+}
+
+static int nilfs_setup_super(struct nilfs_sb_info *sbi)
+{
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
+	int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count);
+	int mnt_count = le16_to_cpu(sbp->s_mnt_count);
+
+	/* nilfs->sem must be locked by the caller. */
+	if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
+		printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
+	} else if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
+		printk(KERN_WARNING
+		       "NILFS warning: mounting fs with errors\n");
+#if 0
+	} else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) {
+		printk(KERN_WARNING
+		       "NILFS warning: maximal mount count reached\n");
+#endif
+	}
+	if (!max_mnt_count)
+		sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
+
+	sbp->s_mnt_count = cpu_to_le16(mnt_count + 1);
+	sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS);
+	sbp->s_mtime = cpu_to_le64(get_seconds());
+	return nilfs_commit_super(sbi, 1);
+}
+
+struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
+						 u64 pos, int blocksize,
+						 struct buffer_head **pbh)
+{
+	unsigned long long sb_index = pos;
+	unsigned long offset;
+
+	offset = do_div(sb_index, blocksize);
+	*pbh = sb_bread(sb, sb_index);
+	if (!*pbh)
+		return NULL;
+	return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset);
+}
+
+int nilfs_store_magic_and_option(struct super_block *sb,
+				 struct nilfs_super_block *sbp,
+				 char *data)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+
+	sb->s_magic = le16_to_cpu(sbp->s_magic);
+
+	/* FS independent flags */
+#ifdef NILFS_ATIME_DISABLE
+	sb->s_flags |= MS_NOATIME;
+#endif
+
+	nilfs_set_default_options(sbi, sbp);
+
+	sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid);
+	sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid);
+	sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
+	sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
+
+	return !parse_options(data, sb) ? -EINVAL : 0 ;
+}
+
+/**
+ * nilfs_fill_super() - initialize a super block instance
+ * @sb: super_block
+ * @data: mount options
+ * @silent: silent mode flag
+ * @nilfs: the_nilfs struct
+ *
+ * This function is called exclusively by bd_mount_mutex.
+ * So, the recovery process is protected from other simultaneous mounts.
+ */
+static int
+nilfs_fill_super(struct super_block *sb, void *data, int silent,
+		 struct the_nilfs *nilfs)
+{
+	struct nilfs_sb_info *sbi;
+	struct inode *root;
+	__u64 cno;
+	int err;
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sb->s_fs_info = sbi;
+
+	get_nilfs(nilfs);
+	sbi->s_nilfs = nilfs;
+	sbi->s_super = sb;
+
+	err = init_nilfs(nilfs, sbi, (char *)data);
+	if (err)
+		goto failed_sbi;
+
+	spin_lock_init(&sbi->s_inode_lock);
+	INIT_LIST_HEAD(&sbi->s_dirty_files);
+	INIT_LIST_HEAD(&sbi->s_list);
+
+	/*
+	 * Following initialization is overlapped because
+	 * nilfs_sb_info structure has been cleared at the beginning.
+	 * But we reserve them to keep our interest and make ready
+	 * for the future change.
+	 */
+	get_random_bytes(&sbi->s_next_generation,
+			 sizeof(sbi->s_next_generation));
+	spin_lock_init(&sbi->s_next_gen_lock);
+
+	sb->s_op = &nilfs_sops;
+	sb->s_export_op = &nilfs_export_ops;
+	sb->s_root = NULL;
+	sb->s_time_gran = 1;
+
+	if (!nilfs_loaded(nilfs)) {
+		err = load_nilfs(nilfs, sbi);
+		if (err)
+			goto failed_sbi;
+	}
+	cno = nilfs_last_cno(nilfs);
+
+	if (sb->s_flags & MS_RDONLY) {
+		if (nilfs_test_opt(sbi, SNAPSHOT)) {
+			err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
+						       sbi->s_snapshot_cno);
+			if (err < 0)
+				goto failed_sbi;
+			if (!err) {
+				printk(KERN_ERR
+				       "NILFS: The specified checkpoint is "
+				       "not a snapshot "
+				       "(checkpoint number=%llu).\n",
+				       (unsigned long long)sbi->s_snapshot_cno);
+				err = -EINVAL;
+				goto failed_sbi;
+			}
+			cno = sbi->s_snapshot_cno;
+		} else
+			/* Read-only mount */
+			sbi->s_snapshot_cno = cno;
+	}
+
+	err = nilfs_attach_checkpoint(sbi, cno);
+	if (err) {
+		printk(KERN_ERR "NILFS: error loading a checkpoint"
+		       " (checkpoint number=%llu).\n", (unsigned long long)cno);
+		goto failed_sbi;
+	}
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		err = nilfs_attach_segment_constructor(sbi);
+		if (err)
+			goto failed_checkpoint;
+	}
+
+	root = nilfs_iget(sb, NILFS_ROOT_INO);
+	if (IS_ERR(root)) {
+		printk(KERN_ERR "NILFS: get root inode failed\n");
+		err = PTR_ERR(root);
+		goto failed_segctor;
+	}
+	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+		iput(root);
+		printk(KERN_ERR "NILFS: corrupt root inode.\n");
+		err = -EINVAL;
+		goto failed_segctor;
+	}
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		iput(root);
+		printk(KERN_ERR "NILFS: get root dentry failed\n");
+		err = -ENOMEM;
+		goto failed_segctor;
+	}
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		down_write(&nilfs->ns_sem);
+		nilfs_setup_super(sbi);
+		up_write(&nilfs->ns_sem);
+	}
+
+	err = nilfs_mark_recovery_complete(sbi);
+	if (unlikely(err)) {
+		printk(KERN_ERR "NILFS: recovery failed.\n");
+		goto failed_root;
+	}
+
+	return 0;
+
+ failed_root:
+	dput(sb->s_root);
+	sb->s_root = NULL;
+
+ failed_segctor:
+	nilfs_detach_segment_constructor(sbi);
+
+ failed_checkpoint:
+	nilfs_detach_checkpoint(sbi);
+
+ failed_sbi:
+	put_nilfs(nilfs);
+	sb->s_fs_info = NULL;
+	kfree(sbi);
+	return err;
+}
+
+static int nilfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct nilfs_sb_info *sbi = NILFS_SB(sb);
+	struct nilfs_super_block *sbp;
+	struct the_nilfs *nilfs = sbi->s_nilfs;
+	unsigned long old_sb_flags;
+	struct nilfs_mount_options old_opts;
+	int err;
+
+	old_sb_flags = sb->s_flags;
+	old_opts.mount_opt = sbi->s_mount_opt;
+	old_opts.snapshot_cno = sbi->s_snapshot_cno;
+
+	if (!parse_options(data, sb)) {
+		err = -EINVAL;
+		goto restore_opts;
+	}
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
+
+	if ((*flags & MS_RDONLY) &&
+	    sbi->s_snapshot_cno != old_opts.snapshot_cno) {
+		printk(KERN_WARNING "NILFS (device %s): couldn't "
+		       "remount to a different snapshot. \n",
+		       sb->s_id);
+		err = -EINVAL;
+		goto restore_opts;
+	}
+
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		goto out;
+	if (*flags & MS_RDONLY) {
+		/* Shutting down the segment constructor */
+		nilfs_detach_segment_constructor(sbi);
+		sb->s_flags |= MS_RDONLY;
+
+		sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
+		/* nilfs_set_opt(sbi, SNAPSHOT); */
+
+		/*
+		 * Remounting a valid RW partition RDONLY, so set
+		 * the RDONLY flag and then mark the partition as valid again.
+		 */
+		down_write(&nilfs->ns_sem);
+		sbp = nilfs->ns_sbp[0];
+		if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) &&
+		    (nilfs->ns_mount_state & NILFS_VALID_FS))
+			sbp->s_state = cpu_to_le16(nilfs->ns_mount_state);
+		sbp->s_mtime = cpu_to_le64(get_seconds());
+		nilfs_commit_super(sbi, 1);
+		up_write(&nilfs->ns_sem);
+	} else {
+		/*
+		 * Mounting a RDONLY partition read-write, so reread and
+		 * store the current valid flag.  (It may have been changed
+		 * by fsck since we originally mounted the partition.)
+		 */
+		down(&sb->s_bdev->bd_mount_sem);
+		/* Check existing RW-mount */
+		if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) {
+			printk(KERN_WARNING "NILFS (device %s): couldn't "
+			       "remount because a RW-mount exists.\n",
+			       sb->s_id);
+			err = -EBUSY;
+			goto rw_remount_failed;
+		}
+		if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
+			printk(KERN_WARNING "NILFS (device %s): couldn't "
+			       "remount because the current RO-mount is not "
+			       "the latest one.\n",
+			       sb->s_id);
+			err = -EINVAL;
+			goto rw_remount_failed;
+		}
+		sb->s_flags &= ~MS_RDONLY;
+		nilfs_clear_opt(sbi, SNAPSHOT);
+		sbi->s_snapshot_cno = 0;
+
+		err = nilfs_attach_segment_constructor(sbi);
+		if (err)
+			goto rw_remount_failed;
+
+		down_write(&nilfs->ns_sem);
+		nilfs_setup_super(sbi);
+		up_write(&nilfs->ns_sem);
+
+		up(&sb->s_bdev->bd_mount_sem);
+	}
+ out:
+	return 0;
+
+ rw_remount_failed:
+	up(&sb->s_bdev->bd_mount_sem);
+ restore_opts:
+	sb->s_flags = old_sb_flags;
+	sbi->s_mount_opt = old_opts.mount_opt;
+	sbi->s_snapshot_cno = old_opts.snapshot_cno;
+	return err;
+}
+
+struct nilfs_super_data {
+	struct block_device *bdev;
+	__u64 cno;
+	int flags;
+};
+
+/**
+ * nilfs_identify - pre-read mount options needed to identify mount instance
+ * @data: mount options
+ * @sd: nilfs_super_data
+ */
+static int nilfs_identify(char *data, struct nilfs_super_data *sd)
+{
+	char *p, *options = data;
+	substring_t args[MAX_OPT_ARGS];
+	int option, token;
+	int ret = 0;
+
+	do {
+		p = strsep(&options, ",");
+		if (p != NULL && *p) {
+			token = match_token(p, tokens, args);
+			if (token == Opt_snapshot) {
+				if (!(sd->flags & MS_RDONLY))
+					ret++;
+				else {
+					ret = match_int(&args[0], &option);
+					if (!ret) {
+						if (option > 0)
+							sd->cno = option;
+						else
+							ret++;
+					}
+				}
+			}
+			if (ret)
+				printk(KERN_ERR
+				       "NILFS: invalid mount option: %s\n", p);
+		}
+		if (!options)
+			break;
+		BUG_ON(options == data);
+		*(options - 1) = ',';
+	} while (!ret);
+	return ret;
+}
+
+static int nilfs_set_bdev_super(struct super_block *s, void *data)
+{
+	struct nilfs_super_data *sd = data;
+
+	s->s_bdev = sd->bdev;
+	s->s_dev = s->s_bdev->bd_dev;
+	return 0;
+}
+
+static int nilfs_test_bdev_super(struct super_block *s, void *data)
+{
+	struct nilfs_super_data *sd = data;
+
+	return s->s_bdev == sd->bdev;
+}
+
+static int nilfs_test_bdev_super2(struct super_block *s, void *data)
+{
+	struct nilfs_super_data *sd = data;
+	int ret;
+
+	if (s->s_bdev != sd->bdev)
+		return 0;
+
+	if (!((s->s_flags | sd->flags) & MS_RDONLY))
+		return 1; /* Reuse an old R/W-mode super_block */
+
+	if (s->s_flags & sd->flags & MS_RDONLY) {
+		if (down_read_trylock(&s->s_umount)) {
+			ret = s->s_root &&
+				(sd->cno == NILFS_SB(s)->s_snapshot_cno);
+			up_read(&s->s_umount);
+			/*
+			 * This path is locked with sb_lock by sget().
+			 * So, drop_super() causes deadlock.
+			 */
+			return ret;
+		}
+	}
+	return 0;
+}
+
+static int
+nilfs_get_sb(struct file_system_type *fs_type, int flags,
+	     const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	struct nilfs_super_data sd;
+	struct super_block *s, *s2;
+	struct the_nilfs *nilfs = NULL;
+	int err, need_to_close = 1;
+
+	sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
+	if (IS_ERR(sd.bdev))
+		return PTR_ERR(sd.bdev);
+
+	/*
+	 * To get mount instance using sget() vfs-routine, NILFS needs
+	 * much more information than normal filesystems to identify mount
+	 * instance.  For snapshot mounts, not only a mount type (ro-mount
+	 * or rw-mount) but also a checkpoint number is required.
+	 * The results are passed in sget() using nilfs_super_data.
+	 */
+	sd.cno = 0;
+	sd.flags = flags;
+	if (nilfs_identify((char *)data, &sd)) {
+		err = -EINVAL;
+		goto failed;
+	}
+
+	/*
+	 * once the super is inserted into the list by sget, s_umount
+	 * will protect the lockfs code from trying to start a snapshot
+	 * while we are mounting
+	 */
+	down(&sd.bdev->bd_mount_sem);
+	if (!sd.cno &&
+	    (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
+		err = (err < 0) ? : -EBUSY;
+		goto failed_unlock;
+	}
+
+	/*
+	 * Phase-1: search any existent instance and get the_nilfs
+	 */
+	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
+	if (IS_ERR(s))
+		goto error_s;
+
+	if (!s->s_root) {
+		err = -ENOMEM;
+		nilfs = alloc_nilfs(sd.bdev);
+		if (!nilfs)
+			goto cancel_new;
+	} else {
+		struct nilfs_sb_info *sbi = NILFS_SB(s);
+
+		/*
+		 * s_umount protects super_block from unmount process;
+		 * It covers pointers of nilfs_sb_info and the_nilfs.
+		 */
+		nilfs = sbi->s_nilfs;
+		get_nilfs(nilfs);
+		up_write(&s->s_umount);
+
+		/*
+		 * Phase-2: search specified snapshot or R/W mode super_block
+		 */
+		if (!sd.cno)
+			/* trying to get the latest checkpoint.  */
+			sd.cno = nilfs_last_cno(nilfs);
+
+		s2 = sget(fs_type, nilfs_test_bdev_super2,
+			  nilfs_set_bdev_super, &sd);
+		deactivate_super(s);
+		/*
+		 * Although deactivate_super() invokes close_bdev_exclusive() at
+		 * kill_block_super().  Here, s is an existent mount; we need
+		 * one more close_bdev_exclusive() call.
+		 */
+		s = s2;
+		if (IS_ERR(s))
+			goto error_s;
+	}
+
+	if (!s->s_root) {
+		char b[BDEVNAME_SIZE];
+
+		s->s_flags = flags;
+		strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
+		sb_set_blocksize(s, block_size(sd.bdev));
+
+		err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs);
+		if (err)
+			goto cancel_new;
+
+		s->s_flags |= MS_ACTIVE;
+		need_to_close = 0;
+	} else if (!(s->s_flags & MS_RDONLY)) {
+		err = -EBUSY;
+	}
+
+	up(&sd.bdev->bd_mount_sem);
+	put_nilfs(nilfs);
+	if (need_to_close)
+		close_bdev_exclusive(sd.bdev, flags);
+	simple_set_mnt(mnt, s);
+	return 0;
+
+ error_s:
+	up(&sd.bdev->bd_mount_sem);
+	if (nilfs)
+		put_nilfs(nilfs);
+	close_bdev_exclusive(sd.bdev, flags);
+	return PTR_ERR(s);
+
+ failed_unlock:
+	up(&sd.bdev->bd_mount_sem);
+ failed:
+	close_bdev_exclusive(sd.bdev, flags);
+
+	return err;
+
+ cancel_new:
+	/* Abandoning the newly allocated superblock */
+	up(&sd.bdev->bd_mount_sem);
+	if (nilfs)
+		put_nilfs(nilfs);
+	up_write(&s->s_umount);
+	deactivate_super(s);
+	/*
+	 * deactivate_super() invokes close_bdev_exclusive().
+	 * We must finish all post-cleaning before this call;
+	 * put_nilfs() and unlocking bd_mount_sem need the block device.
+	 */
+	return err;
+}
+
+static int nilfs_test_bdev_super3(struct super_block *s, void *data)
+{
+	struct nilfs_super_data *sd = data;
+	int ret;
+
+	if (s->s_bdev != sd->bdev)
+		return 0;
+	if (down_read_trylock(&s->s_umount)) {
+		ret = (s->s_flags & MS_RDONLY) && s->s_root &&
+			nilfs_test_opt(NILFS_SB(s), SNAPSHOT);
+		up_read(&s->s_umount);
+		if (ret)
+			return 0; /* ignore snapshot mounts */
+	}
+	return !((sd->flags ^ s->s_flags) & MS_RDONLY);
+}
+
+static int __false_bdev_super(struct super_block *s, void *data)
+{
+#if 0 /* XXX: workaround for lock debug. This is not good idea */
+	up_write(&s->s_umount);
+#endif
+	return -EFAULT;
+}
+
+/**
+ * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not.
+ * fs_type: filesystem type
+ * bdev: block device
+ * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount)
+ * res: pointer to an integer to store result
+ *
+ * This function must be called within a section protected by bd_mount_mutex.
+ */
+static int test_exclusive_mount(struct file_system_type *fs_type,
+				struct block_device *bdev, int flags)
+{
+	struct super_block *s;
+	struct nilfs_super_data sd = { .flags = flags, .bdev = bdev };
+
+	s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd);
+	if (IS_ERR(s)) {
+		if (PTR_ERR(s) != -EFAULT)
+			return PTR_ERR(s);
+		return 0;  /* Not found */
+	}
+	up_write(&s->s_umount);
+	deactivate_super(s);
+	return 1;  /* Found */
+}
+
+struct file_system_type nilfs_fs_type = {
+	.owner    = THIS_MODULE,
+	.name     = "nilfs2",
+	.get_sb   = nilfs_get_sb,
+	.kill_sb  = kill_block_super,
+	.fs_flags = FS_REQUIRES_DEV,
+};
+
+static int __init init_nilfs_fs(void)
+{
+	int err;
+
+	err = nilfs_init_inode_cache();
+	if (err)
+		goto failed;
+
+	err = nilfs_init_transaction_cache();
+	if (err)
+		goto failed_inode_cache;
+
+	err = nilfs_init_segbuf_cache();
+	if (err)
+		goto failed_transaction_cache;
+
+	err = nilfs_btree_path_cache_init();
+	if (err)
+		goto failed_segbuf_cache;
+
+	err = register_filesystem(&nilfs_fs_type);
+	if (err)
+		goto failed_btree_path_cache;
+
+	return 0;
+
+ failed_btree_path_cache:
+	nilfs_btree_path_cache_destroy();
+
+ failed_segbuf_cache:
+	nilfs_destroy_segbuf_cache();
+
+ failed_transaction_cache:
+	nilfs_destroy_transaction_cache();
+
+ failed_inode_cache:
+	nilfs_destroy_inode_cache();
+
+ failed:
+	return err;
+}
+
+static void __exit exit_nilfs_fs(void)
+{
+	nilfs_destroy_segbuf_cache();
+	nilfs_destroy_transaction_cache();
+	nilfs_destroy_inode_cache();
+	nilfs_btree_path_cache_destroy();
+	unregister_filesystem(&nilfs_fs_type);
+}
+
+module_init(init_nilfs_fs)
+module_exit(exit_nilfs_fs)
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
new file mode 100644
index 00000000000..7f65b3be4aa
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.c
@@ -0,0 +1,641 @@
+/*
+ * the_nilfs.c - the_nilfs shared structure.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/crc32.h>
+#include "nilfs.h"
+#include "segment.h"
+#include "alloc.h"
+#include "cpfile.h"
+#include "sufile.h"
+#include "dat.h"
+#include "seglist.h"
+#include "segbuf.h"
+
+void nilfs_set_last_segment(struct the_nilfs *nilfs,
+			    sector_t start_blocknr, u64 seq, __u64 cno)
+{
+	spin_lock(&nilfs->ns_last_segment_lock);
+	nilfs->ns_last_pseg = start_blocknr;
+	nilfs->ns_last_seq = seq;
+	nilfs->ns_last_cno = cno;
+	spin_unlock(&nilfs->ns_last_segment_lock);
+}
+
+/**
+ * alloc_nilfs - allocate the_nilfs structure
+ * @bdev: block device to which the_nilfs is related
+ *
+ * alloc_nilfs() allocates memory for the_nilfs and
+ * initializes its reference count and locks.
+ *
+ * Return Value: On success, pointer to the_nilfs is returned.
+ * On error, NULL is returned.
+ */
+struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+{
+	struct the_nilfs *nilfs;
+
+	nilfs = kzalloc(sizeof(*nilfs), GFP_KERNEL);
+	if (!nilfs)
+		return NULL;
+
+	nilfs->ns_bdev = bdev;
+	atomic_set(&nilfs->ns_count, 1);
+	atomic_set(&nilfs->ns_writer_refcount, -1);
+	atomic_set(&nilfs->ns_ndirtyblks, 0);
+	init_rwsem(&nilfs->ns_sem);
+	mutex_init(&nilfs->ns_writer_mutex);
+	INIT_LIST_HEAD(&nilfs->ns_supers);
+	spin_lock_init(&nilfs->ns_last_segment_lock);
+	nilfs->ns_gc_inodes_h = NULL;
+	init_rwsem(&nilfs->ns_segctor_sem);
+
+	return nilfs;
+}
+
+/**
+ * put_nilfs - release a reference to the_nilfs
+ * @nilfs: the_nilfs structure to be released
+ *
+ * put_nilfs() decrements a reference counter of the_nilfs.
+ * If the reference count reaches zero, the_nilfs is freed.
+ */
+void put_nilfs(struct the_nilfs *nilfs)
+{
+	if (!atomic_dec_and_test(&nilfs->ns_count))
+		return;
+	/*
+	 * Increment of ns_count never occur below because the caller
+	 * of get_nilfs() holds at least one reference to the_nilfs.
+	 * Thus its exclusion control is not required here.
+	 */
+	might_sleep();
+	if (nilfs_loaded(nilfs)) {
+		nilfs_mdt_clear(nilfs->ns_sufile);
+		nilfs_mdt_destroy(nilfs->ns_sufile);
+		nilfs_mdt_clear(nilfs->ns_cpfile);
+		nilfs_mdt_destroy(nilfs->ns_cpfile);
+		nilfs_mdt_clear(nilfs->ns_dat);
+		nilfs_mdt_destroy(nilfs->ns_dat);
+		/* XXX: how and when to clear nilfs->ns_gc_dat? */
+		nilfs_mdt_destroy(nilfs->ns_gc_dat);
+	}
+	if (nilfs_init(nilfs)) {
+		nilfs_destroy_gccache(nilfs);
+		brelse(nilfs->ns_sbh[0]);
+		brelse(nilfs->ns_sbh[1]);
+	}
+	kfree(nilfs);
+}
+
+static int nilfs_load_super_root(struct the_nilfs *nilfs,
+				 struct nilfs_sb_info *sbi, sector_t sr_block)
+{
+	static struct lock_class_key dat_lock_key;
+	struct buffer_head *bh_sr;
+	struct nilfs_super_root *raw_sr;
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	unsigned dat_entry_size, segment_usage_size, checkpoint_size;
+	unsigned inode_size;
+	int err;
+
+	err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1);
+	if (unlikely(err))
+		return err;
+
+	down_read(&nilfs->ns_sem);
+	dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size);
+	checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size);
+	segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size);
+	up_read(&nilfs->ns_sem);
+
+	inode_size = nilfs->ns_inode_size;
+
+	err = -ENOMEM;
+	nilfs->ns_dat = nilfs_mdt_new(
+		nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
+	if (unlikely(!nilfs->ns_dat))
+		goto failed;
+
+	nilfs->ns_gc_dat = nilfs_mdt_new(
+		nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
+	if (unlikely(!nilfs->ns_gc_dat))
+		goto failed_dat;
+
+	nilfs->ns_cpfile = nilfs_mdt_new(
+		nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP);
+	if (unlikely(!nilfs->ns_cpfile))
+		goto failed_gc_dat;
+
+	nilfs->ns_sufile = nilfs_mdt_new(
+		nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP);
+	if (unlikely(!nilfs->ns_sufile))
+		goto failed_cpfile;
+
+	err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size);
+	if (unlikely(err))
+		goto failed_sufile;
+
+	err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size);
+	if (unlikely(err))
+		goto failed_sufile;
+
+	lockdep_set_class(&NILFS_MDT(nilfs->ns_dat)->mi_sem, &dat_lock_key);
+	lockdep_set_class(&NILFS_MDT(nilfs->ns_gc_dat)->mi_sem, &dat_lock_key);
+
+	nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
+	nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size,
+				 sizeof(struct nilfs_cpfile_header));
+	nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size,
+				 sizeof(struct nilfs_sufile_header));
+
+	err = nilfs_mdt_read_inode_direct(
+		nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size));
+	if (unlikely(err))
+		goto failed_sufile;
+
+	err = nilfs_mdt_read_inode_direct(
+		nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size));
+	if (unlikely(err))
+		goto failed_sufile;
+
+	err = nilfs_mdt_read_inode_direct(
+		nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size));
+	if (unlikely(err))
+		goto failed_sufile;
+
+	raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
+	nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
+
+ failed:
+	brelse(bh_sr);
+	return err;
+
+ failed_sufile:
+	nilfs_mdt_destroy(nilfs->ns_sufile);
+
+ failed_cpfile:
+	nilfs_mdt_destroy(nilfs->ns_cpfile);
+
+ failed_gc_dat:
+	nilfs_mdt_destroy(nilfs->ns_gc_dat);
+
+ failed_dat:
+	nilfs_mdt_destroy(nilfs->ns_dat);
+	goto failed;
+}
+
+static void nilfs_init_recovery_info(struct nilfs_recovery_info *ri)
+{
+	memset(ri, 0, sizeof(*ri));
+	INIT_LIST_HEAD(&ri->ri_used_segments);
+}
+
+static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
+{
+	nilfs_dispose_segment_list(&ri->ri_used_segments);
+}
+
+/**
+ * load_nilfs - load and recover the nilfs
+ * @nilfs: the_nilfs structure to be released
+ * @sbi: nilfs_sb_info used to recover past segment
+ *
+ * load_nilfs() searches and load the latest super root,
+ * attaches the last segment, and does recovery if needed.
+ * The caller must call this exclusively for simultaneous mounts.
+ */
+int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
+{
+	struct nilfs_recovery_info ri;
+	unsigned int s_flags = sbi->s_super->s_flags;
+	int really_read_only = bdev_read_only(nilfs->ns_bdev);
+	unsigned valid_fs;
+	int err = 0;
+
+	nilfs_init_recovery_info(&ri);
+
+	down_write(&nilfs->ns_sem);
+	valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
+	up_write(&nilfs->ns_sem);
+
+	if (!valid_fs && (s_flags & MS_RDONLY)) {
+		printk(KERN_INFO "NILFS: INFO: recovery "
+		       "required for readonly filesystem.\n");
+		if (really_read_only) {
+			printk(KERN_ERR "NILFS: write access "
+			       "unavailable, cannot proceed.\n");
+			err = -EROFS;
+			goto failed;
+		}
+		printk(KERN_INFO "NILFS: write access will "
+		       "be enabled during recovery.\n");
+		sbi->s_super->s_flags &= ~MS_RDONLY;
+	}
+
+	err = nilfs_search_super_root(nilfs, sbi, &ri);
+	if (unlikely(err)) {
+		printk(KERN_ERR "NILFS: error searching super root.\n");
+		goto failed;
+	}
+
+	err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root);
+	if (unlikely(err)) {
+		printk(KERN_ERR "NILFS: error loading super root.\n");
+		goto failed;
+	}
+
+	if (!valid_fs) {
+		err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
+		if (unlikely(err)) {
+			nilfs_mdt_destroy(nilfs->ns_cpfile);
+			nilfs_mdt_destroy(nilfs->ns_sufile);
+			nilfs_mdt_destroy(nilfs->ns_dat);
+			goto failed;
+		}
+		if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED)
+			sbi->s_super->s_dirt = 1;
+	}
+
+	set_nilfs_loaded(nilfs);
+
+ failed:
+	nilfs_clear_recovery_info(&ri);
+	sbi->s_super->s_flags = s_flags;
+	return err;
+}
+
+static unsigned long long nilfs_max_size(unsigned int blkbits)
+{
+	unsigned int max_bits;
+	unsigned long long res = MAX_LFS_FILESIZE; /* page cache limit */
+
+	max_bits = blkbits + NILFS_BMAP_KEY_BIT; /* bmap size limit */
+	if (max_bits < 64)
+		res = min_t(unsigned long long, res, (1ULL << max_bits) - 1);
+	return res;
+}
+
+static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
+				   struct nilfs_super_block *sbp)
+{
+	if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) {
+		printk(KERN_ERR "NILFS: revision mismatch "
+		       "(superblock rev.=%d.%d, current rev.=%d.%d). "
+		       "Please check the version of mkfs.nilfs.\n",
+		       le32_to_cpu(sbp->s_rev_level),
+		       le16_to_cpu(sbp->s_minor_rev_level),
+		       NILFS_CURRENT_REV, NILFS_MINOR_REV);
+		return -EINVAL;
+	}
+	nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes);
+	if (nilfs->ns_sbsize > BLOCK_SIZE)
+		return -EINVAL;
+
+	nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
+	nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
+
+	nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
+	if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
+		printk(KERN_ERR "NILFS: too short segment. \n");
+		return -EINVAL;
+	}
+
+	nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block);
+	nilfs->ns_nsegments = le64_to_cpu(sbp->s_nsegments);
+	nilfs->ns_r_segments_percentage =
+		le32_to_cpu(sbp->s_r_segments_percentage);
+	nilfs->ns_nrsvsegs =
+		max_t(unsigned long, NILFS_MIN_NRSVSEGS,
+		      DIV_ROUND_UP(nilfs->ns_nsegments *
+				   nilfs->ns_r_segments_percentage, 100));
+	nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
+	return 0;
+}
+
+static int nilfs_valid_sb(struct nilfs_super_block *sbp)
+{
+	static unsigned char sum[4];
+	const int sumoff = offsetof(struct nilfs_super_block, s_sum);
+	size_t bytes;
+	u32 crc;
+
+	if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC)
+		return 0;
+	bytes = le16_to_cpu(sbp->s_bytes);
+	if (bytes > BLOCK_SIZE)
+		return 0;
+	crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp,
+		       sumoff);
+	crc = crc32_le(crc, sum, 4);
+	crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4,
+		       bytes - sumoff - 4);
+	return crc == le32_to_cpu(sbp->s_sum);
+}
+
+static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
+{
+	return offset < ((le64_to_cpu(sbp->s_nsegments) *
+			  le32_to_cpu(sbp->s_blocks_per_segment)) <<
+			 (le32_to_cpu(sbp->s_log_block_size) + 10));
+}
+
+static void nilfs_release_super_block(struct the_nilfs *nilfs)
+{
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		if (nilfs->ns_sbp[i]) {
+			brelse(nilfs->ns_sbh[i]);
+			nilfs->ns_sbh[i] = NULL;
+			nilfs->ns_sbp[i] = NULL;
+		}
+	}
+}
+
+void nilfs_fall_back_super_block(struct the_nilfs *nilfs)
+{
+	brelse(nilfs->ns_sbh[0]);
+	nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
+	nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
+	nilfs->ns_sbh[1] = NULL;
+	nilfs->ns_sbp[1] = NULL;
+}
+
+void nilfs_swap_super_block(struct the_nilfs *nilfs)
+{
+	struct buffer_head *tsbh = nilfs->ns_sbh[0];
+	struct nilfs_super_block *tsbp = nilfs->ns_sbp[0];
+
+	nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
+	nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
+	nilfs->ns_sbh[1] = tsbh;
+	nilfs->ns_sbp[1] = tsbp;
+}
+
+static int nilfs_load_super_block(struct the_nilfs *nilfs,
+				  struct super_block *sb, int blocksize,
+				  struct nilfs_super_block **sbpp)
+{
+	struct nilfs_super_block **sbp = nilfs->ns_sbp;
+	struct buffer_head **sbh = nilfs->ns_sbh;
+	u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
+	int valid[2], swp = 0;
+
+	sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
+					&sbh[0]);
+	sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]);
+
+	if (!sbp[0]) {
+		if (!sbp[1]) {
+			printk(KERN_ERR "NILFS: unable to read superblock\n");
+			return -EIO;
+		}
+		printk(KERN_WARNING
+		       "NILFS warning: unable to read primary superblock\n");
+	} else if (!sbp[1])
+		printk(KERN_WARNING
+		       "NILFS warning: unable to read secondary superblock\n");
+
+	valid[0] = nilfs_valid_sb(sbp[0]);
+	valid[1] = nilfs_valid_sb(sbp[1]);
+	swp = valid[1] &&
+		(!valid[0] ||
+		 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime));
+
+	if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
+		brelse(sbh[1]);
+		sbh[1] = NULL;
+		sbp[1] = NULL;
+		swp = 0;
+	}
+	if (!valid[swp]) {
+		nilfs_release_super_block(nilfs);
+		printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n",
+		       sb->s_id);
+		return -EINVAL;
+	}
+
+	if (swp) {
+		printk(KERN_WARNING "NILFS warning: broken superblock. "
+		       "using spare superblock.\n");
+		nilfs_swap_super_block(nilfs);
+	}
+
+	nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime);
+	nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0;
+	nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
+	*sbpp = sbp[0];
+	return 0;
+}
+
+/**
+ * init_nilfs - initialize a NILFS instance.
+ * @nilfs: the_nilfs structure
+ * @sbi: nilfs_sb_info
+ * @sb: super block
+ * @data: mount options
+ *
+ * init_nilfs() performs common initialization per block device (e.g.
+ * reading the super block, getting disk layout information, initializing
+ * shared fields in the_nilfs). It takes on some portion of the jobs
+ * typically done by a fill_super() routine. This division arises from
+ * the nature that multiple NILFS instances may be simultaneously
+ * mounted on a device.
+ * For multiple mounts on the same device, only the first mount
+ * invokes these tasks.
+ *
+ * Return Value: On success, 0 is returned. On error, a negative error
+ * code is returned.
+ */
+int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
+{
+	struct super_block *sb = sbi->s_super;
+	struct nilfs_super_block *sbp;
+	struct backing_dev_info *bdi;
+	int blocksize;
+	int err;
+
+	down_write(&nilfs->ns_sem);
+	if (nilfs_init(nilfs)) {
+		/* Load values from existing the_nilfs */
+		sbp = nilfs->ns_sbp[0];
+		err = nilfs_store_magic_and_option(sb, sbp, data);
+		if (err)
+			goto out;
+
+		blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
+		if (sb->s_blocksize != blocksize &&
+		    !sb_set_blocksize(sb, blocksize)) {
+			printk(KERN_ERR "NILFS: blocksize %d unfit to device\n",
+			       blocksize);
+			err = -EINVAL;
+		}
+		sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
+		goto out;
+	}
+
+	blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
+	if (!blocksize) {
+		printk(KERN_ERR "NILFS: unable to set blocksize\n");
+		err = -EINVAL;
+		goto out;
+	}
+	err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
+	if (err)
+		goto out;
+
+	err = nilfs_store_magic_and_option(sb, sbp, data);
+	if (err)
+		goto failed_sbh;
+
+	blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
+	if (sb->s_blocksize != blocksize) {
+		int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
+
+		if (blocksize < hw_blocksize) {
+			printk(KERN_ERR
+			       "NILFS: blocksize %d too small for device "
+			       "(sector-size = %d).\n",
+			       blocksize, hw_blocksize);
+			err = -EINVAL;
+			goto failed_sbh;
+		}
+		nilfs_release_super_block(nilfs);
+		sb_set_blocksize(sb, blocksize);
+
+		err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
+		if (err)
+			goto out;
+			/* not failed_sbh; sbh is released automatically
+			   when reloading fails. */
+	}
+	nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
+
+	err = nilfs_store_disk_layout(nilfs, sbp);
+	if (err)
+		goto failed_sbh;
+
+	sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
+
+	nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
+
+	bdi = nilfs->ns_bdev->bd_inode_backing_dev_info;
+	if (!bdi)
+		bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
+	nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
+
+	/* Finding last segment */
+	nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
+	nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
+	nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
+
+	nilfs->ns_seg_seq = nilfs->ns_last_seq;
+	nilfs->ns_segnum =
+		nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
+	nilfs->ns_cno = nilfs->ns_last_cno + 1;
+	if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
+		printk(KERN_ERR "NILFS invalid last segment number.\n");
+		err = -EINVAL;
+		goto failed_sbh;
+	}
+	/* Dummy values  */
+	nilfs->ns_free_segments_count =
+		nilfs->ns_nsegments - (nilfs->ns_segnum + 1);
+
+	/* Initialize gcinode cache */
+	err = nilfs_init_gccache(nilfs);
+	if (err)
+		goto failed_sbh;
+
+	set_nilfs_init(nilfs);
+	err = 0;
+ out:
+	up_write(&nilfs->ns_sem);
+	return err;
+
+ failed_sbh:
+	nilfs_release_super_block(nilfs);
+	goto out;
+}
+
+int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
+{
+	struct inode *dat = nilfs_dat_inode(nilfs);
+	unsigned long ncleansegs;
+	int err;
+
+	down_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs);
+	up_read(&NILFS_MDT(dat)->mi_sem);	/* XXX */
+	if (likely(!err))
+		*nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
+	return err;
+}
+
+int nilfs_near_disk_full(struct the_nilfs *nilfs)
+{
+	struct inode *sufile = nilfs->ns_sufile;
+	unsigned long ncleansegs, nincsegs;
+	int ret;
+
+	ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs);
+	if (likely(!ret)) {
+		nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
+			nilfs->ns_blocks_per_segment + 1;
+		if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs)
+			ret++;
+	}
+	return ret;
+}
+
+int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
+				int snapshot_mount)
+{
+	struct nilfs_sb_info *sbi;
+	int ret = 0;
+
+	down_read(&nilfs->ns_sem);
+	if (cno == 0 || cno > nilfs->ns_cno)
+		goto out_unlock;
+
+	list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
+		if (sbi->s_snapshot_cno == cno &&
+		    (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) {
+					/* exclude read-only mounts */
+			ret++;
+			break;
+		}
+	}
+	/* for protecting recent checkpoints */
+	if (cno >= nilfs_last_cno(nilfs))
+		ret++;
+
+ out_unlock:
+	up_read(&nilfs->ns_sem);
+	return ret;
+}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
new file mode 100644
index 00000000000..30fe58778d0
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.h
@@ -0,0 +1,298 @@
+/*
+ * the_nilfs.h - the_nilfs shared structure.
+ *
+ * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ * Written by Ryusuke Konishi <ryusuke@osrg.net>
+ *
+ */
+
+#ifndef _THE_NILFS_H
+#define _THE_NILFS_H
+
+#include <linux/types.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include "sb.h"
+
+/* the_nilfs struct */
+enum {
+	THE_NILFS_INIT = 0,     /* Information from super_block is set */
+	THE_NILFS_LOADED,       /* Roll-back/roll-forward has done and
+				   the latest checkpoint was loaded */
+	THE_NILFS_DISCONTINUED,	/* 'next' pointer chain has broken */
+};
+
+/**
+ * struct the_nilfs - struct to supervise multiple nilfs mount points
+ * @ns_flags: flags
+ * @ns_count: reference count
+ * @ns_bdev: block device
+ * @ns_bdi: backing dev info
+ * @ns_writer: back pointer to writable nilfs_sb_info
+ * @ns_sem: semaphore for shared states
+ * @ns_writer_mutex: mutex protecting ns_writer attach/detach
+ * @ns_writer_refcount: number of referrers on ns_writer
+ * @ns_sbh: buffer heads of on-disk super blocks
+ * @ns_sbp: pointers to super block data
+ * @ns_sbwtime: previous write time of super blocks
+ * @ns_sbsize: size of valid data in super block
+ * @ns_supers: list of nilfs super block structs
+ * @ns_seg_seq: segment sequence counter
+ * @ns_segnum: index number of the latest full segment.
+ * @ns_nextnum: index number of the full segment index to be used next
+ * @ns_pseg_offset: offset of next partial segment in the current full segment
+ * @ns_cno: next checkpoint number
+ * @ns_ctime: write time of the last segment
+ * @ns_nongc_ctime: write time of the last segment not for cleaner operation
+ * @ns_ndirtyblks: Number of dirty data blocks
+ * @ns_last_segment_lock: lock protecting fields for the latest segment
+ * @ns_last_pseg: start block number of the latest segment
+ * @ns_last_seq: sequence value of the latest segment
+ * @ns_last_cno: checkpoint number of the latest segment
+ * @ns_prot_seq: least sequence number of segments which must not be reclaimed
+ * @ns_free_segments_count: counter of free segments
+ * @ns_segctor_sem: segment constructor semaphore
+ * @ns_dat: DAT file inode
+ * @ns_cpfile: checkpoint file inode
+ * @ns_sufile: segusage file inode
+ * @ns_gc_dat: shadow inode of the DAT file inode for GC
+ * @ns_gc_inodes: dummy inodes to keep live blocks
+ * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
+ * @ns_blocksize_bits: bit length of block size
+ * @ns_nsegments: number of segments in filesystem
+ * @ns_blocks_per_segment: number of blocks per segment
+ * @ns_r_segments_percentage: reserved segments percentage
+ * @ns_nrsvsegs: number of reserved segments
+ * @ns_first_data_block: block number of first data block
+ * @ns_inode_size: size of on-disk inode
+ * @ns_first_ino: first not-special inode number
+ * @ns_crc_seed: seed value of CRC32 calculation
+ */
+struct the_nilfs {
+	unsigned long		ns_flags;
+	atomic_t		ns_count;
+
+	struct block_device    *ns_bdev;
+	struct backing_dev_info *ns_bdi;
+	struct nilfs_sb_info   *ns_writer;
+	struct rw_semaphore	ns_sem;
+	struct mutex		ns_writer_mutex;
+	atomic_t		ns_writer_refcount;
+
+	/*
+	 * used for
+	 * - loading the latest checkpoint exclusively.
+	 * - allocating a new full segment.
+	 * - protecting s_dirt in the super_block struct
+	 *   (see nilfs_write_super) and the following fields.
+	 */
+	struct buffer_head     *ns_sbh[2];
+	struct nilfs_super_block *ns_sbp[2];
+	time_t			ns_sbwtime[2];
+	unsigned		ns_sbsize;
+	unsigned		ns_mount_state;
+	struct list_head	ns_supers;
+
+	/*
+	 * Following fields are dedicated to a writable FS-instance.
+	 * Except for the period seeking checkpoint, code outside the segment
+	 * constructor must lock a segment semaphore while accessing these
+	 * fields.
+	 * The writable FS-instance is sole during a lifetime of the_nilfs.
+	 */
+	u64			ns_seg_seq;
+	__u64			ns_segnum;
+	__u64			ns_nextnum;
+	unsigned long		ns_pseg_offset;
+	__u64			ns_cno;
+	time_t			ns_ctime;
+	time_t			ns_nongc_ctime;
+	atomic_t		ns_ndirtyblks;
+
+	/*
+	 * The following fields hold information on the latest partial segment
+	 * written to disk with a super root.  These fields are protected by
+	 * ns_last_segment_lock.
+	 */
+	spinlock_t		ns_last_segment_lock;
+	sector_t		ns_last_pseg;
+	u64			ns_last_seq;
+	__u64			ns_last_cno;
+	u64			ns_prot_seq;
+	unsigned long		ns_free_segments_count;
+
+	struct rw_semaphore	ns_segctor_sem;
+
+	/*
+	 * Following fields are lock free except for the period before
+	 * the_nilfs is initialized.
+	 */
+	struct inode	       *ns_dat;
+	struct inode	       *ns_cpfile;
+	struct inode	       *ns_sufile;
+	struct inode	       *ns_gc_dat;
+
+	/* GC inode list and hash table head */
+	struct list_head	ns_gc_inodes;
+	struct hlist_head      *ns_gc_inodes_h;
+
+	/* Disk layout information (static) */
+	unsigned int		ns_blocksize_bits;
+	unsigned long		ns_nsegments;
+	unsigned long		ns_blocks_per_segment;
+	unsigned long		ns_r_segments_percentage;
+	unsigned long		ns_nrsvsegs;
+	unsigned long		ns_first_data_block;
+	int			ns_inode_size;
+	int			ns_first_ino;
+	u32			ns_crc_seed;
+};
+
+#define NILFS_GCINODE_HASH_BITS		8
+#define NILFS_GCINODE_HASH_SIZE		(1<<NILFS_GCINODE_HASH_BITS)
+
+#define THE_NILFS_FNS(bit, name)					\
+static inline void set_nilfs_##name(struct the_nilfs *nilfs)		\
+{									\
+	set_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);			\
+}									\
+static inline void clear_nilfs_##name(struct the_nilfs *nilfs)		\
+{									\
+	clear_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);			\
+}									\
+static inline int nilfs_##name(struct the_nilfs *nilfs)			\
+{									\
+	return test_bit(THE_NILFS_##bit, &(nilfs)->ns_flags);		\
+}
+
+THE_NILFS_FNS(INIT, init)
+THE_NILFS_FNS(LOADED, loaded)
+THE_NILFS_FNS(DISCONTINUED, discontinued)
+
+/* Minimum interval of periodical update of superblocks (in seconds) */
+#define NILFS_SB_FREQ		10
+#define NILFS_ALTSB_FREQ	60  /* spare superblock */
+
+void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
+struct the_nilfs *alloc_nilfs(struct block_device *);
+void put_nilfs(struct the_nilfs *);
+int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
+int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
+int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
+int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
+int nilfs_near_disk_full(struct the_nilfs *);
+void nilfs_fall_back_super_block(struct the_nilfs *);
+void nilfs_swap_super_block(struct the_nilfs *);
+
+
+static inline void get_nilfs(struct the_nilfs *nilfs)
+{
+	/* Caller must have at least one reference of the_nilfs. */
+	atomic_inc(&nilfs->ns_count);
+}
+
+static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs)
+{
+	if (atomic_inc_and_test(&nilfs->ns_writer_refcount))
+		mutex_lock(&nilfs->ns_writer_mutex);
+	return nilfs->ns_writer;
+}
+
+static inline void nilfs_put_writer(struct the_nilfs *nilfs)
+{
+	if (atomic_add_negative(-1, &nilfs->ns_writer_refcount))
+		mutex_unlock(&nilfs->ns_writer_mutex);
+}
+
+static inline void
+nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
+{
+	mutex_lock(&nilfs->ns_writer_mutex);
+	nilfs->ns_writer = sbi;
+	mutex_unlock(&nilfs->ns_writer_mutex);
+}
+
+static inline void
+nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
+{
+	mutex_lock(&nilfs->ns_writer_mutex);
+	if (sbi == nilfs->ns_writer)
+		nilfs->ns_writer = NULL;
+	mutex_unlock(&nilfs->ns_writer_mutex);
+}
+
+static inline void
+nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
+			sector_t *seg_start, sector_t *seg_end)
+{
+	*seg_start = (sector_t)nilfs->ns_blocks_per_segment * segnum;
+	*seg_end = *seg_start + nilfs->ns_blocks_per_segment - 1;
+	if (segnum == 0)
+		*seg_start = nilfs->ns_first_data_block;
+}
+
+static inline sector_t
+nilfs_get_segment_start_blocknr(struct the_nilfs *nilfs, __u64 segnum)
+{
+	return (segnum == 0) ? nilfs->ns_first_data_block :
+		(sector_t)nilfs->ns_blocks_per_segment * segnum;
+}
+
+static inline __u64
+nilfs_get_segnum_of_block(struct the_nilfs *nilfs, sector_t blocknr)
+{
+	sector_t segnum = blocknr;
+
+	sector_div(segnum, nilfs->ns_blocks_per_segment);
+	return segnum;
+}
+
+static inline void
+nilfs_terminate_segment(struct the_nilfs *nilfs, sector_t seg_start,
+			sector_t seg_end)
+{
+	/* terminate the current full segment (used in case of I/O-error) */
+	nilfs->ns_pseg_offset = seg_end - seg_start + 1;
+}
+
+static inline void nilfs_shift_to_next_segment(struct the_nilfs *nilfs)
+{
+	/* move forward with a full segment */
+	nilfs->ns_segnum = nilfs->ns_nextnum;
+	nilfs->ns_pseg_offset = 0;
+	nilfs->ns_seg_seq++;
+}
+
+static inline __u64 nilfs_last_cno(struct the_nilfs *nilfs)
+{
+	__u64 cno;
+
+	spin_lock(&nilfs->ns_last_segment_lock);
+	cno = nilfs->ns_last_cno;
+	spin_unlock(&nilfs->ns_last_segment_lock);
+	return cno;
+}
+
+static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n)
+{
+	return n == nilfs->ns_segnum || n == nilfs->ns_nextnum;
+}
+
+#endif /* _THE_NILFS_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a5887df2cd8..8672b953603 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1926,7 +1926,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 		   out->f_path.dentry->d_name.len,
 		   out->f_path.dentry->d_name.name);
 
-	inode_double_lock(inode, pipe->inode);
+	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
 
 	ret = ocfs2_rw_lock(inode, 1);
 	if (ret < 0) {
@@ -1941,12 +1941,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 		goto out_unlock;
 	}
 
+	if (pipe->inode)
+		mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
 	ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
+	if (pipe->inode)
+		mutex_unlock(&pipe->inode->i_mutex);
 
 out_unlock:
 	ocfs2_rw_unlock(inode, 1);
 out:
-	inode_double_unlock(inode, pipe->inode);
+	mutex_unlock(&inode->i_mutex);
 
 	mlog_exit(ret);
 	return ret;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b0ae0be4801..39e4ad4f59f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -204,6 +204,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 	struct file *file = vma->vm_file;
 	int flags = vma->vm_flags;
 	unsigned long ino = 0;
+	unsigned long long pgoff = 0;
 	dev_t dev = 0;
 	int len;
 
@@ -211,6 +212,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
+		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
 	}
 
 	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
@@ -220,7 +222,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 			flags & VM_WRITE ? 'w' : '-',
 			flags & VM_EXEC ? 'x' : '-',
 			flags & VM_MAYSHARE ? 's' : 'p',
-			((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
+			pgoff,
 			MAJOR(dev), MINOR(dev), ino, &len);
 
 	/*
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 863464d5519..64a72e2e765 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -126,6 +126,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 	struct file *file;
 	dev_t dev = 0;
 	int flags, len;
+	unsigned long long pgoff = 0;
 
 	flags = vma->vm_flags;
 	file = vma->vm_file;
@@ -134,6 +135,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
+		pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
 	}
 
 	seq_printf(m,
@@ -144,7 +146,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 		   flags & VM_WRITE ? 'w' : '-',
 		   flags & VM_EXEC ? 'x' : '-',
 		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-		   (unsigned long long) vma->vm_pgoff << PAGE_SHIFT,
+		   pgoff,
 		   MAJOR(dev), MINOR(dev), ino, &len);
 
 	if (file) {
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a404fb88e45..3a6b193d844 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -221,22 +221,23 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
 	save_mount_options(sb, data);
 
 	fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
+	sb->s_fs_info = fsi;
 	if (!fsi) {
 		err = -ENOMEM;
 		goto fail;
 	}
-	sb->s_fs_info = fsi;
 
 	err = ramfs_parse_options(data, &fsi->mount_opts);
 	if (err)
 		goto fail;
 
-	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
-	sb->s_magic = RAMFS_MAGIC;
-	sb->s_op = &ramfs_ops;
-	sb->s_time_gran = 1;
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_blocksize		= PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits	= PAGE_CACHE_SHIFT;
+	sb->s_magic		= RAMFS_MAGIC;
+	sb->s_op		= &ramfs_ops;
+	sb->s_time_gran		= 1;
+
 	inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
 	if (!inode) {
 		err = -ENOMEM;
@@ -244,14 +245,16 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
 	}
 
 	root = d_alloc_root(inode);
+	sb->s_root = root;
 	if (!root) {
 		err = -ENOMEM;
 		goto fail;
 	}
-	sb->s_root = root;
+
 	return 0;
 fail:
 	kfree(fsi);
+	sb->s_fs_info = NULL;
 	iput(inode);
 	return err;
 }
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
index 1a17020f9fa..ce2d6bcc626 100644
--- a/fs/romfs/Kconfig
+++ b/fs/romfs/Kconfig
@@ -1,6 +1,6 @@
 config ROMFS_FS
 	tristate "ROM file system support"
-	depends on BLOCK
+	depends on BLOCK || MTD
 	---help---
 	  This is a very small read-only file system mainly intended for
 	  initial ram disks of installation disks, but it could be used for
@@ -14,3 +14,49 @@ config ROMFS_FS
 
 	  If you don't know whether you need it, then you don't need it:
 	  answer N.
+
+#
+# Select the backing stores to be supported
+#
+choice
+	prompt "RomFS backing stores"
+	depends on ROMFS_FS
+	default ROMFS_BACKED_BY_BLOCK
+	help
+	  Select the backing stores to be supported.
+
+config ROMFS_BACKED_BY_BLOCK
+	bool "Block device-backed ROM file system support"
+	depends on BLOCK
+	help
+	  This permits ROMFS to use block devices buffered through the page
+	  cache as the medium from which to retrieve data.  It does not allow
+	  direct mapping of the medium.
+
+	  If unsure, answer Y.
+
+config ROMFS_BACKED_BY_MTD
+	bool "MTD-backed ROM file system support"
+	depends on MTD=y || (ROMFS_FS=m && MTD)
+	help
+	  This permits ROMFS to use MTD based devices directly, without the
+	  intercession of the block layer (which may have been disabled).  It
+	  also allows direct mapping of MTD devices through romfs files under
+	  NOMMU conditions if the underlying device is directly addressable by
+	  the CPU.
+
+	  If unsure, answer Y.
+
+config ROMFS_BACKED_BY_BOTH
+	bool "Both the above"
+	depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD))
+endchoice
+
+
+config ROMFS_ON_BLOCK
+	bool
+	default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH
+
+config ROMFS_ON_MTD
+	bool
+	default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH
diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile
index c95b21cf49a..420beb7d495 100644
--- a/fs/romfs/Makefile
+++ b/fs/romfs/Makefile
@@ -1,7 +1,12 @@
 #
-# Makefile for the linux romfs filesystem routines.
+# Makefile for the linux RomFS filesystem routines.
 #
 
 obj-$(CONFIG_ROMFS_FS) += romfs.o
 
-romfs-objs := inode.o
+romfs-y := storage.o super.o
+
+ifneq ($(CONFIG_MMU),y)
+romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o
+endif
+
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
deleted file mode 100644
index 98a232f7196..00000000000
--- a/fs/romfs/inode.c
+++ /dev/null
@@ -1,665 +0,0 @@
-/*
- * ROMFS file system, Linux implementation
- *
- * Copyright (C) 1997-1999  Janos Farkas <chexum@shadow.banki.hu>
- *
- * Using parts of the minix filesystem
- * Copyright (C) 1991, 1992  Linus Torvalds
- *
- * and parts of the affs filesystem additionally
- * Copyright (C) 1993  Ray Burr
- * Copyright (C) 1996  Hans-Joachim Widmaier
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Changes
- *					Changed for 2.1.19 modules
- *	Jan 1997			Initial release
- *	Jun 1997			2.1.43+ changes
- *					Proper page locking in readpage
- *					Changed to work with 2.1.45+ fs
- *	Jul 1997			Fixed follow_link
- *			2.1.47
- *					lookup shouldn't return -ENOENT
- *					from Horst von Brand:
- *					  fail on wrong checksum
- *					  double unlock_super was possible
- *					  correct namelen for statfs
- *					spotted by Bill Hawes:
- *					  readlink shouldn't iput()
- *	Jun 1998	2.1.106		from Avery Pennarun: glibc scandir()
- *					  exposed a problem in readdir
- *			2.1.107		code-freeze spellchecker run
- *	Aug 1998			2.1.118+ VFS changes
- *	Sep 1998	2.1.122		another VFS change (follow_link)
- *	Apr 1999	2.2.7		no more EBADF checking in
- *					  lookup/readdir, use ERR_PTR
- *	Jun 1999	2.3.6		d_alloc_root use changed
- *			2.3.9		clean up usage of ENOENT/negative
- *					  dentries in lookup
- *					clean up page flags setting
- *					  (error, uptodate, locking) in
- *					  in readpage
- *					use init_special_inode for
- *					  fifos/sockets (and streamline) in
- *					  read_inode, fix _ops table order
- *	Aug 1999	2.3.16		__initfunc() => __init change
- *	Oct 1999	2.3.24		page->owner hack obsoleted
- *	Nov 1999	2.3.27		2.3.25+ page->offset => index change
- */
-
-/* todo:
- *	- see Documentation/filesystems/romfs.txt
- *	- use allocated, not stack memory for file names?
- *	- considering write access...
- *	- network (tftp) files?
- *	- merge back some _op tables
- */
-
-/*
- * Sorry about some optimizations and for some goto's.  I just wanted
- * to squeeze some more bytes out of this code.. :)
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/romfs_fs.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-#include <linux/vfs.h>
-
-#include <asm/uaccess.h>
-
-struct romfs_inode_info {
-	unsigned long i_metasize;	/* size of non-data area */
-	unsigned long i_dataoffset;	/* from the start of fs */
-	struct inode vfs_inode;
-};
-
-static struct inode *romfs_iget(struct super_block *, unsigned long);
-
-/* instead of private superblock data */
-static inline unsigned long romfs_maxsize(struct super_block *sb)
-{
-	return (unsigned long)sb->s_fs_info;
-}
-
-static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
-{
-	return container_of(inode, struct romfs_inode_info, vfs_inode);
-}
-
-static __u32
-romfs_checksum(void *data, int size)
-{
-	__u32 sum;
-	__be32 *ptr;
-
-	sum = 0; ptr = data;
-	size>>=2;
-	while (size>0) {
-		sum += be32_to_cpu(*ptr++);
-		size--;
-	}
-	return sum;
-}
-
-static const struct super_operations romfs_ops;
-
-static int romfs_fill_super(struct super_block *s, void *data, int silent)
-{
-	struct buffer_head *bh;
-	struct romfs_super_block *rsb;
-	struct inode *root;
-	int sz, ret = -EINVAL;
-
-	/* I would parse the options here, but there are none.. :) */
-
-	sb_set_blocksize(s, ROMBSIZE);
-	s->s_maxbytes = 0xFFFFFFFF;
-
-	bh = sb_bread(s, 0);
-	if (!bh) {
-		/* XXX merge with other printk? */
-                printk ("romfs: unable to read superblock\n");
-		goto outnobh;
-	}
-
-	rsb = (struct romfs_super_block *)bh->b_data;
-	sz = be32_to_cpu(rsb->size);
-	if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1
-	   || sz < ROMFH_SIZE) {
-		if (!silent)
-			printk ("VFS: Can't find a romfs filesystem on dev "
-				"%s.\n", s->s_id);
-		goto out;
-	}
-	if (romfs_checksum(rsb, min_t(int, sz, 512))) {
-		printk ("romfs: bad initial checksum on dev "
-			"%s.\n", s->s_id);
-		goto out;
-	}
-
-	s->s_magic = ROMFS_MAGIC;
-	s->s_fs_info = (void *)(long)sz;
-
-	s->s_flags |= MS_RDONLY;
-
-	/* Find the start of the fs */
-	sz = (ROMFH_SIZE +
-	      strnlen(rsb->name, ROMFS_MAXFN) + 1 + ROMFH_PAD)
-	     & ROMFH_MASK;
-
-	s->s_op	= &romfs_ops;
-	root = romfs_iget(s, sz);
-	if (IS_ERR(root)) {
-		ret = PTR_ERR(root);
-		goto out;
-	}
-
-	ret = -ENOMEM;
-	s->s_root = d_alloc_root(root);
-	if (!s->s_root)
-		goto outiput;
-
-	brelse(bh);
-	return 0;
-
-outiput:
-	iput(root);
-out:
-	brelse(bh);
-outnobh:
-	return ret;
-}
-
-/* That's simple too. */
-
-static int
-romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	buf->f_type = ROMFS_MAGIC;
-	buf->f_bsize = ROMBSIZE;
-	buf->f_bfree = buf->f_bavail = buf->f_ffree;
-	buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
-	buf->f_namelen = ROMFS_MAXFN;
-	return 0;
-}
-
-/* some helper routines */
-
-static int
-romfs_strnlen(struct inode *i, unsigned long offset, unsigned long count)
-{
-	struct buffer_head *bh;
-	unsigned long avail, maxsize, res;
-
-	maxsize = romfs_maxsize(i->i_sb);
-	if (offset >= maxsize)
-		return -1;
-
-	/* strnlen is almost always valid */
-	if (count > maxsize || offset+count > maxsize)
-		count = maxsize-offset;
-
-	bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-	if (!bh)
-		return -1;		/* error */
-
-	avail = ROMBSIZE - (offset & ROMBMASK);
-	maxsize = min_t(unsigned long, count, avail);
-	res = strnlen(((char *)bh->b_data)+(offset&ROMBMASK), maxsize);
-	brelse(bh);
-
-	if (res < maxsize)
-		return res;		/* found all of it */
-
-	while (res < count) {
-		offset += maxsize;
-
-		bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-		if (!bh)
-			return -1;
-		maxsize = min_t(unsigned long, count - res, ROMBSIZE);
-		avail = strnlen(bh->b_data, maxsize);
-		res += avail;
-		brelse(bh);
-		if (avail < maxsize)
-			return res;
-	}
-	return res;
-}
-
-static int
-romfs_copyfrom(struct inode *i, void *dest, unsigned long offset, unsigned long count)
-{
-	struct buffer_head *bh;
-	unsigned long avail, maxsize, res;
-
-	maxsize = romfs_maxsize(i->i_sb);
-	if (offset >= maxsize || count > maxsize || offset+count>maxsize)
-		return -1;
-
-	bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-	if (!bh)
-		return -1;		/* error */
-
-	avail = ROMBSIZE - (offset & ROMBMASK);
-	maxsize = min_t(unsigned long, count, avail);
-	memcpy(dest, ((char *)bh->b_data) + (offset & ROMBMASK), maxsize);
-	brelse(bh);
-
-	res = maxsize;			/* all of it */
-
-	while (res < count) {
-		offset += maxsize;
-		dest += maxsize;
-
-		bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
-		if (!bh)
-			return -1;
-		maxsize = min_t(unsigned long, count - res, ROMBSIZE);
-		memcpy(dest, bh->b_data, maxsize);
-		brelse(bh);
-		res += maxsize;
-	}
-	return res;
-}
-
-static unsigned char romfs_dtype_table[] = {
-	DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
-};
-
-static int
-romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-	struct inode *i = filp->f_path.dentry->d_inode;
-	struct romfs_inode ri;
-	unsigned long offset, maxoff;
-	int j, ino, nextfh;
-	int stored = 0;
-	char fsname[ROMFS_MAXFN];	/* XXX dynamic? */
-
-	lock_kernel();
-
-	maxoff = romfs_maxsize(i->i_sb);
-
-	offset = filp->f_pos;
-	if (!offset) {
-		offset = i->i_ino & ROMFH_MASK;
-		if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
-			goto out;
-		offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-	}
-
-	/* Not really failsafe, but we are read-only... */
-	for(;;) {
-		if (!offset || offset >= maxoff) {
-			offset = maxoff;
-			filp->f_pos = offset;
-			goto out;
-		}
-		filp->f_pos = offset;
-
-		/* Fetch inode info */
-		if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
-			goto out;
-
-		j = romfs_strnlen(i, offset+ROMFH_SIZE, sizeof(fsname)-1);
-		if (j < 0)
-			goto out;
-
-		fsname[j]=0;
-		romfs_copyfrom(i, fsname, offset+ROMFH_SIZE, j);
-
-		ino = offset;
-		nextfh = be32_to_cpu(ri.next);
-		if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
-			ino = be32_to_cpu(ri.spec);
-		if (filldir(dirent, fsname, j, offset, ino,
-			    romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) {
-			goto out;
-		}
-		stored++;
-		offset = nextfh & ROMFH_MASK;
-	}
-out:
-	unlock_kernel();
-	return stored;
-}
-
-static struct dentry *
-romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
-{
-	unsigned long offset, maxoff;
-	long res;
-	int fslen;
-	struct inode *inode = NULL;
-	char fsname[ROMFS_MAXFN];	/* XXX dynamic? */
-	struct romfs_inode ri;
-	const char *name;		/* got from dentry */
-	int len;
-
-	res = -EACCES;			/* placeholder for "no data here" */
-	offset = dir->i_ino & ROMFH_MASK;
-	lock_kernel();
-	if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
-		goto error;
-
-	maxoff = romfs_maxsize(dir->i_sb);
-	offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-
-	/* OK, now find the file whose name is in "dentry" in the
-	 * directory specified by "dir".  */
-
-	name = dentry->d_name.name;
-	len = dentry->d_name.len;
-
-	for(;;) {
-		if (!offset || offset >= maxoff)
-			goto success; /* negative success */
-		if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
-			goto error;
-
-		/* try to match the first 16 bytes of name */
-		fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, ROMFH_SIZE);
-		if (len < ROMFH_SIZE) {
-			if (len == fslen) {
-				/* both are shorter, and same size */
-				romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
-				if (strncmp (name, fsname, len) == 0)
-					break;
-			}
-		} else if (fslen >= ROMFH_SIZE) {
-			/* both are longer; XXX optimize max size */
-			fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, sizeof(fsname)-1);
-			if (len == fslen) {
-				romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
-				if (strncmp(name, fsname, len) == 0)
-					break;
-			}
-		}
-		/* next entry */
-		offset = be32_to_cpu(ri.next) & ROMFH_MASK;
-	}
-
-	/* Hard link handling */
-	if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
-		offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
-
-	inode = romfs_iget(dir->i_sb, offset);
-	if (IS_ERR(inode)) {
-		res = PTR_ERR(inode);
-		goto error;
-	}
-
-success:
-	d_add(dentry, inode);
-	res = 0;
-error:
-	unlock_kernel();
-	return ERR_PTR(res);
-}
-
-/*
- * Ok, we do readpage, to be able to execute programs.  Unfortunately,
- * we can't use bmap, since we may have looser alignments.
- */
-
-static int
-romfs_readpage(struct file *file, struct page * page)
-{
-	struct inode *inode = page->mapping->host;
-	loff_t offset, size;
-	unsigned long filled;
-	void *buf;
-	int result = -EIO;
-
-	page_cache_get(page);
-	lock_kernel();
-	buf = kmap(page);
-	if (!buf)
-		goto err_out;
-
-	/* 32 bit warning -- but not for us :) */
-	offset = page_offset(page);
-	size = i_size_read(inode);
-	filled = 0;
-	result = 0;
-	if (offset < size) {
-		unsigned long readlen;
-
-		size -= offset;
-		readlen = size > PAGE_SIZE ? PAGE_SIZE : size;
-
-		filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen);
-
-		if (filled != readlen) {
-			SetPageError(page);
-			filled = 0;
-			result = -EIO;
-		}
-	}
-
-	if (filled < PAGE_SIZE)
-		memset(buf + filled, 0, PAGE_SIZE-filled);
-
-	if (!result)
-		SetPageUptodate(page);
-	flush_dcache_page(page);
-
-	unlock_page(page);
-
-	kunmap(page);
-err_out:
-	page_cache_release(page);
-	unlock_kernel();
-
-	return result;
-}
-
-/* Mapping from our types to the kernel */
-
-static const struct address_space_operations romfs_aops = {
-	.readpage = romfs_readpage
-};
-
-static const struct file_operations romfs_dir_operations = {
-	.read		= generic_read_dir,
-	.readdir	= romfs_readdir,
-};
-
-static const struct inode_operations romfs_dir_inode_operations = {
-	.lookup		= romfs_lookup,
-};
-
-static mode_t romfs_modemap[] =
-{
-	0, S_IFDIR+0644, S_IFREG+0644, S_IFLNK+0777,
-	S_IFBLK+0600, S_IFCHR+0600, S_IFSOCK+0644, S_IFIFO+0644
-};
-
-static struct inode *
-romfs_iget(struct super_block *sb, unsigned long ino)
-{
-	int nextfh, ret;
-	struct romfs_inode ri;
-	struct inode *i;
-
-	ino &= ROMFH_MASK;
-	i = iget_locked(sb, ino);
-	if (!i)
-		return ERR_PTR(-ENOMEM);
-	if (!(i->i_state & I_NEW))
-		return i;
-
-	i->i_mode = 0;
-
-	/* Loop for finding the real hard link */
-	for(;;) {
-		if (romfs_copyfrom(i, &ri, ino, ROMFH_SIZE) <= 0) {
-			printk(KERN_ERR "romfs: read error for inode 0x%lx\n",
-				ino);
-			iget_failed(i);
-			return ERR_PTR(-EIO);
-		}
-		/* XXX: do romfs_checksum here too (with name) */
-
-		nextfh = be32_to_cpu(ri.next);
-		if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
-			break;
-
-		ino = be32_to_cpu(ri.spec) & ROMFH_MASK;
-	}
-
-	i->i_nlink = 1;		/* Hard to decide.. */
-	i->i_size = be32_to_cpu(ri.size);
-	i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
-	i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
-
-        /* Precalculate the data offset */
-	ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
-	if (ret >= 0)
-		ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
-	else
-		ino = 0;
-
-        ROMFS_I(i)->i_metasize = ino;
-        ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
-
-        /* Compute permissions */
-        ino = romfs_modemap[nextfh & ROMFH_TYPE];
-	/* only "normal" files have ops */
-	switch (nextfh & ROMFH_TYPE) {
-		case 1:
-			i->i_size = ROMFS_I(i)->i_metasize;
-			i->i_op = &romfs_dir_inode_operations;
-			i->i_fop = &romfs_dir_operations;
-			if (nextfh & ROMFH_EXEC)
-				ino |= S_IXUGO;
-			i->i_mode = ino;
-			break;
-		case 2:
-			i->i_fop = &generic_ro_fops;
-			i->i_data.a_ops = &romfs_aops;
-			if (nextfh & ROMFH_EXEC)
-				ino |= S_IXUGO;
-			i->i_mode = ino;
-			break;
-		case 3:
-			i->i_op = &page_symlink_inode_operations;
-			i->i_data.a_ops = &romfs_aops;
-			i->i_mode = ino | S_IRWXUGO;
-			break;
-		default:
-			/* depending on MBZ for sock/fifos */
-			nextfh = be32_to_cpu(ri.spec);
-			init_special_inode(i, ino,
-					MKDEV(nextfh>>16,nextfh&0xffff));
-	}
-	unlock_new_inode(i);
-	return i;
-}
-
-static struct kmem_cache * romfs_inode_cachep;
-
-static struct inode *romfs_alloc_inode(struct super_block *sb)
-{
-	struct romfs_inode_info *ei;
-	ei = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
-	if (!ei)
-		return NULL;
-	return &ei->vfs_inode;
-}
-
-static void romfs_destroy_inode(struct inode *inode)
-{
-	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
-}
-
-static void init_once(void *foo)
-{
-	struct romfs_inode_info *ei = foo;
-
-	inode_init_once(&ei->vfs_inode);
-}
-
-static int init_inodecache(void)
-{
-	romfs_inode_cachep = kmem_cache_create("romfs_inode_cache",
-					     sizeof(struct romfs_inode_info),
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
-					     init_once);
-	if (romfs_inode_cachep == NULL)
-		return -ENOMEM;
-	return 0;
-}
-
-static void destroy_inodecache(void)
-{
-	kmem_cache_destroy(romfs_inode_cachep);
-}
-
-static int romfs_remount(struct super_block *sb, int *flags, char *data)
-{
-	*flags |= MS_RDONLY;
-	return 0;
-}
-
-static const struct super_operations romfs_ops = {
-	.alloc_inode	= romfs_alloc_inode,
-	.destroy_inode	= romfs_destroy_inode,
-	.statfs		= romfs_statfs,
-	.remount_fs	= romfs_remount,
-};
-
-static int romfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
-			   mnt);
-}
-
-static struct file_system_type romfs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "romfs",
-	.get_sb		= romfs_get_sb,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-
-static int __init init_romfs_fs(void)
-{
-	int err = init_inodecache();
-	if (err)
-		goto out1;
-        err = register_filesystem(&romfs_fs_type);
-	if (err)
-		goto out;
-	return 0;
-out:
-	destroy_inodecache();
-out1:
-	return err;
-}
-
-static void __exit exit_romfs_fs(void)
-{
-	unregister_filesystem(&romfs_fs_type);
-	destroy_inodecache();
-}
-
-/* Yes, works even as a module... :) */
-
-module_init(init_romfs_fs)
-module_exit(exit_romfs_fs)
-MODULE_LICENSE("GPL");
diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h
new file mode 100644
index 00000000000..06044a9dc62
--- /dev/null
+++ b/fs/romfs/internal.h
@@ -0,0 +1,47 @@
+/* RomFS internal definitions
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/romfs_fs.h>
+
+struct romfs_inode_info {
+	struct inode	vfs_inode;
+	unsigned long	i_metasize;	/* size of non-data area */
+	unsigned long	i_dataoffset;	/* from the start of fs */
+};
+
+static inline size_t romfs_maxsize(struct super_block *sb)
+{
+	return (size_t) (unsigned long) sb->s_fs_info;
+}
+
+static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
+{
+	return container_of(inode, struct romfs_inode_info, vfs_inode);
+}
+
+/*
+ * mmap-nommu.c
+ */
+#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD)
+extern const struct file_operations romfs_ro_fops;
+#else
+#define romfs_ro_fops	generic_ro_fops
+#endif
+
+/*
+ * storage.c
+ */
+extern int romfs_dev_read(struct super_block *sb, unsigned long pos,
+			  void *buf, size_t buflen);
+extern ssize_t romfs_dev_strnlen(struct super_block *sb,
+				 unsigned long pos, size_t maxlen);
+extern int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
+			     const char *str, size_t size);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
new file mode 100644
index 00000000000..f0511e81696
--- /dev/null
+++ b/fs/romfs/mmap-nommu.c
@@ -0,0 +1,75 @@
+/* NOMMU mmap support for RomFS on MTD devices
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/mm.h>
+#include <linux/mtd/super.h>
+#include "internal.h"
+
+/*
+ * try to determine where a shared mapping can be made
+ * - only supported for NOMMU at the moment (MMU can't doesn't copy private
+ *   mappings)
+ * - attempts to map through to the underlying MTD device
+ */
+static unsigned long romfs_get_unmapped_area(struct file *file,
+					     unsigned long addr,
+					     unsigned long len,
+					     unsigned long pgoff,
+					     unsigned long flags)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct mtd_info *mtd = inode->i_sb->s_mtd;
+	unsigned long isize, offset;
+
+	if (!mtd)
+		goto cant_map_directly;
+
+	isize = i_size_read(inode);
+	offset = pgoff << PAGE_SHIFT;
+	if (offset > isize || len > isize || offset > isize - len)
+		return (unsigned long) -EINVAL;
+
+	/* we need to call down to the MTD layer to do the actual mapping */
+	if (mtd->get_unmapped_area) {
+		if (addr != 0)
+			return (unsigned long) -EINVAL;
+
+		if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
+			return (unsigned long) -EINVAL;
+
+		offset += ROMFS_I(inode)->i_dataoffset;
+		if (offset > mtd->size - len)
+			return (unsigned long) -EINVAL;
+
+		return mtd->get_unmapped_area(mtd, len, offset, flags);
+	}
+
+cant_map_directly:
+	return (unsigned long) -ENOSYS;
+}
+
+/*
+ * permit a R/O mapping to be made directly through onto an MTD device if
+ * possible
+ */
+static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
+}
+
+const struct file_operations romfs_ro_fops = {
+	.llseek			= generic_file_llseek,
+	.read			= do_sync_read,
+	.aio_read		= generic_file_aio_read,
+	.splice_read		= generic_file_splice_read,
+	.mmap			= romfs_mmap,
+	.get_unmapped_area	= romfs_get_unmapped_area,
+};
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
new file mode 100644
index 00000000000..7e3e1e12a08
--- /dev/null
+++ b/fs/romfs/storage.c
@@ -0,0 +1,261 @@
+/* RomFS storage access routines
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/mtd/super.h>
+#include <linux/buffer_head.h>
+#include "internal.h"
+
+#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK)
+#error no ROMFS backing store interface configured
+#endif
+
+#ifdef CONFIG_ROMFS_ON_MTD
+#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__))
+
+/*
+ * read data from an romfs image on an MTD device
+ */
+static int romfs_mtd_read(struct super_block *sb, unsigned long pos,
+			  void *buf, size_t buflen)
+{
+	size_t rlen;
+	int ret;
+
+	ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf);
+	return (ret < 0 || rlen != buflen) ? -EIO : 0;
+}
+
+/*
+ * determine the length of a string in a romfs image on an MTD device
+ */
+static ssize_t romfs_mtd_strnlen(struct super_block *sb,
+				 unsigned long pos, size_t maxlen)
+{
+	ssize_t n = 0;
+	size_t segment;
+	u_char buf[16], *p;
+	size_t len;
+	int ret;
+
+	/* scan the string up to 16 bytes at a time */
+	while (maxlen > 0) {
+		segment = min_t(size_t, maxlen, 16);
+		ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+		if (ret < 0)
+			return ret;
+		p = memchr(buf, 0, len);
+		if (p)
+			return n + (p - buf);
+		maxlen -= len;
+		pos += len;
+		n += len;
+	}
+
+	return n;
+}
+
+/*
+ * compare a string to one in a romfs image on MTD
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_mtd_strncmp(struct super_block *sb, unsigned long pos,
+			     const char *str, size_t size)
+{
+	u_char buf[16];
+	size_t len, segment;
+	int ret;
+
+	/* scan the string up to 16 bytes at a time */
+	while (size > 0) {
+		segment = min_t(size_t, size, 16);
+		ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
+		if (ret < 0)
+			return ret;
+		if (memcmp(buf, str, len) != 0)
+			return 0;
+		size -= len;
+		pos += len;
+		str += len;
+	}
+
+	return 1;
+}
+#endif /* CONFIG_ROMFS_ON_MTD */
+
+#ifdef CONFIG_ROMFS_ON_BLOCK
+/*
+ * read data from an romfs image on a block device
+ */
+static int romfs_blk_read(struct super_block *sb, unsigned long pos,
+			  void *buf, size_t buflen)
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	size_t segment;
+
+	/* copy the string up to blocksize bytes at a time */
+	while (buflen > 0) {
+		offset = pos & (ROMBSIZE - 1);
+		segment = min_t(size_t, buflen, ROMBSIZE - offset);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		memcpy(buf, bh->b_data + offset, segment);
+		brelse(bh);
+		buflen -= segment;
+		pos += segment;
+	}
+
+	return 0;
+}
+
+/*
+ * determine the length of a string in romfs on a block device
+ */
+static ssize_t romfs_blk_strnlen(struct super_block *sb,
+				 unsigned long pos, size_t limit)
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	ssize_t n = 0;
+	size_t segment;
+	u_char *buf, *p;
+
+	/* scan the string up to blocksize bytes at a time */
+	while (limit > 0) {
+		offset = pos & (ROMBSIZE - 1);
+		segment = min_t(size_t, limit, ROMBSIZE - offset);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		buf = bh->b_data + offset;
+		p = memchr(buf, 0, segment);
+		brelse(bh);
+		if (p)
+			return n + (p - buf);
+		limit -= segment;
+		pos += segment;
+		n += segment;
+	}
+
+	return n;
+}
+
+/*
+ * compare a string to one in a romfs image on a block device
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+static int romfs_blk_strncmp(struct super_block *sb, unsigned long pos,
+			     const char *str, size_t size)
+{
+	struct buffer_head *bh;
+	unsigned long offset;
+	size_t segment;
+	bool x;
+
+	/* scan the string up to 16 bytes at a time */
+	while (size > 0) {
+		offset = pos & (ROMBSIZE - 1);
+		segment = min_t(size_t, size, ROMBSIZE - offset);
+		bh = sb_bread(sb, pos >> ROMBSBITS);
+		if (!bh)
+			return -EIO;
+		x = (memcmp(bh->b_data + offset, str, segment) != 0);
+		brelse(bh);
+		if (x)
+			return 0;
+		size -= segment;
+		pos += segment;
+		str += segment;
+	}
+
+	return 1;
+}
+#endif /* CONFIG_ROMFS_ON_BLOCK */
+
+/*
+ * read data from the romfs image
+ */
+int romfs_dev_read(struct super_block *sb, unsigned long pos,
+		   void *buf, size_t buflen)
+{
+	size_t limit;
+
+	limit = romfs_maxsize(sb);
+	if (pos >= limit)
+		return -EIO;
+	if (buflen > limit - pos)
+		buflen = limit - pos;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd)
+		return romfs_mtd_read(sb, pos, buf, buflen);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev)
+		return romfs_blk_read(sb, pos, buf, buflen);
+#endif
+	return -EIO;
+}
+
+/*
+ * determine the length of a string in romfs
+ */
+ssize_t romfs_dev_strnlen(struct super_block *sb,
+			  unsigned long pos, size_t maxlen)
+{
+	size_t limit;
+
+	limit = romfs_maxsize(sb);
+	if (pos >= limit)
+		return -EIO;
+	if (maxlen > limit - pos)
+		maxlen = limit - pos;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd)
+		return romfs_mtd_strnlen(sb, pos, limit);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev)
+		return romfs_blk_strnlen(sb, pos, limit);
+#endif
+	return -EIO;
+}
+
+/*
+ * compare a string to one in romfs
+ * - return 1 if matched, 0 if differ, -ve if error
+ */
+int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
+		      const char *str, size_t size)
+{
+	size_t limit;
+
+	limit = romfs_maxsize(sb);
+	if (pos >= limit)
+		return -EIO;
+	if (size > ROMFS_MAXFN)
+		return -ENAMETOOLONG;
+	if (size > limit - pos)
+		return -EIO;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd)
+		return romfs_mtd_strncmp(sb, pos, str, size);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev)
+		return romfs_blk_strncmp(sb, pos, str, size);
+#endif
+	return -EIO;
+}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
new file mode 100644
index 00000000000..10ca7d984a8
--- /dev/null
+++ b/fs/romfs/super.c
@@ -0,0 +1,653 @@
+/* Block- or MTD-based romfs
+ *
+ * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * Derived from: ROMFS file system, Linux implementation
+ *
+ * Copyright © 1997-1999  Janos Farkas <chexum@shadow.banki.hu>
+ *
+ * Using parts of the minix filesystem
+ * Copyright © 1991, 1992  Linus Torvalds
+ *
+ * and parts of the affs filesystem additionally
+ * Copyright © 1993  Ray Burr
+ * Copyright © 1996  Hans-Joachim Widmaier
+ *
+ * Changes
+ *					Changed for 2.1.19 modules
+ *	Jan 1997			Initial release
+ *	Jun 1997			2.1.43+ changes
+ *					Proper page locking in readpage
+ *					Changed to work with 2.1.45+ fs
+ *	Jul 1997			Fixed follow_link
+ *			2.1.47
+ *					lookup shouldn't return -ENOENT
+ *					from Horst von Brand:
+ *					  fail on wrong checksum
+ *					  double unlock_super was possible
+ *					  correct namelen for statfs
+ *					spotted by Bill Hawes:
+ *					  readlink shouldn't iput()
+ *	Jun 1998	2.1.106		from Avery Pennarun: glibc scandir()
+ *					  exposed a problem in readdir
+ *			2.1.107		code-freeze spellchecker run
+ *	Aug 1998			2.1.118+ VFS changes
+ *	Sep 1998	2.1.122		another VFS change (follow_link)
+ *	Apr 1999	2.2.7		no more EBADF checking in
+ *					  lookup/readdir, use ERR_PTR
+ *	Jun 1999	2.3.6		d_alloc_root use changed
+ *			2.3.9		clean up usage of ENOENT/negative
+ *					  dentries in lookup
+ *					clean up page flags setting
+ *					  (error, uptodate, locking) in
+ *					  in readpage
+ *					use init_special_inode for
+ *					  fifos/sockets (and streamline) in
+ *					  read_inode, fix _ops table order
+ *	Aug 1999	2.3.16		__initfunc() => __init change
+ *	Oct 1999	2.3.24		page->owner hack obsoleted
+ *	Nov 1999	2.3.27		2.3.25+ page->offset => index change
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/parser.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/statfs.h>
+#include <linux/mtd/super.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uaccess.h>
+#include "internal.h"
+
+static struct kmem_cache *romfs_inode_cachep;
+
+static const umode_t romfs_modemap[8] = {
+	0,			/* hard link */
+	S_IFDIR  | 0644,	/* directory */
+	S_IFREG  | 0644,	/* regular file */
+	S_IFLNK  | 0777,	/* symlink */
+	S_IFBLK  | 0600,	/* blockdev */
+	S_IFCHR  | 0600,	/* chardev */
+	S_IFSOCK | 0644,	/* socket */
+	S_IFIFO  | 0644		/* FIFO */
+};
+
+static const unsigned char romfs_dtype_table[] = {
+	DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
+};
+
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
+
+/*
+ * read a page worth of data from the image
+ */
+static int romfs_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t offset, size;
+	unsigned long fillsize, pos;
+	void *buf;
+	int ret;
+
+	buf = kmap(page);
+	if (!buf)
+		return -ENOMEM;
+
+	/* 32 bit warning -- but not for us :) */
+	offset = page_offset(page);
+	size = i_size_read(inode);
+	fillsize = 0;
+	ret = 0;
+	if (offset < size) {
+		size -= offset;
+		fillsize = size > PAGE_SIZE ? PAGE_SIZE : size;
+
+		pos = ROMFS_I(inode)->i_dataoffset + offset;
+
+		ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize);
+		if (ret < 0) {
+			SetPageError(page);
+			fillsize = 0;
+			ret = -EIO;
+		}
+	}
+
+	if (fillsize < PAGE_SIZE)
+		memset(buf + fillsize, 0, PAGE_SIZE - fillsize);
+	if (ret == 0)
+		SetPageUptodate(page);
+
+	flush_dcache_page(page);
+	kunmap(page);
+	unlock_page(page);
+	return ret;
+}
+
+static const struct address_space_operations romfs_aops = {
+	.readpage	= romfs_readpage
+};
+
+/*
+ * read the entries from a directory
+ */
+static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *i = filp->f_dentry->d_inode;
+	struct romfs_inode ri;
+	unsigned long offset, maxoff;
+	int j, ino, nextfh;
+	int stored = 0;
+	char fsname[ROMFS_MAXFN];	/* XXX dynamic? */
+	int ret;
+
+	maxoff = romfs_maxsize(i->i_sb);
+
+	offset = filp->f_pos;
+	if (!offset) {
+		offset = i->i_ino & ROMFH_MASK;
+		ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+		if (ret < 0)
+			goto out;
+		offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+	}
+
+	/* Not really failsafe, but we are read-only... */
+	for (;;) {
+		if (!offset || offset >= maxoff) {
+			offset = maxoff;
+			filp->f_pos = offset;
+			goto out;
+		}
+		filp->f_pos = offset;
+
+		/* Fetch inode info */
+		ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
+		if (ret < 0)
+			goto out;
+
+		j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE,
+				      sizeof(fsname) - 1);
+		if (j < 0)
+			goto out;
+
+		ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j);
+		if (ret < 0)
+			goto out;
+		fsname[j] = '\0';
+
+		ino = offset;
+		nextfh = be32_to_cpu(ri.next);
+		if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
+			ino = be32_to_cpu(ri.spec);
+		if (filldir(dirent, fsname, j, offset, ino,
+			    romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
+			goto out;
+
+		stored++;
+		offset = nextfh & ROMFH_MASK;
+	}
+
+out:
+	return stored;
+}
+
+/*
+ * look up an entry in a directory
+ */
+static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	unsigned long offset, maxoff;
+	struct inode *inode;
+	struct romfs_inode ri;
+	const char *name;		/* got from dentry */
+	int len, ret;
+
+	offset = dir->i_ino & ROMFH_MASK;
+	ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE);
+	if (ret < 0)
+		goto error;
+
+	/* search all the file entries in the list starting from the one
+	 * pointed to by the directory's special data */
+	maxoff = romfs_maxsize(dir->i_sb);
+	offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+
+	name = dentry->d_name.name;
+	len = dentry->d_name.len;
+
+	for (;;) {
+		if (!offset || offset >= maxoff)
+			goto out0;
+
+		ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri));
+		if (ret < 0)
+			goto error;
+
+		/* try to match the first 16 bytes of name */
+		ret = romfs_dev_strncmp(dir->i_sb, offset + ROMFH_SIZE, name,
+					len);
+		if (ret < 0)
+			goto error;
+		if (ret == 1)
+			break;
+
+		/* next entry */
+		offset = be32_to_cpu(ri.next) & ROMFH_MASK;
+	}
+
+	/* Hard link handling */
+	if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
+		offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
+
+	inode = romfs_iget(dir->i_sb, offset);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto error;
+	}
+	goto outi;
+
+	/*
+	 * it's a bit funky, _lookup needs to return an error code
+	 * (negative) or a NULL, both as a dentry.  ENOENT should not
+	 * be returned, instead we need to create a negative dentry by
+	 * d_add(dentry, NULL); and return 0 as no error.
+	 * (Although as I see, it only matters on writable file
+	 * systems).
+	 */
+out0:
+	inode = NULL;
+outi:
+	d_add(dentry, inode);
+	ret = 0;
+error:
+	return ERR_PTR(ret);
+}
+
+static const struct file_operations romfs_dir_operations = {
+	.read		= generic_read_dir,
+	.readdir	= romfs_readdir,
+};
+
+static struct inode_operations romfs_dir_inode_operations = {
+	.lookup		= romfs_lookup,
+};
+
+/*
+ * get a romfs inode based on its position in the image (which doubles as the
+ * inode number)
+ */
+static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
+{
+	struct romfs_inode_info *inode;
+	struct romfs_inode ri;
+	struct inode *i;
+	unsigned long nlen;
+	unsigned nextfh, ret;
+	umode_t mode;
+
+	/* we might have to traverse a chain of "hard link" file entries to get
+	 * to the actual file */
+	for (;;) {
+		ret = romfs_dev_read(sb, pos, &ri, sizeof(ri));
+		if (ret < 0)
+			goto error;
+
+		/* XXX: do romfs_checksum here too (with name) */
+
+		nextfh = be32_to_cpu(ri.next);
+		if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
+			break;
+
+		pos = be32_to_cpu(ri.spec) & ROMFH_MASK;
+	}
+
+	/* determine the length of the filename */
+	nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN);
+	if (IS_ERR_VALUE(nlen))
+		goto eio;
+
+	/* get an inode for this image position */
+	i = iget_locked(sb, pos);
+	if (!i)
+		return ERR_PTR(-ENOMEM);
+
+	if (!(i->i_state & I_NEW))
+		return i;
+
+	/* precalculate the data offset */
+	inode = ROMFS_I(i);
+	inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
+	inode->i_dataoffset = pos + inode->i_metasize;
+
+	i->i_nlink = 1;		/* Hard to decide.. */
+	i->i_size = be32_to_cpu(ri.size);
+	i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
+	i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
+
+	/* set up mode and ops */
+	mode = romfs_modemap[nextfh & ROMFH_TYPE];
+
+	switch (nextfh & ROMFH_TYPE) {
+	case ROMFH_DIR:
+		i->i_size = ROMFS_I(i)->i_metasize;
+		i->i_op = &romfs_dir_inode_operations;
+		i->i_fop = &romfs_dir_operations;
+		if (nextfh & ROMFH_EXEC)
+			mode |= S_IXUGO;
+		break;
+	case ROMFH_REG:
+		i->i_fop = &romfs_ro_fops;
+		i->i_data.a_ops = &romfs_aops;
+		if (i->i_sb->s_mtd)
+			i->i_data.backing_dev_info =
+				i->i_sb->s_mtd->backing_dev_info;
+		if (nextfh & ROMFH_EXEC)
+			mode |= S_IXUGO;
+		break;
+	case ROMFH_SYM:
+		i->i_op = &page_symlink_inode_operations;
+		i->i_data.a_ops = &romfs_aops;
+		mode |= S_IRWXUGO;
+		break;
+	default:
+		/* depending on MBZ for sock/fifos */
+		nextfh = be32_to_cpu(ri.spec);
+		init_special_inode(i, mode, MKDEV(nextfh >> 16,
+						  nextfh & 0xffff));
+		break;
+	}
+
+	i->i_mode = mode;
+
+	unlock_new_inode(i);
+	return i;
+
+eio:
+	ret = -EIO;
+error:
+	printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos);
+	return ERR_PTR(ret);
+}
+
+/*
+ * allocate a new inode
+ */
+static struct inode *romfs_alloc_inode(struct super_block *sb)
+{
+	struct romfs_inode_info *inode;
+	inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
+	return inode ? &inode->vfs_inode : NULL;
+}
+
+/*
+ * return a spent inode to the slab cache
+ */
+static void romfs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
+}
+
+/*
+ * get filesystem statistics
+ */
+static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+
+	buf->f_type = ROMFS_MAGIC;
+	buf->f_namelen = ROMFS_MAXFN;
+	buf->f_bsize = ROMBSIZE;
+	buf->f_bfree = buf->f_bavail = buf->f_ffree;
+	buf->f_blocks =
+		(romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	return 0;
+}
+
+/*
+ * remounting must involve read-only
+ */
+static int romfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+static const struct super_operations romfs_super_ops = {
+	.alloc_inode	= romfs_alloc_inode,
+	.destroy_inode	= romfs_destroy_inode,
+	.statfs		= romfs_statfs,
+	.remount_fs	= romfs_remount,
+};
+
+/*
+ * checksum check on part of a romfs filesystem
+ */
+static __u32 romfs_checksum(const void *data, int size)
+{
+	const __be32 *ptr = data;
+	__u32 sum;
+
+	sum = 0;
+	size >>= 2;
+	while (size > 0) {
+		sum += be32_to_cpu(*ptr++);
+		size--;
+	}
+	return sum;
+}
+
+/*
+ * fill in the superblock
+ */
+static int romfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct romfs_super_block *rsb;
+	struct inode *root;
+	unsigned long pos, img_size;
+	const char *storage;
+	size_t len;
+	int ret;
+
+#ifdef CONFIG_BLOCK
+	if (!sb->s_mtd) {
+		sb_set_blocksize(sb, ROMBSIZE);
+	} else {
+		sb->s_blocksize = ROMBSIZE;
+		sb->s_blocksize_bits = blksize_bits(ROMBSIZE);
+	}
+#endif
+
+	sb->s_maxbytes = 0xFFFFFFFF;
+	sb->s_magic = ROMFS_MAGIC;
+	sb->s_flags |= MS_RDONLY | MS_NOATIME;
+	sb->s_op = &romfs_super_ops;
+
+	/* read the image superblock and check it */
+	rsb = kmalloc(512, GFP_KERNEL);
+	if (!rsb)
+		return -ENOMEM;
+
+	sb->s_fs_info = (void *) 512;
+	ret = romfs_dev_read(sb, 0, rsb, 512);
+	if (ret < 0)
+		goto error_rsb;
+
+	img_size = be32_to_cpu(rsb->size);
+
+	if (sb->s_mtd && img_size > sb->s_mtd->size)
+		goto error_rsb_inval;
+
+	sb->s_fs_info = (void *) img_size;
+
+	if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
+	    img_size < ROMFH_SIZE) {
+		if (!silent)
+			printk(KERN_WARNING "VFS:"
+			       " Can't find a romfs filesystem on dev %s.\n",
+			       sb->s_id);
+		goto error_rsb_inval;
+	}
+
+	if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) {
+		printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n",
+		       sb->s_id);
+		goto error_rsb_inval;
+	}
+
+	storage = sb->s_mtd ? "MTD" : "the block layer";
+
+	len = strnlen(rsb->name, ROMFS_MAXFN);
+	if (!silent)
+		printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n",
+		       (unsigned) len, (unsigned) len, rsb->name, storage);
+
+	kfree(rsb);
+	rsb = NULL;
+
+	/* find the root directory */
+	pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
+
+	root = romfs_iget(sb, pos);
+	if (!root)
+		goto error;
+
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root)
+		goto error_i;
+
+	return 0;
+
+error_i:
+	iput(root);
+error:
+	return -EINVAL;
+error_rsb_inval:
+	ret = -EINVAL;
+error_rsb:
+	return ret;
+}
+
+/*
+ * get a superblock for mounting
+ */
+static int romfs_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
+{
+	int ret = -EINVAL;
+
+#ifdef CONFIG_ROMFS_ON_MTD
+	ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super,
+			 mnt);
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (ret == -EINVAL)
+		ret = get_sb_bdev(fs_type, flags, dev_name, data,
+				  romfs_fill_super, mnt);
+#endif
+	return ret;
+}
+
+/*
+ * destroy a romfs superblock in the appropriate manner
+ */
+static void romfs_kill_sb(struct super_block *sb)
+{
+#ifdef CONFIG_ROMFS_ON_MTD
+	if (sb->s_mtd) {
+		kill_mtd_super(sb);
+		return;
+	}
+#endif
+#ifdef CONFIG_ROMFS_ON_BLOCK
+	if (sb->s_bdev) {
+		kill_block_super(sb);
+		return;
+	}
+#endif
+}
+
+static struct file_system_type romfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "romfs",
+	.get_sb		= romfs_get_sb,
+	.kill_sb	= romfs_kill_sb,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+/*
+ * inode storage initialiser
+ */
+static void romfs_i_init_once(void *_inode)
+{
+	struct romfs_inode_info *inode = _inode;
+
+	inode_init_once(&inode->vfs_inode);
+}
+
+/*
+ * romfs module initialisation
+ */
+static int __init init_romfs_fs(void)
+{
+	int ret;
+
+	printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n");
+
+	romfs_inode_cachep =
+		kmem_cache_create("romfs_i",
+				  sizeof(struct romfs_inode_info), 0,
+				  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				  romfs_i_init_once);
+
+	if (!romfs_inode_cachep) {
+		printk(KERN_ERR
+		       "ROMFS error: Failed to initialise inode cache\n");
+		return -ENOMEM;
+	}
+	ret = register_filesystem(&romfs_fs_type);
+	if (ret) {
+		printk(KERN_ERR "ROMFS error: Failed to register filesystem\n");
+		goto error_register;
+	}
+	return 0;
+
+error_register:
+	kmem_cache_destroy(romfs_inode_cachep);
+	return ret;
+}
+
+/*
+ * romfs module removal
+ */
+static void __exit exit_romfs_fs(void)
+{
+	unregister_filesystem(&romfs_fs_type);
+	kmem_cache_destroy(romfs_inode_cachep);
+}
+
+module_init(init_romfs_fs);
+module_exit(exit_romfs_fs);
+
+MODULE_DESCRIPTION("Direct-MTD Capable RomFS");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */
diff --git a/fs/splice.c b/fs/splice.c
index dd727d43e5b..c18aa7e03e2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -737,10 +737,19 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
 	 * ->write_end. Most of the time, these expect i_mutex to
 	 * be held. Since this may result in an ABBA deadlock with
 	 * pipe->inode, we have to order lock acquiry here.
+	 *
+	 * Outer lock must be inode->i_mutex, as pipe_wait() will
+	 * release and reacquire pipe->inode->i_mutex, AND inode must
+	 * never be a pipe.
 	 */
-	inode_double_lock(inode, pipe->inode);
+	WARN_ON(S_ISFIFO(inode->i_mode));
+	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+	if (pipe->inode)
+		mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
 	ret = __splice_from_pipe(pipe, &sd, actor);
-	inode_double_unlock(inode, pipe->inode);
+	if (pipe->inode)
+		mutex_unlock(&pipe->inode->i_mutex);
+	mutex_unlock(&inode->i_mutex);
 
 	return ret;
 }
@@ -831,11 +840,17 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 	};
 	ssize_t ret;
 
-	inode_double_lock(inode, pipe->inode);
+	WARN_ON(S_ISFIFO(inode->i_mode));
+	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
 	ret = file_remove_suid(out);
-	if (likely(!ret))
+	if (likely(!ret)) {
+		if (pipe->inode)
+			mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
 		ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
-	inode_double_unlock(inode, pipe->inode);
+		if (pipe->inode)
+			mutex_unlock(&pipe->inode->i_mutex);
+	}
+	mutex_unlock(&inode->i_mutex);
 	if (ret > 0) {
 		unsigned long nr_pages;
 
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 69e971d5ddc..2b1b8fe5e03 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -40,6 +40,7 @@
 #include <linux/dcache.h>
 #include <linux/exportfs.h>
 #include <linux/zlib.h>
+#include <linux/slab.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/super.c b/fs/super.c
index 77cb4ec919b..786fe7d7279 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -771,6 +771,46 @@ void kill_litter_super(struct super_block *sb)
 
 EXPORT_SYMBOL(kill_litter_super);
 
+static int ns_test_super(struct super_block *sb, void *data)
+{
+	return sb->s_fs_info == data;
+}
+
+static int ns_set_super(struct super_block *sb, void *data)
+{
+	sb->s_fs_info = data;
+	return set_anon_super(sb, NULL);
+}
+
+int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt)
+{
+	struct super_block *sb;
+
+	sb = sget(fs_type, ns_test_super, ns_set_super, data);
+	if (IS_ERR(sb))
+		return PTR_ERR(sb);
+
+	if (!sb->s_root) {
+		int err;
+		sb->s_flags = flags;
+		err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+		if (err) {
+			up_write(&sb->s_umount);
+			deactivate_super(sb);
+			return err;
+		}
+
+		sb->s_flags |= MS_ACTIVE;
+	}
+
+	simple_set_mnt(mnt, sb);
+	return 0;
+}
+
+EXPORT_SYMBOL(get_sb_ns);
+
 #ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
 {
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index f393620890e..af1914462f0 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -194,29 +194,26 @@ static int make_free_space(struct ubifs_info *c)
 }
 
 /**
- * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index.
+ * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index.
  * @c: UBIFS file-system description object
  *
- * This function calculates and returns the number of eraseblocks which should
- * be kept for index usage.
+ * This function calculates and returns the number of LEBs which should be kept
+ * for index usage.
  */
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
 {
-	int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz;
+	int idx_lebs;
 	long long idx_size;
 
 	idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
-
 	/* And make sure we have thrice the index size of space reserved */
-	idx_size = idx_size + (idx_size << 1);
-
+	idx_size += idx_size << 1;
 	/*
 	 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
 	 * pair, nor similarly the two variables for the new index size, so we
 	 * have to do this costly 64-bit division on fast-path.
 	 */
-	idx_size += eff_leb_size - 1;
-	idx_lebs = div_u64(idx_size, eff_leb_size);
+	idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size);
 	/*
 	 * The index head is not available for the in-the-gaps method, so add an
 	 * extra LEB to compensate.
@@ -310,23 +307,23 @@ static int can_use_rp(struct ubifs_info *c)
  * do_budget_space - reserve flash space for index and data growth.
  * @c: UBIFS file-system description object
  *
- * This function makes sure UBIFS has enough free eraseblocks for index growth
- * and data.
+ * This function makes sure UBIFS has enough free LEBs for index growth and
+ * data.
  *
  * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
  * would take if it was consolidated and written to the flash. This guarantees
  * that the "in-the-gaps" commit method always succeeds and UBIFS will always
  * be able to commit dirty index. So this function basically adds amount of
  * budgeted index space to the size of the current index, multiplies this by 3,
- * and makes sure this does not exceed the amount of free eraseblocks.
+ * and makes sure this does not exceed the amount of free LEBs.
  *
  * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
  * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
  *    be large, because UBIFS does not do any index consolidation as long as
  *    there is free space. IOW, the index may take a lot of LEBs, but the LEBs
  *    will contain a lot of dirt.
- * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be
- *   consolidated to take up to @c->min_idx_lebs LEBs.
+ * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
+ *    the index may be consolidated to take up to @c->min_idx_lebs LEBs.
  *
  * This function returns zero in case of success, and %-ENOSPC in case of
  * failure.
@@ -695,12 +692,12 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
  * This function calculates amount of free space to report to user-space.
  *
  * Because UBIFS may introduce substantial overhead (the index, node headers,
- * alignment, wastage at the end of eraseblocks, etc), it cannot report real
- * amount of free flash space it has (well, because not all dirty space is
- * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so,
- * it would bread user expectations about what free space is. Users seem to
- * accustomed to assume that if the file-system reports N bytes of free space,
- * they would be able to fit a file of N bytes to the FS. This almost works for
+ * alignment, wastage at the end of LEBs, etc), it cannot report real amount of
+ * free flash space it has (well, because not all dirty space is reclaimable,
+ * UBIFS does not actually know the real amount). If UBIFS did so, it would
+ * bread user expectations about what free space is. Users seem to accustomed
+ * to assume that if the file-system reports N bytes of free space, they would
+ * be able to fit a file of N bytes to the FS. This almost works for
  * traditional file-systems, because they have way less overhead than UBIFS.
  * So, to keep users happy, UBIFS tries to take the overhead into account.
  */
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index e975bd82f38..ce2cd834361 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -479,9 +479,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
 					  "bad or corrupted node)");
 		else {
 			for (i = 0; i < nlen && dent->name[i]; i++)
-				printk("%c", dent->name[i]);
+				printk(KERN_CONT "%c", dent->name[i]);
 		}
-		printk("\n");
+		printk(KERN_CONT "\n");
 
 		break;
 	}
@@ -1214,7 +1214,7 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
 
 			/*
 			 * Make sure the last key in our znode is less or
-			 * equivalent than the the key in zbranch which goes
+			 * equivalent than the key in the zbranch which goes
 			 * after our pointing zbranch.
 			 */
 			cmp = keys_cmp(c, max,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 0ff89fe71e5..6d34dc7e33e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -430,6 +430,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+	int skipped_read = 0;
 	struct page *page;
 
 	ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
@@ -444,7 +445,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 
 	if (!PageUptodate(page)) {
 		/* The page is not loaded from the flash */
-		if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+		if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
 			/*
 			 * We change whole page so no need to load it. But we
 			 * have to set the @PG_checked flag to make the further
@@ -453,7 +454,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 			 * the media.
 			 */
 			SetPageChecked(page);
-		else {
+			skipped_read = 1;
+		} else {
 			err = do_readpage(page);
 			if (err) {
 				unlock_page(page);
@@ -470,6 +472,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	if (unlikely(err)) {
 		ubifs_assert(err == -ENOSPC);
 		/*
+		 * If we skipped reading the page because we were going to
+		 * write all of it, then it is not up to date.
+		 */
+		if (skipped_read) {
+			ClearPageChecked(page);
+			ClearPageUptodate(page);
+		}
+		/*
 		 * Budgeting failed which means it would have to force
 		 * write-back but didn't, because we set the @fast flag in the
 		 * request. Write-back cannot be done now, while we have the
@@ -949,7 +959,7 @@ static int do_writepage(struct page *page, int len)
  * whole index and correct all inode sizes, which is long an unacceptable.
  *
  * To prevent situations like this, UBIFS writes pages back only if they are
- * within last synchronized inode size, i.e. the the size which has been
+ * within the last synchronized inode size, i.e. the size which has been
  * written to the flash media last time. Otherwise, UBIFS forces inode
  * write-back, thus making sure the on-flash inode contains current inode size,
  * and then keeps writing pages back.
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 717d79c97c5..1d54383d126 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -478,7 +478,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
  * ubifs_find_free_space - find a data LEB with free space.
  * @c: the UBIFS file-system description object
  * @min_space: minimum amount of required free space
- * @free: contains amount of free space in the LEB on exit
+ * @offs: contains offset of where free space starts on exit
  * @squeeze: whether to try to find space in a non-empty LEB first
  *
  * This function looks for an LEB with at least @min_space bytes of free space.
@@ -490,7 +490,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
  * failed to find a LEB with @min_space bytes of free space and other a negative
  * error codes in case of failure.
  */
-int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
 			  int squeeze)
 {
 	const struct ubifs_lprops *lprops;
@@ -558,10 +558,10 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
 		spin_unlock(&c->space_lock);
 	}
 
-	*free = lprops->free;
+	*offs = c->leb_size - lprops->free;
 	ubifs_release_lprops(c);
 
-	if (*free == c->leb_size) {
+	if (*offs == 0) {
 		/*
 		 * Ensure that empty LEBs have been unmapped. They may not have
 		 * been, for example, because of an unclean unmount.  Also
@@ -573,8 +573,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
 			return err;
 	}
 
-	dbg_find("found LEB %d, free %d", lnum, *free);
-	ubifs_assert(*free >= min_space);
+	dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs);
+	ubifs_assert(*offs <= c->leb_size - min_space);
 	return lnum;
 
 out:
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index a711d33b3d3..f0f5f15d384 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -47,7 +47,7 @@
  * have to waste large pieces of free space at the end of LEB B, because nodes
  * from LEB A would not fit. And the worst situation is when all nodes are of
  * maximum size. So dark watermark is the amount of free + dirty space in LEB
- * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
+ * which are guaranteed to be reclaimable. If LEB has less space, the GC might
  * be unable to reclaim it. So, LEBs with free + dirty greater than dark
  * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
  * good, and GC takes extra care when moving them.
@@ -57,14 +57,6 @@
 #include "ubifs.h"
 
 /*
- * GC tries to optimize the way it fit nodes to available space, and it sorts
- * nodes a little. The below constants are watermarks which define "large",
- * "medium", and "small" nodes.
- */
-#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
-#define SMALL_NODE_WM  UBIFS_MAX_DENT_NODE_SZ
-
-/*
  * GC may need to move more than one LEB to make progress. The below constants
  * define "soft" and "hard" limits on the number of LEBs the garbage collector
  * may move.
@@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c)
 }
 
 /**
- * joinup - bring data nodes for an inode together.
- * @c: UBIFS file-system description object
- * @sleb: describes scanned LEB
- * @inum: inode number
- * @blk: block number
- * @data: list to which to add data nodes
+ * list_sort - sort a list.
+ * @priv: private data, passed to @cmp
+ * @head: the list to sort
+ * @cmp: the elements comparison function
  *
- * This function looks at the first few nodes in the scanned LEB @sleb and adds
- * them to @data if they are data nodes from @inum and have a larger block
- * number than @blk. This function returns %0 on success and a negative error
- * code on failure.
+ * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
+ * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
+ * in ascending order.
+ *
+ * The comparison function @cmp is supposed to return a negative value if @a is
+ * than @b, and a positive value if @a is greater than @b. If @a and @b are
+ * equivalent, then it does not matter what this function returns.
  */
-static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum,
-		  unsigned int blk, struct list_head *data)
+static void list_sort(void *priv, struct list_head *head,
+		      int (*cmp)(void *priv, struct list_head *a,
+				 struct list_head *b))
 {
-	int err, cnt = 6, lnum = sleb->lnum, offs;
-	struct ubifs_scan_node *snod, *tmp;
-	union ubifs_key *key;
+	struct list_head *p, *q, *e, *list, *tail, *oldhead;
+	int insize, nmerges, psize, qsize, i;
+
+	if (list_empty(head))
+		return;
+
+	list = head->next;
+	list_del(head);
+	insize = 1;
+	for (;;) {
+		p = oldhead = list;
+		list = tail = NULL;
+		nmerges = 0;
+
+		while (p) {
+			nmerges++;
+			q = p;
+			psize = 0;
+			for (i = 0; i < insize; i++) {
+				psize++;
+				q = q->next == oldhead ? NULL : q->next;
+				if (!q)
+					break;
+			}
 
-	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
-		key = &snod->key;
-		if (key_inum(c, key) == inum &&
-		    key_type(c, key) == UBIFS_DATA_KEY &&
-		    key_block(c, key) > blk) {
-			offs = snod->offs;
-			err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0);
-			if (err < 0)
-				return err;
-			list_del(&snod->list);
-			if (err) {
-				list_add_tail(&snod->list, data);
-				blk = key_block(c, key);
-			} else
-				kfree(snod);
-			cnt = 6;
-		} else if (--cnt == 0)
+			qsize = insize;
+			while (psize > 0 || (qsize > 0 && q)) {
+				if (!psize) {
+					e = q;
+					q = q->next;
+					qsize--;
+					if (q == oldhead)
+						q = NULL;
+				} else if (!qsize || !q) {
+					e = p;
+					p = p->next;
+					psize--;
+					if (p == oldhead)
+						p = NULL;
+				} else if (cmp(priv, p, q) <= 0) {
+					e = p;
+					p = p->next;
+					psize--;
+					if (p == oldhead)
+						p = NULL;
+				} else {
+					e = q;
+					q = q->next;
+					qsize--;
+					if (q == oldhead)
+						q = NULL;
+				}
+				if (tail)
+					tail->next = e;
+				else
+					list = e;
+				e->prev = tail;
+				tail = e;
+			}
+			p = q;
+		}
+
+		tail->next = list;
+		list->prev = tail;
+
+		if (nmerges <= 1)
 			break;
+
+		insize *= 2;
 	}
-	return 0;
+
+	head->next = list;
+	head->prev = list->prev;
+	list->prev->next = head;
+	list->prev = head;
 }
 
 /**
- * move_nodes - move nodes.
+ * data_nodes_cmp - compare 2 data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first data node
+ * @a: second data node
+ *
+ * This function compares data nodes @a and @b. Returns %1 if @a has greater
+ * inode or block number, and %-1 otherwise.
+ */
+int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	ino_t inuma, inumb;
+	struct ubifs_info *c = priv;
+	struct ubifs_scan_node *sa, *sb;
+
+	cond_resched();
+	sa = list_entry(a, struct ubifs_scan_node, list);
+	sb = list_entry(b, struct ubifs_scan_node, list);
+	ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
+	ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
+
+	inuma = key_inum(c, &sa->key);
+	inumb = key_inum(c, &sb->key);
+
+	if (inuma == inumb) {
+		unsigned int blka = key_block(c, &sa->key);
+		unsigned int blkb = key_block(c, &sb->key);
+
+		if (blka <= blkb)
+			return -1;
+	} else if (inuma <= inumb)
+		return -1;
+
+	return 1;
+}
+
+/*
+ * nondata_nodes_cmp - compare 2 non-data nodes.
+ * @priv: UBIFS file-system description object
+ * @a: first node
+ * @a: second node
+ *
+ * This function compares nodes @a and @b. It makes sure that inode nodes go
+ * first and sorted by length in descending order. Directory entry nodes go
+ * after inode nodes and are sorted in ascending hash valuer order.
+ */
+int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	int typea, typeb;
+	ino_t inuma, inumb;
+	struct ubifs_info *c = priv;
+	struct ubifs_scan_node *sa, *sb;
+
+	cond_resched();
+	sa = list_entry(a, struct ubifs_scan_node, list);
+	sb = list_entry(b, struct ubifs_scan_node, list);
+	typea = key_type(c, &sa->key);
+	typeb = key_type(c, &sb->key);
+	ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
+
+	/* Inodes go before directory entries */
+	if (typea == UBIFS_INO_KEY) {
+		if (typeb == UBIFS_INO_KEY)
+			return sb->len - sa->len;
+		return -1;
+	}
+	if (typeb == UBIFS_INO_KEY)
+		return 1;
+
+	ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
+	inuma = key_inum(c, &sa->key);
+	inumb = key_inum(c, &sb->key);
+
+	if (inuma == inumb) {
+		uint32_t hasha = key_hash(c, &sa->key);
+		uint32_t hashb = key_hash(c, &sb->key);
+
+		if (hasha <= hashb)
+			return -1;
+	} else if (inuma <= inumb)
+		return -1;
+
+	return 1;
+}
+
+/**
+ * sort_nodes - sort nodes for GC.
  * @c: UBIFS file-system description object
- * @sleb: describes nodes to move
+ * @sleb: describes nodes to sort and contains the result on exit
+ * @nondata: contains non-data nodes on exit
+ * @min: minimum node size is returned here
  *
- * This function moves valid nodes from data LEB described by @sleb to the GC
- * journal head. The obsolete nodes are dropped.
+ * This function sorts the list of inodes to garbage collect. First of all, it
+ * kills obsolete nodes and separates data and non-data nodes to the
+ * @sleb->nodes and @nondata lists correspondingly.
+ *
+ * Data nodes are then sorted in block number order - this is important for
+ * bulk-read; data nodes with lower inode number go before data nodes with
+ * higher inode number, and data nodes with lower block number go before data
+ * nodes with higher block number;
  *
- * When moving nodes we have to deal with classical bin-packing problem: the
- * space in the current GC journal head LEB and in @c->gc_lnum are the "bins",
- * where the nodes in the @sleb->nodes list are the elements which should be
- * fit optimally to the bins. This function uses the "first fit decreasing"
- * strategy, although it does not really sort the nodes but just split them on
- * 3 classes - large, medium, and small, so they are roughly sorted.
+ * Non-data nodes are sorted as follows.
+ *   o First go inode nodes - they are sorted in descending length order.
+ *   o Then go directory entry nodes - they are sorted in hash order, which
+ *     should supposedly optimize 'readdir()'. Direntry nodes with lower parent
+ *     inode number go before direntry nodes with higher parent inode number,
+ *     and direntry nodes with lower name hash values go before direntry nodes
+ *     with higher name hash values.
  *
- * This function returns zero in case of success, %-EAGAIN if commit is
- * required, and other negative error codes in case of other failures.
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
  */
-static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		      struct list_head *nondata, int *min)
 {
 	struct ubifs_scan_node *snod, *tmp;
-	struct list_head data, large, medium, small;
-	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
-	int avail, err, min = INT_MAX;
-	unsigned int blk = 0;
-	ino_t inum = 0;
 
-	INIT_LIST_HEAD(&data);
-	INIT_LIST_HEAD(&large);
-	INIT_LIST_HEAD(&medium);
-	INIT_LIST_HEAD(&small);
+	*min = INT_MAX;
 
-	while (!list_empty(&sleb->nodes)) {
-		struct list_head *lst = sleb->nodes.next;
-
-		snod = list_entry(lst, struct ubifs_scan_node, list);
+	/* Separate data nodes and non-data nodes */
+	list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+		int err;
 
 		ubifs_assert(snod->type != UBIFS_IDX_NODE);
 		ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 		err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
 					 snod->offs, 0);
 		if (err < 0)
-			goto out;
+			return err;
 
-		list_del(lst);
 		if (!err) {
 			/* The node is obsolete, remove it from the list */
+			list_del(&snod->list);
 			kfree(snod);
 			continue;
 		}
 
-		/*
-		 * Sort the list of nodes so that data nodes go first, large
-		 * nodes go second, and small nodes go last.
-		 */
-		if (key_type(c, &snod->key) == UBIFS_DATA_KEY) {
-			if (inum != key_inum(c, &snod->key)) {
-				if (inum) {
-					/*
-					 * Try to move data nodes from the same
-					 * inode together.
-					 */
-					err = joinup(c, sleb, inum, blk, &data);
-					if (err)
-						goto out;
-				}
-				inum = key_inum(c, &snod->key);
-				blk = key_block(c, &snod->key);
-			}
-			list_add_tail(lst, &data);
-		} else if (snod->len > MEDIUM_NODE_WM)
-			list_add_tail(lst, &large);
-		else if (snod->len > SMALL_NODE_WM)
-			list_add_tail(lst, &medium);
-		else
-			list_add_tail(lst, &small);
-
-		/* And find the smallest node */
-		if (snod->len < min)
-			min = snod->len;
+		if (snod->len < *min)
+			*min = snod->len;
+
+		if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
+			list_move_tail(&snod->list, nondata);
 	}
 
-	/*
-	 * Join the tree lists so that we'd have one roughly sorted list
-	 * ('large' will be the head of the joined list).
-	 */
-	list_splice(&data, &large);
-	list_splice(&medium, large.prev);
-	list_splice(&small, large.prev);
+	/* Sort data and non-data nodes */
+	list_sort(c, &sleb->nodes, &data_nodes_cmp);
+	list_sort(c, nondata, &nondata_nodes_cmp);
+	return 0;
+}
+
+/**
+ * move_node - move a node.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ * @snod: the mode to move
+ * @wbuf: write-buffer to move node to
+ *
+ * This function moves node @snod to @wbuf, changes TNC correspondingly, and
+ * destroys @snod. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+		     struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
+{
+	int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
+
+	cond_resched();
+	err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
+	if (err)
+		return err;
+
+	err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
+				snod->offs, new_lnum, new_offs,
+				snod->len);
+	list_del(&snod->list);
+	kfree(snod);
+	return err;
+}
+
+/**
+ * move_nodes - move nodes.
+ * @c: UBIFS file-system description object
+ * @sleb: describes the LEB to move nodes from
+ *
+ * This function moves valid nodes from data LEB described by @sleb to the GC
+ * journal head. This function returns zero in case of success, %-EAGAIN if
+ * commit is required, and other negative error codes in case of other
+ * failures.
+ */
+static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+{
+	int err, min;
+	LIST_HEAD(nondata);
+	struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
 
 	if (wbuf->lnum == -1) {
 		/*
@@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 		 */
 		err = switch_gc_head(c);
 		if (err)
-			goto out;
+			return err;
 	}
 
+	err = sort_nodes(c, sleb, &nondata, &min);
+	if (err)
+		goto out;
+
 	/* Write nodes to their new location. Use the first-fit strategy */
 	while (1) {
-		avail = c->leb_size - wbuf->offs - wbuf->used;
-		list_for_each_entry_safe(snod, tmp, &large, list) {
-			int new_lnum, new_offs;
+		int avail;
+		struct ubifs_scan_node *snod, *tmp;
+
+		/* Move data nodes */
+		list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+			avail = c->leb_size - wbuf->offs - wbuf->used;
+			if  (snod->len > avail)
+				/*
+				 * Do not skip data nodes in order to optimize
+				 * bulk-read.
+				 */
+				break;
+
+			err = move_node(c, sleb, snod, wbuf);
+			if (err)
+				goto out;
+		}
 
+		/* Move non-data nodes */
+		list_for_each_entry_safe(snod, tmp, &nondata, list) {
+			avail = c->leb_size - wbuf->offs - wbuf->used;
 			if (avail < min)
 				break;
 
-			if (snod->len > avail)
-				/* This node does not fit */
+			if  (snod->len > avail) {
+				/*
+				 * Keep going only if this is an inode with
+				 * some data. Otherwise stop and switch the GC
+				 * head. IOW, we assume that data-less inode
+				 * nodes and direntry nodes are roughly of the
+				 * same size.
+				 */
+				if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
+				    snod->len == UBIFS_INO_NODE_SZ)
+					break;
 				continue;
+			}
 
-			cond_resched();
-
-			new_lnum = wbuf->lnum;
-			new_offs = wbuf->offs + wbuf->used;
-			err = ubifs_wbuf_write_nolock(wbuf, snod->node,
-						      snod->len);
+			err = move_node(c, sleb, snod, wbuf);
 			if (err)
 				goto out;
-			err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
-						snod->offs, new_lnum, new_offs,
-						snod->len);
-			if (err)
-				goto out;
-
-			avail = c->leb_size - wbuf->offs - wbuf->used;
-			list_del(&snod->list);
-			kfree(snod);
 		}
 
-		if (list_empty(&large))
+		if (list_empty(&sleb->nodes) && list_empty(&nondata))
 			break;
 
 		/*
@@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
 	return 0;
 
 out:
-	list_for_each_entry_safe(snod, tmp, &large, list) {
-		list_del(&snod->list);
-		kfree(snod);
-	}
+	list_splice_tail(&nondata, &sleb->nodes);
 	return err;
 }
 
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index a11ca0958a2..64b5f3a309f 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -114,7 +114,7 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
  */
 static int reserve_space(struct ubifs_info *c, int jhead, int len)
 {
-	int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze;
+	int err = 0, err1, retries = 0, avail, lnum, offs, squeeze;
 	struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
 
 	/*
@@ -139,10 +139,9 @@ again:
 	 * Write buffer wasn't seek'ed or there is no enough space - look for an
 	 * LEB with some empty space.
 	 */
-	lnum = ubifs_find_free_space(c, len, &free, squeeze);
+	lnum = ubifs_find_free_space(c, len, &offs, squeeze);
 	if (lnum >= 0) {
 		/* Found an LEB, add it to the journal head */
-		offs = c->leb_size - free;
 		err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
 		if (err)
 			goto out_return;
@@ -1366,7 +1365,7 @@ out_ro:
  * @host: host inode
  *
  * This function writes the updated version of an extended attribute inode and
- * the host inode tho the journal (to the base head). The host inode is written
+ * the host inode to the journal (to the base head). The host inode is written
  * after the extended attribute inode in order to guarantee that the extended
  * attribute will be flushed when the inode is synchronized by 'fsync()' and
  * consequently, the write-buffer is synchronized. This function returns zero
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index efb3430a258..5fa27ea031b 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -381,8 +381,8 @@ static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
  * @c: UBIFS file-system description object
  * @key: the key to get hash from
  */
-static inline int key_hash(const struct ubifs_info *c,
-			   const union ubifs_key *key)
+static inline uint32_t key_hash(const struct ubifs_info *c,
+				const union ubifs_key *key)
 {
 	return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
 }
@@ -392,7 +392,7 @@ static inline int key_hash(const struct ubifs_info *c,
  * @c: UBIFS file-system description object
  * @k: the key to get hash from
  */
-static inline int key_hash_flash(const struct ubifs_info *c, const void *k)
+static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k)
 {
 	const union ubifs_key *key = k;
 
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 3e0aa736755..56e33772a1e 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -239,7 +239,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
 	}
 
 	/*
-	 * Make sure the the amount of space in buds will not exceed
+	 * Make sure the amount of space in buds will not exceed the
 	 * 'c->max_bud_bytes' limit, because we want to guarantee mount time
 	 * limits.
 	 *
@@ -367,7 +367,6 @@ static void remove_buds(struct ubifs_info *c)
 				bud->jhead, c->leb_size - bud->start,
 				c->cmt_bud_bytes);
 			rb_erase(p1, &c->buds);
-			list_del(&bud->list);
 			/*
 			 * If the commit does not finish, the recovery will need
 			 * to replay the journal, in which case the old buds
@@ -375,7 +374,7 @@ static void remove_buds(struct ubifs_info *c)
 			 * commit i.e. do not allow them to be garbage
 			 * collected.
 			 */
-			list_add(&bud->list, &c->old_buds);
+			list_move(&bud->list, &c->old_buds);
 		}
 	}
 	spin_unlock(&c->buds_lock);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 3216a1f277f..8cbfb824802 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -229,7 +229,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		while (offs + len > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-			dbg_chk_lpt_sz(c, 2, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
@@ -272,7 +272,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		if (offs + c->lsave_sz > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-			dbg_chk_lpt_sz(c, 2, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
@@ -292,7 +292,7 @@ static int layout_cnodes(struct ubifs_info *c)
 		if (offs + c->ltab_sz > c->leb_size) {
 			alen = ALIGN(offs, c->min_io_size);
 			upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
-			dbg_chk_lpt_sz(c, 2, alen - offs);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = alloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
@@ -416,14 +416,12 @@ static int write_cnodes(struct ubifs_info *c)
 						       alen, UBI_SHORTTERM);
 				if (err)
 					return err;
-				dbg_chk_lpt_sz(c, 4, alen - wlen);
 			}
-			dbg_chk_lpt_sz(c, 2, 0);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = realloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
-			offs = 0;
-			from = 0;
+			offs = from = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
 			err = ubifs_leb_unmap(c, lnum);
@@ -477,11 +475,11 @@ static int write_cnodes(struct ubifs_info *c)
 					      UBI_SHORTTERM);
 			if (err)
 				return err;
-			dbg_chk_lpt_sz(c, 2, alen - wlen);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = realloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
-			offs = 0;
+			offs = from = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
 			err = ubifs_leb_unmap(c, lnum);
@@ -504,11 +502,11 @@ static int write_cnodes(struct ubifs_info *c)
 					      UBI_SHORTTERM);
 			if (err)
 				return err;
-			dbg_chk_lpt_sz(c, 2, alen - wlen);
+			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
 			err = realloc_lpt_leb(c, &lnum);
 			if (err)
 				goto no_space;
-			offs = 0;
+			offs = from = 0;
 			ubifs_assert(lnum >= c->lpt_first &&
 				     lnum <= c->lpt_last);
 			err = ubifs_leb_unmap(c, lnum);
@@ -1756,10 +1754,16 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
 /**
  * dbg_chk_lpt_sz - check LPT does not write more than LPT size.
  * @c: the UBIFS file-system description object
- * @action: action
+ * @action: what to do
  * @len: length written
  *
  * This function returns %0 on success and a negative error code on failure.
+ * The @action argument may be one of:
+ *   o %0 - LPT debugging checking starts, initialize debugging variables;
+ *   o %1 - wrote an LPT node, increase LPT size by @len bytes;
+ *   o %2 - switched to a different LEB and wasted @len bytes;
+ *   o %3 - check that we've written the right number of bytes.
+ *   o %4 - wasted @len bytes;
  */
 int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 {
@@ -1917,12 +1921,12 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
 				       lnum, offs);
 			err = ubifs_unpack_nnode(c, buf, &nnode);
 			for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
-				printk("%d:%d", nnode.nbranch[i].lnum,
+				printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
 				       nnode.nbranch[i].offs);
 				if (i != UBIFS_LPT_FANOUT - 1)
-					printk(", ");
+					printk(KERN_CONT ", ");
 			}
-			printk("\n");
+			printk(KERN_CONT "\n");
 			break;
 		}
 		case UBIFS_LPT_LTAB:
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 90acac603e6..10662975d2e 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -425,59 +425,35 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
  * @lnum: LEB number of the LEB from which @buf was read
  * @offs: offset from which @buf was read
  *
- * This function scans @buf for more nodes and returns %0 is a node is found and
- * %1 if no more nodes are found.
+ * This function ensures that the corrupted node at @offs is the last thing
+ * written to a LEB. This function returns %1 if more data is not found and
+ * %0 if more data is found.
  */
 static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
 			int lnum, int offs)
 {
-	int skip, next_offs = 0;
+	struct ubifs_ch *ch = buf;
+	int skip, dlen = le32_to_cpu(ch->len);
 
-	if (len > UBIFS_DATA_NODE_SZ) {
-		struct ubifs_ch *ch = buf;
-		int dlen = le32_to_cpu(ch->len);
-
-		if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ &&
-		    dlen <= UBIFS_MAX_DATA_NODE_SZ)
-			/* The corrupt node looks like a data node */
-			next_offs = ALIGN(offs + dlen, 8);
-	}
-
-	if (c->min_io_size == 1)
-		skip = 8;
-	else
-		skip = ALIGN(offs + 1, c->min_io_size) - offs;
-
-	offs += skip;
-	buf += skip;
-	len -= skip;
-	while (len > 8) {
-		struct ubifs_ch *ch = buf;
-		uint32_t magic = le32_to_cpu(ch->magic);
-		int ret;
-
-		if (magic == UBIFS_NODE_MAGIC) {
-			ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
-			if (ret == SCANNED_A_NODE || ret > 0) {
-				/*
-				 * There is a small chance this is just data in
-				 * a data node, so check that possibility. e.g.
-				 * this is part of a file that itself contains
-				 * a UBIFS image.
-				 */
-				if (next_offs && offs + le32_to_cpu(ch->len) <=
-				    next_offs)
-					continue;
-				dbg_rcvry("unexpected node at %d:%d", lnum,
-					  offs);
-				return 0;
-			}
-		}
-		offs += 8;
-		buf += 8;
-		len -= 8;
+	/* Check for empty space after the corrupt node's common header */
+	skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs;
+	if (is_empty(buf + skip, len - skip))
+		return 1;
+	/*
+	 * The area after the common header size is not empty, so the common
+	 * header must be intact. Check it.
+	 */
+	if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) {
+		dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs);
+		return 0;
 	}
-	return 1;
+	/* Now we know the corrupt node's length we can skip over it */
+	skip = ALIGN(offs + dlen, c->min_io_size) - offs;
+	/* After which there should be empty space */
+	if (is_empty(buf + skip, len - skip))
+		return 1;
+	dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip);
+	return 0;
 }
 
 /**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ce42a7b0ca5..11cc80125a4 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -143,7 +143,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
 		dirty -= c->leb_size - lp->free;
 		/*
 		 * If the replay order was perfect the dirty space would now be
-		 * zero. The order is not perfect because the the journal heads
+		 * zero. The order is not perfect because the journal heads
 		 * race with each other. This is not a problem but is does mean
 		 * that the dirty space may temporarily exceed c->leb_size
 		 * during the replay.
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index e070c643d1b..57085e43320 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -193,6 +193,7 @@ static int create_default_filesystem(struct ubifs_info *c)
 	if (tmp64 > DEFAULT_MAX_RP_SIZE)
 		tmp64 = DEFAULT_MAX_RP_SIZE;
 	sup->rp_size = cpu_to_le64(tmp64);
+	sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
 
 	err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
 	kfree(sup);
@@ -532,17 +533,39 @@ int ubifs_read_superblock(struct ubifs_info *c)
 	if (IS_ERR(sup))
 		return PTR_ERR(sup);
 
+	c->fmt_version = le32_to_cpu(sup->fmt_version);
+	c->ro_compat_version = le32_to_cpu(sup->ro_compat_version);
+
 	/*
 	 * The software supports all previous versions but not future versions,
 	 * due to the unavailability of time-travelling equipment.
 	 */
-	c->fmt_version = le32_to_cpu(sup->fmt_version);
 	if (c->fmt_version > UBIFS_FORMAT_VERSION) {
-		ubifs_err("on-flash format version is %d, but software only "
-			  "supports up to version %d", c->fmt_version,
-			  UBIFS_FORMAT_VERSION);
-		err = -EINVAL;
-		goto out;
+		struct super_block *sb = c->vfs_sb;
+		int mounting_ro = sb->s_flags & MS_RDONLY;
+
+		ubifs_assert(!c->ro_media || mounting_ro);
+		if (!mounting_ro ||
+		    c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
+			ubifs_err("on-flash format version is w%d/r%d, but "
+				  "software only supports up to version "
+				  "w%d/r%d", c->fmt_version,
+				  c->ro_compat_version, UBIFS_FORMAT_VERSION,
+				  UBIFS_RO_COMPAT_VERSION);
+			if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {
+				ubifs_msg("only R/O mounting is possible");
+				err = -EROFS;
+			} else
+				err = -EINVAL;
+			goto out;
+		}
+
+		/*
+		 * The FS is mounted R/O, and the media format is
+		 * R/O-compatible with the UBIFS implementation, so we can
+		 * mount.
+		 */
+		c->rw_incompat = 1;
 	}
 
 	if (c->fmt_version < 3) {
@@ -623,7 +646,6 @@ int ubifs_read_superblock(struct ubifs_info *c)
 	c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
 	c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
 	c->main_first = c->leb_cnt - c->main_lebs;
-	c->report_rp_size = ubifs_reported_space(c, c->rp_size);
 
 	err = validate_sb(c, sup);
 out:
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index e7bab52a141..02feb59cefc 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -206,8 +206,7 @@ static int shrink_tnc_trees(int nr, int age, int *contention)
 		 * Move this one to the end of the list to provide some
 		 * fairness.
 		 */
-		list_del(&c->infos_list);
-		list_add_tail(&c->infos_list, &ubifs_infos);
+		list_move_tail(&c->infos_list, &ubifs_infos);
 		mutex_unlock(&c->umount_mutex);
 		if (freed >= nr)
 			break;
@@ -263,8 +262,7 @@ static int kick_a_thread(void)
 			}
 
 			if (i == 1) {
-				list_del(&c->infos_list);
-				list_add_tail(&c->infos_list, &ubifs_infos);
+				list_move_tail(&c->infos_list, &ubifs_infos);
 				spin_unlock(&ubifs_infos_lock);
 
 				ubifs_request_bg_commit(c);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index c5c98355459..faa44f90608 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -421,8 +421,8 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",no_chk_data_crc");
 
 	if (c->mount_opts.override_compr) {
-		seq_printf(s, ",compr=");
-		seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type));
+		seq_printf(s, ",compr=%s",
+			   ubifs_compr_name(c->mount_opts.compr_type));
 	}
 
 	return 0;
@@ -700,6 +700,8 @@ static int init_constants_sb(struct ubifs_info *c)
 	if (err)
 		return err;
 
+	/* Initialize effective LEB size used in budgeting calculations */
+	c->idx_leb_size = c->leb_size - c->max_idx_node_sz;
 	return 0;
 }
 
@@ -716,6 +718,7 @@ static void init_constants_master(struct ubifs_info *c)
 	long long tmp64;
 
 	c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+	c->report_rp_size = ubifs_reported_space(c, c->rp_size);
 
 	/*
 	 * Calculate total amount of FS blocks. This number is not used
@@ -1201,7 +1204,7 @@ static int mount_ubifs(struct ubifs_info *c)
 			goto out_cbuf;
 
 		/* Create background thread */
-		c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+		c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
 		if (IS_ERR(c->bgt)) {
 			err = PTR_ERR(c->bgt);
 			c->bgt = NULL;
@@ -1318,11 +1321,15 @@ static int mount_ubifs(struct ubifs_info *c)
 		else {
 			c->need_recovery = 0;
 			ubifs_msg("recovery completed");
-			/* GC LEB has to be empty and taken at this point */
-			ubifs_assert(c->lst.taken_empty_lebs == 1);
+			/*
+			 * GC LEB has to be empty and taken at this point. But
+			 * the journal head LEBs may also be accounted as
+			 * "empty taken" if they are empty.
+			 */
+			ubifs_assert(c->lst.taken_empty_lebs > 0);
 		}
 	} else
-		ubifs_assert(c->lst.taken_empty_lebs == 1);
+		ubifs_assert(c->lst.taken_empty_lebs > 0);
 
 	err = dbg_check_filesystem(c);
 	if (err)
@@ -1344,8 +1351,9 @@ static int mount_ubifs(struct ubifs_info *c)
 	x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
 	ubifs_msg("journal size:       %lld bytes (%lld KiB, %lld MiB, %d "
 		  "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
-	ubifs_msg("media format:       %d (latest is %d)",
-		  c->fmt_version, UBIFS_FORMAT_VERSION);
+	ubifs_msg("media format:       w%d/r%d (latest is w%d/r%d)",
+		  c->fmt_version, c->ro_compat_version,
+		  UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
 	ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
 	ubifs_msg("reserved for root:  %llu bytes (%llu KiB)",
 		c->report_rp_size, c->report_rp_size >> 10);
@@ -1485,6 +1493,15 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 {
 	int err, lnum;
 
+	if (c->rw_incompat) {
+		ubifs_err("the file-system is not R/W-compatible");
+		ubifs_msg("on-flash format version is w%d/r%d, but software "
+			  "only supports up to version w%d/r%d", c->fmt_version,
+			  c->ro_compat_version, UBIFS_FORMAT_VERSION,
+			  UBIFS_RO_COMPAT_VERSION);
+		return -EROFS;
+	}
+
 	mutex_lock(&c->umount_mutex);
 	dbg_save_space_info(c);
 	c->remounting_rw = 1;
@@ -1554,7 +1571,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	ubifs_create_buds_lists(c);
 
 	/* Create background thread */
-	c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+	c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
 	if (IS_ERR(c->bgt)) {
 		err = PTR_ERR(c->bgt);
 		c->bgt = NULL;
@@ -1775,7 +1792,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		c->bu.buf = NULL;
 	}
 
-	ubifs_assert(c->lst.taken_empty_lebs == 1);
+	ubifs_assert(c->lst.taken_empty_lebs > 0);
 	return 0;
 }
 
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index fa28a84c6a1..f249f7b0d65 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1252,7 +1252,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
 	 * splitting in the middle of the colliding sequence. Also, when
 	 * removing the leftmost key, we would have to correct the key of the
 	 * parent node, which would introduce additional complications. Namely,
-	 * if we changed the the leftmost key of the parent znode, the garbage
+	 * if we changed the leftmost key of the parent znode, the garbage
 	 * collector would be unable to find it (GC is doing this when GC'ing
 	 * indexing LEBs). Although we already have an additional RB-tree where
 	 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index b25fc36cf72..3eee07e0c49 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -36,9 +36,31 @@
 /* UBIFS node magic number (must not have the padding byte first or last) */
 #define UBIFS_NODE_MAGIC  0x06101831
 
-/* UBIFS on-flash format version */
+/*
+ * UBIFS on-flash format version. This version is increased when the on-flash
+ * format is changing. If this happens, UBIFS is will support older versions as
+ * well. But older UBIFS code will not support newer formats. Format changes
+ * will be rare and only when absolutely necessary, e.g. to fix a bug or to add
+ * a new feature.
+ *
+ * UBIFS went into mainline kernel with format version 4. The older formats
+ * were development formats.
+ */
 #define UBIFS_FORMAT_VERSION 4
 
+/*
+ * Read-only compatibility version. If the UBIFS format is changed, older UBIFS
+ * implementations will not be able to mount newer formats in read-write mode.
+ * However, depending on the change, it may be possible to mount newer formats
+ * in R/O mode. This is indicated by the R/O compatibility version which is
+ * stored in the super-block.
+ *
+ * This is needed to support boot-loaders which only need R/O mounting. With
+ * this flag it is possible to do UBIFS format changes without a need to update
+ * boot-loaders.
+ */
+#define UBIFS_RO_COMPAT_VERSION 0
+
 /* Minimum logical eraseblock size in bytes */
 #define UBIFS_MIN_LEB_SZ (15*1024)
 
@@ -53,7 +75,7 @@
 
 /*
  * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
- * shorter than uncompressed data length, UBIFS preferes to leave this data
+ * shorter than uncompressed data length, UBIFS prefers to leave this data
  * node uncompress, because it'll be read faster.
  */
 #define UBIFS_MIN_COMPRESS_DIFF 64
@@ -586,6 +608,7 @@ struct ubifs_pad_node {
  * @padding2: reserved for future, zeroes
  * @time_gran: time granularity in nanoseconds
  * @uuid: UUID generated when the file system image was created
+ * @ro_compat_version: UBIFS R/O compatibility version
  */
 struct ubifs_sb_node {
 	struct ubifs_ch ch;
@@ -612,7 +635,8 @@ struct ubifs_sb_node {
 	__le64 rp_size;
 	__le32 time_gran;
 	__u8 uuid[16];
-	__u8 padding2[3972];
+	__le32 ro_compat_version;
+	__u8 padding2[3968];
 } __attribute__ ((packed));
 
 /**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 039a68bee29..0a8341e1408 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -934,6 +934,7 @@ struct ubifs_debug_info;
  *          by @commit_sem
  * @cnt_lock: protects @highest_inum and @max_sqnum counters
  * @fmt_version: UBIFS on-flash format version
+ * @ro_compat_version: R/O compatibility version
  * @uuid: UUID from super block
  *
  * @lhead_lnum: log head logical eraseblock number
@@ -966,6 +967,7 @@ struct ubifs_debug_info;
  *                   recovery)
  * @bulk_read: enable bulk-reads
  * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ * @rw_incompat: the media is not R/W compatible
  *
  * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
  *             @calc_idx_sz
@@ -1015,6 +1017,8 @@ struct ubifs_debug_info;
  * @min_io_shift: number of bits in @min_io_size minus one
  * @leb_size: logical eraseblock size in bytes
  * @half_leb_size: half LEB size
+ * @idx_leb_size: how many bytes of an LEB are effectively available when it is
+ *                used to store indexing nodes (@leb_size - @max_idx_node_sz)
  * @leb_cnt: count of logical eraseblocks
  * @max_leb_cnt: maximum count of logical eraseblocks
  * @old_leb_cnt: count of logical eraseblocks before re-size
@@ -1132,8 +1136,8 @@ struct ubifs_debug_info;
  *             previous commit start
  * @uncat_list: list of un-categorized LEBs
  * @empty_list: list of empty LEBs
- * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size)
- * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size)
+ * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
+ * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
  * @freeable_cnt: number of freeable LEBs in @freeable_list
  *
  * @ltab_lnum: LEB number of LPT's own lprops table
@@ -1177,6 +1181,7 @@ struct ubifs_info {
 	unsigned long long cmt_no;
 	spinlock_t cnt_lock;
 	int fmt_version;
+	int ro_compat_version;
 	unsigned char uuid[16];
 
 	int lhead_lnum;
@@ -1205,6 +1210,7 @@ struct ubifs_info {
 	unsigned int no_chk_data_crc:1;
 	unsigned int bulk_read:1;
 	unsigned int default_compr:2;
+	unsigned int rw_incompat:1;
 
 	struct mutex tnc_mutex;
 	struct ubifs_zbranch zroot;
@@ -1253,6 +1259,7 @@ struct ubifs_info {
 	int min_io_shift;
 	int leb_size;
 	int half_leb_size;
+	int idx_leb_size;
 	int leb_cnt;
 	int max_leb_cnt;
 	int old_leb_cnt;
@@ -1500,7 +1507,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free);
 long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
 
 /* find.c */
-int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
 			  int squeeze);
 int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
 int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c13f67300fe..7ec89fc05b2 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -153,23 +153,6 @@ xfs_find_bdev_for_inode(
 }
 
 /*
- * Schedule IO completion handling on a xfsdatad if this was
- * the final hold on this ioend. If we are asked to wait,
- * flush the workqueue.
- */
-STATIC void
-xfs_finish_ioend(
-	xfs_ioend_t	*ioend,
-	int		wait)
-{
-	if (atomic_dec_and_test(&ioend->io_remaining)) {
-		queue_work(xfsdatad_workqueue, &ioend->io_work);
-		if (wait)
-			flush_workqueue(xfsdatad_workqueue);
-	}
-}
-
-/*
  * We're now finished for good with this ioend structure.
  * Update the page state via the associated buffer_heads,
  * release holds on the inode and bio, and finally free
@@ -310,6 +293,27 @@ xfs_end_bio_read(
 }
 
 /*
+ * Schedule IO completion handling on a xfsdatad if this was
+ * the final hold on this ioend. If we are asked to wait,
+ * flush the workqueue.
+ */
+STATIC void
+xfs_finish_ioend(
+	xfs_ioend_t	*ioend,
+	int		wait)
+{
+	if (atomic_dec_and_test(&ioend->io_remaining)) {
+		struct workqueue_struct *wq = xfsdatad_workqueue;
+		if (ioend->io_work.func == xfs_end_bio_unwritten)
+			wq = xfsconvertd_workqueue;
+
+		queue_work(wq, &ioend->io_work);
+		if (wait)
+			flush_workqueue(wq);
+	}
+}
+
+/*
  * Allocate and initialise an IO completion structure.
  * We need to track unwritten extent write completion here initially.
  * We'll need to extend this for updating the ondisk inode size later
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 1dd52884975..221b3e66cee 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -19,6 +19,7 @@
 #define __XFS_AOPS_H__
 
 extern struct workqueue_struct *xfsdatad_workqueue;
+extern struct workqueue_struct *xfsconvertd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 
 /*
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index aa1016bb913..e28800a9f2b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -51,6 +51,7 @@ static struct shrinker xfs_buf_shake = {
 
 static struct workqueue_struct *xfslogd_workqueue;
 struct workqueue_struct *xfsdatad_workqueue;
+struct workqueue_struct *xfsconvertd_workqueue;
 
 #ifdef XFS_BUF_TRACE
 void
@@ -1775,6 +1776,7 @@ xfs_flush_buftarg(
 	xfs_buf_t	*bp, *n;
 	int		pincount = 0;
 
+	xfs_buf_runall_queues(xfsconvertd_workqueue);
 	xfs_buf_runall_queues(xfsdatad_workqueue);
 	xfs_buf_runall_queues(xfslogd_workqueue);
 
@@ -1831,9 +1833,15 @@ xfs_buf_init(void)
 	if (!xfsdatad_workqueue)
 		goto out_destroy_xfslogd_workqueue;
 
+	xfsconvertd_workqueue = create_workqueue("xfsconvertd");
+	if (!xfsconvertd_workqueue)
+		goto out_destroy_xfsdatad_workqueue;
+
 	register_shrinker(&xfs_buf_shake);
 	return 0;
 
+ out_destroy_xfsdatad_workqueue:
+	destroy_workqueue(xfsdatad_workqueue);
  out_destroy_xfslogd_workqueue:
 	destroy_workqueue(xfslogd_workqueue);
  out_free_buf_zone:
@@ -1849,6 +1857,7 @@ void
 xfs_buf_terminate(void)
 {
 	unregister_shrinker(&xfs_buf_shake);
+	destroy_workqueue(xfsconvertd_workqueue);
 	destroy_workqueue(xfsdatad_workqueue);
 	destroy_workqueue(xfslogd_workqueue);
 	kmem_zone_destroy(xfs_buf_zone);
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 5aeb7777696..08be36d7326 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -74,14 +74,14 @@ xfs_flush_pages(
 
 	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 		xfs_iflags_clear(ip, XFS_ITRUNCATED);
-		ret = filemap_fdatawrite(mapping);
-		if (flags & XFS_B_ASYNC)
-			return -ret;
-		ret2 = filemap_fdatawait(mapping);
-		if (!ret)
-			ret = ret2;
+		ret = -filemap_fdatawrite(mapping);
 	}
-	return -ret;
+	if (flags & XFS_B_ASYNC)
+		return ret;
+	ret2 = xfs_wait_on_pages(ip, first, last);
+	if (!ret)
+		ret = ret2;
+	return ret;
 }
 
 int
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7e90daa0d1d..9142192ccbe 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -751,10 +751,26 @@ start:
 			goto relock;
 		}
 	} else {
+		int enospc = 0;
+		ssize_t ret2 = 0;
+
+write_retry:
 		xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs,
 				*offset, ioflags);
-		ret = generic_file_buffered_write(iocb, iovp, segs,
+		ret2 = generic_file_buffered_write(iocb, iovp, segs,
 				pos, offset, count, ret);
+		/*
+		 * if we just got an ENOSPC, flush the inode now we
+		 * aren't holding any page locks and retry *once*
+		 */
+		if (ret2 == -ENOSPC && !enospc) {
+			error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE);
+			if (error)
+				goto out_unlock_internal;
+			enospc = 1;
+			goto write_retry;
+		}
+		ret = ret2;
 	}
 
 	current->backing_dev_info = NULL;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a608e72fa40..f7ba76633c2 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -62,12 +62,6 @@ xfs_sync_inodes_ag(
 	uint32_t	first_index = 0;
 	int		error = 0;
 	int		last_error = 0;
-	int		fflag = XFS_B_ASYNC;
-
-	if (flags & SYNC_DELWRI)
-		fflag = XFS_B_DELWRI;
-	if (flags & SYNC_WAIT)
-		fflag = 0;		/* synchronous overrides all */
 
 	do {
 		struct inode	*inode;
@@ -128,11 +122,23 @@ xfs_sync_inodes_ag(
 		 * If we have to flush data or wait for I/O completion
 		 * we need to hold the iolock.
 		 */
-		if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
-			xfs_ilock(ip, XFS_IOLOCK_SHARED);
-			lock_flags |= XFS_IOLOCK_SHARED;
-			error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
-			if (flags & SYNC_IOWAIT)
+		if (flags & SYNC_DELWRI) {
+			if (VN_DIRTY(inode)) {
+				if (flags & SYNC_TRYLOCK) {
+					if (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
+						lock_flags |= XFS_IOLOCK_SHARED;
+				} else {
+					xfs_ilock(ip, XFS_IOLOCK_SHARED);
+					lock_flags |= XFS_IOLOCK_SHARED;
+				}
+				if (lock_flags & XFS_IOLOCK_SHARED) {
+					error = xfs_flush_pages(ip, 0, -1,
+							(flags & SYNC_WAIT) ? 0
+								: XFS_B_ASYNC,
+							FI_NONE);
+				}
+			}
+			if (VN_CACHED(inode) && (flags & SYNC_IOWAIT))
 				xfs_ioend_wait(ip);
 		}
 		xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -398,15 +404,17 @@ STATIC void
 xfs_syncd_queue_work(
 	struct xfs_mount *mp,
 	void		*data,
-	void		(*syncer)(struct xfs_mount *, void *))
+	void		(*syncer)(struct xfs_mount *, void *),
+	struct completion *completion)
 {
-	struct bhv_vfs_sync_work *work;
+	struct xfs_sync_work *work;
 
-	work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
+	work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
 	INIT_LIST_HEAD(&work->w_list);
 	work->w_syncer = syncer;
 	work->w_data = data;
 	work->w_mount = mp;
+	work->w_completion = completion;
 	spin_lock(&mp->m_sync_lock);
 	list_add_tail(&work->w_list, &mp->m_sync_list);
 	spin_unlock(&mp->m_sync_lock);
@@ -420,49 +428,26 @@ xfs_syncd_queue_work(
  * heads, looking about for more room...
  */
 STATIC void
-xfs_flush_inode_work(
-	struct xfs_mount *mp,
-	void		*arg)
-{
-	struct inode	*inode = arg;
-	filemap_flush(inode->i_mapping);
-	iput(inode);
-}
-
-void
-xfs_flush_inode(
-	xfs_inode_t	*ip)
-{
-	struct inode	*inode = VFS_I(ip);
-
-	igrab(inode);
-	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
-	delay(msecs_to_jiffies(500));
-}
-
-/*
- * This is the "bigger hammer" version of xfs_flush_inode_work...
- * (IOW, "If at first you don't succeed, use a Bigger Hammer").
- */
-STATIC void
-xfs_flush_device_work(
+xfs_flush_inodes_work(
 	struct xfs_mount *mp,
 	void		*arg)
 {
 	struct inode	*inode = arg;
-	sync_blockdev(mp->m_super->s_bdev);
+	xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK);
+	xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK | SYNC_IOWAIT);
 	iput(inode);
 }
 
 void
-xfs_flush_device(
+xfs_flush_inodes(
 	xfs_inode_t	*ip)
 {
 	struct inode	*inode = VFS_I(ip);
+	DECLARE_COMPLETION_ONSTACK(completion);
 
 	igrab(inode);
-	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
-	delay(msecs_to_jiffies(500));
+	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
+	wait_for_completion(&completion);
 	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
 }
 
@@ -497,7 +482,7 @@ xfssyncd(
 {
 	struct xfs_mount	*mp = arg;
 	long			timeleft;
-	bhv_vfs_sync_work_t	*work, *n;
+	xfs_sync_work_t		*work, *n;
 	LIST_HEAD		(tmp);
 
 	set_freezable();
@@ -532,6 +517,8 @@ xfssyncd(
 			list_del(&work->w_list);
 			if (work == &mp->m_sync_work)
 				continue;
+			if (work->w_completion)
+				complete(work->w_completion);
 			kmem_free(work);
 		}
 	}
@@ -545,6 +532,7 @@ xfs_syncd_init(
 {
 	mp->m_sync_work.w_syncer = xfs_sync_worker;
 	mp->m_sync_work.w_mount = mp;
+	mp->m_sync_work.w_completion = NULL;
 	mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
 	if (IS_ERR(mp->m_sync_task))
 		return -PTR_ERR(mp->m_sync_task);
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 04f058c848a..308d5bf6dfb 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -21,18 +21,20 @@
 struct xfs_mount;
 struct xfs_perag;
 
-typedef struct bhv_vfs_sync_work {
+typedef struct xfs_sync_work {
 	struct list_head	w_list;
 	struct xfs_mount	*w_mount;
 	void			*w_data;	/* syncer routine argument */
 	void			(*w_syncer)(struct xfs_mount *, void *);
-} bhv_vfs_sync_work_t;
+	struct completion	*w_completion;
+} xfs_sync_work_t;
 
 #define SYNC_ATTR		0x0001	/* sync attributes */
 #define SYNC_DELWRI		0x0002	/* look at delayed writes */
 #define SYNC_WAIT		0x0004	/* wait for i/o to complete */
 #define SYNC_BDFLUSH		0x0008	/* BDFLUSH is calling -- don't block */
 #define SYNC_IOWAIT		0x0010  /* wait for all I/O to complete */
+#define SYNC_TRYLOCK		0x0020  /* only try to lock inodes */
 
 int xfs_syncd_init(struct xfs_mount *mp);
 void xfs_syncd_stop(struct xfs_mount *mp);
@@ -43,8 +45,7 @@ int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
 
-void xfs_flush_inode(struct xfs_inode *ip);
-void xfs_flush_device(struct xfs_inode *ip);
+void xfs_flush_inodes(struct xfs_inode *ip);
 
 int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
 int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 478e587087f..89b81eedce6 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -69,15 +69,6 @@ xfs_inode_alloc(
 	ASSERT(!spin_is_locked(&ip->i_flags_lock));
 	ASSERT(completion_done(&ip->i_flush));
 
-	/*
-	 * initialise the VFS inode here to get failures
-	 * out of the way early.
-	 */
-	if (!inode_init_always(mp->m_super, VFS_I(ip))) {
-		kmem_zone_free(xfs_inode_zone, ip);
-		return NULL;
-	}
-
 	/* initialise the xfs inode */
 	ip->i_ino = ino;
 	ip->i_mount = mp;
@@ -113,6 +104,20 @@ xfs_inode_alloc(
 #ifdef XFS_DIR2_TRACE
 	ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
 #endif
+	/*
+	* Now initialise the VFS inode. We do this after the xfs_inode
+	* initialisation as internal failures will result in ->destroy_inode
+	* being called and that will pass down through the reclaim path and
+	* free the XFS inode. This path requires the XFS inode to already be
+	* initialised. Hence if this call fails, the xfs_inode has already
+	* been freed and we should not reference it at all in the error
+	* handling.
+	*/
+	if (!inode_init_always(mp->m_super, VFS_I(ip)))
+		return NULL;
+
+	/* prevent anyone from using this yet */
+	VFS_I(ip)->i_state = I_NEW|I_LOCK;
 
 	return ip;
 }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 08ce72316bf..5aaa2d7ec15 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -338,38 +338,6 @@ xfs_iomap_eof_align_last_fsb(
 }
 
 STATIC int
-xfs_flush_space(
-	xfs_inode_t	*ip,
-	int		*fsynced,
-	int		*ioflags)
-{
-	switch (*fsynced) {
-	case 0:
-		if (ip->i_delayed_blks) {
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-			xfs_flush_inode(ip);
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
-			*fsynced = 1;
-		} else {
-			*ioflags |= BMAPI_SYNC;
-			*fsynced = 2;
-		}
-		return 0;
-	case 1:
-		*fsynced = 2;
-		*ioflags |= BMAPI_SYNC;
-		return 0;
-	case 2:
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		xfs_flush_device(ip);
-		xfs_ilock(ip, XFS_ILOCK_EXCL);
-		*fsynced = 3;
-		return 0;
-	}
-	return 1;
-}
-
-STATIC int
 xfs_cmn_err_fsblock_zero(
 	xfs_inode_t	*ip,
 	xfs_bmbt_irec_t	*imap)
@@ -538,15 +506,9 @@ error_out:
 }
 
 /*
- * If the caller is doing a write at the end of the file,
- * then extend the allocation out to the file system's write
- * iosize.  We clean up any extra space left over when the
- * file is closed in xfs_inactive().
- *
- * For sync writes, we are flushing delayed allocate space to
- * try to make additional space available for allocation near
- * the filesystem full boundary - preallocation hurts in that
- * situation, of course.
+ * If the caller is doing a write at the end of the file, then extend the
+ * allocation out to the file system's write iosize.  We clean up any extra
+ * space left over when the file is closed in xfs_inactive().
  */
 STATIC int
 xfs_iomap_eof_want_preallocate(
@@ -565,7 +527,7 @@ xfs_iomap_eof_want_preallocate(
 	int		n, error, imaps;
 
 	*prealloc = 0;
-	if ((ioflag & BMAPI_SYNC) || (offset + count) <= ip->i_size)
+	if ((offset + count) <= ip->i_size)
 		return 0;
 
 	/*
@@ -611,7 +573,7 @@ xfs_iomap_write_delay(
 	xfs_extlen_t	extsz;
 	int		nimaps;
 	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
-	int		prealloc, fsynced = 0;
+	int		prealloc, flushed = 0;
 	int		error;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -627,12 +589,12 @@ xfs_iomap_write_delay(
 	extsz = xfs_get_extsz_hint(ip);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
-retry:
 	error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
 				ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
 	if (error)
 		return error;
 
+retry:
 	if (prealloc) {
 		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
 		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
@@ -659,15 +621,22 @@ retry:
 
 	/*
 	 * If bmapi returned us nothing, and if we didn't get back EDQUOT,
-	 * then we must have run out of space - flush delalloc, and retry..
+	 * then we must have run out of space - flush all other inodes with
+	 * delalloc blocks and retry without EOF preallocation.
 	 */
 	if (nimaps == 0) {
 		xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE,
 					ip, offset, count);
-		if (xfs_flush_space(ip, &fsynced, &ioflag))
+		if (flushed)
 			return XFS_ERROR(ENOSPC);
 
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		xfs_flush_inodes(ip);
+		xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+		flushed = 1;
 		error = 0;
+		prealloc = 0;
 		goto retry;
 	}
 
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index a1cc1322fc0..fdcf7b82747 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -40,8 +40,7 @@ typedef enum {
 	BMAPI_IGNSTATE = (1 << 4),	/* ignore unwritten state on read */
 	BMAPI_DIRECT = (1 << 5),	/* direct instead of buffered write */
 	BMAPI_MMAP = (1 << 6),		/* allocate for mmap write */
-	BMAPI_SYNC = (1 << 7),		/* sync write to flush delalloc space */
-	BMAPI_TRYLOCK = (1 << 8),	/* non-blocking request */
+	BMAPI_TRYLOCK = (1 << 7),	/* non-blocking request */
 } bmapi_flags_t;
 
 
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f76c6d7cea2..3750f04ede0 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -562,9 +562,8 @@ xfs_log_mount(
 	}
 
 	mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
-	if (!mp->m_log) {
-		cmn_err(CE_WARN, "XFS: Log allocation failed: No memory!");
-		error = ENOMEM;
+	if (IS_ERR(mp->m_log)) {
+		error = -PTR_ERR(mp->m_log);
 		goto out;
 	}
 
@@ -1180,10 +1179,13 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	xfs_buf_t		*bp;
 	int			i;
 	int			iclogsize;
+	int			error = ENOMEM;
 
 	log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
-	if (!log)
-		return NULL;
+	if (!log) {
+		xlog_warn("XFS: Log allocation failed: No memory!");
+		goto out;
+	}
 
 	log->l_mp	   = mp;
 	log->l_targ	   = log_target;
@@ -1201,19 +1203,35 @@ xlog_alloc_log(xfs_mount_t	*mp,
 	log->l_grant_reserve_cycle = 1;
 	log->l_grant_write_cycle = 1;
 
+	error = EFSCORRUPTED;
 	if (xfs_sb_version_hassector(&mp->m_sb)) {
 		log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT;
-		ASSERT(log->l_sectbb_log <= mp->m_sectbb_log);
+		if (log->l_sectbb_log < 0 ||
+		    log->l_sectbb_log > mp->m_sectbb_log) {
+			xlog_warn("XFS: Log sector size (0x%x) out of range.",
+						log->l_sectbb_log);
+			goto out_free_log;
+		}
+
 		/* for larger sector sizes, must have v2 or external log */
-		ASSERT(log->l_sectbb_log == 0 ||
-			log->l_logBBstart == 0 ||
-			xfs_sb_version_haslogv2(&mp->m_sb));
-		ASSERT(mp->m_sb.sb_logsectlog >= BBSHIFT);
+		if (log->l_sectbb_log != 0 &&
+		    (log->l_logBBstart != 0 &&
+		     !xfs_sb_version_haslogv2(&mp->m_sb))) {
+			xlog_warn("XFS: log sector size (0x%x) invalid "
+				  "for configuration.", log->l_sectbb_log);
+			goto out_free_log;
+		}
+		if (mp->m_sb.sb_logsectlog < BBSHIFT) {
+			xlog_warn("XFS: Log sector log (0x%x) too small.",
+						mp->m_sb.sb_logsectlog);
+			goto out_free_log;
+		}
 	}
 	log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1;
 
 	xlog_get_iclog_buffer_size(mp, log);
 
+	error = ENOMEM;
 	bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp);
 	if (!bp)
 		goto out_free_log;
@@ -1313,7 +1331,8 @@ out_free_iclog:
 	xfs_buf_free(log->l_xbuf);
 out_free_log:
 	kmem_free(log);
-	return NULL;
+out:
+	return ERR_PTR(-error);
 }	/* xlog_alloc_log */
 
 
@@ -2541,18 +2560,19 @@ redo:
 			xlog_ins_ticketq(&log->l_reserve_headq, tic);
 		xlog_trace_loggrant(log, tic,
 				    "xlog_grant_log_space: sleep 2");
+		spin_unlock(&log->l_grant_lock);
+		xlog_grant_push_ail(log->l_mp, need_bytes);
+		spin_lock(&log->l_grant_lock);
+
 		XFS_STATS_INC(xs_sleep_logspace);
 		sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
 
-		if (XLOG_FORCED_SHUTDOWN(log)) {
-			spin_lock(&log->l_grant_lock);
+		spin_lock(&log->l_grant_lock);
+		if (XLOG_FORCED_SHUTDOWN(log))
 			goto error_return;
-		}
 
 		xlog_trace_loggrant(log, tic,
 				    "xlog_grant_log_space: wake 2");
-		xlog_grant_push_ail(log->l_mp, need_bytes);
-		spin_lock(&log->l_grant_lock);
 		goto redo;
 	} else if (tic->t_flags & XLOG_TIC_IN_Q)
 		xlog_del_ticketq(&log->l_reserve_headq, tic);
@@ -2631,7 +2651,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 	 * for more free space, otherwise try to get some space for
 	 * this transaction.
 	 */
-
+	need_bytes = tic->t_unit_res;
 	if ((ntic = log->l_write_headq)) {
 		free_bytes = xlog_space_left(log, log->l_grant_write_cycle,
 					     log->l_grant_write_bytes);
@@ -2651,26 +2671,25 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 
 			xlog_trace_loggrant(log, tic,
 				    "xlog_regrant_write_log_space: sleep 1");
+			spin_unlock(&log->l_grant_lock);
+			xlog_grant_push_ail(log->l_mp, need_bytes);
+			spin_lock(&log->l_grant_lock);
+
 			XFS_STATS_INC(xs_sleep_logspace);
 			sv_wait(&tic->t_wait, PINOD|PLTWAIT,
 				&log->l_grant_lock, s);
 
 			/* If we're shutting down, this tic is already
 			 * off the queue */
-			if (XLOG_FORCED_SHUTDOWN(log)) {
-				spin_lock(&log->l_grant_lock);
+			spin_lock(&log->l_grant_lock);
+			if (XLOG_FORCED_SHUTDOWN(log))
 				goto error_return;
-			}
 
 			xlog_trace_loggrant(log, tic,
 				    "xlog_regrant_write_log_space: wake 1");
-			xlog_grant_push_ail(log->l_mp, tic->t_unit_res);
-			spin_lock(&log->l_grant_lock);
 		}
 	}
 
-	need_bytes = tic->t_unit_res;
-
 redo:
 	if (XLOG_FORCED_SHUTDOWN(log))
 		goto error_return;
@@ -2680,19 +2699,20 @@ redo:
 	if (free_bytes < need_bytes) {
 		if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
 			xlog_ins_ticketq(&log->l_write_headq, tic);
+		spin_unlock(&log->l_grant_lock);
+		xlog_grant_push_ail(log->l_mp, need_bytes);
+		spin_lock(&log->l_grant_lock);
+
 		XFS_STATS_INC(xs_sleep_logspace);
 		sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
 
 		/* If we're shutting down, this tic is already off the queue */
-		if (XLOG_FORCED_SHUTDOWN(log)) {
-			spin_lock(&log->l_grant_lock);
+		spin_lock(&log->l_grant_lock);
+		if (XLOG_FORCED_SHUTDOWN(log))
 			goto error_return;
-		}
 
 		xlog_trace_loggrant(log, tic,
 				    "xlog_regrant_write_log_space: wake 2");
-		xlog_grant_push_ail(log->l_mp, need_bytes);
-		spin_lock(&log->l_grant_lock);
 		goto redo;
 	} else if (tic->t_flags & XLOG_TIC_IN_Q)
 		xlog_del_ticketq(&log->l_write_headq, tic);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7af44adffc8..d6a64392f98 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -313,7 +313,7 @@ typedef struct xfs_mount {
 #endif
 	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
 	struct task_struct	*m_sync_task;	/* generalised sync thread */
-	bhv_vfs_sync_work_t	m_sync_work;	/* work item for VFS_SYNC */
+	xfs_sync_work_t		m_sync_work;	/* work item for VFS_SYNC */
 	struct list_head	m_sync_list;	/* sync thread work item list */
 	spinlock_t		m_sync_lock;	/* work item list lock */
 	int			m_sync_seq;	/* sync thread generation no. */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 7394c7af5de..19cf90a9c76 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1457,6 +1457,13 @@ xfs_create(
 	error = xfs_trans_reserve(tp, resblks, log_res, 0,
 			XFS_TRANS_PERM_LOG_RES, log_count);
 	if (error == ENOSPC) {
+		/* flush outstanding delalloc blocks and retry */
+		xfs_flush_inodes(dp);
+		error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
+			XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+	}
+	if (error == ENOSPC) {
+		/* No space at all so try a "no-allocation" reservation */
 		resblks = 0;
 		error = xfs_trans_reserve(tp, 0, log_res, 0,
 				XFS_TRANS_PERM_LOG_RES, log_count);