diff options
Diffstat (limited to 'fs')
123 files changed, 23328 insertions, 1544 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 86b203fc3c5..9f7270f36b2 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -175,9 +175,34 @@ source "fs/qnx4/Kconfig" source "fs/romfs/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" - source "fs/exofs/Kconfig" +config NILFS2_FS + tristate "NILFS2 file system support (EXPERIMENTAL)" + depends on BLOCK && EXPERIMENTAL + select CRC32 + help + NILFS2 is a log-structured file system (LFS) supporting continuous + snapshotting. In addition to versioning capability of the entire + file system, users can even restore files mistakenly overwritten or + destroyed just a few seconds ago. Since this file system can keep + consistency like conventional LFS, it achieves quick recovery after + system crashes. + + NILFS2 creates a number of checkpoints every few seconds or per + synchronous write basis (unless there is no change). Users can + select significant versions among continuously created checkpoints, + and can change them into snapshots which will be preserved for long + periods until they are changed back to checkpoints. Each + snapshot is mountable as a read-only file system concurrently with + its writable mount, and this feature is convenient for online backup. + + Some features including atime, extended attributes, and POSIX ACLs, + are not supported yet. + + To compile this file system support as a module, choose M here: the + module will be called nilfs2. If unsure, say N. + endif # MISC_FILESYSTEMS menuconfig NETWORK_FILESYSTEMS diff --git a/fs/Makefile b/fs/Makefile index 70b2aed8713..af6d04700d9 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -114,6 +114,7 @@ obj-$(CONFIG_JFS_FS) += jfs/ obj-$(CONFIG_XFS_FS) += xfs/ obj-$(CONFIG_9P_FS) += 9p/ obj-$(CONFIG_AFS_FS) += afs/ +obj-$(CONFIG_NILFS2_FS) += nilfs2/ obj-$(CONFIG_BEFS_FS) += befs/ obj-$(CONFIG_HOSTFS) += hostfs/ obj-$(CONFIG_HPPFS) += hppfs/ diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c index 49f18942306..7ad36506c25 100644 --- a/fs/afs/netdevices.c +++ b/fs/afs/netdevices.c @@ -20,8 +20,7 @@ int afs_get_MAC_address(u8 *mac, size_t maclen) struct net_device *dev; int ret = -ENODEV; - if (maclen != ETH_ALEN) - BUG(); + BUG_ON(maclen != ETH_ALEN); rtnl_lock(); dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER); diff --git a/fs/befs/debug.c b/fs/befs/debug.c index b8e304a0661..622e73775c8 100644 --- a/fs/befs/debug.c +++ b/fs/befs/debug.c @@ -17,6 +17,7 @@ #include <linux/spinlock.h> #include <linux/kernel.h> #include <linux/fs.h> +#include <linux/slab.h> #endif /* __KERNEL__ */ diff --git a/fs/befs/super.c b/fs/befs/super.c index 41f2b4d0093..ca40f828f64 100644 --- a/fs/befs/super.c +++ b/fs/befs/super.c @@ -8,6 +8,7 @@ */ #include <linux/fs.h> +#include <asm/page.h> /* for PAGE_SIZE */ #include "befs.h" #include "super.h" diff --git a/fs/buffer.c b/fs/buffer.c index 5d55a896ff7..13edf7ad3ff 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -737,7 +737,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) { struct buffer_head *bh; struct list_head tmp; - struct address_space *mapping; + struct address_space *mapping, *prev_mapping = NULL; int err = 0, err2; INIT_LIST_HEAD(&tmp); @@ -762,7 +762,18 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * contents - it is a noop if I/O is still in * flight on potentially older contents. */ - ll_rw_block(SWRITE_SYNC, 1, &bh); + ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh); + + /* + * Kick off IO for the previous mapping. Note + * that we will not run the very last mapping, + * wait_on_buffer() will do that for us + * through sync_buffer(). + */ + if (prev_mapping && prev_mapping != mapping) + blk_run_address_space(prev_mapping); + prev_mapping = mapping; + brelse(bh); spin_lock(lock); } @@ -1585,6 +1596,16 @@ EXPORT_SYMBOL(unmap_underlying_metadata); * locked buffer. This only can happen if someone has written the buffer * directly, with submit_bh(). At the address_space level PageWriteback * prevents this contention from occurring. + * + * If block_write_full_page() is called with wbc->sync_mode == + * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this + * causes the writes to be flagged as synchronous writes, but the + * block device queue will NOT be unplugged, since usually many pages + * will be pushed to the out before the higher-level caller actually + * waits for the writes to be completed. The various wait functions, + * such as wait_on_writeback_range() will ultimately call sync_page() + * which will ultimately call blk_run_backing_dev(), which will end up + * unplugging the device queue. */ static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block, struct writeback_control *wbc) @@ -1595,7 +1616,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; const unsigned blocksize = 1 << inode->i_blkbits; int nr_underway = 0; - int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + int write_op = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC_PLUG : WRITE); BUG_ON(!PageLocked(page)); @@ -2957,12 +2979,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) for (i = 0; i < nr; i++) { struct buffer_head *bh = bhs[i]; - if (rw == SWRITE || rw == SWRITE_SYNC) + if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG) lock_buffer(bh); else if (!trylock_buffer(bh)) continue; - if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { + if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC || + rw == SWRITE_SYNC_PLUG) { if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; get_bh(bh); @@ -2998,7 +3021,7 @@ int sync_dirty_buffer(struct buffer_head *bh) if (test_clear_buffer_dirty(bh)) { get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); wait_on_buffer(bh); if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); diff --git a/fs/direct-io.c b/fs/direct-io.c index b6d43908ff7..da258e7249c 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1126,7 +1126,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, int acquire_i_mutex = 0; if (rw & WRITE) - rw = WRITE_SYNC; + rw = WRITE_ODIRECT; if (bdev) bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index b43b9556366..acf67883110 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -590,9 +590,8 @@ static int ext2_get_blocks(struct inode *inode, if (depth == 0) return (err); -reread: - partial = ext2_get_branch(inode, depth, offsets, chain, &err); + partial = ext2_get_branch(inode, depth, offsets, chain, &err); /* Simplest case - block found, no allocation needed */ if (!partial) { first_block = le32_to_cpu(chain[depth - 1].key); @@ -602,15 +601,16 @@ reread: while (count < maxblocks && count <= blocks_to_boundary) { ext2_fsblk_t blk; - if (!verify_chain(chain, partial)) { + if (!verify_chain(chain, chain + depth - 1)) { /* * Indirect block might be removed by * truncate while we were reading it. * Handling of that case: forget what we've * got now, go to reread. */ + err = -EAGAIN; count = 0; - goto changed; + break; } blk = le32_to_cpu(*(chain[depth-1].p + count)); if (blk == first_block + count) @@ -618,7 +618,8 @@ reread: else break; } - goto got_it; + if (err != -EAGAIN) + goto got_it; } /* Next simple case - plain lookup or failed read of indirect block */ @@ -626,6 +627,33 @@ reread: goto cleanup; mutex_lock(&ei->truncate_mutex); + /* + * If the indirect block is missing while we are reading + * the chain(ext3_get_branch() returns -EAGAIN err), or + * if the chain has been changed after we grab the semaphore, + * (either because another process truncated this branch, or + * another get_block allocated this branch) re-grab the chain to see if + * the request block has been allocated or not. + * + * Since we already block the truncate/other get_block + * at this point, we will have the current copy of the chain when we + * splice the branch into the tree. + */ + if (err == -EAGAIN || !verify_chain(chain, partial)) { + while (partial > chain) { + brelse(partial->bh); + partial--; + } + partial = ext2_get_branch(inode, depth, offsets, chain, &err); + if (!partial) { + count++; + mutex_unlock(&ei->truncate_mutex); + if (err) + goto cleanup; + clear_buffer_new(bh_result); + goto got_it; + } + } /* * Okay, we need to do block allocation. Lazily initialize the block @@ -683,12 +711,6 @@ cleanup: partial--; } return err; -changed: - while (partial > chain) { - brelse(partial->bh); - partial--; - } - goto reread; } int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig index 8e0cfe44b0f..fb3c1a21b13 100644 --- a/fs/ext3/Kconfig +++ b/fs/ext3/Kconfig @@ -28,6 +28,25 @@ config EXT3_FS To compile this file system support as a module, choose M here: the module will be called ext3. +config EXT3_DEFAULTS_TO_ORDERED + bool "Default to 'data=ordered' in ext3 (legacy option)" + depends on EXT3_FS + help + If a filesystem does not explicitly specify a data ordering + mode, and the journal capability allowed it, ext3 used to + historically default to 'data=ordered'. + + That was a rather unfortunate choice, because it leads to all + kinds of latency problems, and the 'data=writeback' mode is more + appropriate these days. + + You should probably always answer 'n' here, and if you really + want to use 'data=ordered' mode, set it in the filesystem itself + with 'tune2fs -o journal_data_ordered'. + + But if you really want to enable the legacy default, you can do + so by answering 'y' to this question. + config EXT3_FS_XATTR bool "Ext3 extended attributes" depends on EXT3_FS diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 466a332e0bd..fcfa2436185 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1521,12 +1521,16 @@ static int ext3_ordered_writepage(struct page *page, if (!page_has_buffers(page)) { create_empty_buffers(page, inode->i_sb->s_blocksize, (1 << BH_Dirty)|(1 << BH_Uptodate)); - } else if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { - /* Provide NULL instead of get_block so that we catch bugs if buffers weren't really mapped */ - return block_write_full_page(page, NULL, wbc); + page_bufs = page_buffers(page); + } else { + page_bufs = page_buffers(page); + if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE, + NULL, buffer_unmapped)) { + /* Provide NULL get_block() to catch bugs if buffers + * weren't really mapped */ + return block_write_full_page(page, NULL, wbc); + } } - page_bufs = page_buffers(page); - handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { @@ -1581,6 +1585,15 @@ static int ext3_writeback_writepage(struct page *page, if (ext3_journal_current_handle()) goto out_fail; + if (page_has_buffers(page)) { + if (!walk_page_buffers(NULL, page_buffers(page), 0, + PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { + /* Provide NULL get_block() to catch bugs if buffers + * weren't really mapped */ + return block_write_full_page(page, NULL, wbc); + } + } + handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 9e5b8e387e1..599dbfe504c 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -44,6 +44,12 @@ #include "acl.h" #include "namei.h" +#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED + #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA +#else + #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA +#endif + static int ext3_load_journal(struct super_block *, struct ext3_super_block *, unsigned long journal_devnum); static int ext3_create_journal(struct super_block *, struct ext3_super_block *, @@ -1919,7 +1925,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) cope, else JOURNAL_DATA */ if (journal_check_available_features (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) - set_opt(sbi->s_mount_opt, ORDERED_DATA); + set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE); else set_opt(sbi->s_mount_opt, JOURNAL_DATA); break; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ac77d8b8251..6132353dcf6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -342,7 +342,7 @@ static int ext4_valid_extent_idx(struct inode *inode, ext4_fsblk_t block = idx_pblock(ext_idx); struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; if (unlikely(block < le32_to_cpu(es->s_first_data_block) || - (block > ext4_blocks_count(es)))) + (block >= ext4_blocks_count(es)))) return 0; else return 1; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a2e7952bc5f..c6bd6ced3bb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -372,16 +372,16 @@ static int ext4_block_to_path(struct inode *inode, } static int __ext4_check_blockref(const char *function, struct inode *inode, - unsigned int *p, unsigned int max) { + __le32 *p, unsigned int max) { unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es); - unsigned int *bref = p; + __le32 *bref = p; while (bref < p+max) { - if (unlikely(*bref >= maxblocks)) { + if (unlikely(le32_to_cpu(*bref) >= maxblocks)) { ext4_error(inode->i_sb, function, "block reference %u >= max (%u) " "in inode #%lu, offset=%d", - *bref, maxblocks, + le32_to_cpu(*bref), maxblocks, inode->i_ino, (int)(bref-p)); return -EIO; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9987bba99db..2958f4e6f22 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2508,6 +2508,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (EXT4_BLOCKS_PER_GROUP(sb) == 0) goto cantfind_ext4; + /* check blocks count against device size */ + blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; + if (blocks_count && ext4_blocks_count(es) > blocks_count) { + printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu " + "exceeds size of device (%llu blocks)\n", + ext4_blocks_count(es), blocks_count); + goto failed_mount; + } + /* * It makes no sense for the first data block to be beyond the end * of the filesystem. diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2b25133524a..06f30e96567 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -938,9 +938,9 @@ static void fuse_release_user_pages(struct fuse_req *req, int write) } static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, - unsigned *nbytesp, int write) + size_t *nbytesp, int write) { - unsigned nbytes = *nbytesp; + size_t nbytes = *nbytesp; unsigned long user_addr = (unsigned long) buf; unsigned offset = user_addr & ~PAGE_MASK; int npages; @@ -955,7 +955,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, return 0; } - nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); + nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); down_read(¤t->mm->mmap_sem); @@ -1298,6 +1298,8 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & VM_MAYSHARE) return -ENODEV; + invalidate_inode_pages2(file->f_mapping); + return generic_file_mmap(file, vma); } diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 9435dda8f1e..a1cbff2b4d9 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -70,6 +70,10 @@ static int hfs_releasepage(struct page *page, gfp_t mask) BUG(); return 0; } + + if (!tree) + return 0; + if (tree->node_size >= PAGE_CACHE_SIZE) { nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT); spin_lock(&tree->hash_lock); diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 36ca2e1a4fa..7b6165f25fb 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -349,6 +349,7 @@ void hfs_mdb_put(struct super_block *sb) if (HFS_SB(sb)->nls_disk) unload_nls(HFS_SB(sb)->nls_disk); + free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0); kfree(HFS_SB(sb)); sb->s_fs_info = NULL; } diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index f8077b9c898..a8e8513a78a 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -351,8 +351,13 @@ void journal_commit_transaction(journal_t *journal) spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; + /* + * Use plugged writes here, since we want to submit several before + * we unplug the device. We don't do explicit unplugging in here, + * instead we rely on sync_buffer() doing the unplug for us. + */ if (commit_transaction->t_synchronous_commit) - write_op = WRITE_SYNC; + write_op = WRITE_SYNC_PLUG; spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { DEFINE_WAIT(wait); diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index c7bd649bbbd..3e9afc2a91d 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -55,6 +55,25 @@ * need do nothing. * RevokeValid set, Revoked set: * buffer has been revoked. + * + * Locking rules: + * We keep two hash tables of revoke records. One hashtable belongs to the + * running transaction (is pointed to by journal->j_revoke), the other one + * belongs to the committing transaction. Accesses to the second hash table + * happen only from the kjournald and no other thread touches this table. Also + * journal_switch_revoke_table() which switches which hashtable belongs to the + * running and which to the committing transaction is called only from + * kjournald. Therefore we need no locks when accessing the hashtable belonging + * to the committing transaction. + * + * All users operating on the hash table belonging to the running transaction + * have a handle to the transaction. Therefore they are safe from kjournald + * switching hash tables under them. For operations on the lists of entries in + * the hash table j_revoke_lock is used. + * + * Finally, also replay code uses the hash tables but at this moment noone else + * can touch them (filesystem isn't mounted yet) and hence no locking is + * needed. */ #ifndef __KERNEL__ @@ -402,8 +421,6 @@ int journal_revoke(handle_t *handle, unsigned long blocknr, * the second time we would still have a pending revoke to cancel. So, * do not trust the Revoked bit on buffers unless RevokeValid is also * set. - * - * The caller must have the journal locked. */ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) { @@ -481,10 +498,7 @@ void journal_switch_revoke_table(journal_t *journal) /* * Write revoke records to the journal for all entries in the current * revoke hash, deleting the entries as we go. - * - * Called with the journal lock held. */ - void journal_write_revoke_records(journal_t *journal, transaction_t *transaction) { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 4ea72377c7a..073c8c3df7c 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -138,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal, set_buffer_ordered(bh); barrier_done = 1; } - ret = submit_bh(WRITE_SYNC, bh); + ret = submit_bh(WRITE_SYNC_PLUG, bh); if (barrier_done) clear_buffer_ordered(bh); @@ -159,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal, lock_buffer(bh); set_buffer_uptodate(bh); clear_buffer_dirty(bh); - ret = submit_bh(WRITE_SYNC, bh); + ret = submit_bh(WRITE_SYNC_PLUG, bh); } *cbh = bh; return ret; @@ -190,7 +190,7 @@ retry: set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - ret = submit_bh(WRITE_SYNC, bh); + ret = submit_bh(WRITE_SYNC_PLUG, bh); if (ret) { unlock_buffer(bh); return ret; @@ -402,8 +402,13 @@ void jbd2_journal_commit_transaction(journal_t *journal) spin_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; + /* + * Use plugged writes here, since we want to submit several before + * we unplug the device. We don't do explicit unplugging in here, + * instead we rely on sync_buffer() doing the unplug for us. + */ if (commit_transaction->t_synchronous_commit) - write_op = WRITE_SYNC; + write_op = WRITE_SYNC_PLUG; stats.u.run.rs_wait = commit_transaction->t_max_wait; stats.u.run.rs_locked = jiffies; stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 77ccf8cb082..043740dde20 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -38,12 +38,12 @@ static int jffs2_acl_count(size_t size) size_t s; size -= sizeof(struct jffs2_acl_header); - s = size - 4 * sizeof(struct jffs2_acl_entry_short); - if (s < 0) { + if (size < 4 * sizeof(struct jffs2_acl_entry_short)) { if (size % sizeof(struct jffs2_acl_entry_short)) return -1; return size / sizeof(struct jffs2_acl_entry_short); } else { + s = size - 4 * sizeof(struct jffs2_acl_entry_short); if (s % sizeof(struct jffs2_acl_entry)) return -1; return s / sizeof(struct jffs2_acl_entry) + 4; diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c index f9211252b5f..9eff2bdae8a 100644 --- a/fs/jffs2/malloc.c +++ b/fs/jffs2/malloc.c @@ -284,10 +284,9 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x) struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void) { struct jffs2_xattr_datum *xd; - xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL); + xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL); dbg_memalloc("%p\n", xd); - memset(xd, 0, sizeof(struct jffs2_xattr_datum)); xd->class = RAWNODE_CLASS_XATTR_DATUM; xd->node = (void *)xd; INIT_LIST_HEAD(&xd->xindex); @@ -303,10 +302,9 @@ void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd) struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void) { struct jffs2_xattr_ref *ref; - ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL); + ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL); dbg_memalloc("%p\n", ref); - memset(ref, 0, sizeof(struct jffs2_xattr_ref)); ref->class = RAWNODE_CLASS_XATTR_REF; ref->node = (void *)ref; return ref; diff --git a/fs/libfs.c b/fs/libfs.c index 4910a36f516..cd223190c4e 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -575,6 +575,21 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, * possibly a read which collects the result - which is stored in a * file-local buffer. */ + +void simple_transaction_set(struct file *file, size_t n) +{ + struct simple_transaction_argresp *ar = file->private_data; + + BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); + + /* + * The barrier ensures that ar->size will really remain zero until + * ar->data is ready for reading. + */ + smp_mb(); + ar->size = n; +} + char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) { struct simple_transaction_argresp *ar; @@ -820,6 +835,7 @@ EXPORT_SYMBOL(simple_sync_file); EXPORT_SYMBOL(simple_unlink); EXPORT_SYMBOL(simple_read_from_buffer); EXPORT_SYMBOL(memory_read_from_buffer); +EXPORT_SYMBOL(simple_transaction_set); EXPORT_SYMBOL(simple_transaction_get); EXPORT_SYMBOL(simple_transaction_read); EXPORT_SYMBOL(simple_transaction_release); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 763b78a6e9d..83ee34203bd 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -426,8 +426,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, ret = nlm_granted; goto out; case -EAGAIN: + /* + * If this is a blocking request for an + * already pending lock request then we need + * to put it back on lockd's block list + */ + if (wait) + break; ret = nlm_lck_denied; - break; + goto out; case FILE_LOCK_DEFERRED: if (wait) break; @@ -443,10 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } - ret = nlm_lck_denied; - if (!wait) - goto out; - ret = nlm_lck_blocked; /* Append to list of blocked */ diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 3523b895eb4..5a97bcfe03e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -516,8 +516,6 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out_unlock; ret = nfs_updatepage(filp, page, 0, pagelen); - if (ret == 0) - ret = pagelen; out_unlock: unlock_page(page); if (ret) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 82eaadbff40..6717200923f 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1228,7 +1228,6 @@ static int nfs_parse_mount_options(char *raw, goto out_nomem; token = match_token(string, nfs_xprt_protocol_tokens, args); - kfree(string); switch (token) { case Opt_xprt_udp: @@ -1258,6 +1257,7 @@ static int nfs_parse_mount_options(char *raw, goto out_nomem; token = match_token(string, nfs_xprt_protocol_tokens, args); + kfree(string); switch (token) { case Opt_xprt_udp: diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 44d7d04dab9..503b9da159a 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -1,6 +1,7 @@ config NFSD tristate "NFS server support" depends on INET + depends on FILE_LOCKING select LOCKD select SUNRPC select EXPORTFS diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9dbd2eb9128..7c9fe838f03 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -18,6 +18,7 @@ #include <linux/unistd.h> #include <linux/slab.h> #include <linux/major.h> +#include <linux/magic.h> #include <linux/sunrpc/svc.h> #include <linux/nfsd/nfsd.h> @@ -202,6 +203,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp, struct nfsd3_writeres *resp) { __be32 nfserr; + unsigned long cnt = argp->len; dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n", SVCFH_fmt(&argp->fh), @@ -214,9 +216,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp, nfserr = nfsd_write(rqstp, &resp->fh, NULL, argp->offset, rqstp->rq_vec, argp->vlen, - argp->len, + &cnt, &resp->committed); - resp->count = argp->count; + resp->count = cnt; RETURN_STATUS(nfserr); } @@ -569,7 +571,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; /* Note that we don't care for remote fs's here */ - if (sb->s_magic == 0x4d44 /* MSDOS_SUPER_MAGIC */) { + if (sb->s_magic == MSDOS_SUPER_MAGIC) { resp->f_properties = NFS3_FSF_BILLYBOY; } resp->f_maxfilesize = sb->s_maxbytes; @@ -610,7 +612,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, resp->p_link_max = EXT2_LINK_MAX; resp->p_name_max = EXT2_NAME_LEN; break; - case 0x4d44: /* MSDOS_SUPER_MAGIC */ + case MSDOS_SUPER_MAGIC: resp->p_case_insensitive = 1; resp->p_case_preserving = 0; break; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c464181b599..290289bd44f 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -218,7 +218,7 @@ static int encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) { __be32 *p; - int len = cb_rec->cbr_fhlen; + int len = cb_rec->cbr_fh.fh_size; RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); WRITE32(OP_CB_RECALL); @@ -226,7 +226,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t)); WRITE32(cb_rec->cbr_trunc); WRITE32(len); - WRITEMEM(cb_rec->cbr_fhval, len); + WRITEMEM(&cb_rec->cbr_fh.fh_base, len); return 0; } @@ -361,9 +361,8 @@ static struct rpc_program cb_program = { /* Reference counting, callback cleanup, etc., all look racy as heck. * And why is cb_set an atomic? */ -static int do_probe_callback(void *data) +static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp) { - struct nfs4_client *clp = data; struct sockaddr_in addr; struct nfs4_callback *cb = &clp->cl_callback; struct rpc_timeout timeparms = { @@ -384,17 +383,10 @@ static int do_probe_callback(void *data) .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), .client_name = clp->cl_principal, }; - struct rpc_message msg = { - .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], - .rpc_argp = clp, - }; struct rpc_clnt *client; - int status; - if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) { - status = nfserr_cb_path_down; - goto out_err; - } + if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) + return ERR_PTR(-EINVAL); /* Initialize address */ memset(&addr, 0, sizeof(addr)); @@ -404,9 +396,29 @@ static int do_probe_callback(void *data) /* Create RPC client */ client = rpc_create(&args); + if (IS_ERR(client)) + dprintk("NFSD: couldn't create callback client: %ld\n", + PTR_ERR(client)); + return client; + +} + +static int do_probe_callback(void *data) +{ + struct nfs4_client *clp = data; + struct nfs4_callback *cb = &clp->cl_callback; + struct rpc_message msg = { + .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], + .rpc_argp = clp, + }; + struct rpc_clnt *client; + int status; + + client = setup_callback_client(clp); if (IS_ERR(client)) { - dprintk("NFSD: couldn't create callback client\n"); status = PTR_ERR(client); + dprintk("NFSD: couldn't create callback client: %d\n", + status); goto out_err; } @@ -422,10 +434,10 @@ static int do_probe_callback(void *data) out_release_client: rpc_shutdown_client(client); out_err: - dprintk("NFSD: warning: no callback path to client %.*s\n", - (int)clp->cl_name.len, clp->cl_name.data); + dprintk("NFSD: warning: no callback path to client %.*s: error %d\n", + (int)clp->cl_name.len, clp->cl_name.data, status); put_nfs4_client(clp); - return status; + return 0; } /* @@ -451,7 +463,6 @@ nfsd4_probe_callback(struct nfs4_client *clp) /* * called with dp->dl_count inc'ed. - * nfs4_lock_state() may or may not have been called. */ void nfsd4_cb_recall(struct nfs4_delegation *dp) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 9fa60a3ad48..b2883e9c638 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -93,6 +93,21 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o open->op_truncate = 0; if (open->op_create) { + /* FIXME: check session persistence and pnfs flags. + * The nfsv4.1 spec requires the following semantics: + * + * Persistent | pNFS | Server REQUIRED | Client Allowed + * Reply Cache | server | | + * -------------+--------+-----------------+-------------------- + * no | no | EXCLUSIVE4_1 | EXCLUSIVE4_1 + * | | | (SHOULD) + * | | and EXCLUSIVE4 | or EXCLUSIVE4 + * | | | (SHOULD NOT) + * no | yes | EXCLUSIVE4_1 | EXCLUSIVE4_1 + * yes | no | GUARDED4 | GUARDED4 + * yes | yes | GUARDED4 | GUARDED4 + */ + /* * Note: create modes (UNCHECKED,GUARDED...) are the same * in NFSv4 as in v3. @@ -103,11 +118,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o (u32 *)open->op_verf.data, &open->op_truncate, &created); - /* If we ever decide to use different attrs to store the - * verifier in nfsd_create_v3, then we'll need to change this + /* + * Following rfc 3530 14.2.16, use the returned bitmask + * to indicate which attributes we used to store the + * verifier: */ if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0) - open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | + open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_MODIFY); } else { status = nfsd_lookup(rqstp, current_fh, @@ -118,13 +135,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o goto out; set_change_info(&open->op_cinfo, current_fh); - - /* set reply cache */ fh_dup2(current_fh, &resfh); - open->op_stateowner->so_replay.rp_openfh_len = resfh.fh_handle.fh_size; - memcpy(open->op_stateowner->so_replay.rp_openfh, - &resfh.fh_handle.fh_base, resfh.fh_handle.fh_size); + /* set reply cache */ + fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, + &resfh.fh_handle); if (!created) status = do_open_permission(rqstp, current_fh, open, NFSD_MAY_NOP); @@ -150,10 +165,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); /* set replay cache */ - open->op_stateowner->so_replay.rp_openfh_len = current_fh->fh_handle.fh_size; - memcpy(open->op_stateowner->so_replay.rp_openfh, - ¤t_fh->fh_handle.fh_base, - current_fh->fh_handle.fh_size); + fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh, + ¤t_fh->fh_handle); open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && (open->op_iattr.ia_size == 0); @@ -164,12 +177,23 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_ return status; } +static void +copy_clientid(clientid_t *clid, struct nfsd4_session *session) +{ + struct nfsd4_sessionid *sid = + (struct nfsd4_sessionid *)session->se_sessionid.data; + + clid->cl_boot = sid->clientid.cl_boot; + clid->cl_id = sid->clientid.cl_id; +} static __be32 nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open) { __be32 status; + struct nfsd4_compoundres *resp; + dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n", (int)open->op_fname.len, open->op_fname.data, open->op_stateowner); @@ -178,16 +202,19 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) return nfserr_inval; + if (nfsd4_has_session(cstate)) + copy_clientid(&open->op_clientid, cstate->session); + nfs4_lock_state(); /* check seqid for replay. set nfs4_owner */ - status = nfsd4_process_open1(open); + resp = rqstp->rq_resp; + status = nfsd4_process_open1(&resp->cstate, open); if (status == nfserr_replay_me) { struct nfs4_replay *rp = &open->op_stateowner->so_replay; fh_put(&cstate->current_fh); - cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len; - memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh, - rp->rp_openfh_len); + fh_copy_shallow(&cstate->current_fh.fh_handle, + &rp->rp_openfh); status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); if (status) dprintk("nfsd4_open: replay failed" @@ -209,10 +236,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_DELEGATE_CUR: - status = nfserr_inval; - if (open->op_create) - goto out; - /* fall through */ case NFS4_OPEN_CLAIM_NULL: /* * (1) set CURRENT_FH to the file being opened, @@ -455,8 +478,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) return nfserr_inval; - getattr->ga_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0; - getattr->ga_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1; + getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion); + getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); + getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); getattr->ga_fhp = &cstate->current_fh; return nfs_ok; @@ -520,9 +544,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); /* check stateid */ - if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh, - &read->rd_stateid, - CHECK_FH | RD_STATE, &read->rd_filp))) { + if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid, + RD_STATE, &read->rd_filp))) { dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; } @@ -548,8 +571,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) return nfserr_inval; - readdir->rd_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0; - readdir->rd_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1; + readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion); + readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); + readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) || (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) @@ -653,8 +677,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { nfs4_lock_state(); - status = nfs4_preprocess_stateid_op(&cstate->current_fh, - &setattr->sa_stateid, CHECK_FH | WR_STATE, NULL); + status = nfs4_preprocess_stateid_op(cstate, + &setattr->sa_stateid, WR_STATE, NULL); nfs4_unlock_state(); if (status) { dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); @@ -685,6 +709,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct file *filp = NULL; u32 *p; __be32 status = nfs_ok; + unsigned long cnt; /* no need to check permission - this will be done in nfsd_write() */ @@ -692,8 +717,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_inval; nfs4_lock_state(); - status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid, - CHECK_FH | WR_STATE, &filp); + status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp); if (filp) get_file(filp); nfs4_unlock_state(); @@ -703,7 +727,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } - write->wr_bytes_written = write->wr_buflen; + cnt = write->wr_buflen; write->wr_how_written = write->wr_stable_how; p = (u32 *)write->wr_verifier.data; *p++ = nfssvc_boot.tv_sec; @@ -711,10 +735,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfsd_write(rqstp, &cstate->current_fh, filp, write->wr_offset, rqstp->rq_vec, write->wr_vlen, - write->wr_buflen, &write->wr_how_written); + &cnt, &write->wr_how_written); if (filp) fput(filp); + write->wr_bytes_written = cnt; + if (status == nfserr_symlink) status = nfserr_inval; return status; @@ -737,8 +763,9 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) return status; - if ((verify->ve_bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) - || (verify->ve_bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1)) + if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion)) + || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion)) + || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion))) return nfserr_attrnotsupp; if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR) || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)) @@ -766,7 +793,8 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out_kfree; - p = buf + 3; + /* skip bitmap */ + p = buf + 1 + ntohl(buf[0]); status = nfserr_not_same; if (ntohl(*p++) != verify->ve_attrlen) goto out_kfree; @@ -813,39 +841,17 @@ static inline void nfsd4_increment_op_stats(u32 opnum) nfsdstats.nfs4_opcount[opnum]++; } -static void cstate_free(struct nfsd4_compound_state *cstate) -{ - if (cstate == NULL) - return; - fh_put(&cstate->current_fh); - fh_put(&cstate->save_fh); - BUG_ON(cstate->replay_owner); - kfree(cstate); -} - -static struct nfsd4_compound_state *cstate_alloc(void) -{ - struct nfsd4_compound_state *cstate; - - cstate = kmalloc(sizeof(struct nfsd4_compound_state), GFP_KERNEL); - if (cstate == NULL) - return NULL; - fh_init(&cstate->current_fh, NFS4_FHSIZE); - fh_init(&cstate->save_fh, NFS4_FHSIZE); - cstate->replay_owner = NULL; - return cstate; -} - typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, void *); +enum nfsd4_op_flags { + ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ + ALLOWED_ON_ABSENT_FS = 2 << 0, /* ops processed on absent fs */ + ALLOWED_AS_FIRST_OP = 3 << 0, /* ops reqired first in compound */ +}; struct nfsd4_operation { nfsd4op_func op_func; u32 op_flags; -/* Most ops require a valid current filehandle; a few don't: */ -#define ALLOWED_WITHOUT_FH 1 -/* GETATTR and ops not listed as returning NFS4ERR_MOVED: */ -#define ALLOWED_ON_ABSENT_FS 2 char *op_name; }; @@ -854,6 +860,51 @@ static struct nfsd4_operation nfsd4_ops[]; static const char *nfsd4_op_name(unsigned opnum); /* + * This is a replay of a compound for which no cache entry pages + * were used. Encode the sequence operation, and if cachethis is FALSE + * encode the uncache rep error on the next operation. + */ +static __be32 +nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args, + struct nfsd4_compoundres *resp) +{ + struct nfsd4_op *op; + + dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__, + resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis); + + /* Encode the replayed sequence operation */ + BUG_ON(resp->opcnt != 1); + op = &args->ops[resp->opcnt - 1]; + nfsd4_encode_operation(resp, op); + + /*return nfserr_retry_uncached_rep in next operation. */ + if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) { + op = &args->ops[resp->opcnt++]; + op->status = nfserr_retry_uncached_rep; + nfsd4_encode_operation(resp, op); + } + return op->status; +} + +/* + * Enforce NFSv4.1 COMPOUND ordering rules. + * + * TODO: + * - enforce NFS4ERR_NOT_ONLY_OP, + * - DESTROY_SESSION MUST be the final operation in the COMPOUND request. + */ +static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args) +{ + if (args->minorversion && args->opcnt > 0) { + struct nfsd4_op *op = &args->ops[0]; + return (op->status == nfserr_op_illegal) || + (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP); + } + return true; +} + +/* * COMPOUND call. */ static __be32 @@ -863,12 +914,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, { struct nfsd4_op *op; struct nfsd4_operation *opdesc; - struct nfsd4_compound_state *cstate = NULL; + struct nfsd4_compound_state *cstate = &resp->cstate; int slack_bytes; __be32 status; resp->xbuf = &rqstp->rq_res; - resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len; + resp->p = rqstp->rq_res.head[0].iov_base + + rqstp->rq_res.head[0].iov_len; resp->tagp = resp->p; /* reserve space for: taglen, tag, and opcnt */ resp->p += 2 + XDR_QUADLEN(args->taglen); @@ -877,18 +929,25 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, resp->tag = args->tag; resp->opcnt = 0; resp->rqstp = rqstp; + resp->cstate.minorversion = args->minorversion; + resp->cstate.replay_owner = NULL; + fh_init(&resp->cstate.current_fh, NFS4_FHSIZE); + fh_init(&resp->cstate.save_fh, NFS4_FHSIZE); + /* Use the deferral mechanism only for NFSv4.0 compounds */ + rqstp->rq_usedeferral = (args->minorversion == 0); /* * According to RFC3010, this takes precedence over all other errors. */ status = nfserr_minor_vers_mismatch; - if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION) + if (args->minorversion > nfsd_supported_minorversion) goto out; - status = nfserr_resource; - cstate = cstate_alloc(); - if (cstate == NULL) - goto out; + if (!nfs41_op_ordering_ok(args)) { + op = &args->ops[0]; + op->status = nfserr_sequence_pos; + goto encode_op; + } status = nfs_ok; while (!status && resp->opcnt < args->opcnt) { @@ -897,7 +956,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, dprintk("nfsv4 compound op #%d/%d: %d (%s)\n", resp->opcnt, args->opcnt, op->opnum, nfsd4_op_name(op->opnum)); - /* * The XDR decode routines may have pre-set op->status; * for example, if there is a miscellaneous XDR error @@ -938,6 +996,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, BUG_ON(op->status == nfs_ok); encode_op: + /* Only from SEQUENCE or CREATE_SESSION */ + if (resp->cstate.status == nfserr_replay_cache) { + dprintk("%s NFS4.1 replay from cache\n", __func__); + if (nfsd4_not_cached(resp)) + status = nfsd4_enc_uncached_replay(args, resp); + else + status = op->status; + goto out; + } if (op->status == nfserr_replay_me) { op->replay = &cstate->replay_owner->so_replay; nfsd4_encode_replay(resp, op); @@ -961,15 +1028,24 @@ encode_op: nfsd4_increment_op_stats(op->opnum); } + if (!rqstp->rq_usedeferral && status == nfserr_dropit) { + dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__); + status = nfserr_jukebox; + } - cstate_free(cstate); + resp->cstate.status = status; + fh_put(&resp->cstate.current_fh); + fh_put(&resp->cstate.save_fh); + BUG_ON(resp->cstate.replay_owner); out: nfsd4_release_compoundargs(args); + /* Reset deferral mechanism for RPC deferrals */ + rqstp->rq_usedeferral = 1; dprintk("nfsv4 compound returned %d\n", ntohl(status)); return status; } -static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = { +static struct nfsd4_operation nfsd4_ops[] = { [OP_ACCESS] = { .op_func = (nfsd4op_func)nfsd4_access, .op_name = "OP_ACCESS", @@ -1045,7 +1121,7 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = { .op_name = "OP_PUTFH", }, [OP_PUTPUBFH] = { - /* unsupported, just for future reference: */ + .op_func = (nfsd4op_func)nfsd4_putrootfh, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, .op_name = "OP_PUTPUBFH", }, @@ -1119,6 +1195,28 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = { .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, .op_name = "OP_RELEASE_LOCKOWNER", }, + + /* NFSv4.1 operations */ + [OP_EXCHANGE_ID] = { + .op_func = (nfsd4op_func)nfsd4_exchange_id, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_EXCHANGE_ID", + }, + [OP_CREATE_SESSION] = { + .op_func = (nfsd4op_func)nfsd4_create_session, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_CREATE_SESSION", + }, + [OP_DESTROY_SESSION] = { + .op_func = (nfsd4op_func)nfsd4_destroy_session, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_DESTROY_SESSION", + }, + [OP_SEQUENCE] = { + .op_func = (nfsd4op_func)nfsd4_sequence, + .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, + .op_name = "OP_SEQUENCE", + }, }; static const char *nfsd4_op_name(unsigned opnum) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 74f7b67567f..3444c0052a8 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -182,36 +182,26 @@ out_unlock: typedef int (recdir_func)(struct dentry *, struct dentry *); -struct dentry_list { - struct dentry *dentry; +struct name_list { + char name[HEXDIR_LEN]; struct list_head list; }; -struct dentry_list_arg { - struct list_head dentries; - struct dentry *parent; -}; - static int -nfsd4_build_dentrylist(void *arg, const char *name, int namlen, +nfsd4_build_namelist(void *arg, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { - struct dentry_list_arg *dla = arg; - struct list_head *dentries = &dla->dentries; - struct dentry *parent = dla->parent; - struct dentry *dentry; - struct dentry_list *child; + struct list_head *names = arg; + struct name_list *entry; - if (name && isdotent(name, namlen)) + if (namlen != HEXDIR_LEN - 1) return 0; - dentry = lookup_one_len(name, parent, namlen); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - child = kmalloc(sizeof(*child), GFP_KERNEL); - if (child == NULL) + entry = kmalloc(sizeof(struct name_list), GFP_KERNEL); + if (entry == NULL) return -ENOMEM; - child->dentry = dentry; - list_add(&child->list, dentries); + memcpy(entry->name, name, HEXDIR_LEN - 1); + entry->name[HEXDIR_LEN - 1] = '\0'; + list_add(&entry->list, names); return 0; } @@ -220,11 +210,9 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f) { const struct cred *original_cred; struct file *filp; - struct dentry_list_arg dla = { - .parent = dir, - }; - struct list_head *dentries = &dla.dentries; - struct dentry_list *child; + LIST_HEAD(names); + struct name_list *entry; + struct dentry *dentry; int status; if (!rec_dir_init) @@ -233,31 +221,34 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f) status = nfs4_save_creds(&original_cred); if (status < 0) return status; - INIT_LIST_HEAD(dentries); filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY, current_cred()); status = PTR_ERR(filp); if (IS_ERR(filp)) goto out; - INIT_LIST_HEAD(dentries); - status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla); + status = vfs_readdir(filp, nfsd4_build_namelist, &names); fput(filp); - while (!list_empty(dentries)) { - child = list_entry(dentries->next, struct dentry_list, list); - status = f(dir, child->dentry); + while (!list_empty(&names)) { + entry = list_entry(names.next, struct name_list, list); + + dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); + if (IS_ERR(dentry)) { + status = PTR_ERR(dentry); + goto out; + } + status = f(dir, dentry); + dput(dentry); if (status) goto out; - list_del(&child->list); - dput(child->dentry); - kfree(child); + list_del(&entry->list); + kfree(entry); } out: - while (!list_empty(dentries)) { - child = list_entry(dentries->next, struct dentry_list, list); - list_del(&child->list); - dput(child->dentry); - kfree(child); + while (!list_empty(&names)) { + entry = list_entry(names.next, struct name_list, list); + list_del(&entry->list); + kfree(entry); } nfs4_reset_creds(original_cred); return status; @@ -353,7 +344,8 @@ purge_old(struct dentry *parent, struct dentry *child) { int status; - if (nfs4_has_reclaimed_state(child->d_name.name)) + /* note: we currently use this path only for minorversion 0 */ + if (nfs4_has_reclaimed_state(child->d_name.name, false)) return 0; status = nfsd4_clear_clid_dir(parent, child); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b6f60f48e94..c65a27b76a9 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -68,6 +68,7 @@ static u32 current_delegid = 1; static u32 nfs4_init; static stateid_t zerostateid; /* bits all 0 */ static stateid_t onestateid; /* bits all 1 */ +static u64 current_sessionid = 1; #define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t))) #define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) @@ -75,18 +76,21 @@ static stateid_t onestateid; /* bits all 1 */ /* forward declarations */ static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); -static void release_stateid_lockowners(struct nfs4_stateid *open_stp); static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; static void nfs4_set_recdir(char *recdir); -/* Locking: - * - * client_mutex: - * protects clientid_hashtbl[], clientstr_hashtbl[], - * unconfstr_hashtbl[], uncofid_hashtbl[]. - */ +/* Locking: */ + +/* Currently used for almost all code touching nfsv4 state: */ static DEFINE_MUTEX(client_mutex); +/* + * Currently used for the del_recall_lru and file hash table. In an + * effort to decrease the scope of the client_mutex, this spinlock may + * eventually cover more: + */ +static DEFINE_SPINLOCK(recall_lock); + static struct kmem_cache *stateowner_slab = NULL; static struct kmem_cache *file_slab = NULL; static struct kmem_cache *stateid_slab = NULL; @@ -117,37 +121,23 @@ opaque_hashval(const void *ptr, int nbytes) return x; } -/* forward declarations */ -static void release_stateowner(struct nfs4_stateowner *sop); -static void release_stateid(struct nfs4_stateid *stp, int flags); - -/* - * Delegation state - */ - -/* recall_lock protects the del_recall_lru */ -static DEFINE_SPINLOCK(recall_lock); static struct list_head del_recall_lru; -static void -free_nfs4_file(struct kref *kref) -{ - struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref); - list_del(&fp->fi_hash); - iput(fp->fi_inode); - kmem_cache_free(file_slab, fp); -} - static inline void put_nfs4_file(struct nfs4_file *fi) { - kref_put(&fi->fi_ref, free_nfs4_file); + if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) { + list_del(&fi->fi_hash); + spin_unlock(&recall_lock); + iput(fi->fi_inode); + kmem_cache_free(file_slab, fi); + } } static inline void get_nfs4_file(struct nfs4_file *fi) { - kref_get(&fi->fi_ref); + atomic_inc(&fi->fi_ref); } static int num_delegations; @@ -220,9 +210,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f dp->dl_stateid.si_stateownerid = current_delegid++; dp->dl_stateid.si_fileid = 0; dp->dl_stateid.si_generation = 0; - dp->dl_fhlen = current_fh->fh_handle.fh_size; - memcpy(dp->dl_fhval, ¤t_fh->fh_handle.fh_base, - current_fh->fh_handle.fh_size); + fh_copy_shallow(&dp->dl_fh, ¤t_fh->fh_handle); dp->dl_time = 0; atomic_set(&dp->dl_count, 1); list_add(&dp->dl_perfile, &fp->fi_delegations); @@ -311,6 +299,291 @@ static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE]; static struct list_head client_lru; static struct list_head close_lru; +static void unhash_generic_stateid(struct nfs4_stateid *stp) +{ + list_del(&stp->st_hash); + list_del(&stp->st_perfile); + list_del(&stp->st_perstateowner); +} + +static void free_generic_stateid(struct nfs4_stateid *stp) +{ + put_nfs4_file(stp->st_file); + kmem_cache_free(stateid_slab, stp); +} + +static void release_lock_stateid(struct nfs4_stateid *stp) +{ + unhash_generic_stateid(stp); + locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner); + free_generic_stateid(stp); +} + +static void unhash_lockowner(struct nfs4_stateowner *sop) +{ + struct nfs4_stateid *stp; + + list_del(&sop->so_idhash); + list_del(&sop->so_strhash); + list_del(&sop->so_perstateid); + while (!list_empty(&sop->so_stateids)) { + stp = list_first_entry(&sop->so_stateids, + struct nfs4_stateid, st_perstateowner); + release_lock_stateid(stp); + } +} + +static void release_lockowner(struct nfs4_stateowner *sop) +{ + unhash_lockowner(sop); + nfs4_put_stateowner(sop); +} + +static void +release_stateid_lockowners(struct nfs4_stateid *open_stp) +{ + struct nfs4_stateowner *lock_sop; + + while (!list_empty(&open_stp->st_lockowners)) { + lock_sop = list_entry(open_stp->st_lockowners.next, + struct nfs4_stateowner, so_perstateid); + /* list_del(&open_stp->st_lockowners); */ + BUG_ON(lock_sop->so_is_open_owner); + release_lockowner(lock_sop); + } +} + +static void release_open_stateid(struct nfs4_stateid *stp) +{ + unhash_generic_stateid(stp); + release_stateid_lockowners(stp); + nfsd_close(stp->st_vfs_file); + free_generic_stateid(stp); +} + +static void unhash_openowner(struct nfs4_stateowner *sop) +{ + struct nfs4_stateid *stp; + + list_del(&sop->so_idhash); + list_del(&sop->so_strhash); + list_del(&sop->so_perclient); + list_del(&sop->so_perstateid); /* XXX: necessary? */ + while (!list_empty(&sop->so_stateids)) { + stp = list_first_entry(&sop->so_stateids, + struct nfs4_stateid, st_perstateowner); + release_open_stateid(stp); + } +} + +static void release_openowner(struct nfs4_stateowner *sop) +{ + unhash_openowner(sop); + list_del(&sop->so_close_lru); + nfs4_put_stateowner(sop); +} + +static DEFINE_SPINLOCK(sessionid_lock); +#define SESSION_HASH_SIZE 512 +static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE]; + +static inline int +hash_sessionid(struct nfs4_sessionid *sessionid) +{ + struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid; + + return sid->sequence % SESSION_HASH_SIZE; +} + +static inline void +dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) +{ + u32 *ptr = (u32 *)(&sessionid->data[0]); + dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]); +} + +static void +gen_sessionid(struct nfsd4_session *ses) +{ + struct nfs4_client *clp = ses->se_client; + struct nfsd4_sessionid *sid; + + sid = (struct nfsd4_sessionid *)ses->se_sessionid.data; + sid->clientid = clp->cl_clientid; + sid->sequence = current_sessionid++; + sid->reserved = 0; +} + +/* + * Give the client the number of slots it requests bound by + * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages. + * + * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we + * should (up to a point) re-negotiate active sessions and reduce their + * slot usage to make rooom for new connections. For now we just fail the + * create session. + */ +static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan) +{ + int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT; + + spin_lock(&nfsd_serv->sv_lock); + if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages) + np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used; + nfsd_serv->sv_drc_pages_used += np; + spin_unlock(&nfsd_serv->sv_lock); + + if (np <= 0) { + status = nfserr_resource; + fchan->maxreqs = 0; + } else + fchan->maxreqs = np / NFSD_PAGES_PER_SLOT; + + return status; +} + +/* + * fchan holds the client values on input, and the server values on output + */ +static int init_forechannel_attrs(struct svc_rqst *rqstp, + struct nfsd4_session *session, + struct nfsd4_channel_attrs *fchan) +{ + int status = 0; + __u32 maxcount = svc_max_payload(rqstp); + + /* headerpadsz set to zero in encode routine */ + + /* Use the client's max request and max response size if possible */ + if (fchan->maxreq_sz > maxcount) + fchan->maxreq_sz = maxcount; + session->se_fmaxreq_sz = fchan->maxreq_sz; + + if (fchan->maxresp_sz > maxcount) + fchan->maxresp_sz = maxcount; + session->se_fmaxresp_sz = fchan->maxresp_sz; + + /* Set the max response cached size our default which is + * a multiple of PAGE_SIZE and small */ + session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE; + fchan->maxresp_cached = session->se_fmaxresp_cached; + + /* Use the client's maxops if possible */ + if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND) + fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND; + session->se_fmaxops = fchan->maxops; + + /* try to use the client requested number of slots */ + if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION) + fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION; + + /* FIXME: Error means no more DRC pages so the server should + * recover pages from existing sessions. For now fail session + * creation. + */ + status = set_forechannel_maxreqs(fchan); + + session->se_fnumslots = fchan->maxreqs; + return status; +} + +static int +alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, + struct nfsd4_create_session *cses) +{ + struct nfsd4_session *new, tmp; + int idx, status = nfserr_resource, slotsize; + + memset(&tmp, 0, sizeof(tmp)); + + /* FIXME: For now, we just accept the client back channel attributes. */ + status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel); + if (status) + goto out; + + /* allocate struct nfsd4_session and slot table in one piece */ + slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot); + new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL); + if (!new) + goto out; + + memcpy(new, &tmp, sizeof(*new)); + + new->se_client = clp; + gen_sessionid(new); + idx = hash_sessionid(&new->se_sessionid); + memcpy(clp->cl_sessionid.data, new->se_sessionid.data, + NFS4_MAX_SESSIONID_LEN); + + new->se_flags = cses->flags; + kref_init(&new->se_ref); + spin_lock(&sessionid_lock); + list_add(&new->se_hash, &sessionid_hashtbl[idx]); + list_add(&new->se_perclnt, &clp->cl_sessions); + spin_unlock(&sessionid_lock); + + status = nfs_ok; +out: + return status; +} + +/* caller must hold sessionid_lock */ +static struct nfsd4_session * +find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid) +{ + struct nfsd4_session *elem; + int idx; + + dump_sessionid(__func__, sessionid); + idx = hash_sessionid(sessionid); + dprintk("%s: idx is %d\n", __func__, idx); + /* Search in the appropriate list */ + list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) { + dump_sessionid("list traversal", &elem->se_sessionid); + if (!memcmp(elem->se_sessionid.data, sessionid->data, + NFS4_MAX_SESSIONID_LEN)) { + return elem; + } + } + + dprintk("%s: session not found\n", __func__); + return NULL; +} + +/* caller must hold sessionid_lock */ +static void +unhash_session(struct nfsd4_session *ses) +{ + list_del(&ses->se_hash); + list_del(&ses->se_perclnt); +} + +static void +release_session(struct nfsd4_session *ses) +{ + spin_lock(&sessionid_lock); + unhash_session(ses); + spin_unlock(&sessionid_lock); + nfsd4_put_session(ses); +} + +static void nfsd4_release_respages(struct page **respages, short resused); + +void +free_session(struct kref *kref) +{ + struct nfsd4_session *ses; + int i; + + ses = container_of(kref, struct nfsd4_session, se_ref); + for (i = 0; i < ses->se_fnumslots; i++) { + struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry; + nfsd4_release_respages(e->ce_respages, e->ce_resused); + } + kfree(ses->se_slots); + kfree(ses); +} + static inline void renew_client(struct nfs4_client *clp) { @@ -330,8 +603,8 @@ STALE_CLIENTID(clientid_t *clid) { if (clid->cl_boot == boot_time) return 0; - dprintk("NFSD stale clientid (%08x/%08x)\n", - clid->cl_boot, clid->cl_id); + dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n", + clid->cl_boot, clid->cl_id, boot_time); return 1; } @@ -376,6 +649,8 @@ static inline void free_client(struct nfs4_client *clp) { shutdown_callback_client(clp); + nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages, + clp->cl_slot.sl_cache_entry.ce_resused); if (clp->cl_cred.cr_group_info) put_group_info(clp->cl_cred.cr_group_info); kfree(clp->cl_principal); @@ -420,7 +695,13 @@ expire_client(struct nfs4_client *clp) list_del(&clp->cl_lru); while (!list_empty(&clp->cl_openowners)) { sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); - release_stateowner(sop); + release_openowner(sop); + } + while (!list_empty(&clp->cl_sessions)) { + struct nfsd4_session *ses; + ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, + se_perclnt); + release_session(ses); } put_nfs4_client(clp); } @@ -439,6 +720,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir) INIT_LIST_HEAD(&clp->cl_strhash); INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); + INIT_LIST_HEAD(&clp->cl_sessions); INIT_LIST_HEAD(&clp->cl_lru); return clp; } @@ -568,25 +850,45 @@ find_unconfirmed_client(clientid_t *clid) return NULL; } +/* + * Return 1 iff clp's clientid establishment method matches the use_exchange_id + * parameter. Matching is based on the fact the at least one of the + * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1 + * + * FIXME: we need to unify the clientid namespaces for nfsv4.x + * and correctly deal with client upgrade/downgrade in EXCHANGE_ID + * and SET_CLIENTID{,_CONFIRM} + */ +static inline int +match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id) +{ + bool has_exchange_flags = (clp->cl_exchange_flags != 0); + return use_exchange_id == has_exchange_flags; +} + static struct nfs4_client * -find_confirmed_client_by_str(const char *dname, unsigned int hashval) +find_confirmed_client_by_str(const char *dname, unsigned int hashval, + bool use_exchange_id) { struct nfs4_client *clp; list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) { - if (same_name(clp->cl_recdir, dname)) + if (same_name(clp->cl_recdir, dname) && + match_clientid_establishment(clp, use_exchange_id)) return clp; } return NULL; } static struct nfs4_client * -find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) +find_unconfirmed_client_by_str(const char *dname, unsigned int hashval, + bool use_exchange_id) { struct nfs4_client *clp; list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) { - if (same_name(clp->cl_recdir, dname)) + if (same_name(clp->cl_recdir, dname) && + match_clientid_establishment(clp, use_exchange_id)) return clp; } return NULL; @@ -685,6 +987,534 @@ out_err: return; } +void +nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + + resp->cstate.statp = statp; +} + +/* + * Dereference the result pages. + */ +static void +nfsd4_release_respages(struct page **respages, short resused) +{ + int i; + + dprintk("--> %s\n", __func__); + for (i = 0; i < resused; i++) { + if (!respages[i]) + continue; + put_page(respages[i]); + respages[i] = NULL; + } +} + +static void +nfsd4_copy_pages(struct page **topages, struct page **frompages, short count) +{ + int i; + + for (i = 0; i < count; i++) { + topages[i] = frompages[i]; + if (!topages[i]) + continue; + get_page(topages[i]); + } +} + +/* + * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous + * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total + * length of the XDR response is less than se_fmaxresp_cached + * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a + * of the reply (e.g. readdir). + * + * Store the base and length of the rq_req.head[0] page + * of the NFSv4.1 data, just past the rpc header. + */ +void +nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) +{ + struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry; + struct svc_rqst *rqstp = resp->rqstp; + struct nfsd4_compoundargs *args = rqstp->rq_argp; + struct nfsd4_op *op = &args->ops[resp->opcnt]; + struct kvec *resv = &rqstp->rq_res.head[0]; + + dprintk("--> %s entry %p\n", __func__, entry); + + /* Don't cache a failed OP_SEQUENCE. */ + if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status) + return; + + nfsd4_release_respages(entry->ce_respages, entry->ce_resused); + entry->ce_opcnt = resp->opcnt; + entry->ce_status = resp->cstate.status; + + /* + * Don't need a page to cache just the sequence operation - the slot + * does this for us! + */ + + if (nfsd4_not_cached(resp)) { + entry->ce_resused = 0; + entry->ce_rpchdrlen = 0; + dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__, + resp->cstate.slot->sl_cache_entry.ce_cachethis); + return; + } + entry->ce_resused = rqstp->rq_resused; + if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1) + entry->ce_resused = NFSD_PAGES_PER_SLOT + 1; + nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages, + entry->ce_resused); + entry->ce_datav.iov_base = resp->cstate.statp; + entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp - + (char *)page_address(rqstp->rq_respages[0])); + /* Current request rpc header length*/ + entry->ce_rpchdrlen = (char *)resp->cstate.statp - + (char *)page_address(rqstp->rq_respages[0]); +} + +/* + * We keep the rpc header, but take the nfs reply from the replycache. + */ +static int +nfsd41_copy_replay_data(struct nfsd4_compoundres *resp, + struct nfsd4_cache_entry *entry) +{ + struct svc_rqst *rqstp = resp->rqstp; + struct kvec *resv = &resp->rqstp->rq_res.head[0]; + int len; + + /* Current request rpc header length*/ + len = (char *)resp->cstate.statp - + (char *)page_address(rqstp->rq_respages[0]); + if (entry->ce_datav.iov_len + len > PAGE_SIZE) { + dprintk("%s v41 cached reply too large (%Zd).\n", __func__, + entry->ce_datav.iov_len); + return 0; + } + /* copy the cached reply nfsd data past the current rpc header */ + memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base, + entry->ce_datav.iov_len); + resv->iov_len = len + entry->ce_datav.iov_len; + return 1; +} + +/* + * Keep the first page of the replay. Copy the NFSv4.1 data from the first + * cached page. Replace any futher replay pages from the cache. + */ +__be32 +nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, + struct nfsd4_sequence *seq) +{ + struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry; + __be32 status; + + dprintk("--> %s entry %p\n", __func__, entry); + + /* + * If this is just the sequence operation, we did not keep + * a page in the cache entry because we can just use the + * slot info stored in struct nfsd4_sequence that was checked + * against the slot in nfsd4_sequence(). + * + * This occurs when seq->cachethis is FALSE, or when the client + * session inactivity timer fires and a solo sequence operation + * is sent (lease renewal). + */ + if (seq && nfsd4_not_cached(resp)) { + seq->maxslots = resp->cstate.session->se_fnumslots; + return nfs_ok; + } + + if (!nfsd41_copy_replay_data(resp, entry)) { + /* + * Not enough room to use the replay rpc header, send the + * cached header. Release all the allocated result pages. + */ + svc_free_res_pages(resp->rqstp); + nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages, + entry->ce_resused); + } else { + /* Release all but the first allocated result page */ + + resp->rqstp->rq_resused--; + svc_free_res_pages(resp->rqstp); + + nfsd4_copy_pages(&resp->rqstp->rq_respages[1], + &entry->ce_respages[1], + entry->ce_resused - 1); + } + + resp->rqstp->rq_resused = entry->ce_resused; + resp->opcnt = entry->ce_opcnt; + resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen; + status = entry->ce_status; + + return status; +} + +/* + * Set the exchange_id flags returned by the server. + */ +static void +nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) +{ + /* pNFS is not supported */ + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; + + /* Referrals are supported, Migration is not. */ + new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; + + /* set the wire flags to return to client. */ + clid->flags = new->cl_exchange_flags; +} + +__be32 +nfsd4_exchange_id(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_exchange_id *exid) +{ + struct nfs4_client *unconf, *conf, *new; + int status; + unsigned int strhashval; + char dname[HEXDIR_LEN]; + nfs4_verifier verf = exid->verifier; + u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr; + + dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " + " ip_addr=%u flags %x, spa_how %d\n", + __func__, rqstp, exid, exid->clname.len, exid->clname.data, + ip_addr, exid->flags, exid->spa_how); + + if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A)) + return nfserr_inval; + + /* Currently only support SP4_NONE */ + switch (exid->spa_how) { + case SP4_NONE: + break; + case SP4_SSV: + return nfserr_encr_alg_unsupp; + default: + BUG(); /* checked by xdr code */ + case SP4_MACH_CRED: + return nfserr_serverfault; /* no excuse :-/ */ + } + + status = nfs4_make_rec_clidname(dname, &exid->clname); + + if (status) + goto error; + + strhashval = clientstr_hashval(dname); + + nfs4_lock_state(); + status = nfs_ok; + + conf = find_confirmed_client_by_str(dname, strhashval, true); + if (conf) { + if (!same_verf(&verf, &conf->cl_verifier)) { + /* 18.35.4 case 8 */ + if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { + status = nfserr_not_same; + goto out; + } + /* Client reboot: destroy old state */ + expire_client(conf); + goto out_new; + } + if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { + /* 18.35.4 case 9 */ + if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { + status = nfserr_perm; + goto out; + } + expire_client(conf); + goto out_new; + } + if (ip_addr != conf->cl_addr && + !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) { + /* Client collision. 18.35.4 case 3 */ + status = nfserr_clid_inuse; + goto out; + } + /* + * Set bit when the owner id and verifier map to an already + * confirmed client id (18.35.3). + */ + exid->flags |= EXCHGID4_FLAG_CONFIRMED_R; + + /* + * Falling into 18.35.4 case 2, possible router replay. + * Leave confirmed record intact and return same result. + */ + copy_verf(conf, &verf); + new = conf; + goto out_copy; + } else { + /* 18.35.4 case 7 */ + if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) { + status = nfserr_noent; + goto out; + } + } + + unconf = find_unconfirmed_client_by_str(dname, strhashval, true); + if (unconf) { + /* + * Possible retry or client restart. Per 18.35.4 case 4, + * a new unconfirmed record should be generated regardless + * of whether any properties have changed. + */ + expire_client(unconf); + } + +out_new: + /* Normal case */ + new = create_client(exid->clname, dname); + if (new == NULL) { + status = nfserr_resource; + goto out; + } + + copy_verf(new, &verf); + copy_cred(&new->cl_cred, &rqstp->rq_cred); + new->cl_addr = ip_addr; + gen_clid(new); + gen_confirm(new); + add_to_unconfirmed(new, strhashval); +out_copy: + exid->clientid.cl_boot = new->cl_clientid.cl_boot; + exid->clientid.cl_id = new->cl_clientid.cl_id; + + new->cl_slot.sl_seqid = 0; + exid->seqid = 1; + nfsd4_set_ex_flags(new, exid); + + dprintk("nfsd4_exchange_id seqid %d flags %x\n", + new->cl_slot.sl_seqid, new->cl_exchange_flags); + status = nfs_ok; + +out: + nfs4_unlock_state(); +error: + dprintk("nfsd4_exchange_id returns %d\n", ntohl(status)); + return status; +} + +static int +check_slot_seqid(u32 seqid, struct nfsd4_slot *slot) +{ + dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid, + slot->sl_seqid); + + /* The slot is in use, and no response has been sent. */ + if (slot->sl_inuse) { + if (seqid == slot->sl_seqid) + return nfserr_jukebox; + else + return nfserr_seq_misordered; + } + /* Normal */ + if (likely(seqid == slot->sl_seqid + 1)) + return nfs_ok; + /* Replay */ + if (seqid == slot->sl_seqid) + return nfserr_replay_cache; + /* Wraparound */ + if (seqid == 1 && (slot->sl_seqid + 1) == 0) + return nfs_ok; + /* Misordered replay or misordered new request */ + return nfserr_seq_misordered; +} + +__be32 +nfsd4_create_session(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_create_session *cr_ses) +{ + u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr; + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfs4_client *conf, *unconf; + struct nfsd4_slot *slot = NULL; + int status = 0; + + nfs4_lock_state(); + unconf = find_unconfirmed_client(&cr_ses->clientid); + conf = find_confirmed_client(&cr_ses->clientid); + + if (conf) { + slot = &conf->cl_slot; + status = check_slot_seqid(cr_ses->seqid, slot); + if (status == nfserr_replay_cache) { + dprintk("Got a create_session replay! seqid= %d\n", + slot->sl_seqid); + cstate->slot = slot; + cstate->status = status; + /* Return the cached reply status */ + status = nfsd4_replay_cache_entry(resp, NULL); + goto out; + } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) { + status = nfserr_seq_misordered; + dprintk("Sequence misordered!\n"); + dprintk("Expected seqid= %d but got seqid= %d\n", + slot->sl_seqid, cr_ses->seqid); + goto out; + } + conf->cl_slot.sl_seqid++; + } else if (unconf) { + if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || + (ip_addr != unconf->cl_addr)) { + status = nfserr_clid_inuse; + goto out; + } + + slot = &unconf->cl_slot; + status = check_slot_seqid(cr_ses->seqid, slot); + if (status) { + /* an unconfirmed replay returns misordered */ + status = nfserr_seq_misordered; + goto out; + } + + slot->sl_seqid++; /* from 0 to 1 */ + move_to_confirmed(unconf); + + /* + * We do not support RDMA or persistent sessions + */ + cr_ses->flags &= ~SESSION4_PERSIST; + cr_ses->flags &= ~SESSION4_RDMA; + + conf = unconf; + } else { + status = nfserr_stale_clientid; + goto out; + } + + status = alloc_init_session(rqstp, conf, cr_ses); + if (status) + goto out; + + memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data, + NFS4_MAX_SESSIONID_LEN); + cr_ses->seqid = slot->sl_seqid; + + slot->sl_inuse = true; + cstate->slot = slot; + /* Ensure a page is used for the cache */ + slot->sl_cache_entry.ce_cachethis = 1; +out: + nfs4_unlock_state(); + dprintk("%s returns %d\n", __func__, ntohl(status)); + return status; +} + +__be32 +nfsd4_destroy_session(struct svc_rqst *r, + struct nfsd4_compound_state *cstate, + struct nfsd4_destroy_session *sessionid) +{ + struct nfsd4_session *ses; + u32 status = nfserr_badsession; + + /* Notes: + * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid + * - Should we return nfserr_back_chan_busy if waiting for + * callbacks on to-be-destroyed session? + * - Do we need to clear any callback info from previous session? + */ + + dump_sessionid(__func__, &sessionid->sessionid); + spin_lock(&sessionid_lock); + ses = find_in_sessionid_hashtbl(&sessionid->sessionid); + if (!ses) { + spin_unlock(&sessionid_lock); + goto out; + } + + unhash_session(ses); + spin_unlock(&sessionid_lock); + + /* wait for callbacks */ + shutdown_callback_client(ses->se_client); + nfsd4_put_session(ses); + status = nfs_ok; +out: + dprintk("%s returns %d\n", __func__, ntohl(status)); + return status; +} + +__be32 +nfsd4_sequence(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_sequence *seq) +{ + struct nfsd4_compoundres *resp = rqstp->rq_resp; + struct nfsd4_session *session; + struct nfsd4_slot *slot; + int status; + + if (resp->opcnt != 1) + return nfserr_sequence_pos; + + spin_lock(&sessionid_lock); + status = nfserr_badsession; + session = find_in_sessionid_hashtbl(&seq->sessionid); + if (!session) + goto out; + + status = nfserr_badslot; + if (seq->slotid >= session->se_fnumslots) + goto out; + + slot = &session->se_slots[seq->slotid]; + dprintk("%s: slotid %d\n", __func__, seq->slotid); + + status = check_slot_seqid(seq->seqid, slot); + if (status == nfserr_replay_cache) { + cstate->slot = slot; + cstate->session = session; + /* Return the cached reply status and set cstate->status + * for nfsd4_svc_encode_compoundres processing */ + status = nfsd4_replay_cache_entry(resp, seq); + cstate->status = nfserr_replay_cache; + goto replay_cache; + } + if (status) + goto out; + + /* Success! bump slot seqid */ + slot->sl_inuse = true; + slot->sl_seqid = seq->seqid; + slot->sl_cache_entry.ce_cachethis = seq->cachethis; + /* Always set the cache entry cachethis for solo sequence */ + if (nfsd4_is_solo_sequence(resp)) + slot->sl_cache_entry.ce_cachethis = 1; + + cstate->slot = slot; + cstate->session = session; + +replay_cache: + /* Renew the clientid on success and on replay. + * Hold a session reference until done processing the compound: + * nfsd4_put_session called only if the cstate slot is set. + */ + renew_client(session->se_client); + nfsd4_get_session(session); +out: + spin_unlock(&sessionid_lock); + dprintk("%s: return %d\n", __func__, ntohl(status)); + return status; +} + __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid *setclid) @@ -716,14 +1546,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, strhashval = clientstr_hashval(dname); nfs4_lock_state(); - conf = find_confirmed_client_by_str(dname, strhashval); + conf = find_confirmed_client_by_str(dname, strhashval, false); if (conf) { /* RFC 3530 14.2.33 CASE 0: */ status = nfserr_clid_inuse; - if (!same_creds(&conf->cl_cred, &rqstp->rq_cred) - || conf->cl_addr != sin->sin_addr.s_addr) { - dprintk("NFSD: setclientid: string in use by clientat %pI4\n", - &conf->cl_addr); + if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { + dprintk("NFSD: setclientid: string in use by client" + " at %pI4\n", &conf->cl_addr); goto out; } } @@ -732,7 +1561,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * has a description of SETCLIENTID request processing consisting * of 5 bullet points, labeled as CASE0 - CASE4 below. */ - unconf = find_unconfirmed_client_by_str(dname, strhashval); + unconf = find_unconfirmed_client_by_str(dname, strhashval, false); status = nfserr_resource; if (!conf) { /* @@ -887,7 +1716,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, unsigned int hash = clientstr_hashval(unconf->cl_recdir); conf = find_confirmed_client_by_str(unconf->cl_recdir, - hash); + hash, false); if (conf) { nfsd4_remove_clid_dir(conf); expire_client(conf); @@ -923,11 +1752,13 @@ alloc_init_file(struct inode *ino) fp = kmem_cache_alloc(file_slab, GFP_KERNEL); if (fp) { - kref_init(&fp->fi_ref); + atomic_set(&fp->fi_ref, 1); INIT_LIST_HEAD(&fp->fi_hash); INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); + spin_lock(&recall_lock); list_add(&fp->fi_hash, &file_hashtbl[hashval]); + spin_unlock(&recall_lock); fp->fi_inode = igrab(ino); fp->fi_id = current_fileid++; fp->fi_had_conflict = false; @@ -1037,48 +1868,6 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str return sop; } -static void -release_stateid_lockowners(struct nfs4_stateid *open_stp) -{ - struct nfs4_stateowner *lock_sop; - - while (!list_empty(&open_stp->st_lockowners)) { - lock_sop = list_entry(open_stp->st_lockowners.next, - struct nfs4_stateowner, so_perstateid); - /* list_del(&open_stp->st_lockowners); */ - BUG_ON(lock_sop->so_is_open_owner); - release_stateowner(lock_sop); - } -} - -static void -unhash_stateowner(struct nfs4_stateowner *sop) -{ - struct nfs4_stateid *stp; - - list_del(&sop->so_idhash); - list_del(&sop->so_strhash); - if (sop->so_is_open_owner) - list_del(&sop->so_perclient); - list_del(&sop->so_perstateid); - while (!list_empty(&sop->so_stateids)) { - stp = list_entry(sop->so_stateids.next, - struct nfs4_stateid, st_perstateowner); - if (sop->so_is_open_owner) - release_stateid(stp, OPEN_STATE); - else - release_stateid(stp, LOCK_STATE); - } -} - -static void -release_stateowner(struct nfs4_stateowner *sop) -{ - unhash_stateowner(sop); - list_del(&sop->so_close_lru); - nfs4_put_stateowner(sop); -} - static inline void init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_stateowner *sop = open->op_stateowner; @@ -1100,30 +1889,13 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open * stp->st_stateid.si_generation = 0; stp->st_access_bmap = 0; stp->st_deny_bmap = 0; - __set_bit(open->op_share_access, &stp->st_access_bmap); + __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK, + &stp->st_access_bmap); __set_bit(open->op_share_deny, &stp->st_deny_bmap); stp->st_openstp = NULL; } static void -release_stateid(struct nfs4_stateid *stp, int flags) -{ - struct file *filp = stp->st_vfs_file; - - list_del(&stp->st_hash); - list_del(&stp->st_perfile); - list_del(&stp->st_perstateowner); - if (flags & OPEN_STATE) { - release_stateid_lockowners(stp); - stp->st_vfs_file = NULL; - nfsd_close(filp); - } else if (flags & LOCK_STATE) - locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner); - put_nfs4_file(stp->st_file); - kmem_cache_free(stateid_slab, stp); -} - -static void move_to_close_lru(struct nfs4_stateowner *sop) { dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); @@ -1160,20 +1932,33 @@ find_file(struct inode *ino) unsigned int hashval = file_hashval(ino); struct nfs4_file *fp; + spin_lock(&recall_lock); list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { if (fp->fi_inode == ino) { get_nfs4_file(fp); + spin_unlock(&recall_lock); return fp; } } + spin_unlock(&recall_lock); return NULL; } -static inline int access_valid(u32 x) +static inline int access_valid(u32 x, u32 minorversion) { - if (x < NFS4_SHARE_ACCESS_READ) + if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ) return 0; - if (x > NFS4_SHARE_ACCESS_BOTH) + if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH) + return 0; + x &= ~NFS4_SHARE_ACCESS_MASK; + if (minorversion && x) { + if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL) + return 0; + if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED) + return 0; + x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK); + } + if (x) return 0; return 1; } @@ -1409,7 +2194,8 @@ static struct lock_manager_operations nfsd_lease_mng_ops = { __be32 -nfsd4_process_open1(struct nfsd4_open *open) +nfsd4_process_open1(struct nfsd4_compound_state *cstate, + struct nfsd4_open *open) { clientid_t *clientid = &open->op_clientid; struct nfs4_client *clp = NULL; @@ -1432,10 +2218,13 @@ nfsd4_process_open1(struct nfsd4_open *open) return nfserr_expired; goto renew; } + /* When sessions are used, skip open sequenceid processing */ + if (nfsd4_has_session(cstate)) + goto renew; if (!sop->so_confirmed) { /* Replace unconfirmed owners without checking for replay. */ clp = sop->so_client; - release_stateowner(sop); + release_openowner(sop); open->op_stateowner = NULL; goto renew; } @@ -1709,6 +2498,7 @@ out: __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { + struct nfsd4_compoundres *resp = rqstp->rq_resp; struct nfs4_file *fp = NULL; struct inode *ino = current_fh->fh_dentry->d_inode; struct nfs4_stateid *stp = NULL; @@ -1716,7 +2506,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf __be32 status; status = nfserr_inval; - if (!access_valid(open->op_share_access) + if (!access_valid(open->op_share_access, resp->cstate.minorversion) || !deny_valid(open->op_share_deny)) goto out; /* @@ -1764,12 +2554,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf init_stateid(stp, fp, open); status = nfsd4_truncate(rqstp, current_fh, open); if (status) { - release_stateid(stp, OPEN_STATE); + release_open_stateid(stp); goto out; } + if (nfsd4_has_session(&resp->cstate)) + update_stateid(&stp->st_stateid); } memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); + if (nfsd4_has_session(&resp->cstate)) + open->op_stateowner->so_confirmed = 1; + /* * Attempt to hand out a delegation. No error return, because the * OPEN succeeds even if we fail. @@ -1790,7 +2585,8 @@ out: * To finish the open response, we just need to set the rflags. */ open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; - if (!open->op_stateowner->so_confirmed) + if (!open->op_stateowner->so_confirmed && + !nfsd4_has_session(&resp->cstate)) open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; return status; @@ -1898,7 +2694,7 @@ nfs4_laundromat(void) } dprintk("NFSD: purging unused open stateowner (so_id %d)\n", sop->so_id); - release_stateowner(sop); + release_openowner(sop); } if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; @@ -1983,10 +2779,7 @@ out: static inline __be32 check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) { - /* Trying to call delegreturn with a special stateid? Yuch: */ - if (!(flags & (RD_STATE | WR_STATE))) - return nfserr_bad_stateid; - else if (ONE_STATEID(stateid) && (flags & RD_STATE)) + if (ONE_STATEID(stateid) && (flags & RD_STATE)) return nfs_ok; else if (locks_in_grace()) { /* Answer in remaining cases depends on existance of @@ -2005,14 +2798,20 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) * that are not able to provide mandatory locking. */ static inline int -io_during_grace_disallowed(struct inode *inode, int flags) +grace_disallows_io(struct inode *inode) { - return locks_in_grace() && (flags & (RD_STATE | WR_STATE)) - && mandatory_lock(inode); + return locks_in_grace() && mandatory_lock(inode); } -static int check_stateid_generation(stateid_t *in, stateid_t *ref) +static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags) { + /* + * When sessions are used the stateid generation number is ignored + * when it is zero. + */ + if ((flags & HAS_SESSION) && in->si_generation == 0) + goto out; + /* If the client sends us a stateid from the future, it's buggy: */ if (in->si_generation > ref->si_generation) return nfserr_bad_stateid; @@ -2028,74 +2827,77 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref) */ if (in->si_generation < ref->si_generation) return nfserr_old_stateid; +out: return nfs_ok; } +static int is_delegation_stateid(stateid_t *stateid) +{ + return stateid->si_fileid == 0; +} + /* * Checks for stateid operations */ __be32 -nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp) +nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, + stateid_t *stateid, int flags, struct file **filpp) { struct nfs4_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; - stateid_t *stidp; + struct svc_fh *current_fh = &cstate->current_fh; struct inode *ino = current_fh->fh_dentry->d_inode; __be32 status; - dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n", - stateid->si_boot, stateid->si_stateownerid, - stateid->si_fileid, stateid->si_generation); if (filpp) *filpp = NULL; - if (io_during_grace_disallowed(ino, flags)) + if (grace_disallows_io(ino)) return nfserr_grace; + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) return check_special_stateids(current_fh, stateid, flags); - /* STALE STATEID */ status = nfserr_stale_stateid; if (STALE_STATEID(stateid)) goto out; - /* BAD STATEID */ status = nfserr_bad_stateid; - if (!stateid->si_fileid) { /* delegation stateid */ - if(!(dp = find_delegation_stateid(ino, stateid))) { - dprintk("NFSD: delegation stateid not found\n"); + if (is_delegation_stateid(stateid)) { + dp = find_delegation_stateid(ino, stateid); + if (!dp) goto out; - } - stidp = &dp->dl_stateid; + status = check_stateid_generation(stateid, &dp->dl_stateid, + flags); + if (status) + goto out; + status = nfs4_check_delegmode(dp, flags); + if (status) + goto out; + renew_client(dp->dl_client); + if (filpp) + *filpp = dp->dl_vfs_file; } else { /* open or lock stateid */ - if (!(stp = find_stateid(stateid, flags))) { - dprintk("NFSD: open or lock stateid not found\n"); + stp = find_stateid(stateid, flags); + if (!stp) goto out; - } - if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp)) + if (nfs4_check_fh(current_fh, stp)) goto out; if (!stp->st_stateowner->so_confirmed) goto out; - stidp = &stp->st_stateid; - } - status = check_stateid_generation(stateid, stidp); - if (status) - goto out; - if (stp) { - if ((status = nfs4_check_openmode(stp,flags))) + status = check_stateid_generation(stateid, &stp->st_stateid, + flags); + if (status) + goto out; + status = nfs4_check_openmode(stp, flags); + if (status) goto out; renew_client(stp->st_stateowner->so_client); if (filpp) *filpp = stp->st_vfs_file; - } else { - if ((status = nfs4_check_delegmode(dp, flags))) - goto out; - renew_client(dp->dl_client); - if (flags & DELEG_RET) - unhash_delegation(dp); - if (filpp) - *filpp = dp->dl_vfs_file; } status = nfs_ok; out: @@ -2113,10 +2915,14 @@ setlkflg (int type) * Checks for sequence id mutating operations. */ static __be32 -nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock) +nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, + stateid_t *stateid, int flags, + struct nfs4_stateowner **sopp, + struct nfs4_stateid **stpp, struct nfsd4_lock *lock) { struct nfs4_stateid *stp; struct nfs4_stateowner *sop; + struct svc_fh *current_fh = &cstate->current_fh; __be32 status; dprintk("NFSD: preprocess_seqid_op: seqid=%d " @@ -2134,6 +2940,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei if (STALE_STATEID(stateid)) return nfserr_stale_stateid; + + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; + /* * We return BAD_STATEID if filehandle doesn't match stateid, * the confirmed flag is incorrecly set, or the generation @@ -2166,8 +2976,9 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei if (lock->lk_is_new) { if (!sop->so_is_open_owner) return nfserr_bad_stateid; - if (!same_clid(&clp->cl_clientid, lockclid)) - return nfserr_bad_stateid; + if (!(flags & HAS_SESSION) && + !same_clid(&clp->cl_clientid, lockclid)) + return nfserr_bad_stateid; /* stp is the open stateid */ status = nfs4_check_openmode(stp, lkflg); if (status) @@ -2190,7 +3001,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei * For the moment, we ignore the possibility of * generation number wraparound. */ - if (seqid != sop->so_seqid) + if (!(flags & HAS_SESSION) && seqid != sop->so_seqid) goto check_replay; if (sop->so_confirmed && flags & CONFIRM) { @@ -2203,7 +3014,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei " confirmed yet!\n"); return nfserr_bad_stateid; } - status = check_stateid_generation(stateid, &stp->st_stateid); + status = check_stateid_generation(stateid, &stp->st_stateid, flags); if (status) return status; renew_client(sop->so_client); @@ -2239,7 +3050,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, + if ((status = nfs4_preprocess_seqid_op(cstate, oc->oc_seqid, &oc->oc_req_stateid, CONFIRM | OPEN_STATE, &oc->oc_stateowner, &stp, NULL))) @@ -2304,12 +3115,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, (int)cstate->current_fh.fh_dentry->d_name.len, cstate->current_fh.fh_dentry->d_name.name); - if (!access_valid(od->od_share_access) + if (!access_valid(od->od_share_access, cstate->minorversion) || !deny_valid(od->od_share_deny)) return nfserr_inval; nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, + if ((status = nfs4_preprocess_seqid_op(cstate, od->od_seqid, &od->od_stateid, OPEN_STATE, @@ -2362,7 +3173,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); /* check close_lru for replay */ - if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, + if ((status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, &close->cl_stateid, OPEN_STATE | CLOSE_STATE, @@ -2373,7 +3184,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); /* release_stateid() calls nfsd_close() if needed */ - release_stateid(stp, OPEN_STATE); + release_open_stateid(stp); /* place unused nfs4_stateowners on so_close_lru list to be * released by the laundromat service after the lease period @@ -2394,16 +3205,40 @@ __be32 nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_delegreturn *dr) { + struct nfs4_delegation *dp; + stateid_t *stateid = &dr->dr_stateid; + struct inode *inode; __be32 status; + int flags = 0; if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) - goto out; + return status; + inode = cstate->current_fh.fh_dentry->d_inode; + if (nfsd4_has_session(cstate)) + flags |= HAS_SESSION; nfs4_lock_state(); - status = nfs4_preprocess_stateid_op(&cstate->current_fh, - &dr->dr_stateid, DELEG_RET, NULL); - nfs4_unlock_state(); + status = nfserr_bad_stateid; + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + goto out; + status = nfserr_stale_stateid; + if (STALE_STATEID(stateid)) + goto out; + status = nfserr_bad_stateid; + if (!is_delegation_stateid(stateid)) + goto out; + dp = find_delegation_stateid(inode, stateid); + if (!dp) + goto out; + status = check_stateid_generation(stateid, &dp->dl_stateid, flags); + if (status) + goto out; + renew_client(dp->dl_client); + + unhash_delegation(dp); out: + nfs4_unlock_state(); + return status; } @@ -2684,11 +3519,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_file *fp; status = nfserr_stale_clientid; - if (STALE_CLIENTID(&lock->lk_new_clientid)) + if (!nfsd4_has_session(cstate) && + STALE_CLIENTID(&lock->lk_new_clientid)) goto out; /* validate and update open stateid and open seqid */ - status = nfs4_preprocess_seqid_op(&cstate->current_fh, + status = nfs4_preprocess_seqid_op(cstate, lock->lk_new_open_seqid, &lock->lk_new_open_stateid, OPEN_STATE, @@ -2715,7 +3551,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } else { /* lock (lock owner + lock stateid) already exists */ - status = nfs4_preprocess_seqid_op(&cstate->current_fh, + status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, &lock->lk_old_lock_stateid, LOCK_STATE, @@ -2788,7 +3624,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } out: if (status && lock->lk_is_new && lock_sop) - release_stateowner(lock_sop); + release_lockowner(lock_sop); if (lock->lk_replay_owner) { nfs4_get_stateowner(lock->lk_replay_owner); cstate->replay_owner = lock->lk_replay_owner; @@ -2838,7 +3674,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); status = nfserr_stale_clientid; - if (STALE_CLIENTID(&lockt->lt_clientid)) + if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid)) goto out; if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) { @@ -2911,7 +3747,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); - if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, + if ((status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, &locku->lu_stateid, LOCK_STATE, @@ -3037,7 +3873,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, /* unhash_stateowner deletes so_perclient only * for openowners. */ list_del(&sop->so_perclient); - release_stateowner(sop); + release_lockowner(sop); } out: nfs4_unlock_state(); @@ -3051,12 +3887,12 @@ alloc_reclaim(void) } int -nfs4_has_reclaimed_state(const char *name) +nfs4_has_reclaimed_state(const char *name, bool use_exchange_id) { unsigned int strhashval = clientstr_hashval(name); struct nfs4_client *clp; - clp = find_confirmed_client_by_str(name, strhashval); + clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id); return clp ? 1 : 0; } @@ -3153,6 +3989,8 @@ nfs4_state_init(void) INIT_LIST_HEAD(&unconf_str_hashtbl[i]); INIT_LIST_HEAD(&unconf_id_hashtbl[i]); } + for (i = 0; i < SESSION_HASH_SIZE; i++) + INIT_LIST_HEAD(&sessionid_hashtbl[i]); for (i = 0; i < FILE_HASH_SIZE; i++) { INIT_LIST_HEAD(&file_hashtbl[i]); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 9250067943d..b820c311931 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -45,6 +45,7 @@ #include <linux/fs.h> #include <linux/namei.h> #include <linux/vfs.h> +#include <linux/utsname.h> #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/clnt.h> @@ -188,6 +189,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) return p; } +static int zero_clientid(clientid_t *clid) +{ + return (clid->cl_boot == 0) && (clid->cl_id == 0); +} + static int defer_free(struct nfsd4_compoundargs *argp, void (*release)(const void *), void *p) @@ -230,6 +236,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) bmval[0] = 0; bmval[1] = 0; + bmval[2] = 0; READ_BUF(4); READ32(bmlen); @@ -241,13 +248,27 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) READ32(bmval[0]); if (bmlen > 1) READ32(bmval[1]); + if (bmlen > 2) + READ32(bmval[2]); DECODE_TAIL; } +static u32 nfsd_attrmask[] = { + NFSD_WRITEABLE_ATTRS_WORD0, + NFSD_WRITEABLE_ATTRS_WORD1, + NFSD_WRITEABLE_ATTRS_WORD2 +}; + +static u32 nfsd41_ex_attrmask[] = { + NFSD_SUPPATTR_EXCLCREAT_WORD0, + NFSD_SUPPATTR_EXCLCREAT_WORD1, + NFSD_SUPPATTR_EXCLCREAT_WORD2 +}; + static __be32 -nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr, - struct nfs4_acl **acl) +nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable, + struct iattr *iattr, struct nfs4_acl **acl) { int expected_len, len = 0; u32 dummy32; @@ -263,9 +284,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia * According to spec, unsupported attributes return ERR_ATTRNOTSUPP; * read-only attributes return ERR_INVAL. */ - if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1)) + if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) || + (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) || + (bmval[2] & ~nfsd_suppattrs2(argp->minorversion))) return nfserr_attrnotsupp; - if ((bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0) || (bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1)) + if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) || + (bmval[2] & ~writable[2])) return nfserr_inval; READ_BUF(4); @@ -400,6 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia goto xdr_error; } } + BUG_ON(bmval[2]); /* no such writeable attr supported yet */ if (len != expected_len) goto xdr_error; @@ -493,7 +518,9 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval))) return status; - if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl))) + status = nfsd4_decode_fattr(argp, create->cr_bmval, nfsd_attrmask, + &create->cr_iattr, &create->cr_acl); + if (status) goto out; DECODE_TAIL; @@ -583,6 +610,8 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt) READ_BUF(lockt->lt_owner.len); READMEM(lockt->lt_owner.data, lockt->lt_owner.len); + if (argp->minorversion && !zero_clientid(&lockt->lt_clientid)) + return nfserr_inval; DECODE_TAIL; } @@ -652,13 +681,26 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) switch (open->op_createmode) { case NFS4_CREATE_UNCHECKED: case NFS4_CREATE_GUARDED: - if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl))) + status = nfsd4_decode_fattr(argp, open->op_bmval, + nfsd_attrmask, &open->op_iattr, &open->op_acl); + if (status) goto out; break; case NFS4_CREATE_EXCLUSIVE: READ_BUF(8); COPYMEM(open->op_verf.data, 8); break; + case NFS4_CREATE_EXCLUSIVE4_1: + if (argp->minorversion < 1) + goto xdr_error; + READ_BUF(8); + COPYMEM(open->op_verf.data, 8); + status = nfsd4_decode_fattr(argp, open->op_bmval, + nfsd41_ex_attrmask, &open->op_iattr, + &open->op_acl); + if (status) + goto out; + break; default: goto xdr_error; } @@ -851,7 +893,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); if (status) return status; - return nfsd4_decode_fattr(argp, setattr->sa_bmval, + return nfsd4_decode_fattr(argp, setattr->sa_bmval, nfsd_attrmask, &setattr->sa_iattr, &setattr->sa_acl); } @@ -993,6 +1035,241 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel READ_BUF(rlockowner->rl_owner.len); READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len); + if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid)) + return nfserr_inval; + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp, + struct nfsd4_exchange_id *exid) +{ + int dummy; + DECODE_HEAD; + + READ_BUF(NFS4_VERIFIER_SIZE); + COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE); + + READ_BUF(4); + READ32(exid->clname.len); + + READ_BUF(exid->clname.len); + SAVEMEM(exid->clname.data, exid->clname.len); + + READ_BUF(4); + READ32(exid->flags); + + /* Ignore state_protect4_a */ + READ_BUF(4); + READ32(exid->spa_how); + switch (exid->spa_how) { + case SP4_NONE: + break; + case SP4_MACH_CRED: + /* spo_must_enforce */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy * 4); + p += dummy; + + /* spo_must_allow */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy * 4); + p += dummy; + break; + case SP4_SSV: + /* ssp_ops */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy * 4); + p += dummy; + + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy * 4); + p += dummy; + + /* ssp_hash_algs<> */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + + /* ssp_encr_algs<> */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + + /* ssp_window and ssp_num_gss_handles */ + READ_BUF(8); + READ32(dummy); + READ32(dummy); + break; + default: + goto xdr_error; + } + + /* Ignore Implementation ID */ + READ_BUF(4); /* nfs_impl_id4 array length */ + READ32(dummy); + + if (dummy > 1) + goto xdr_error; + + if (dummy == 1) { + /* nii_domain */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + + /* nii_name */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + + /* nii_date */ + READ_BUF(12); + p += 3; + } + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, + struct nfsd4_create_session *sess) +{ + DECODE_HEAD; + + u32 dummy; + char *machine_name; + int i; + int nr_secflavs; + + READ_BUF(16); + COPYMEM(&sess->clientid, 8); + READ32(sess->seqid); + READ32(sess->flags); + + /* Fore channel attrs */ + READ_BUF(28); + READ32(dummy); /* headerpadsz is always 0 */ + READ32(sess->fore_channel.maxreq_sz); + READ32(sess->fore_channel.maxresp_sz); + READ32(sess->fore_channel.maxresp_cached); + READ32(sess->fore_channel.maxops); + READ32(sess->fore_channel.maxreqs); + READ32(sess->fore_channel.nr_rdma_attrs); + if (sess->fore_channel.nr_rdma_attrs == 1) { + READ_BUF(4); + READ32(sess->fore_channel.rdma_attrs); + } else if (sess->fore_channel.nr_rdma_attrs > 1) { + dprintk("Too many fore channel attr bitmaps!\n"); + goto xdr_error; + } + + /* Back channel attrs */ + READ_BUF(28); + READ32(dummy); /* headerpadsz is always 0 */ + READ32(sess->back_channel.maxreq_sz); + READ32(sess->back_channel.maxresp_sz); + READ32(sess->back_channel.maxresp_cached); + READ32(sess->back_channel.maxops); + READ32(sess->back_channel.maxreqs); + READ32(sess->back_channel.nr_rdma_attrs); + if (sess->back_channel.nr_rdma_attrs == 1) { + READ_BUF(4); + READ32(sess->back_channel.rdma_attrs); + } else if (sess->back_channel.nr_rdma_attrs > 1) { + dprintk("Too many back channel attr bitmaps!\n"); + goto xdr_error; + } + + READ_BUF(8); + READ32(sess->callback_prog); + + /* callback_sec_params4 */ + READ32(nr_secflavs); + for (i = 0; i < nr_secflavs; ++i) { + READ_BUF(4); + READ32(dummy); + switch (dummy) { + case RPC_AUTH_NULL: + /* Nothing to read */ + break; + case RPC_AUTH_UNIX: + READ_BUF(8); + /* stamp */ + READ32(dummy); + + /* machine name */ + READ32(dummy); + READ_BUF(dummy); + SAVEMEM(machine_name, dummy); + + /* uid, gid */ + READ_BUF(8); + READ32(sess->uid); + READ32(sess->gid); + + /* more gids */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy * 4); + for (i = 0; i < dummy; ++i) + READ32(dummy); + break; + case RPC_AUTH_GSS: + dprintk("RPC_AUTH_GSS callback secflavor " + "not supported!\n"); + READ_BUF(8); + /* gcbp_service */ + READ32(dummy); + /* gcbp_handle_from_server */ + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + /* gcbp_handle_from_client */ + READ_BUF(4); + READ32(dummy); + READ_BUF(dummy); + p += XDR_QUADLEN(dummy); + break; + default: + dprintk("Illegal callback secflavor\n"); + return nfserr_inval; + } + } + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp, + struct nfsd4_destroy_session *destroy_session) +{ + DECODE_HEAD; + READ_BUF(NFS4_MAX_SESSIONID_LEN); + COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_sequence(struct nfsd4_compoundargs *argp, + struct nfsd4_sequence *seq) +{ + DECODE_HEAD; + + READ_BUF(NFS4_MAX_SESSIONID_LEN + 16); + COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); + READ32(seq->seqid); + READ32(seq->slotid); + READ32(seq->maxslots); + READ32(seq->cachethis); + DECODE_TAIL; } @@ -1005,7 +1282,7 @@ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) static __be32 nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) { - return nfserr_opnotsupp; + return nfserr_notsupp; } typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); @@ -1031,7 +1308,7 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, - [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_noop, [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, [OP_READ] = (nfsd4_dec)nfsd4_decode_read, [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, @@ -1050,6 +1327,67 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner, }; +static nfsd4_dec nfsd41_dec_ops[] = { + [OP_ACCESS] (nfsd4_dec)nfsd4_decode_access, + [OP_CLOSE] (nfsd4_dec)nfsd4_decode_close, + [OP_COMMIT] (nfsd4_dec)nfsd4_decode_commit, + [OP_CREATE] (nfsd4_dec)nfsd4_decode_create, + [OP_DELEGPURGE] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DELEGRETURN] (nfsd4_dec)nfsd4_decode_delegreturn, + [OP_GETATTR] (nfsd4_dec)nfsd4_decode_getattr, + [OP_GETFH] (nfsd4_dec)nfsd4_decode_noop, + [OP_LINK] (nfsd4_dec)nfsd4_decode_link, + [OP_LOCK] (nfsd4_dec)nfsd4_decode_lock, + [OP_LOCKT] (nfsd4_dec)nfsd4_decode_lockt, + [OP_LOCKU] (nfsd4_dec)nfsd4_decode_locku, + [OP_LOOKUP] (nfsd4_dec)nfsd4_decode_lookup, + [OP_LOOKUPP] (nfsd4_dec)nfsd4_decode_noop, + [OP_NVERIFY] (nfsd4_dec)nfsd4_decode_verify, + [OP_OPEN] (nfsd4_dec)nfsd4_decode_open, + [OP_OPENATTR] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OPEN_CONFIRM] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OPEN_DOWNGRADE] (nfsd4_dec)nfsd4_decode_open_downgrade, + [OP_PUTFH] (nfsd4_dec)nfsd4_decode_putfh, + [OP_PUTPUBFH] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_PUTROOTFH] (nfsd4_dec)nfsd4_decode_noop, + [OP_READ] (nfsd4_dec)nfsd4_decode_read, + [OP_READDIR] (nfsd4_dec)nfsd4_decode_readdir, + [OP_READLINK] (nfsd4_dec)nfsd4_decode_noop, + [OP_REMOVE] (nfsd4_dec)nfsd4_decode_remove, + [OP_RENAME] (nfsd4_dec)nfsd4_decode_rename, + [OP_RENEW] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_RESTOREFH] (nfsd4_dec)nfsd4_decode_noop, + [OP_SAVEFH] (nfsd4_dec)nfsd4_decode_noop, + [OP_SECINFO] (nfsd4_dec)nfsd4_decode_secinfo, + [OP_SETATTR] (nfsd4_dec)nfsd4_decode_setattr, + [OP_SETCLIENTID] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp, + [OP_VERIFY] (nfsd4_dec)nfsd4_decode_verify, + [OP_WRITE] (nfsd4_dec)nfsd4_decode_write, + [OP_RELEASE_LOCKOWNER] (nfsd4_dec)nfsd4_decode_notsupp, + + /* new operations for NFSv4.1 */ + [OP_BACKCHANNEL_CTL] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp, + [OP_EXCHANGE_ID] (nfsd4_dec)nfsd4_decode_exchange_id, + [OP_CREATE_SESSION] (nfsd4_dec)nfsd4_decode_create_session, + [OP_DESTROY_SESSION] (nfsd4_dec)nfsd4_decode_destroy_session, + [OP_FREE_STATEID] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GET_DIR_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICEINFO] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_GETDEVICELIST] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTGET] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTRETURN] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SECINFO_NO_NAME] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SEQUENCE] (nfsd4_dec)nfsd4_decode_sequence, + [OP_SET_SSV] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_TEST_STATEID] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_WANT_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DESTROY_CLIENTID] (nfsd4_dec)nfsd4_decode_notsupp, + [OP_RECLAIM_COMPLETE] (nfsd4_dec)nfsd4_decode_notsupp, +}; + struct nfsd4_minorversion_ops { nfsd4_dec *decoders; int nops; @@ -1057,6 +1395,7 @@ struct nfsd4_minorversion_ops { static struct nfsd4_minorversion_ops nfsd4_minorversion[] = { [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) }, + [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) }, }; static __be32 @@ -1412,6 +1751,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, { u32 bmval0 = bmval[0]; u32 bmval1 = bmval[1]; + u32 bmval2 = bmval[2]; struct kstat stat; struct svc_fh tempfh; struct kstatfs statfs; @@ -1425,12 +1765,16 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, int err; int aclsupport = 0; struct nfs4_acl *acl = NULL; + struct nfsd4_compoundres *resp = rqstp->rq_resp; + u32 minorversion = resp->cstate.minorversion; BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); - BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0); - BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1); + BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion)); + BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion)); + BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion)); if (exp->ex_fslocs.migrated) { + BUG_ON(bmval[2]); status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err); if (status) goto out; @@ -1476,22 +1820,42 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if ((buflen -= 16) < 0) goto out_resource; - WRITE32(2); - WRITE32(bmval0); - WRITE32(bmval1); + if (unlikely(bmval2)) { + WRITE32(3); + WRITE32(bmval0); + WRITE32(bmval1); + WRITE32(bmval2); + } else if (likely(bmval1)) { + WRITE32(2); + WRITE32(bmval0); + WRITE32(bmval1); + } else { + WRITE32(1); + WRITE32(bmval0); + } attrlenp = p++; /* to be backfilled later */ if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { - u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0; + u32 word0 = nfsd_suppattrs0(minorversion); + u32 word1 = nfsd_suppattrs1(minorversion); + u32 word2 = nfsd_suppattrs2(minorversion); + if ((buflen -= 12) < 0) goto out_resource; if (!aclsupport) word0 &= ~FATTR4_WORD0_ACL; if (!exp->ex_fslocs.locations) word0 &= ~FATTR4_WORD0_FS_LOCATIONS; - WRITE32(2); - WRITE32(word0); - WRITE32(NFSD_SUPPORTED_ATTRS_WORD1); + if (!word2) { + WRITE32(2); + WRITE32(word0); + WRITE32(word1); + } else { + WRITE32(3); + WRITE32(word0); + WRITE32(word1); + WRITE32(word2); + } } if (bmval0 & FATTR4_WORD0_TYPE) { if ((buflen -= 4) < 0) @@ -1801,6 +2165,13 @@ out_acl: } WRITE64(stat.ino); } + if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { + WRITE32(3); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1); + WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2); + } + *attrlenp = htonl((char *)p - (char *)attrlenp - 4); *countp = p - buffer; status = nfs_ok; @@ -2572,6 +2943,143 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w } static __be32 +nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_exchange_id *exid) +{ + ENCODE_HEAD; + char *major_id; + char *server_scope; + int major_id_sz; + int server_scope_sz; + uint64_t minor_id = 0; + + if (nfserr) + return nfserr; + + major_id = utsname()->nodename; + major_id_sz = strlen(major_id); + server_scope = utsname()->nodename; + server_scope_sz = strlen(server_scope); + + RESERVE_SPACE( + 8 /* eir_clientid */ + + 4 /* eir_sequenceid */ + + 4 /* eir_flags */ + + 4 /* spr_how (SP4_NONE) */ + + 8 /* so_minor_id */ + + 4 /* so_major_id.len */ + + (XDR_QUADLEN(major_id_sz) * 4) + + 4 /* eir_server_scope.len */ + + (XDR_QUADLEN(server_scope_sz) * 4) + + 4 /* eir_server_impl_id.count (0) */); + + WRITEMEM(&exid->clientid, 8); + WRITE32(exid->seqid); + WRITE32(exid->flags); + + /* state_protect4_r. Currently only support SP4_NONE */ + BUG_ON(exid->spa_how != SP4_NONE); + WRITE32(exid->spa_how); + + /* The server_owner struct */ + WRITE64(minor_id); /* Minor id */ + /* major id */ + WRITE32(major_id_sz); + WRITEMEM(major_id, major_id_sz); + + /* Server scope */ + WRITE32(server_scope_sz); + WRITEMEM(server_scope, server_scope_sz); + + /* Implementation id */ + WRITE32(0); /* zero length nfs_impl_id4 array */ + ADJUST_ARGS(); + return 0; +} + +static __be32 +nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_create_session *sess) +{ + ENCODE_HEAD; + + if (nfserr) + return nfserr; + + RESERVE_SPACE(24); + WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN); + WRITE32(sess->seqid); + WRITE32(sess->flags); + ADJUST_ARGS(); + + RESERVE_SPACE(28); + WRITE32(0); /* headerpadsz */ + WRITE32(sess->fore_channel.maxreq_sz); + WRITE32(sess->fore_channel.maxresp_sz); + WRITE32(sess->fore_channel.maxresp_cached); + WRITE32(sess->fore_channel.maxops); + WRITE32(sess->fore_channel.maxreqs); + WRITE32(sess->fore_channel.nr_rdma_attrs); + ADJUST_ARGS(); + + if (sess->fore_channel.nr_rdma_attrs) { + RESERVE_SPACE(4); + WRITE32(sess->fore_channel.rdma_attrs); + ADJUST_ARGS(); + } + + RESERVE_SPACE(28); + WRITE32(0); /* headerpadsz */ + WRITE32(sess->back_channel.maxreq_sz); + WRITE32(sess->back_channel.maxresp_sz); + WRITE32(sess->back_channel.maxresp_cached); + WRITE32(sess->back_channel.maxops); + WRITE32(sess->back_channel.maxreqs); + WRITE32(sess->back_channel.nr_rdma_attrs); + ADJUST_ARGS(); + + if (sess->back_channel.nr_rdma_attrs) { + RESERVE_SPACE(4); + WRITE32(sess->back_channel.rdma_attrs); + ADJUST_ARGS(); + } + return 0; +} + +static __be32 +nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_destroy_session *destroy_session) +{ + return nfserr; +} + +__be32 +nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_sequence *seq) +{ + ENCODE_HEAD; + + if (nfserr) + return nfserr; + + RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20); + WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN); + WRITE32(seq->seqid); + WRITE32(seq->slotid); + WRITE32(seq->maxslots); + /* + * FIXME: for now: + * target_maxslots = maxslots + * status_flags = 0 + */ + WRITE32(seq->maxslots); + WRITE32(0); + + ADJUST_ARGS(); + return 0; +} + +static __be32 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) { return nfserr; @@ -2579,6 +3087,11 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); +/* + * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1 + * since we don't need to filter out obsolete ops as this is + * done in the decoding phase. + */ static nfsd4_enc nfsd4_enc_ops[] = { [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access, [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close, @@ -2617,8 +3130,77 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop, [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write, [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop, + + /* NFSv4.1 operations */ + [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop, + [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id, + [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session, + [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session, + [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, + [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, + [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, + [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop, + [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop, }; +/* + * Calculate the total amount of memory that the compound response has taken + * after encoding the current operation. + * + * pad: add on 8 bytes for the next operation's op_code and status so that + * there is room to cache a failure on the next operation. + * + * Compare this length to the session se_fmaxresp_cached. + * + * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so + * will be at least a page and will therefore hold the xdr_buf head. + */ +static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp) +{ + int status = 0; + struct xdr_buf *xb = &resp->rqstp->rq_res; + struct nfsd4_compoundargs *args = resp->rqstp->rq_argp; + struct nfsd4_session *session = NULL; + struct nfsd4_slot *slot = resp->cstate.slot; + u32 length, tlen = 0, pad = 8; + + if (!nfsd4_has_session(&resp->cstate)) + return status; + + session = resp->cstate.session; + if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0) + return status; + + if (resp->opcnt >= args->opcnt) + pad = 0; /* this is the last operation */ + + if (xb->page_len == 0) { + length = (char *)resp->p - (char *)xb->head[0].iov_base + pad; + } else { + if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0) + tlen = (char *)resp->p - (char *)xb->tail[0].iov_base; + + length = xb->head[0].iov_len + xb->page_len + tlen + pad; + } + dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__, + length, xb->page_len, tlen, pad); + + if (length <= session->se_fmaxresp_cached) + return status; + else + return nfserr_rep_too_big_to_cache; +} + void nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) { @@ -2635,6 +3217,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || !nfsd4_enc_ops[op->opnum]); op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); + /* nfsd4_check_drc_limit guarantees enough room for error status */ + if (!op->status && nfsd4_check_drc_limit(resp)) + op->status = nfserr_rep_too_big_to_cache; status: /* * Note: We write the status directly, instead of using WRITE32(), @@ -2735,6 +3320,18 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo iov = &rqstp->rq_res.head[0]; iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; BUG_ON(iov->iov_len > PAGE_SIZE); + if (nfsd4_has_session(&resp->cstate)) { + if (resp->cstate.status == nfserr_replay_cache && + !nfsd4_not_cached(resp)) { + iov->iov_len = resp->cstate.iovlen; + } else { + nfsd4_store_cache_entry(resp); + dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__); + resp->cstate.slot->sl_inuse = 0; + } + if (resp->cstate.session) + nfsd4_put_session(resp->cstate.session); + } return 1; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index a4ed8644d69..af16849d243 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -60,6 +60,7 @@ enum { NFSD_FO_UnlockFS, NFSD_Threads, NFSD_Pool_Threads, + NFSD_Pool_Stats, NFSD_Versions, NFSD_Ports, NFSD_MaxBlkSize, @@ -172,6 +173,16 @@ static const struct file_operations exports_operations = { .owner = THIS_MODULE, }; +extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); + +static struct file_operations pool_stats_operations = { + .open = nfsd_pool_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .owner = THIS_MODULE, +}; + /*----------------------------------------------------------------------------*/ /* * payload - write methods @@ -781,8 +792,9 @@ out_free: static ssize_t __write_versions(struct file *file, char *buf, size_t size) { char *mesg = buf; - char *vers, sign; + char *vers, *minorp, sign; int len, num; + unsigned minor; ssize_t tlen = 0; char *sep; @@ -803,9 +815,20 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) do { sign = *vers; if (sign == '+' || sign == '-') - num = simple_strtol((vers+1), NULL, 0); + num = simple_strtol((vers+1), &minorp, 0); else - num = simple_strtol(vers, NULL, 0); + num = simple_strtol(vers, &minorp, 0); + if (*minorp == '.') { + if (num < 4) + return -EINVAL; + minor = simple_strtoul(minorp+1, NULL, 0); + if (minor == 0) + return -EINVAL; + if (nfsd_minorversion(minor, sign == '-' ? + NFSD_CLEAR : NFSD_SET) < 0) + return -EINVAL; + goto next; + } switch(num) { case 2: case 3: @@ -815,6 +838,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) default: return -EINVAL; } + next: vers += len + 1; tlen += len; } while ((len = qword_get(&mesg, vers, size)) > 0); @@ -833,6 +857,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) num); sep = " "; } + if (nfsd_vers(4, NFSD_AVAIL)) + for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++) + len += sprintf(buf+len, " %c4.%u", + (nfsd_vers(4, NFSD_TEST) && + nfsd_minorversion(minor, NFSD_TEST)) ? + '+' : '-', + minor); len += sprintf(buf+len, "\n"); return len; } @@ -1248,6 +1279,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO}, [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 6f7f2635122..e298e260b5f 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -180,6 +180,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp, { __be32 nfserr; int stable = 1; + unsigned long cnt = argp->len; dprintk("nfsd: WRITE %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -188,7 +189,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp, nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, argp->offset, rqstp->rq_vec, argp->vlen, - argp->len, + &cnt, &stable); return nfsd_return_attrs(nfserr, resp); } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 7c09852be71..cbba4a93578 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -22,6 +22,7 @@ #include <linux/freezer.h> #include <linux/fs_struct.h> #include <linux/kthread.h> +#include <linux/swap.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/stats.h> @@ -40,9 +41,6 @@ extern struct svc_program nfsd_program; static int nfsd(void *vrqstp); struct timeval nfssvc_boot; -static atomic_t nfsd_busy; -static unsigned long nfsd_last_call; -static DEFINE_SPINLOCK(nfsd_call_lock); /* * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members @@ -123,6 +121,8 @@ struct svc_program nfsd_program = { }; +u32 nfsd_supported_minorversion; + int nfsd_vers(int vers, enum vers_op change) { if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) @@ -149,6 +149,28 @@ int nfsd_vers(int vers, enum vers_op change) } return 0; } + +int nfsd_minorversion(u32 minorversion, enum vers_op change) +{ + if (minorversion > NFSD_SUPPORTED_MINOR_VERSION) + return -1; + switch(change) { + case NFSD_SET: + nfsd_supported_minorversion = minorversion; + break; + case NFSD_CLEAR: + if (minorversion == 0) + return -1; + nfsd_supported_minorversion = minorversion - 1; + break; + case NFSD_TEST: + return minorversion <= nfsd_supported_minorversion; + case NFSD_AVAIL: + return minorversion <= NFSD_SUPPORTED_MINOR_VERSION; + } + return 0; +} + /* * Maximum number of nfsd processes */ @@ -200,6 +222,28 @@ void nfsd_reset_versions(void) } } +/* + * Each session guarantees a negotiated per slot memory cache for replies + * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated + * NFSv4.1 server might want to use more memory for a DRC than a machine + * with mutiple services. + * + * Impose a hard limit on the number of pages for the DRC which varies + * according to the machines free pages. This is of course only a default. + * + * For now this is a #defined shift which could be under admin control + * in the future. + */ +static void set_max_drc(void) +{ + /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */ + #define NFSD_DRC_SIZE_SHIFT 7 + nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages() + >> NFSD_DRC_SIZE_SHIFT; + nfsd_serv->sv_drc_pages_used = 0; + dprintk("%s svc_drc_max_pages %u\n", __func__, + nfsd_serv->sv_drc_max_pages); +} int nfsd_create_serv(void) { @@ -227,11 +271,12 @@ int nfsd_create_serv(void) nfsd_max_blksize /= 2; } - atomic_set(&nfsd_busy, 0); nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd_last_thread, nfsd, THIS_MODULE); if (nfsd_serv == NULL) err = -ENOMEM; + else + set_max_drc(); do_gettimeofday(&nfssvc_boot); /* record boot time */ return err; @@ -375,26 +420,6 @@ nfsd_svc(unsigned short port, int nrservs) return error; } -static inline void -update_thread_usage(int busy_threads) -{ - unsigned long prev_call; - unsigned long diff; - int decile; - - spin_lock(&nfsd_call_lock); - prev_call = nfsd_last_call; - nfsd_last_call = jiffies; - decile = busy_threads*10/nfsdstats.th_cnt; - if (decile>0 && decile <= 10) { - diff = nfsd_last_call - prev_call; - if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP) - nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP; - if (decile == 10) - nfsdstats.th_fullcnt++; - } - spin_unlock(&nfsd_call_lock); -} /* * This is the NFS server kernel thread @@ -460,8 +485,6 @@ nfsd(void *vrqstp) continue; } - update_thread_usage(atomic_read(&nfsd_busy)); - atomic_inc(&nfsd_busy); /* Lock the export hash tables for reading. */ exp_readlock(); @@ -470,8 +493,6 @@ nfsd(void *vrqstp) /* Unlock export hash tables */ exp_readunlock(); - update_thread_usage(atomic_read(&nfsd_busy)); - atomic_dec(&nfsd_busy); } /* Clear signals before calling svc_exit_thread() */ @@ -539,6 +560,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) + rqstp->rq_res.head[0].iov_len; rqstp->rq_res.head[0].iov_len += sizeof(__be32); + /* NFSv4.1 DRC requires statp */ + if (rqstp->rq_vers == 4) + nfsd4_set_statp(rqstp, statp); + /* Now call the procedure handler, and encode NFS status. */ nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); nfserr = map_new_errors(rqstp->rq_vers, nfserr); @@ -570,3 +595,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); return 1; } + +int nfsd_pool_stats_open(struct inode *inode, struct file *file) +{ + if (nfsd_serv == NULL) + return -ENODEV; + return svc_pool_stats_open(nfsd_serv, file); +} diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 78376b6c023..ab93fcfef25 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -366,8 +366,9 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, } /* Revoke setuid/setgid on chown */ - if (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || - ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)) { + if (!S_ISDIR(inode->i_mode) && + (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || + ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) { iap->ia_valid |= ATTR_KILL_PRIV; if (iap->ia_valid & ATTR_MODE) { /* we're setting mode too, just clear the s*id bits */ @@ -960,7 +961,7 @@ static void kill_suid(struct dentry *dentry) static __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, - unsigned long cnt, int *stablep) + unsigned long *cnt, int *stablep) { struct svc_export *exp; struct dentry *dentry; @@ -974,7 +975,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, err = nfserr_perm; if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && - (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt))) + (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt))) goto out; #endif @@ -1009,7 +1010,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); set_fs(oldfs); if (host_err >= 0) { - nfsdstats.io_write += cnt; + nfsdstats.io_write += host_err; fsnotify_modify(file->f_path.dentry); } @@ -1054,9 +1055,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, } dprintk("nfsd: write complete host_err=%d\n", host_err); - if (host_err >= 0) + if (host_err >= 0) { err = 0; - else + *cnt = host_err; + } else err = nfserrno(host_err); out: return err; @@ -1098,7 +1100,7 @@ out: */ __be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, - loff_t offset, struct kvec *vec, int vlen, unsigned long cnt, + loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, int *stablep) { __be32 err = 0; @@ -1179,6 +1181,21 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, return 0; } +/* HPUX client sometimes creates a file in mode 000, and sets size to 0. + * setting size to 0 may fail for some specific file systems by the permission + * checking which requires WRITE permission but the mode is 000. + * we ignore the resizing(to 0) on the just new created file, since the size is + * 0 after file created. + * + * call this only after vfs_create() is called. + * */ +static void +nfsd_check_ignore_resizing(struct iattr *iap) +{ + if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0)) + iap->ia_valid &= ~ATTR_SIZE; +} + /* * Create a file (regular, directory, device, fifo); UNIX sockets * not yet implemented. @@ -1274,6 +1291,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, switch (type) { case S_IFREG: host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); + if (!host_err) + nfsd_check_ignore_resizing(iap); break; case S_IFDIR: host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); @@ -1427,6 +1446,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, /* setattr will sync the child (or not) */ } + nfsd_check_ignore_resizing(iap); + if (createmode == NFS3_CREATE_EXCLUSIVE) { /* Cram the verifier into atime/mtime */ iap->ia_valid = ATTR_MTIME|ATTR_ATIME diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile new file mode 100644 index 00000000000..df3e62c1ddc --- /dev/null +++ b/fs/nilfs2/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_NILFS2_FS) += nilfs2.o +nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \ + btnode.o bmap.o btree.o direct.o dat.o recovery.o \ + the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \ + ifile.o alloc.o gcinode.o ioctl.o gcdat.o diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c new file mode 100644 index 00000000000..d69e6ae5925 --- /dev/null +++ b/fs/nilfs2/alloc.c @@ -0,0 +1,504 @@ +/* + * alloc.c - NILFS dat/inode allocator + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Original code was written by Koji Sato <koji@osrg.net>. + * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>, + * Amagai Yoshiji <amagai@osrg.net>. + */ + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/bitops.h> +#include "mdt.h" +#include "alloc.h" + + +static inline unsigned long +nilfs_palloc_groups_per_desc_block(const struct inode *inode) +{ + return (1UL << inode->i_blkbits) / + sizeof(struct nilfs_palloc_group_desc); +} + +static inline unsigned long +nilfs_palloc_groups_count(const struct inode *inode) +{ + return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */)); +} + +int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + + mi->mi_bgl = kmalloc(sizeof(*mi->mi_bgl), GFP_NOFS); + if (!mi->mi_bgl) + return -ENOMEM; + + bgl_lock_init(mi->mi_bgl); + + nilfs_mdt_set_entry_size(inode, entry_size, 0); + + mi->mi_blocks_per_group = + DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode), + mi->mi_entries_per_block) + 1; + /* Number of blocks in a group including entry blocks and + a bitmap block */ + mi->mi_blocks_per_desc_block = + nilfs_palloc_groups_per_desc_block(inode) * + mi->mi_blocks_per_group + 1; + /* Number of blocks per descriptor including the + descriptor block */ + return 0; +} + +static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr, + unsigned long *offset) +{ + __u64 group = nr; + + *offset = do_div(group, nilfs_palloc_entries_per_group(inode)); + return group; +} + +static unsigned long +nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group) +{ + unsigned long desc_block = + group / nilfs_palloc_groups_per_desc_block(inode); + return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block; +} + +static unsigned long +nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group) +{ + unsigned long desc_offset = + group % nilfs_palloc_groups_per_desc_block(inode); + return nilfs_palloc_desc_blkoff(inode, group) + 1 + + desc_offset * NILFS_MDT(inode)->mi_blocks_per_group; +} + +static unsigned long +nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group, + const struct nilfs_palloc_group_desc *desc) +{ + unsigned long nfree; + + spin_lock(nilfs_mdt_bgl_lock(inode, group)); + nfree = le32_to_cpu(desc->pg_nfrees); + spin_unlock(nilfs_mdt_bgl_lock(inode, group)); + return nfree; +} + +static void +nilfs_palloc_group_desc_add_entries(struct inode *inode, + unsigned long group, + struct nilfs_palloc_group_desc *desc, + u32 n) +{ + spin_lock(nilfs_mdt_bgl_lock(inode, group)); + le32_add_cpu(&desc->pg_nfrees, n); + spin_unlock(nilfs_mdt_bgl_lock(inode, group)); +} + +static unsigned long +nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr) +{ + unsigned long group, group_offset; + + group = nilfs_palloc_group(inode, nr, &group_offset); + + return nilfs_palloc_bitmap_blkoff(inode, group) + 1 + + group_offset / NILFS_MDT(inode)->mi_entries_per_block; +} + +static void nilfs_palloc_desc_block_init(struct inode *inode, + struct buffer_head *bh, void *kaddr) +{ + struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh); + unsigned long n = nilfs_palloc_groups_per_desc_block(inode); + __le32 nfrees; + + nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode)); + while (n-- > 0) { + desc->pg_nfrees = nfrees; + desc++; + } +} + +static int nilfs_palloc_get_desc_block(struct inode *inode, + unsigned long group, + int create, struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(inode, + nilfs_palloc_desc_blkoff(inode, group), + create, nilfs_palloc_desc_block_init, bhp); +} + +static int nilfs_palloc_get_bitmap_block(struct inode *inode, + unsigned long group, + int create, struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(inode, + nilfs_palloc_bitmap_blkoff(inode, group), + create, NULL, bhp); +} + +int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, + int create, struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr), + create, NULL, bhp); +} + +static struct nilfs_palloc_group_desc * +nilfs_palloc_block_get_group_desc(const struct inode *inode, + unsigned long group, + const struct buffer_head *bh, void *kaddr) +{ + return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) + + group % nilfs_palloc_groups_per_desc_block(inode); +} + +static unsigned char * +nilfs_palloc_block_get_bitmap(const struct inode *inode, + const struct buffer_head *bh, void *kaddr) +{ + return (unsigned char *)(kaddr + bh_offset(bh)); +} + +void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, + const struct buffer_head *bh, void *kaddr) +{ + unsigned long entry_offset, group_offset; + + nilfs_palloc_group(inode, nr, &group_offset); + entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block; + + return kaddr + bh_offset(bh) + + entry_offset * NILFS_MDT(inode)->mi_entry_size; +} + +static int nilfs_palloc_find_available_slot(struct inode *inode, + unsigned long group, + unsigned long target, + unsigned char *bitmap, + int bsize) /* size in bits */ +{ + int curr, pos, end, i; + + if (target > 0) { + end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1); + if (end > bsize) + end = bsize; + pos = nilfs_find_next_zero_bit(bitmap, end, target); + if (pos < end && + !nilfs_set_bit_atomic( + nilfs_mdt_bgl_lock(inode, group), pos, bitmap)) + return pos; + } else + end = 0; + + for (i = 0, curr = end; + i < bsize; + i += BITS_PER_LONG, curr += BITS_PER_LONG) { + /* wrap around */ + if (curr >= bsize) + curr = 0; + while (*((unsigned long *)bitmap + curr / BITS_PER_LONG) + != ~0UL) { + end = curr + BITS_PER_LONG; + if (end > bsize) + end = bsize; + pos = nilfs_find_next_zero_bit(bitmap, end, curr); + if ((pos < end) && + !nilfs_set_bit_atomic( + nilfs_mdt_bgl_lock(inode, group), pos, + bitmap)) + return pos; + } + } + return -ENOSPC; +} + +static unsigned long +nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, + unsigned long curr, unsigned long max) +{ + return min_t(unsigned long, + nilfs_palloc_groups_per_desc_block(inode) - + curr % nilfs_palloc_groups_per_desc_block(inode), + max - curr + 1); +} + +int nilfs_palloc_prepare_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct buffer_head *desc_bh, *bitmap_bh; + struct nilfs_palloc_group_desc *desc; + unsigned char *bitmap; + void *desc_kaddr, *bitmap_kaddr; + unsigned long group, maxgroup, ngroups; + unsigned long group_offset, maxgroup_offset; + unsigned long n, entries_per_group, groups_per_desc_block; + unsigned long i, j; + int pos, ret; + + ngroups = nilfs_palloc_groups_count(inode); + maxgroup = ngroups - 1; + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + entries_per_group = nilfs_palloc_entries_per_group(inode); + groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode); + + for (i = 0; i < ngroups; i += n) { + if (group >= ngroups) { + /* wrap around */ + group = 0; + maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr, + &maxgroup_offset) - 1; + } + ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); + if (ret < 0) + return ret; + desc_kaddr = kmap(desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc( + inode, group, desc_bh, desc_kaddr); + n = nilfs_palloc_rest_groups_in_desc_block(inode, group, + maxgroup); + for (j = 0; j < n; j++, desc++, group++) { + if (nilfs_palloc_group_desc_nfrees(inode, group, desc) + > 0) { + ret = nilfs_palloc_get_bitmap_block( + inode, group, 1, &bitmap_bh); + if (ret < 0) + goto out_desc; + bitmap_kaddr = kmap(bitmap_bh->b_page); + bitmap = nilfs_palloc_block_get_bitmap( + inode, bitmap_bh, bitmap_kaddr); + pos = nilfs_palloc_find_available_slot( + inode, group, group_offset, bitmap, + entries_per_group); + if (pos >= 0) { + /* found a free entry */ + nilfs_palloc_group_desc_add_entries( + inode, group, desc, -1); + req->pr_entry_nr = + entries_per_group * group + pos; + kunmap(desc_bh->b_page); + kunmap(bitmap_bh->b_page); + + req->pr_desc_bh = desc_bh; + req->pr_bitmap_bh = bitmap_bh; + return 0; + } + kunmap(bitmap_bh->b_page); + brelse(bitmap_bh); + } + + group_offset = 0; + } + + kunmap(desc_bh->b_page); + brelse(desc_bh); + } + + /* no entries left */ + return -ENOSPC; + + out_desc: + kunmap(desc_bh->b_page); + brelse(desc_bh); + return ret; +} + +void nilfs_palloc_commit_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh); + nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh); + nilfs_mdt_mark_dirty(inode); + + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); +} + +void nilfs_palloc_commit_free_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct nilfs_palloc_group_desc *desc; + unsigned long group, group_offset; + unsigned char *bitmap; + void *desc_kaddr, *bitmap_kaddr; + + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + desc_kaddr = kmap(req->pr_desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc(inode, group, + req->pr_desc_bh, desc_kaddr); + bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); + bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh, + bitmap_kaddr); + + if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), + group_offset, bitmap)) + printk(KERN_WARNING "%s: entry number %llu already freed\n", + __func__, (unsigned long long)req->pr_entry_nr); + + nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); + + kunmap(req->pr_bitmap_bh->b_page); + kunmap(req->pr_desc_bh->b_page); + + nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh); + nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh); + nilfs_mdt_mark_dirty(inode); + + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); +} + +void nilfs_palloc_abort_alloc_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct nilfs_palloc_group_desc *desc; + void *desc_kaddr, *bitmap_kaddr; + unsigned char *bitmap; + unsigned long group, group_offset; + + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + desc_kaddr = kmap(req->pr_desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc(inode, group, + req->pr_desc_bh, desc_kaddr); + bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page); + bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh, + bitmap_kaddr); + if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), + group_offset, bitmap)) + printk(KERN_WARNING "%s: entry numer %llu already freed\n", + __func__, (unsigned long long)req->pr_entry_nr); + + nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); + + kunmap(req->pr_bitmap_bh->b_page); + kunmap(req->pr_desc_bh->b_page); + + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); + + req->pr_entry_nr = 0; + req->pr_bitmap_bh = NULL; + req->pr_desc_bh = NULL; +} + +int nilfs_palloc_prepare_free_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + struct buffer_head *desc_bh, *bitmap_bh; + unsigned long group, group_offset; + int ret; + + group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset); + ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh); + if (ret < 0) + return ret; + ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh); + if (ret < 0) { + brelse(desc_bh); + return ret; + } + + req->pr_desc_bh = desc_bh; + req->pr_bitmap_bh = bitmap_bh; + return 0; +} + +void nilfs_palloc_abort_free_entry(struct inode *inode, + struct nilfs_palloc_req *req) +{ + brelse(req->pr_bitmap_bh); + brelse(req->pr_desc_bh); + + req->pr_entry_nr = 0; + req->pr_bitmap_bh = NULL; + req->pr_desc_bh = NULL; +} + +static int +nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr) +{ + __u64 first, last; + + first = group * nilfs_palloc_entries_per_group(inode); + last = first + nilfs_palloc_entries_per_group(inode) - 1; + return (nr >= first) && (nr <= last); +} + +int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) +{ + struct buffer_head *desc_bh, *bitmap_bh; + struct nilfs_palloc_group_desc *desc; + unsigned char *bitmap; + void *desc_kaddr, *bitmap_kaddr; + unsigned long group, group_offset; + int i, j, n, ret; + + for (i = 0; i < nitems; i += n) { + group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset); + ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh); + if (ret < 0) + return ret; + ret = nilfs_palloc_get_bitmap_block(inode, group, 0, + &bitmap_bh); + if (ret < 0) { + brelse(desc_bh); + return ret; + } + desc_kaddr = kmap(desc_bh->b_page); + desc = nilfs_palloc_block_get_group_desc( + inode, group, desc_bh, desc_kaddr); + bitmap_kaddr = kmap(bitmap_bh->b_page); + bitmap = nilfs_palloc_block_get_bitmap( + inode, bitmap_bh, bitmap_kaddr); + for (j = i, n = 0; + (j < nitems) && nilfs_palloc_group_is_in(inode, group, + entry_nrs[j]); + j++, n++) { + nilfs_palloc_group(inode, entry_nrs[j], &group_offset); + if (!nilfs_clear_bit_atomic( + nilfs_mdt_bgl_lock(inode, group), + group_offset, bitmap)) { + printk(KERN_WARNING + "%s: entry number %llu already freed\n", + __func__, + (unsigned long long)entry_nrs[j]); + } + } + nilfs_palloc_group_desc_add_entries(inode, group, desc, n); + + kunmap(bitmap_bh->b_page); + kunmap(desc_bh->b_page); + + nilfs_mdt_mark_buffer_dirty(desc_bh); + nilfs_mdt_mark_buffer_dirty(bitmap_bh); + nilfs_mdt_mark_dirty(inode); + + brelse(bitmap_bh); + brelse(desc_bh); + } + return 0; +} diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h new file mode 100644 index 00000000000..4ace5475c2c --- /dev/null +++ b/fs/nilfs2/alloc.h @@ -0,0 +1,72 @@ +/* + * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Original code was written by Koji Sato <koji@osrg.net>. + * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>, + * Amagai Yoshiji <amagai@osrg.net>. + */ + +#ifndef _NILFS_ALLOC_H +#define _NILFS_ALLOC_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> + +static inline unsigned long +nilfs_palloc_entries_per_group(const struct inode *inode) +{ + return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */); +} + +int nilfs_palloc_init_blockgroup(struct inode *, unsigned); +int nilfs_palloc_get_entry_block(struct inode *, __u64, int, + struct buffer_head **); +void *nilfs_palloc_block_get_entry(const struct inode *, __u64, + const struct buffer_head *, void *); + +/** + * nilfs_palloc_req - persistent alloctor request and reply + * @pr_entry_nr: entry number (vblocknr or inode number) + * @pr_desc_bh: buffer head of the buffer containing block group descriptors + * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap + * @pr_entry_bh: buffer head of the buffer containing translation entries + */ +struct nilfs_palloc_req { + __u64 pr_entry_nr; + struct buffer_head *pr_desc_bh; + struct buffer_head *pr_bitmap_bh; + struct buffer_head *pr_entry_bh; +}; + +int nilfs_palloc_prepare_alloc_entry(struct inode *, + struct nilfs_palloc_req *); +void nilfs_palloc_commit_alloc_entry(struct inode *, + struct nilfs_palloc_req *); +void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *); +void nilfs_palloc_commit_free_entry(struct inode *, struct nilfs_palloc_req *); +int nilfs_palloc_prepare_free_entry(struct inode *, struct nilfs_palloc_req *); +void nilfs_palloc_abort_free_entry(struct inode *, struct nilfs_palloc_req *); +int nilfs_palloc_freev(struct inode *, __u64 *, size_t); + +#define nilfs_set_bit_atomic ext2_set_bit_atomic +#define nilfs_clear_bit_atomic ext2_clear_bit_atomic +#define nilfs_find_next_zero_bit ext2_find_next_zero_bit + +#endif /* _NILFS_ALLOC_H */ diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c new file mode 100644 index 00000000000..064279e33bb --- /dev/null +++ b/fs/nilfs2/bmap.c @@ -0,0 +1,788 @@ +/* + * bmap.c - NILFS block mapping. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/errno.h> +#include "nilfs.h" +#include "bmap.h" +#include "sb.h" +#include "btnode.h" +#include "mdt.h" +#include "dat.h" +#include "alloc.h" + +int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, + __u64 *ptrp) +{ + __u64 ptr; + int ret; + + down_read(&bmap->b_sem); + ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp); + if (ret < 0) + goto out; + if (bmap->b_pops->bpop_translate != NULL) { + ret = bmap->b_pops->bpop_translate(bmap, *ptrp, &ptr); + if (ret < 0) + goto out; + *ptrp = ptr; + } + + out: + up_read(&bmap->b_sem); + return ret; +} + + +/** + * nilfs_bmap_lookup - find a record + * @bmap: bmap + * @key: key + * @recp: pointer to record + * + * Description: nilfs_bmap_lookup() finds a record whose key matches @key in + * @bmap. + * + * Return Value: On success, 0 is returned and the record associated with @key + * is stored in the place pointed by @recp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A record associated with @key does not exist. + */ +int nilfs_bmap_lookup(struct nilfs_bmap *bmap, + unsigned long key, + unsigned long *recp) +{ + __u64 ptr; + int ret; + + /* XXX: use macro for level 1 */ + ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr); + if (recp != NULL) + *recp = ptr; + return ret; +} + +static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) +{ + __u64 keys[NILFS_BMAP_SMALL_HIGH + 1]; + __u64 ptrs[NILFS_BMAP_SMALL_HIGH + 1]; + int ret, n; + + if (bmap->b_ops->bop_check_insert != NULL) { + ret = bmap->b_ops->bop_check_insert(bmap, key); + if (ret > 0) { + n = bmap->b_ops->bop_gather_data( + bmap, keys, ptrs, NILFS_BMAP_SMALL_HIGH + 1); + if (n < 0) + return n; + ret = nilfs_btree_convert_and_insert( + bmap, key, ptr, keys, ptrs, n, + NILFS_BMAP_LARGE_LOW, NILFS_BMAP_LARGE_HIGH); + if (ret == 0) + bmap->b_u.u_flags |= NILFS_BMAP_LARGE; + + return ret; + } else if (ret < 0) + return ret; + } + + return bmap->b_ops->bop_insert(bmap, key, ptr); +} + +/** + * nilfs_bmap_insert - insert a new key-record pair into a bmap + * @bmap: bmap + * @key: key + * @rec: record + * + * Description: nilfs_bmap_insert() inserts the new key-record pair specified + * by @key and @rec into @bmap. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EEXIST - A record associated with @key already exist. + */ +int nilfs_bmap_insert(struct nilfs_bmap *bmap, + unsigned long key, + unsigned long rec) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_do_insert(bmap, key, rec); + up_write(&bmap->b_sem); + return ret; +} + +static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key) +{ + __u64 keys[NILFS_BMAP_LARGE_LOW + 1]; + __u64 ptrs[NILFS_BMAP_LARGE_LOW + 1]; + int ret, n; + + if (bmap->b_ops->bop_check_delete != NULL) { + ret = bmap->b_ops->bop_check_delete(bmap, key); + if (ret > 0) { + n = bmap->b_ops->bop_gather_data( + bmap, keys, ptrs, NILFS_BMAP_LARGE_LOW + 1); + if (n < 0) + return n; + ret = nilfs_direct_delete_and_convert( + bmap, key, keys, ptrs, n, + NILFS_BMAP_SMALL_LOW, NILFS_BMAP_SMALL_HIGH); + if (ret == 0) + bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE; + + return ret; + } else if (ret < 0) + return ret; + } + + return bmap->b_ops->bop_delete(bmap, key); +} + +int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key) +{ + __u64 lastkey; + int ret; + + down_read(&bmap->b_sem); + ret = bmap->b_ops->bop_last_key(bmap, &lastkey); + if (!ret) + *key = lastkey; + up_read(&bmap->b_sem); + return ret; +} + +/** + * nilfs_bmap_delete - delete a key-record pair from a bmap + * @bmap: bmap + * @key: key + * + * Description: nilfs_bmap_delete() deletes the key-record pair specified by + * @key from @bmap. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A record associated with @key does not exist. + */ +int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_do_delete(bmap, key); + up_write(&bmap->b_sem); + return ret; +} + +static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key) +{ + __u64 lastkey; + int ret; + + ret = bmap->b_ops->bop_last_key(bmap, &lastkey); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + return ret; + } + + while (key <= lastkey) { + ret = nilfs_bmap_do_delete(bmap, lastkey); + if (ret < 0) + return ret; + ret = bmap->b_ops->bop_last_key(bmap, &lastkey); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + return ret; + } + } + return 0; +} + +/** + * nilfs_bmap_truncate - truncate a bmap to a specified key + * @bmap: bmap + * @key: key + * + * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are + * greater than or equal to @key from @bmap. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_do_truncate(bmap, key); + up_write(&bmap->b_sem); + return ret; +} + +/** + * nilfs_bmap_clear - free resources a bmap holds + * @bmap: bmap + * + * Description: nilfs_bmap_clear() frees resources associated with @bmap. + */ +void nilfs_bmap_clear(struct nilfs_bmap *bmap) +{ + down_write(&bmap->b_sem); + if (bmap->b_ops->bop_clear != NULL) + bmap->b_ops->bop_clear(bmap); + up_write(&bmap->b_sem); +} + +/** + * nilfs_bmap_propagate - propagate dirty state + * @bmap: bmap + * @bh: buffer head + * + * Description: nilfs_bmap_propagate() marks the buffers that directly or + * indirectly refer to the block specified by @bh dirty. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh) +{ + int ret; + + down_write(&bmap->b_sem); + ret = bmap->b_ops->bop_propagate(bmap, bh); + up_write(&bmap->b_sem); + return ret; +} + +/** + * nilfs_bmap_lookup_dirty_buffers - + * @bmap: bmap + * @listp: pointer to buffer head list + */ +void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap, + struct list_head *listp) +{ + if (bmap->b_ops->bop_lookup_dirty_buffers != NULL) + bmap->b_ops->bop_lookup_dirty_buffers(bmap, listp); +} + +/** + * nilfs_bmap_assign - assign a new block number to a block + * @bmap: bmap + * @bhp: pointer to buffer head + * @blocknr: block number + * @binfo: block information + * + * Description: nilfs_bmap_assign() assigns the block number @blocknr to the + * buffer specified by @bh. + * + * Return Value: On success, 0 is returned and the buffer head of a newly + * create buffer and the block information associated with the buffer are + * stored in the place pointed by @bh and @binfo, respectively. On error, one + * of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_assign(struct nilfs_bmap *bmap, + struct buffer_head **bh, + unsigned long blocknr, + union nilfs_binfo *binfo) +{ + int ret; + + down_write(&bmap->b_sem); + ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo); + up_write(&bmap->b_sem); + return ret; +} + +/** + * nilfs_bmap_mark - mark block dirty + * @bmap: bmap + * @key: key + * @level: level + * + * Description: nilfs_bmap_mark() marks the block specified by @key and @level + * as dirty. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level) +{ + int ret; + + if (bmap->b_ops->bop_mark == NULL) + return 0; + + down_write(&bmap->b_sem); + ret = bmap->b_ops->bop_mark(bmap, key, level); + up_write(&bmap->b_sem); + return ret; +} + +/** + * nilfs_bmap_test_and_clear_dirty - test and clear a bmap dirty state + * @bmap: bmap + * + * Description: nilfs_test_and_clear() is the atomic operation to test and + * clear the dirty state of @bmap. + * + * Return Value: 1 is returned if @bmap is dirty, or 0 if clear. + */ +int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap) +{ + int ret; + + down_write(&bmap->b_sem); + ret = nilfs_bmap_dirty(bmap); + nilfs_bmap_clear_dirty(bmap); + up_write(&bmap->b_sem); + return ret; +} + + +/* + * Internal use only + */ + +void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n) +{ + inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n); + if (NILFS_MDT(bmap->b_inode)) + nilfs_mdt_mark_dirty(bmap->b_inode); + else + mark_inode_dirty(bmap->b_inode); +} + +void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n) +{ + inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n); + if (NILFS_MDT(bmap->b_inode)) + nilfs_mdt_mark_dirty(bmap->b_inode); + else + mark_inode_dirty(bmap->b_inode); +} + +int nilfs_bmap_get_block(const struct nilfs_bmap *bmap, __u64 ptr, + struct buffer_head **bhp) +{ + return nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache, + ptr, 0, bhp, 0); +} + +void nilfs_bmap_put_block(const struct nilfs_bmap *bmap, + struct buffer_head *bh) +{ + brelse(bh); +} + +int nilfs_bmap_get_new_block(const struct nilfs_bmap *bmap, __u64 ptr, + struct buffer_head **bhp) +{ + int ret; + + ret = nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache, + ptr, 0, bhp, 1); + if (ret < 0) + return ret; + set_buffer_nilfs_volatile(*bhp); + return 0; +} + +void nilfs_bmap_delete_block(const struct nilfs_bmap *bmap, + struct buffer_head *bh) +{ + nilfs_btnode_delete(bh); +} + +__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, + const struct buffer_head *bh) +{ + struct buffer_head *pbh; + __u64 key; + + key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT - + bmap->b_inode->i_blkbits); + for (pbh = page_buffers(bh->b_page); pbh != bh; + pbh = pbh->b_this_page, key++); + + return key; +} + +__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key) +{ + __s64 diff; + + diff = key - bmap->b_last_allocated_key; + if ((nilfs_bmap_keydiff_abs(diff) < NILFS_INODE_BMAP_SIZE) && + (bmap->b_last_allocated_ptr != NILFS_BMAP_INVALID_PTR) && + (bmap->b_last_allocated_ptr + diff > 0)) + return bmap->b_last_allocated_ptr + diff; + else + return NILFS_BMAP_INVALID_PTR; +} + +static struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap) +{ + return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode)); +} + +#define NILFS_BMAP_GROUP_DIV 8 +__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap) +{ + struct inode *dat = nilfs_bmap_get_dat(bmap); + unsigned long entries_per_group = nilfs_palloc_entries_per_group(dat); + unsigned long group = bmap->b_inode->i_ino / entries_per_group; + + return group * entries_per_group + + (bmap->b_inode->i_ino % NILFS_BMAP_GROUP_DIV) * + (entries_per_group / NILFS_BMAP_GROUP_DIV); +} + +static int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static int nilfs_bmap_prepare_start_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + return nilfs_dat_prepare_start(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static void nilfs_bmap_commit_start_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req, + sector_t blocknr) +{ + nilfs_dat_commit_start(nilfs_bmap_get_dat(bmap), &req->bpr_req, + blocknr); +} + +static void nilfs_bmap_abort_start_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_abort_start(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +static void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 0); +} + +static void nilfs_bmap_commit_end_vmdt(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 1); +} + +static void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); +} + +int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr, + sector_t blocknr) +{ + return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr); +} + +int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr) +{ + return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr); +} + +int nilfs_bmap_prepare_update(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *oldreq, + union nilfs_bmap_ptr_req *newreq) +{ + int ret; + + ret = bmap->b_pops->bpop_prepare_end_ptr(bmap, oldreq); + if (ret < 0) + return ret; + ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, newreq); + if (ret < 0) + bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq); + + return ret; +} + +void nilfs_bmap_commit_update(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *oldreq, + union nilfs_bmap_ptr_req *newreq) +{ + bmap->b_pops->bpop_commit_end_ptr(bmap, oldreq); + bmap->b_pops->bpop_commit_alloc_ptr(bmap, newreq); +} + +void nilfs_bmap_abort_update(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *oldreq, + union nilfs_bmap_ptr_req *newreq) +{ + bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq); + bmap->b_pops->bpop_abort_alloc_ptr(bmap, newreq); +} + +static int nilfs_bmap_translate_v(const struct nilfs_bmap *bmap, __u64 ptr, + __u64 *ptrp) +{ + sector_t blocknr; + int ret; + + ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), ptr, &blocknr); + if (ret < 0) + return ret; + if (ptrp != NULL) + *ptrp = blocknr; + return 0; +} + +static int nilfs_bmap_prepare_alloc_p(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + /* ignore target ptr */ + req->bpr_ptr = bmap->b_last_allocated_ptr++; + return 0; +} + +static void nilfs_bmap_commit_alloc_p(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + /* do nothing */ +} + +static void nilfs_bmap_abort_alloc_p(struct nilfs_bmap *bmap, + union nilfs_bmap_ptr_req *req) +{ + bmap->b_last_allocated_ptr--; +} + +static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_v = { + .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v, + .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v, + .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v, + .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v, + .bpop_commit_start_ptr = nilfs_bmap_commit_start_v, + .bpop_abort_start_ptr = nilfs_bmap_abort_start_v, + .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v, + .bpop_commit_end_ptr = nilfs_bmap_commit_end_v, + .bpop_abort_end_ptr = nilfs_bmap_abort_end_v, + + .bpop_translate = nilfs_bmap_translate_v, +}; + +static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_vmdt = { + .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v, + .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v, + .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v, + .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v, + .bpop_commit_start_ptr = nilfs_bmap_commit_start_v, + .bpop_abort_start_ptr = nilfs_bmap_abort_start_v, + .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v, + .bpop_commit_end_ptr = nilfs_bmap_commit_end_vmdt, + .bpop_abort_end_ptr = nilfs_bmap_abort_end_v, + + .bpop_translate = nilfs_bmap_translate_v, +}; + +static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_p = { + .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_p, + .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_p, + .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_p, + .bpop_prepare_start_ptr = NULL, + .bpop_commit_start_ptr = NULL, + .bpop_abort_start_ptr = NULL, + .bpop_prepare_end_ptr = NULL, + .bpop_commit_end_ptr = NULL, + .bpop_abort_end_ptr = NULL, + + .bpop_translate = NULL, +}; + +static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = { + .bpop_prepare_alloc_ptr = NULL, + .bpop_commit_alloc_ptr = NULL, + .bpop_abort_alloc_ptr = NULL, + .bpop_prepare_start_ptr = NULL, + .bpop_commit_start_ptr = NULL, + .bpop_abort_start_ptr = NULL, + .bpop_prepare_end_ptr = NULL, + .bpop_commit_end_ptr = NULL, + .bpop_abort_end_ptr = NULL, + + .bpop_translate = NULL, +}; + +static struct lock_class_key nilfs_bmap_dat_lock_key; + +/** + * nilfs_bmap_read - read a bmap from an inode + * @bmap: bmap + * @raw_inode: on-disk inode + * + * Description: nilfs_bmap_read() initializes the bmap @bmap. + * + * Return Value: On success, 0 is returned. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) +{ + if (raw_inode == NULL) + memset(bmap->b_u.u_data, 0, NILFS_BMAP_SIZE); + else + memcpy(bmap->b_u.u_data, raw_inode->i_bmap, NILFS_BMAP_SIZE); + + init_rwsem(&bmap->b_sem); + bmap->b_state = 0; + bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; + switch (bmap->b_inode->i_ino) { + case NILFS_DAT_INO: + bmap->b_pops = &nilfs_bmap_ptr_ops_p; + bmap->b_last_allocated_key = 0; /* XXX: use macro */ + bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT; + lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); + break; + case NILFS_CPFILE_INO: + case NILFS_SUFILE_INO: + bmap->b_pops = &nilfs_bmap_ptr_ops_vmdt; + bmap->b_last_allocated_key = 0; /* XXX: use macro */ + bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; + break; + default: + bmap->b_pops = &nilfs_bmap_ptr_ops_v; + bmap->b_last_allocated_key = 0; /* XXX: use macro */ + bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; + break; + } + + return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ? + nilfs_btree_init(bmap, + NILFS_BMAP_LARGE_LOW, + NILFS_BMAP_LARGE_HIGH) : + nilfs_direct_init(bmap, + NILFS_BMAP_SMALL_LOW, + NILFS_BMAP_SMALL_HIGH); +} + +/** + * nilfs_bmap_write - write back a bmap to an inode + * @bmap: bmap + * @raw_inode: on-disk inode + * + * Description: nilfs_bmap_write() stores @bmap in @raw_inode. + */ +void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) +{ + down_write(&bmap->b_sem); + memcpy(raw_inode->i_bmap, bmap->b_u.u_data, + NILFS_INODE_BMAP_SIZE * sizeof(__le64)); + if (bmap->b_inode->i_ino == NILFS_DAT_INO) + bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT; + + up_write(&bmap->b_sem); +} + +void nilfs_bmap_init_gc(struct nilfs_bmap *bmap) +{ + memset(&bmap->b_u, 0, NILFS_BMAP_SIZE); + init_rwsem(&bmap->b_sem); + bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; + bmap->b_pops = &nilfs_bmap_ptr_ops_gc; + bmap->b_last_allocated_key = 0; + bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; + bmap->b_state = 0; + nilfs_btree_init_gc(bmap); +} + +void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) +{ + memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union)); + init_rwsem(&gcbmap->b_sem); + lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); + gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode; +} + +void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) +{ + memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union)); + init_rwsem(&bmap->b_sem); + lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); + bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; +} diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h new file mode 100644 index 00000000000..4f2708abb1b --- /dev/null +++ b/fs/nilfs2/bmap.h @@ -0,0 +1,244 @@ +/* + * bmap.h - NILFS block mapping. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_BMAP_H +#define _NILFS_BMAP_H + +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> +#include "alloc.h" + +#define NILFS_BMAP_INVALID_PTR 0 + +#define nilfs_bmap_dkey_to_key(dkey) le64_to_cpu(dkey) +#define nilfs_bmap_key_to_dkey(key) cpu_to_le64(key) +#define nilfs_bmap_dptr_to_ptr(dptr) le64_to_cpu(dptr) +#define nilfs_bmap_ptr_to_dptr(ptr) cpu_to_le64(ptr) + +#define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff)) + + +struct nilfs_bmap; + +/** + * union nilfs_bmap_ptr_req - request for bmap ptr + * @bpr_ptr: bmap pointer + * @bpr_req: request for persistent allocator + */ +union nilfs_bmap_ptr_req { + __u64 bpr_ptr; + struct nilfs_palloc_req bpr_req; +}; + +/** + * struct nilfs_bmap_stats - bmap statistics + * @bs_nblocks: number of blocks created or deleted + */ +struct nilfs_bmap_stats { + unsigned int bs_nblocks; +}; + +/** + * struct nilfs_bmap_operations - bmap operation table + */ +struct nilfs_bmap_operations { + int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *); + int (*bop_insert)(struct nilfs_bmap *, __u64, __u64); + int (*bop_delete)(struct nilfs_bmap *, __u64); + void (*bop_clear)(struct nilfs_bmap *); + + int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *); + void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *, + struct list_head *); + + int (*bop_assign)(struct nilfs_bmap *, + struct buffer_head **, + sector_t, + union nilfs_binfo *); + int (*bop_mark)(struct nilfs_bmap *, __u64, int); + + /* The following functions are internal use only. */ + int (*bop_last_key)(const struct nilfs_bmap *, __u64 *); + int (*bop_check_insert)(const struct nilfs_bmap *, __u64); + int (*bop_check_delete)(struct nilfs_bmap *, __u64); + int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int); +}; + + +/** + * struct nilfs_bmap_ptr_operations - bmap ptr operation table + */ +struct nilfs_bmap_ptr_operations { + int (*bpop_prepare_alloc_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + void (*bpop_commit_alloc_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + void (*bpop_abort_alloc_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + int (*bpop_prepare_start_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + void (*bpop_commit_start_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *, + sector_t); + void (*bpop_abort_start_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + int (*bpop_prepare_end_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + void (*bpop_commit_end_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + void (*bpop_abort_end_ptr)(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *); + + int (*bpop_translate)(const struct nilfs_bmap *, __u64, __u64 *); +}; + + +#define NILFS_BMAP_SIZE (NILFS_INODE_BMAP_SIZE * sizeof(__le64)) +#define NILFS_BMAP_KEY_BIT (sizeof(unsigned long) * 8 /* CHAR_BIT */) +#define NILFS_BMAP_NEW_PTR_INIT \ + (1UL << (sizeof(unsigned long) * 8 /* CHAR_BIT */ - 1)) + +static inline int nilfs_bmap_is_new_ptr(unsigned long ptr) +{ + return !!(ptr & NILFS_BMAP_NEW_PTR_INIT); +} + + +/** + * struct nilfs_bmap - bmap structure + * @b_u: raw data + * @b_sem: semaphore + * @b_inode: owner of bmap + * @b_ops: bmap operation table + * @b_pops: bmap ptr operation table + * @b_low: low watermark of conversion + * @b_high: high watermark of conversion + * @b_last_allocated_key: last allocated key for data block + * @b_last_allocated_ptr: last allocated ptr for data block + * @b_state: state + */ +struct nilfs_bmap { + union { + __u8 u_flags; + __le64 u_data[NILFS_BMAP_SIZE / sizeof(__le64)]; + } b_u; + struct rw_semaphore b_sem; + struct inode *b_inode; + const struct nilfs_bmap_operations *b_ops; + const struct nilfs_bmap_ptr_operations *b_pops; + __u64 b_low; + __u64 b_high; + __u64 b_last_allocated_key; + __u64 b_last_allocated_ptr; + int b_state; +}; + +/* state */ +#define NILFS_BMAP_DIRTY 0x00000001 + + +int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); +int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); +void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); +int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *); +int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long); +int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long); +int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *); +int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long); +void nilfs_bmap_clear(struct nilfs_bmap *); +int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *); +void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *); +int nilfs_bmap_assign(struct nilfs_bmap *, struct buffer_head **, + unsigned long, union nilfs_binfo *); +int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *); +int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int); + +void nilfs_bmap_init_gc(struct nilfs_bmap *); +void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); +void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); + + +/* + * Internal use only + */ + +int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t); +int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64); + + +__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, + const struct buffer_head *); + +__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64); +__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *); + +int nilfs_bmap_prepare_update(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *, + union nilfs_bmap_ptr_req *); +void nilfs_bmap_commit_update(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *, + union nilfs_bmap_ptr_req *); +void nilfs_bmap_abort_update(struct nilfs_bmap *, + union nilfs_bmap_ptr_req *, + union nilfs_bmap_ptr_req *); + +void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int); +void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int); + + +int nilfs_bmap_get_block(const struct nilfs_bmap *, __u64, + struct buffer_head **); +void nilfs_bmap_put_block(const struct nilfs_bmap *, struct buffer_head *); +int nilfs_bmap_get_new_block(const struct nilfs_bmap *, __u64, + struct buffer_head **); +void nilfs_bmap_delete_block(const struct nilfs_bmap *, struct buffer_head *); + + +/* Assume that bmap semaphore is locked. */ +static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap) +{ + return !!(bmap->b_state & NILFS_BMAP_DIRTY); +} + +/* Assume that bmap semaphore is locked. */ +static inline void nilfs_bmap_set_dirty(struct nilfs_bmap *bmap) +{ + bmap->b_state |= NILFS_BMAP_DIRTY; +} + +/* Assume that bmap semaphore is locked. */ +static inline void nilfs_bmap_clear_dirty(struct nilfs_bmap *bmap) +{ + bmap->b_state &= ~NILFS_BMAP_DIRTY; +} + + +#define NILFS_BMAP_LARGE 0x1 + +#define NILFS_BMAP_SMALL_LOW NILFS_DIRECT_KEY_MIN +#define NILFS_BMAP_SMALL_HIGH NILFS_DIRECT_KEY_MAX +#define NILFS_BMAP_LARGE_LOW NILFS_BTREE_ROOT_NCHILDREN_MAX +#define NILFS_BMAP_LARGE_HIGH NILFS_BTREE_KEY_MAX + +#endif /* _NILFS_BMAP_H */ diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h new file mode 100644 index 00000000000..d41509bff47 --- /dev/null +++ b/fs/nilfs2/bmap_union.h @@ -0,0 +1,42 @@ +/* + * bmap_union.h - NILFS block mapping. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_BMAP_UNION_H +#define _NILFS_BMAP_UNION_H + +#include "bmap.h" +#include "direct.h" +#include "btree.h" + +/** + * nilfs_bmap_union - + * @bi_bmap: bmap structure + * @bi_btree: direct map structure + * @bi_direct: B-tree structure + */ +union nilfs_bmap_union { + struct nilfs_bmap bi_bmap; + struct nilfs_direct bi_direct; + struct nilfs_btree bi_btree; +}; + +#endif /* _NILFS_BMAP_UNION_H */ diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c new file mode 100644 index 00000000000..4cc07b2c30e --- /dev/null +++ b/fs/nilfs2/btnode.c @@ -0,0 +1,316 @@ +/* + * btnode.c - NILFS B-tree node cache + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * This file was originally written by Seiji Kihara <kihara@osrg.net> + * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for + * stabilization and simplification. + * + */ + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/mm.h> +#include <linux/backing-dev.h> +#include "nilfs.h" +#include "mdt.h" +#include "dat.h" +#include "page.h" +#include "btnode.h" + + +void nilfs_btnode_cache_init_once(struct address_space *btnc) +{ + INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC); + spin_lock_init(&btnc->tree_lock); + INIT_LIST_HEAD(&btnc->private_list); + spin_lock_init(&btnc->private_lock); + + spin_lock_init(&btnc->i_mmap_lock); + INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap); + INIT_LIST_HEAD(&btnc->i_mmap_nonlinear); +} + +static struct address_space_operations def_btnode_aops; + +void nilfs_btnode_cache_init(struct address_space *btnc) +{ + btnc->host = NULL; /* can safely set to host inode ? */ + btnc->flags = 0; + mapping_set_gfp_mask(btnc, GFP_NOFS); + btnc->assoc_mapping = NULL; + btnc->backing_dev_info = &default_backing_dev_info; + btnc->a_ops = &def_btnode_aops; +} + +void nilfs_btnode_cache_clear(struct address_space *btnc) +{ + invalidate_mapping_pages(btnc, 0, -1); + truncate_inode_pages(btnc, 0); +} + +int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, + sector_t pblocknr, struct buffer_head **pbh, + int newblk) +{ + struct buffer_head *bh; + struct inode *inode = NILFS_BTNC_I(btnc); + int err; + + bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); + if (unlikely(!bh)) + return -ENOMEM; + + err = -EEXIST; /* internal code */ + if (newblk) { + if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) || + buffer_dirty(bh))) { + brelse(bh); + BUG(); + } + bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; + bh->b_blocknr = blocknr; + set_buffer_mapped(bh); + set_buffer_uptodate(bh); + goto found; + } + + if (buffer_uptodate(bh) || buffer_dirty(bh)) + goto found; + + if (pblocknr == 0) { + pblocknr = blocknr; + if (inode->i_ino != NILFS_DAT_INO) { + struct inode *dat = + nilfs_dat_inode(NILFS_I_NILFS(inode)); + + /* blocknr is a virtual block number */ + err = nilfs_dat_translate(dat, blocknr, &pblocknr); + if (unlikely(err)) { + brelse(bh); + goto out_locked; + } + } + } + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + err = -EEXIST; /* internal code */ + goto found; + } + set_buffer_mapped(bh); + bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; + bh->b_blocknr = pblocknr; /* set block address for read */ + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(READ, bh); + bh->b_blocknr = blocknr; /* set back to the given block address */ + err = 0; +found: + *pbh = bh; + +out_locked: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + return err; +} + +int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr, + sector_t pblocknr, struct buffer_head **pbh, int newblk) +{ + struct buffer_head *bh; + int err; + + err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk); + if (err == -EEXIST) /* internal code (cache hit) */ + return 0; + if (unlikely(err)) + return err; + + bh = *pbh; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + brelse(bh); + return -EIO; + } + return 0; +} + +/** + * nilfs_btnode_delete - delete B-tree node buffer + * @bh: buffer to be deleted + * + * nilfs_btnode_delete() invalidates the specified buffer and delete the page + * including the buffer if the page gets unbusy. + */ +void nilfs_btnode_delete(struct buffer_head *bh) +{ + struct address_space *mapping; + struct page *page = bh->b_page; + pgoff_t index = page_index(page); + int still_dirty; + + page_cache_get(page); + lock_page(page); + wait_on_page_writeback(page); + + nilfs_forget_buffer(bh); + still_dirty = PageDirty(page); + mapping = page->mapping; + unlock_page(page); + page_cache_release(page); + + if (!still_dirty && mapping) + invalidate_inode_pages2_range(mapping, index, index); +} + +/** + * nilfs_btnode_prepare_change_key + * prepare to move contents of the block for old key to one of new key. + * the old buffer will not be removed, but might be reused for new buffer. + * it might return -ENOMEM because of memory allocation errors, + * and might return -EIO because of disk read errors. + */ +int nilfs_btnode_prepare_change_key(struct address_space *btnc, + struct nilfs_btnode_chkey_ctxt *ctxt) +{ + struct buffer_head *obh, *nbh; + struct inode *inode = NILFS_BTNC_I(btnc); + __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; + int err; + + if (oldkey == newkey) + return 0; + + obh = ctxt->bh; + ctxt->newbh = NULL; + + if (inode->i_blkbits == PAGE_CACHE_SHIFT) { + lock_page(obh->b_page); + /* + * We cannot call radix_tree_preload for the kernels older + * than 2.6.23, because it is not exported for modules. + */ + err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (err) + goto failed_unlock; + /* BUG_ON(oldkey != obh->b_page->index); */ + if (unlikely(oldkey != obh->b_page->index)) + NILFS_PAGE_BUG(obh->b_page, + "invalid oldkey %lld (newkey=%lld)", + (unsigned long long)oldkey, + (unsigned long long)newkey); + +retry: + spin_lock_irq(&btnc->tree_lock); + err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); + spin_unlock_irq(&btnc->tree_lock); + /* + * Note: page->index will not change to newkey until + * nilfs_btnode_commit_change_key() will be called. + * To protect the page in intermediate state, the page lock + * is held. + */ + radix_tree_preload_end(); + if (!err) + return 0; + else if (err != -EEXIST) + goto failed_unlock; + + err = invalidate_inode_pages2_range(btnc, newkey, newkey); + if (!err) + goto retry; + /* fallback to copy mode */ + unlock_page(obh->b_page); + } + + err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1); + if (likely(!err)) { + BUG_ON(nbh == obh); + ctxt->newbh = nbh; + } + return err; + + failed_unlock: + unlock_page(obh->b_page); + return err; +} + +/** + * nilfs_btnode_commit_change_key + * commit the change_key operation prepared by prepare_change_key(). + */ +void nilfs_btnode_commit_change_key(struct address_space *btnc, + struct nilfs_btnode_chkey_ctxt *ctxt) +{ + struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh; + __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; + struct page *opage; + + if (oldkey == newkey) + return; + + if (nbh == NULL) { /* blocksize == pagesize */ + opage = obh->b_page; + if (unlikely(oldkey != opage->index)) + NILFS_PAGE_BUG(opage, + "invalid oldkey %lld (newkey=%lld)", + (unsigned long long)oldkey, + (unsigned long long)newkey); + if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage)) + BUG(); + + spin_lock_irq(&btnc->tree_lock); + radix_tree_delete(&btnc->page_tree, oldkey); + radix_tree_tag_set(&btnc->page_tree, newkey, + PAGECACHE_TAG_DIRTY); + spin_unlock_irq(&btnc->tree_lock); + + opage->index = obh->b_blocknr = newkey; + unlock_page(opage); + } else { + nilfs_copy_buffer(nbh, obh); + nilfs_btnode_mark_dirty(nbh); + + nbh->b_blocknr = newkey; + ctxt->bh = nbh; + nilfs_btnode_delete(obh); /* will decrement bh->b_count */ + } +} + +/** + * nilfs_btnode_abort_change_key + * abort the change_key operation prepared by prepare_change_key(). + */ +void nilfs_btnode_abort_change_key(struct address_space *btnc, + struct nilfs_btnode_chkey_ctxt *ctxt) +{ + struct buffer_head *nbh = ctxt->newbh; + __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey; + + if (oldkey == newkey) + return; + + if (nbh == NULL) { /* blocksize == pagesize */ + spin_lock_irq(&btnc->tree_lock); + radix_tree_delete(&btnc->page_tree, newkey); + spin_unlock_irq(&btnc->tree_lock); + unlock_page(ctxt->bh->b_page); + } else + brelse(nbh); +} diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h new file mode 100644 index 00000000000..35faa86444a --- /dev/null +++ b/fs/nilfs2/btnode.h @@ -0,0 +1,58 @@ +/* + * btnode.h - NILFS B-tree node cache + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Seiji Kihara <kihara@osrg.net> + * Revised by Ryusuke Konishi <ryusuke@osrg.net> + */ + +#ifndef _NILFS_BTNODE_H +#define _NILFS_BTNODE_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/backing-dev.h> + + +struct nilfs_btnode_chkey_ctxt { + __u64 oldkey; + __u64 newkey; + struct buffer_head *bh; + struct buffer_head *newbh; +}; + +void nilfs_btnode_cache_init_once(struct address_space *); +void nilfs_btnode_cache_init(struct address_space *); +void nilfs_btnode_cache_clear(struct address_space *); +int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, + struct buffer_head **, int); +int nilfs_btnode_get(struct address_space *, __u64, sector_t, + struct buffer_head **, int); +void nilfs_btnode_delete(struct buffer_head *); +int nilfs_btnode_prepare_change_key(struct address_space *, + struct nilfs_btnode_chkey_ctxt *); +void nilfs_btnode_commit_change_key(struct address_space *, + struct nilfs_btnode_chkey_ctxt *); +void nilfs_btnode_abort_change_key(struct address_space *, + struct nilfs_btnode_chkey_ctxt *); + +#define nilfs_btnode_mark_dirty(bh) nilfs_mark_buffer_dirty(bh) + + +#endif /* _NILFS_BTNODE_H */ diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c new file mode 100644 index 00000000000..6b37a276729 --- /dev/null +++ b/fs/nilfs2/btree.c @@ -0,0 +1,2269 @@ +/* + * btree.c - NILFS B-tree. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/pagevec.h> +#include "nilfs.h" +#include "page.h" +#include "btnode.h" +#include "btree.h" +#include "alloc.h" + +/** + * struct nilfs_btree_path - A path on which B-tree operations are executed + * @bp_bh: buffer head of node block + * @bp_sib_bh: buffer head of sibling node block + * @bp_index: index of child node + * @bp_oldreq: ptr end request for old ptr + * @bp_newreq: ptr alloc request for new ptr + * @bp_op: rebalance operation + */ +struct nilfs_btree_path { + struct buffer_head *bp_bh; + struct buffer_head *bp_sib_bh; + int bp_index; + union nilfs_bmap_ptr_req bp_oldreq; + union nilfs_bmap_ptr_req bp_newreq; + struct nilfs_btnode_chkey_ctxt bp_ctxt; + void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *, + int, __u64 *, __u64 *); +}; + +/* + * B-tree path operations + */ + +static struct kmem_cache *nilfs_btree_path_cache; + +int __init nilfs_btree_path_cache_init(void) +{ + nilfs_btree_path_cache = + kmem_cache_create("nilfs2_btree_path_cache", + sizeof(struct nilfs_btree_path) * + NILFS_BTREE_LEVEL_MAX, 0, 0, NULL); + return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM; +} + +void nilfs_btree_path_cache_destroy(void) +{ + kmem_cache_destroy(nilfs_btree_path_cache); +} + +static inline struct nilfs_btree_path * +nilfs_btree_alloc_path(const struct nilfs_btree *btree) +{ + return (struct nilfs_btree_path *) + kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); +} + +static inline void nilfs_btree_free_path(const struct nilfs_btree *btree, + struct nilfs_btree_path *path) +{ + kmem_cache_free(nilfs_btree_path_cache, path); +} + +static void nilfs_btree_init_path(const struct nilfs_btree *btree, + struct nilfs_btree_path *path) +{ + int level; + + for (level = NILFS_BTREE_LEVEL_DATA; + level < NILFS_BTREE_LEVEL_MAX; + level++) { + path[level].bp_bh = NULL; + path[level].bp_sib_bh = NULL; + path[level].bp_index = 0; + path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; + path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; + path[level].bp_op = NULL; + } +} + +static void nilfs_btree_clear_path(const struct nilfs_btree *btree, + struct nilfs_btree_path *path) +{ + int level; + + for (level = NILFS_BTREE_LEVEL_DATA; + level < NILFS_BTREE_LEVEL_MAX; + level++) { + if (path[level].bp_bh != NULL) { + nilfs_bmap_put_block(&btree->bt_bmap, + path[level].bp_bh); + path[level].bp_bh = NULL; + } + /* sib_bh is released or deleted by prepare or commit + * operations. */ + path[level].bp_sib_bh = NULL; + path[level].bp_index = 0; + path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; + path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; + path[level].bp_op = NULL; + } +} + + +/* + * B-tree node operations + */ + +static inline int +nilfs_btree_node_get_flags(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return node->bn_flags; +} + +static inline void +nilfs_btree_node_set_flags(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + int flags) +{ + node->bn_flags = flags; +} + +static inline int nilfs_btree_node_root(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT; +} + +static inline int +nilfs_btree_node_get_level(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return node->bn_level; +} + +static inline void +nilfs_btree_node_set_level(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + int level) +{ + node->bn_level = level; +} + +static inline int +nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return le16_to_cpu(node->bn_nchildren); +} + +static inline void +nilfs_btree_node_set_nchildren(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + int nchildren) +{ + node->bn_nchildren = cpu_to_le16(nchildren); +} + +static inline int +nilfs_btree_node_size(const struct nilfs_btree *btree) +{ + return 1 << btree->bt_bmap.b_inode->i_blkbits; +} + +static inline int +nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return nilfs_btree_node_root(btree, node) ? + NILFS_BTREE_ROOT_NCHILDREN_MIN : + NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); +} + +static inline int +nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return nilfs_btree_node_root(btree, node) ? + NILFS_BTREE_ROOT_NCHILDREN_MAX : + NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree)); +} + +static inline __le64 * +nilfs_btree_node_dkeys(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return (__le64 *)((char *)(node + 1) + + (nilfs_btree_node_root(btree, node) ? + 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); +} + +static inline __le64 * +nilfs_btree_node_dptrs(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node) +{ + return (__le64 *)(nilfs_btree_node_dkeys(btree, node) + + nilfs_btree_node_nchildren_max(btree, node)); +} + +static inline __u64 +nilfs_btree_node_get_key(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node, int index) +{ + return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) + + index)); +} + +static inline void +nilfs_btree_node_set_key(struct nilfs_btree *btree, + struct nilfs_btree_node *node, int index, __u64 key) +{ + *(nilfs_btree_node_dkeys(btree, node) + index) = + nilfs_bmap_key_to_dkey(key); +} + +static inline __u64 +nilfs_btree_node_get_ptr(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node, + int index) +{ + return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) + + index)); +} + +static inline void +nilfs_btree_node_set_ptr(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + int index, + __u64 ptr) +{ + *(nilfs_btree_node_dptrs(btree, node) + index) = + nilfs_bmap_ptr_to_dptr(ptr); +} + +static void nilfs_btree_node_init(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + int flags, int level, int nchildren, + const __u64 *keys, const __u64 *ptrs) +{ + __le64 *dkeys; + __le64 *dptrs; + int i; + + nilfs_btree_node_set_flags(btree, node, flags); + nilfs_btree_node_set_level(btree, node, level); + nilfs_btree_node_set_nchildren(btree, node, nchildren); + + dkeys = nilfs_btree_node_dkeys(btree, node); + dptrs = nilfs_btree_node_dptrs(btree, node); + for (i = 0; i < nchildren; i++) { + dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]); + dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]); + } +} + +/* Assume the buffer heads corresponding to left and right are locked. */ +static void nilfs_btree_node_move_left(struct nilfs_btree *btree, + struct nilfs_btree_node *left, + struct nilfs_btree_node *right, + int n) +{ + __le64 *ldkeys, *rdkeys; + __le64 *ldptrs, *rdptrs; + int lnchildren, rnchildren; + + ldkeys = nilfs_btree_node_dkeys(btree, left); + ldptrs = nilfs_btree_node_dptrs(btree, left); + lnchildren = nilfs_btree_node_get_nchildren(btree, left); + + rdkeys = nilfs_btree_node_dkeys(btree, right); + rdptrs = nilfs_btree_node_dptrs(btree, right); + rnchildren = nilfs_btree_node_get_nchildren(btree, right); + + memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); + memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs)); + memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys)); + memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs)); + + lnchildren += n; + rnchildren -= n; + nilfs_btree_node_set_nchildren(btree, left, lnchildren); + nilfs_btree_node_set_nchildren(btree, right, rnchildren); +} + +/* Assume that the buffer heads corresponding to left and right are locked. */ +static void nilfs_btree_node_move_right(struct nilfs_btree *btree, + struct nilfs_btree_node *left, + struct nilfs_btree_node *right, + int n) +{ + __le64 *ldkeys, *rdkeys; + __le64 *ldptrs, *rdptrs; + int lnchildren, rnchildren; + + ldkeys = nilfs_btree_node_dkeys(btree, left); + ldptrs = nilfs_btree_node_dptrs(btree, left); + lnchildren = nilfs_btree_node_get_nchildren(btree, left); + + rdkeys = nilfs_btree_node_dkeys(btree, right); + rdptrs = nilfs_btree_node_dptrs(btree, right); + rnchildren = nilfs_btree_node_get_nchildren(btree, right); + + memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); + memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs)); + memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys)); + memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs)); + + lnchildren -= n; + rnchildren += n; + nilfs_btree_node_set_nchildren(btree, left, lnchildren); + nilfs_btree_node_set_nchildren(btree, right, rnchildren); +} + +/* Assume that the buffer head corresponding to node is locked. */ +static void nilfs_btree_node_insert(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + __u64 key, __u64 ptr, int index) +{ + __le64 *dkeys; + __le64 *dptrs; + int nchildren; + + dkeys = nilfs_btree_node_dkeys(btree, node); + dptrs = nilfs_btree_node_dptrs(btree, node); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + if (index < nchildren) { + memmove(dkeys + index + 1, dkeys + index, + (nchildren - index) * sizeof(*dkeys)); + memmove(dptrs + index + 1, dptrs + index, + (nchildren - index) * sizeof(*dptrs)); + } + dkeys[index] = nilfs_bmap_key_to_dkey(key); + dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr); + nchildren++; + nilfs_btree_node_set_nchildren(btree, node, nchildren); +} + +/* Assume that the buffer head corresponding to node is locked. */ +static void nilfs_btree_node_delete(struct nilfs_btree *btree, + struct nilfs_btree_node *node, + __u64 *keyp, __u64 *ptrp, int index) +{ + __u64 key; + __u64 ptr; + __le64 *dkeys; + __le64 *dptrs; + int nchildren; + + dkeys = nilfs_btree_node_dkeys(btree, node); + dptrs = nilfs_btree_node_dptrs(btree, node); + key = nilfs_bmap_dkey_to_key(dkeys[index]); + ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + if (keyp != NULL) + *keyp = key; + if (ptrp != NULL) + *ptrp = ptr; + + if (index < nchildren - 1) { + memmove(dkeys + index, dkeys + index + 1, + (nchildren - index - 1) * sizeof(*dkeys)); + memmove(dptrs + index, dptrs + index + 1, + (nchildren - index - 1) * sizeof(*dptrs)); + } + nchildren--; + nilfs_btree_node_set_nchildren(btree, node, nchildren); +} + +static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, + const struct nilfs_btree_node *node, + __u64 key, int *indexp) +{ + __u64 nkey; + int index, low, high, s; + + /* binary search */ + low = 0; + high = nilfs_btree_node_get_nchildren(btree, node) - 1; + index = 0; + s = 0; + while (low <= high) { + index = (low + high) / 2; + nkey = nilfs_btree_node_get_key(btree, node, index); + if (nkey == key) { + s = 0; + goto out; + } else if (nkey < key) { + low = index + 1; + s = -1; + } else { + high = index - 1; + s = 1; + } + } + + /* adjust index */ + if (nilfs_btree_node_get_level(btree, node) > + NILFS_BTREE_LEVEL_NODE_MIN) { + if ((s > 0) && (index > 0)) + index--; + } else if (s < 0) + index++; + + out: + *indexp = index; + + return s == 0; +} + +static inline struct nilfs_btree_node * +nilfs_btree_get_root(const struct nilfs_btree *btree) +{ + return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data; +} + +static inline struct nilfs_btree_node * +nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree, + const struct nilfs_btree_path *path, + int level) +{ + return (struct nilfs_btree_node *)path[level].bp_bh->b_data; +} + +static inline struct nilfs_btree_node * +nilfs_btree_get_sib_node(const struct nilfs_btree *btree, + const struct nilfs_btree_path *path, + int level) +{ + return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; +} + +static inline int nilfs_btree_height(const struct nilfs_btree *btree) +{ + return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree)) + + 1; +} + +static inline struct nilfs_btree_node * +nilfs_btree_get_node(const struct nilfs_btree *btree, + const struct nilfs_btree_path *path, + int level) +{ + return (level == nilfs_btree_height(btree) - 1) ? + nilfs_btree_get_root(btree) : + nilfs_btree_get_nonroot_node(btree, path, level); +} + +static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, + struct nilfs_btree_path *path, + __u64 key, __u64 *ptrp, int minlevel) +{ + struct nilfs_btree_node *node; + __u64 ptr; + int level, index, found, ret; + + node = nilfs_btree_get_root(btree); + level = nilfs_btree_node_get_level(btree, node); + if ((level < minlevel) || + (nilfs_btree_node_get_nchildren(btree, node) <= 0)) + return -ENOENT; + + found = nilfs_btree_node_lookup(btree, node, key, &index); + ptr = nilfs_btree_node_get_ptr(btree, node, index); + path[level].bp_bh = NULL; + path[level].bp_index = index; + + for (level--; level >= minlevel; level--) { + ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, + &path[level].bp_bh); + if (ret < 0) + return ret; + node = nilfs_btree_get_nonroot_node(btree, path, level); + BUG_ON(level != nilfs_btree_node_get_level(btree, node)); + if (!found) + found = nilfs_btree_node_lookup(btree, node, key, + &index); + else + index = 0; + if (index < nilfs_btree_node_nchildren_max(btree, node)) + ptr = nilfs_btree_node_get_ptr(btree, node, index); + else { + WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); + /* insert */ + ptr = NILFS_BMAP_INVALID_PTR; + } + path[level].bp_index = index; + } + if (!found) + return -ENOENT; + + if (ptrp != NULL) + *ptrp = ptr; + + return 0; +} + +static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, + struct nilfs_btree_path *path, + __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node; + __u64 ptr; + int index, level, ret; + + node = nilfs_btree_get_root(btree); + index = nilfs_btree_node_get_nchildren(btree, node) - 1; + if (index < 0) + return -ENOENT; + level = nilfs_btree_node_get_level(btree, node); + ptr = nilfs_btree_node_get_ptr(btree, node, index); + path[level].bp_bh = NULL; + path[level].bp_index = index; + + for (level--; level > 0; level--) { + ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, + &path[level].bp_bh); + if (ret < 0) + return ret; + node = nilfs_btree_get_nonroot_node(btree, path, level); + BUG_ON(level != nilfs_btree_node_get_level(btree, node)); + index = nilfs_btree_node_get_nchildren(btree, node) - 1; + ptr = nilfs_btree_node_get_ptr(btree, node, index); + path[level].bp_index = index; + } + + if (keyp != NULL) + *keyp = nilfs_btree_node_get_key(btree, node, index); + if (ptrp != NULL) + *ptrp = ptr; + + return 0; +} + +static int nilfs_btree_lookup(const struct nilfs_bmap *bmap, + __u64 key, int level, __u64 *ptrp) +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + __u64 ptr; + int ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); + + if (ptrp != NULL) + *ptrp = ptr; + + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + + return ret; +} + +static void nilfs_btree_promote_key(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 key) +{ + if (level < nilfs_btree_height(btree) - 1) { + do { + lock_buffer(path[level].bp_bh); + nilfs_btree_node_set_key( + btree, + nilfs_btree_get_nonroot_node( + btree, path, level), + path[level].bp_index, key); + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + unlock_buffer(path[level].bp_bh); + } while ((path[level].bp_index == 0) && + (++level < nilfs_btree_height(btree) - 1)); + } + + /* root */ + if (level == nilfs_btree_height(btree) - 1) { + nilfs_btree_node_set_key(btree, + nilfs_btree_get_root(btree), + path[level].bp_index, key); + } +} + +static void nilfs_btree_do_insert(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node; + + if (level < nilfs_btree_height(btree) - 1) { + lock_buffer(path[level].bp_bh); + node = nilfs_btree_get_nonroot_node(btree, path, level); + nilfs_btree_node_insert(btree, node, *keyp, *ptrp, + path[level].bp_index); + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + unlock_buffer(path[level].bp_bh); + + if (path[level].bp_index == 0) + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key( + btree, node, 0)); + } else { + node = nilfs_btree_get_root(btree); + nilfs_btree_node_insert(btree, node, *keyp, *ptrp, + path[level].bp_index); + } +} + +static void nilfs_btree_carry_left(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *left; + int nchildren, lnchildren, n, move; + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + left = nilfs_btree_get_sib_node(btree, path, level); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + lnchildren = nilfs_btree_node_get_nchildren(btree, left); + move = 0; + + n = (nchildren + lnchildren + 1) / 2 - lnchildren; + if (n > path[level].bp_index) { + /* move insert point */ + n--; + move = 1; + } + + nilfs_btree_node_move_left(btree, left, node, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(btree, node, 0)); + + if (move) { + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + path[level].bp_index += lnchildren; + path[level + 1].bp_index--; + } else { + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + path[level].bp_index -= n; + } + + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); +} + +static void nilfs_btree_carry_right(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + int nchildren, rnchildren, n, move; + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + right = nilfs_btree_get_sib_node(btree, path, level); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + rnchildren = nilfs_btree_node_get_nchildren(btree, right); + move = 0; + + n = (nchildren + rnchildren + 1) / 2 - rnchildren; + if (n > nchildren - path[level].bp_index) { + /* move insert point */ + n--; + move = 1; + } + + nilfs_btree_node_move_right(btree, node, right, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + path[level + 1].bp_index++; + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(btree, right, 0)); + path[level + 1].bp_index--; + + if (move) { + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + path[level].bp_index -= + nilfs_btree_node_get_nchildren(btree, node); + path[level + 1].bp_index++; + } else { + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + } + + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); +} + +static void nilfs_btree_split(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + __u64 newkey; + __u64 newptr; + int nchildren, n, move; + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + right = nilfs_btree_get_sib_node(btree, path, level); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + move = 0; + + n = (nchildren + 1) / 2; + if (n > nchildren - path[level].bp_index) { + n--; + move = 1; + } + + nilfs_btree_node_move_right(btree, node, right, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + newkey = nilfs_btree_node_get_key(btree, right, 0); + newptr = path[level].bp_newreq.bpr_ptr; + + if (move) { + path[level].bp_index -= + nilfs_btree_node_get_nchildren(btree, node); + nilfs_btree_node_insert(btree, right, *keyp, *ptrp, + path[level].bp_index); + + *keyp = nilfs_btree_node_get_key(btree, right, 0); + *ptrp = path[level].bp_newreq.bpr_ptr; + + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + } else { + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); + + *keyp = nilfs_btree_node_get_key(btree, right, 0); + *ptrp = path[level].bp_newreq.bpr_ptr; + + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + } + + path[level + 1].bp_index++; +} + +static void nilfs_btree_grow(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *root, *child; + int n; + + lock_buffer(path[level].bp_sib_bh); + + root = nilfs_btree_get_root(btree); + child = nilfs_btree_get_sib_node(btree, path, level); + + n = nilfs_btree_node_get_nchildren(btree, root); + + nilfs_btree_node_move_right(btree, root, child, n); + nilfs_btree_node_set_level(btree, root, level + 1); + + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_sib_bh); + + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + + nilfs_btree_do_insert(btree, path, level, keyp, ptrp); + + *keyp = nilfs_btree_node_get_key(btree, child, 0); + *ptrp = path[level].bp_newreq.bpr_ptr; +} + +static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree, + const struct nilfs_btree_path *path) +{ + struct nilfs_btree_node *node; + int level; + + if (path == NULL) + return NILFS_BMAP_INVALID_PTR; + + /* left sibling */ + level = NILFS_BTREE_LEVEL_NODE_MIN; + if (path[level].bp_index > 0) { + node = nilfs_btree_get_node(btree, path, level); + return nilfs_btree_node_get_ptr(btree, node, + path[level].bp_index - 1); + } + + /* parent */ + level = NILFS_BTREE_LEVEL_NODE_MIN + 1; + if (level <= nilfs_btree_height(btree) - 1) { + node = nilfs_btree_get_node(btree, path, level); + return nilfs_btree_node_get_ptr(btree, node, + path[level].bp_index); + } + + return NILFS_BMAP_INVALID_PTR; +} + +static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree, + const struct nilfs_btree_path *path, + __u64 key) +{ + __u64 ptr; + + ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key); + if (ptr != NILFS_BMAP_INVALID_PTR) + /* sequential access */ + return ptr; + else { + ptr = nilfs_btree_find_near(btree, path); + if (ptr != NILFS_BMAP_INVALID_PTR) + /* near */ + return ptr; + } + /* block group */ + return nilfs_bmap_find_target_in_group(&btree->bt_bmap); +} + +static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key, + __u64 ptr) +{ + btree->bt_bmap.b_last_allocated_key = key; + btree->bt_bmap.b_last_allocated_ptr = ptr; +} + +static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int *levelp, __u64 key, __u64 ptr, + struct nilfs_bmap_stats *stats) +{ + struct buffer_head *bh; + struct nilfs_btree_node *node, *parent, *sib; + __u64 sibptr; + int pindex, level, ret; + + stats->bs_nblocks = 0; + level = NILFS_BTREE_LEVEL_DATA; + + /* allocate a new ptr for data block */ + if (btree->bt_ops->btop_find_target != NULL) + path[level].bp_newreq.bpr_ptr = + btree->bt_ops->btop_find_target(btree, path, key); + + ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr( + &btree->bt_bmap, &path[level].bp_newreq); + if (ret < 0) + goto err_out_data; + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < nilfs_btree_height(btree) - 1; + level++) { + node = nilfs_btree_get_nonroot_node(btree, path, level); + if (nilfs_btree_node_get_nchildren(btree, node) < + nilfs_btree_node_nchildren_max(btree, node)) { + path[level].bp_op = nilfs_btree_do_insert; + stats->bs_nblocks++; + goto out; + } + + parent = nilfs_btree_get_node(btree, path, level + 1); + pindex = path[level + 1].bp_index; + + /* left sibling */ + if (pindex > 0) { + sibptr = nilfs_btree_node_get_ptr(btree, parent, + pindex - 1); + ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, + &bh); + if (ret < 0) + goto err_out_child_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(btree, sib) < + nilfs_btree_node_nchildren_max(btree, sib)) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_carry_left; + stats->bs_nblocks++; + goto out; + } else + nilfs_bmap_put_block(&btree->bt_bmap, bh); + } + + /* right sibling */ + if (pindex < + nilfs_btree_node_get_nchildren(btree, parent) - 1) { + sibptr = nilfs_btree_node_get_ptr(btree, parent, + pindex + 1); + ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, + &bh); + if (ret < 0) + goto err_out_child_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(btree, sib) < + nilfs_btree_node_nchildren_max(btree, sib)) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_carry_right; + stats->bs_nblocks++; + goto out; + } else + nilfs_bmap_put_block(&btree->bt_bmap, bh); + } + + /* split */ + path[level].bp_newreq.bpr_ptr = + path[level - 1].bp_newreq.bpr_ptr + 1; + ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr( + &btree->bt_bmap, &path[level].bp_newreq); + if (ret < 0) + goto err_out_child_node; + ret = nilfs_bmap_get_new_block(&btree->bt_bmap, + path[level].bp_newreq.bpr_ptr, + &bh); + if (ret < 0) + goto err_out_curr_node; + + stats->bs_nblocks++; + + lock_buffer(bh); + nilfs_btree_node_init(btree, + (struct nilfs_btree_node *)bh->b_data, + 0, level, 0, NULL, NULL); + unlock_buffer(bh); + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_split; + } + + /* root */ + node = nilfs_btree_get_root(btree); + if (nilfs_btree_node_get_nchildren(btree, node) < + nilfs_btree_node_nchildren_max(btree, node)) { + path[level].bp_op = nilfs_btree_do_insert; + stats->bs_nblocks++; + goto out; + } + + /* grow */ + path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; + ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr( + &btree->bt_bmap, &path[level].bp_newreq); + if (ret < 0) + goto err_out_child_node; + ret = nilfs_bmap_get_new_block(&btree->bt_bmap, + path[level].bp_newreq.bpr_ptr, &bh); + if (ret < 0) + goto err_out_curr_node; + + lock_buffer(bh); + nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data, + 0, level, 0, NULL, NULL); + unlock_buffer(bh); + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_grow; + + level++; + path[level].bp_op = nilfs_btree_do_insert; + + /* a newly-created node block and a data block are added */ + stats->bs_nblocks += 2; + + /* success */ + out: + *levelp = level; + return ret; + + /* error */ + err_out_curr_node: + btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap, + &path[level].bp_newreq); + err_out_child_node: + for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { + nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh); + btree->bt_bmap.b_pops->bpop_abort_alloc_ptr( + &btree->bt_bmap, &path[level].bp_newreq); + + } + + btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap, + &path[level].bp_newreq); + err_out_data: + *levelp = level; + stats->bs_nblocks = 0; + return ret; +} + +static void nilfs_btree_commit_insert(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int maxlevel, __u64 key, __u64 ptr) +{ + int level; + + set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); + ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; + if (btree->bt_ops->btop_set_target != NULL) + btree->bt_ops->btop_set_target(btree, key, ptr); + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { + if (btree->bt_bmap.b_pops->bpop_commit_alloc_ptr != NULL) { + btree->bt_bmap.b_pops->bpop_commit_alloc_ptr( + &btree->bt_bmap, &path[level - 1].bp_newreq); + } + path[level].bp_op(btree, path, level, &key, &ptr); + } + + if (!nilfs_bmap_dirty(&btree->bt_bmap)) + nilfs_bmap_set_dirty(&btree->bt_bmap); +} + +static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + struct nilfs_bmap_stats stats; + int level, ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + ret = nilfs_btree_do_lookup(btree, path, key, NULL, + NILFS_BTREE_LEVEL_NODE_MIN); + if (ret != -ENOENT) { + if (ret == 0) + ret = -EEXIST; + goto out; + } + + ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats); + if (ret < 0) + goto out; + nilfs_btree_commit_insert(btree, path, level, key, ptr); + nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); + + out: + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + return ret; +} + +static void nilfs_btree_do_delete(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node; + + if (level < nilfs_btree_height(btree) - 1) { + lock_buffer(path[level].bp_bh); + node = nilfs_btree_get_nonroot_node(btree, path, level); + nilfs_btree_node_delete(btree, node, keyp, ptrp, + path[level].bp_index); + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + unlock_buffer(path[level].bp_bh); + if (path[level].bp_index == 0) + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(btree, node, 0)); + } else { + node = nilfs_btree_get_root(btree); + nilfs_btree_node_delete(btree, node, keyp, ptrp, + path[level].bp_index); + } +} + +static void nilfs_btree_borrow_left(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *left; + int nchildren, lnchildren, n; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + left = nilfs_btree_get_sib_node(btree, path, level); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + lnchildren = nilfs_btree_node_get_nchildren(btree, left); + + n = (nchildren + lnchildren) / 2 - nchildren; + + nilfs_btree_node_move_right(btree, left, node, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(btree, node, 0)); + + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + path[level].bp_index += n; +} + +static void nilfs_btree_borrow_right(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + int nchildren, rnchildren, n; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + right = nilfs_btree_get_sib_node(btree, path, level); + nchildren = nilfs_btree_node_get_nchildren(btree, node); + rnchildren = nilfs_btree_node_get_nchildren(btree, right); + + n = (nchildren + rnchildren) / 2 - nchildren; + + nilfs_btree_node_move_left(btree, node, right, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + path[level + 1].bp_index++; + nilfs_btree_promote_key(btree, path, level + 1, + nilfs_btree_node_get_key(btree, right, 0)); + path[level + 1].bp_index--; + + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; +} + +static void nilfs_btree_concat_left(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *left; + int n; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + left = nilfs_btree_get_sib_node(btree, path, level); + + n = nilfs_btree_node_get_nchildren(btree, node); + + nilfs_btree_node_move_left(btree, left, node, n); + + if (!buffer_dirty(path[level].bp_sib_bh)) + nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh); + path[level].bp_bh = path[level].bp_sib_bh; + path[level].bp_sib_bh = NULL; + path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left); +} + +static void nilfs_btree_concat_right(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *node, *right; + int n; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + lock_buffer(path[level].bp_bh); + lock_buffer(path[level].bp_sib_bh); + + node = nilfs_btree_get_nonroot_node(btree, path, level); + right = nilfs_btree_get_sib_node(btree, path, level); + + n = nilfs_btree_node_get_nchildren(btree, right); + + nilfs_btree_node_move_left(btree, node, right, n); + + if (!buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + + unlock_buffer(path[level].bp_bh); + unlock_buffer(path[level].bp_sib_bh); + + nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh); + path[level].bp_sib_bh = NULL; + path[level + 1].bp_index++; +} + +static void nilfs_btree_shrink(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ + struct nilfs_btree_node *root, *child; + int n; + + nilfs_btree_do_delete(btree, path, level, keyp, ptrp); + + lock_buffer(path[level].bp_bh); + root = nilfs_btree_get_root(btree); + child = nilfs_btree_get_nonroot_node(btree, path, level); + + nilfs_btree_node_delete(btree, root, NULL, NULL, 0); + nilfs_btree_node_set_level(btree, root, level); + n = nilfs_btree_node_get_nchildren(btree, child); + nilfs_btree_node_move_left(btree, root, child, n); + unlock_buffer(path[level].bp_bh); + + nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh); + path[level].bp_bh = NULL; +} + + +static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int *levelp, + struct nilfs_bmap_stats *stats) +{ + struct buffer_head *bh; + struct nilfs_btree_node *node, *parent, *sib; + __u64 sibptr; + int pindex, level, ret; + + ret = 0; + stats->bs_nblocks = 0; + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < nilfs_btree_height(btree) - 1; + level++) { + node = nilfs_btree_get_nonroot_node(btree, path, level); + path[level].bp_oldreq.bpr_ptr = + nilfs_btree_node_get_ptr(btree, node, + path[level].bp_index); + if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) { + ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr( + &btree->bt_bmap, &path[level].bp_oldreq); + if (ret < 0) + goto err_out_child_node; + } + + if (nilfs_btree_node_get_nchildren(btree, node) > + nilfs_btree_node_nchildren_min(btree, node)) { + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + goto out; + } + + parent = nilfs_btree_get_node(btree, path, level + 1); + pindex = path[level + 1].bp_index; + + if (pindex > 0) { + /* left sibling */ + sibptr = nilfs_btree_node_get_ptr(btree, parent, + pindex - 1); + ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, + &bh); + if (ret < 0) + goto err_out_curr_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(btree, sib) > + nilfs_btree_node_nchildren_min(btree, sib)) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_borrow_left; + stats->bs_nblocks++; + goto out; + } else { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_concat_left; + stats->bs_nblocks++; + /* continue; */ + } + } else if (pindex < + nilfs_btree_node_get_nchildren(btree, parent) - 1) { + /* right sibling */ + sibptr = nilfs_btree_node_get_ptr(btree, parent, + pindex + 1); + ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr, + &bh); + if (ret < 0) + goto err_out_curr_node; + sib = (struct nilfs_btree_node *)bh->b_data; + if (nilfs_btree_node_get_nchildren(btree, sib) > + nilfs_btree_node_nchildren_min(btree, sib)) { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_borrow_right; + stats->bs_nblocks++; + goto out; + } else { + path[level].bp_sib_bh = bh; + path[level].bp_op = nilfs_btree_concat_right; + stats->bs_nblocks++; + /* continue; */ + } + } else { + /* no siblings */ + /* the only child of the root node */ + WARN_ON(level != nilfs_btree_height(btree) - 2); + if (nilfs_btree_node_get_nchildren(btree, node) - 1 <= + NILFS_BTREE_ROOT_NCHILDREN_MAX) { + path[level].bp_op = nilfs_btree_shrink; + stats->bs_nblocks += 2; + } else { + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + } + + goto out; + + } + } + + node = nilfs_btree_get_root(btree); + path[level].bp_oldreq.bpr_ptr = + nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); + if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) { + ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr( + &btree->bt_bmap, &path[level].bp_oldreq); + if (ret < 0) + goto err_out_child_node; + } + /* child of the root node is deleted */ + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + + /* success */ + out: + *levelp = level; + return ret; + + /* error */ + err_out_curr_node: + if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL) + btree->bt_bmap.b_pops->bpop_abort_end_ptr( + &btree->bt_bmap, &path[level].bp_oldreq); + err_out_child_node: + for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { + nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh); + if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL) + btree->bt_bmap.b_pops->bpop_abort_end_ptr( + &btree->bt_bmap, &path[level].bp_oldreq); + } + *levelp = level; + stats->bs_nblocks = 0; + return ret; +} + +static void nilfs_btree_commit_delete(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int maxlevel) +{ + int level; + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { + if (btree->bt_bmap.b_pops->bpop_commit_end_ptr != NULL) + btree->bt_bmap.b_pops->bpop_commit_end_ptr( + &btree->bt_bmap, &path[level].bp_oldreq); + path[level].bp_op(btree, path, level, NULL, NULL); + } + + if (!nilfs_bmap_dirty(&btree->bt_bmap)) + nilfs_bmap_set_dirty(&btree->bt_bmap); +} + +static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key) + +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + struct nilfs_bmap_stats stats; + int level, ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + ret = nilfs_btree_do_lookup(btree, path, key, NULL, + NILFS_BTREE_LEVEL_NODE_MIN); + if (ret < 0) + goto out; + + ret = nilfs_btree_prepare_delete(btree, path, &level, &stats); + if (ret < 0) + goto out; + nilfs_btree_commit_delete(btree, path, level); + nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); + +out: + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + return ret; +} + +static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + int ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); + + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + + return ret; +} + +static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) +{ + struct buffer_head *bh; + struct nilfs_btree *btree; + struct nilfs_btree_node *root, *node; + __u64 maxkey, nextmaxkey; + __u64 ptr; + int nchildren, ret; + + btree = (struct nilfs_btree *)bmap; + root = nilfs_btree_get_root(btree); + switch (nilfs_btree_height(btree)) { + case 2: + bh = NULL; + node = root; + break; + case 3: + nchildren = nilfs_btree_node_get_nchildren(btree, root); + if (nchildren > 1) + return 0; + ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); + ret = nilfs_bmap_get_block(bmap, ptr, &bh); + if (ret < 0) + return ret; + node = (struct nilfs_btree_node *)bh->b_data; + break; + default: + return 0; + } + + nchildren = nilfs_btree_node_get_nchildren(btree, node); + maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1); + nextmaxkey = (nchildren > 1) ? + nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0; + if (bh != NULL) + nilfs_bmap_put_block(bmap, bh); + + return (maxkey == key) && (nextmaxkey < bmap->b_low); +} + +static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, + __u64 *keys, __u64 *ptrs, int nitems) +{ + struct buffer_head *bh; + struct nilfs_btree *btree; + struct nilfs_btree_node *node, *root; + __le64 *dkeys; + __le64 *dptrs; + __u64 ptr; + int nchildren, i, ret; + + btree = (struct nilfs_btree *)bmap; + root = nilfs_btree_get_root(btree); + switch (nilfs_btree_height(btree)) { + case 2: + bh = NULL; + node = root; + break; + case 3: + nchildren = nilfs_btree_node_get_nchildren(btree, root); + WARN_ON(nchildren > 1); + ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); + ret = nilfs_bmap_get_block(bmap, ptr, &bh); + if (ret < 0) + return ret; + node = (struct nilfs_btree_node *)bh->b_data; + break; + default: + node = NULL; + return -EINVAL; + } + + nchildren = nilfs_btree_node_get_nchildren(btree, node); + if (nchildren < nitems) + nitems = nchildren; + dkeys = nilfs_btree_node_dkeys(btree, node); + dptrs = nilfs_btree_node_dptrs(btree, node); + for (i = 0; i < nitems; i++) { + keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]); + ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]); + } + + if (bh != NULL) + nilfs_bmap_put_block(bmap, bh); + + return nitems; +} + +static int +nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, + union nilfs_bmap_ptr_req *dreq, + union nilfs_bmap_ptr_req *nreq, + struct buffer_head **bhp, + struct nilfs_bmap_stats *stats) +{ + struct buffer_head *bh; + struct nilfs_btree *btree; + int ret; + + btree = (struct nilfs_btree *)bmap; + stats->bs_nblocks = 0; + + /* for data */ + /* cannot find near ptr */ + if (btree->bt_ops->btop_find_target != NULL) + dreq->bpr_ptr + = btree->bt_ops->btop_find_target(btree, NULL, key); + ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, dreq); + if (ret < 0) + return ret; + + *bhp = NULL; + stats->bs_nblocks++; + if (nreq != NULL) { + nreq->bpr_ptr = dreq->bpr_ptr + 1; + ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, nreq); + if (ret < 0) + goto err_out_dreq; + + ret = nilfs_bmap_get_new_block(bmap, nreq->bpr_ptr, &bh); + if (ret < 0) + goto err_out_nreq; + + *bhp = bh; + stats->bs_nblocks++; + } + + /* success */ + return 0; + + /* error */ + err_out_nreq: + bmap->b_pops->bpop_abort_alloc_ptr(bmap, nreq); + err_out_dreq: + bmap->b_pops->bpop_abort_alloc_ptr(bmap, dreq); + stats->bs_nblocks = 0; + return ret; + +} + +static void +nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, + __u64 key, __u64 ptr, + const __u64 *keys, const __u64 *ptrs, + int n, __u64 low, __u64 high, + union nilfs_bmap_ptr_req *dreq, + union nilfs_bmap_ptr_req *nreq, + struct buffer_head *bh) +{ + struct nilfs_btree *btree; + struct nilfs_btree_node *node; + __u64 tmpptr; + + /* free resources */ + if (bmap->b_ops->bop_clear != NULL) + bmap->b_ops->bop_clear(bmap); + + /* ptr must be a pointer to a buffer head. */ + set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); + + /* convert and insert */ + btree = (struct nilfs_btree *)bmap; + nilfs_btree_init(bmap, low, high); + if (nreq != NULL) { + if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) { + bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq); + bmap->b_pops->bpop_commit_alloc_ptr(bmap, nreq); + } + + /* create child node at level 1 */ + lock_buffer(bh); + node = (struct nilfs_btree_node *)bh->b_data; + nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs); + nilfs_btree_node_insert(btree, node, + key, dreq->bpr_ptr, n); + if (!buffer_dirty(bh)) + nilfs_btnode_mark_dirty(bh); + if (!nilfs_bmap_dirty(bmap)) + nilfs_bmap_set_dirty(bmap); + + unlock_buffer(bh); + nilfs_bmap_put_block(bmap, bh); + + /* create root node at level 2 */ + node = nilfs_btree_get_root(btree); + tmpptr = nreq->bpr_ptr; + nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, + 2, 1, &keys[0], &tmpptr); + } else { + if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) + bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq); + + /* create root node at level 1 */ + node = nilfs_btree_get_root(btree); + nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, + 1, n, keys, ptrs); + nilfs_btree_node_insert(btree, node, + key, dreq->bpr_ptr, n); + if (!nilfs_bmap_dirty(bmap)) + nilfs_bmap_set_dirty(bmap); + } + + if (btree->bt_ops->btop_set_target != NULL) + btree->bt_ops->btop_set_target(btree, key, dreq->bpr_ptr); +} + +/** + * nilfs_btree_convert_and_insert - + * @bmap: + * @key: + * @ptr: + * @keys: + * @ptrs: + * @n: + * @low: + * @high: + */ +int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap, + __u64 key, __u64 ptr, + const __u64 *keys, const __u64 *ptrs, + int n, __u64 low, __u64 high) +{ + struct buffer_head *bh; + union nilfs_bmap_ptr_req dreq, nreq, *di, *ni; + struct nilfs_bmap_stats stats; + int ret; + + if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) { + di = &dreq; + ni = NULL; + } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX( + 1 << bmap->b_inode->i_blkbits)) { + di = &dreq; + ni = &nreq; + } else { + di = NULL; + ni = NULL; + BUG(); + } + + ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh, + &stats); + if (ret < 0) + return ret; + nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n, + low, high, di, ni, bh); + nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); + return 0; +} + +static int nilfs_btree_propagate_p(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head *bh) +{ + while ((++level < nilfs_btree_height(btree) - 1) && + !buffer_dirty(path[level].bp_bh)) + nilfs_btnode_mark_dirty(path[level].bp_bh); + + return 0; +} + +static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level) +{ + struct nilfs_btree_node *parent; + int ret; + + parent = nilfs_btree_get_node(btree, path, level + 1); + path[level].bp_oldreq.bpr_ptr = + nilfs_btree_node_get_ptr(btree, parent, + path[level + 1].bp_index); + path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; + ret = nilfs_bmap_prepare_update(&btree->bt_bmap, + &path[level].bp_oldreq, + &path[level].bp_newreq); + if (ret < 0) + return ret; + + if (buffer_nilfs_node(path[level].bp_bh)) { + path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr; + path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr; + path[level].bp_ctxt.bh = path[level].bp_bh; + ret = nilfs_btnode_prepare_change_key( + &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &path[level].bp_ctxt); + if (ret < 0) { + nilfs_bmap_abort_update(&btree->bt_bmap, + &path[level].bp_oldreq, + &path[level].bp_newreq); + return ret; + } + } + + return 0; +} + +static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level) +{ + struct nilfs_btree_node *parent; + + nilfs_bmap_commit_update(&btree->bt_bmap, + &path[level].bp_oldreq, + &path[level].bp_newreq); + + if (buffer_nilfs_node(path[level].bp_bh)) { + nilfs_btnode_commit_change_key( + &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &path[level].bp_ctxt); + path[level].bp_bh = path[level].bp_ctxt.bh; + } + set_buffer_nilfs_volatile(path[level].bp_bh); + + parent = nilfs_btree_get_node(btree, path, level + 1); + nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index, + path[level].bp_newreq.bpr_ptr); +} + +static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level) +{ + nilfs_bmap_abort_update(&btree->bt_bmap, + &path[level].bp_oldreq, + &path[level].bp_newreq); + if (buffer_nilfs_node(path[level].bp_bh)) + nilfs_btnode_abort_change_key( + &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &path[level].bp_ctxt); +} + +static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int minlevel, + int *maxlevelp) +{ + int level, ret; + + level = minlevel; + if (!buffer_nilfs_volatile(path[level].bp_bh)) { + ret = nilfs_btree_prepare_update_v(btree, path, level); + if (ret < 0) + return ret; + } + while ((++level < nilfs_btree_height(btree) - 1) && + !buffer_dirty(path[level].bp_bh)) { + + WARN_ON(buffer_nilfs_volatile(path[level].bp_bh)); + ret = nilfs_btree_prepare_update_v(btree, path, level); + if (ret < 0) + goto out; + } + + /* success */ + *maxlevelp = level - 1; + return 0; + + /* error */ + out: + while (--level > minlevel) + nilfs_btree_abort_update_v(btree, path, level); + if (!buffer_nilfs_volatile(path[level].bp_bh)) + nilfs_btree_abort_update_v(btree, path, level); + return ret; +} + +static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int minlevel, + int maxlevel, + struct buffer_head *bh) +{ + int level; + + if (!buffer_nilfs_volatile(path[minlevel].bp_bh)) + nilfs_btree_commit_update_v(btree, path, minlevel); + + for (level = minlevel + 1; level <= maxlevel; level++) + nilfs_btree_commit_update_v(btree, path, level); +} + +static int nilfs_btree_propagate_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head *bh) +{ + int maxlevel, ret; + struct nilfs_btree_node *parent; + __u64 ptr; + + get_bh(bh); + path[level].bp_bh = bh; + ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel); + if (ret < 0) + goto out; + + if (buffer_nilfs_volatile(path[level].bp_bh)) { + parent = nilfs_btree_get_node(btree, path, level + 1); + ptr = nilfs_btree_node_get_ptr(btree, parent, + path[level + 1].bp_index); + ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr); + if (ret < 0) + goto out; + } + + nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh); + + out: + brelse(path[level].bp_bh); + path[level].bp_bh = NULL; + return ret; +} + +static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, + struct buffer_head *bh) +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + struct nilfs_btree_node *node; + __u64 key; + int level, ret; + + WARN_ON(!buffer_dirty(bh)); + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + if (buffer_nilfs_node(bh)) { + node = (struct nilfs_btree_node *)bh->b_data; + key = nilfs_btree_node_get_key(btree, node, 0); + level = nilfs_btree_node_get_level(btree, node); + } else { + key = nilfs_bmap_data_get_key(bmap, bh); + level = NILFS_BTREE_LEVEL_DATA; + } + + ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1); + if (ret < 0) { + if (unlikely(ret == -ENOENT)) + printk(KERN_CRIT "%s: key = %llu, level == %d\n", + __func__, (unsigned long long)key, level); + goto out; + } + + ret = btree->bt_ops->btop_propagate(btree, path, level, bh); + + out: + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + + return ret; +} + +static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap, + struct buffer_head *bh) +{ + return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr); +} + +static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, + struct list_head *lists, + struct buffer_head *bh) +{ + struct list_head *head; + struct buffer_head *cbh; + struct nilfs_btree_node *node, *cnode; + __u64 key, ckey; + int level; + + get_bh(bh); + node = (struct nilfs_btree_node *)bh->b_data; + key = nilfs_btree_node_get_key(btree, node, 0); + level = nilfs_btree_node_get_level(btree, node); + list_for_each(head, &lists[level]) { + cbh = list_entry(head, struct buffer_head, b_assoc_buffers); + cnode = (struct nilfs_btree_node *)cbh->b_data; + ckey = nilfs_btree_node_get_key(btree, cnode, 0); + if (key < ckey) + break; + } + list_add_tail(&bh->b_assoc_buffers, head); +} + +static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap, + struct list_head *listp) +{ + struct nilfs_btree *btree = (struct nilfs_btree *)bmap; + struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache; + struct list_head lists[NILFS_BTREE_LEVEL_MAX]; + struct pagevec pvec; + struct buffer_head *bh, *head; + pgoff_t index = 0; + int level, i; + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < NILFS_BTREE_LEVEL_MAX; + level++) + INIT_LIST_HEAD(&lists[level]); + + pagevec_init(&pvec, 0); + + while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + bh = head = page_buffers(pvec.pages[i]); + do { + if (buffer_dirty(bh)) + nilfs_btree_add_dirty_buffer(btree, + lists, bh); + } while ((bh = bh->b_this_page) != head); + } + pagevec_release(&pvec); + cond_resched(); + } + + for (level = NILFS_BTREE_LEVEL_NODE_MIN; + level < NILFS_BTREE_LEVEL_MAX; + level++) + list_splice(&lists[level], listp->prev); +} + +static int nilfs_btree_assign_p(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree_node *parent; + __u64 key; + __u64 ptr; + int ret; + + parent = nilfs_btree_get_node(btree, path, level + 1); + ptr = nilfs_btree_node_get_ptr(btree, parent, + path[level + 1].bp_index); + if (buffer_nilfs_node(*bh)) { + path[level].bp_ctxt.oldkey = ptr; + path[level].bp_ctxt.newkey = blocknr; + path[level].bp_ctxt.bh = *bh; + ret = nilfs_btnode_prepare_change_key( + &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &path[level].bp_ctxt); + if (ret < 0) + return ret; + nilfs_btnode_commit_change_key( + &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &path[level].bp_ctxt); + *bh = path[level].bp_ctxt.bh; + } + + nilfs_btree_node_set_ptr(btree, parent, + path[level + 1].bp_index, blocknr); + + key = nilfs_btree_node_get_key(btree, parent, + path[level + 1].bp_index); + /* on-disk format */ + binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); + binfo->bi_dat.bi_level = level; + + return 0; +} + +static int nilfs_btree_assign_v(struct nilfs_btree *btree, + struct nilfs_btree_path *path, + int level, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree_node *parent; + __u64 key; + __u64 ptr; + union nilfs_bmap_ptr_req req; + int ret; + + parent = nilfs_btree_get_node(btree, path, level + 1); + ptr = nilfs_btree_node_get_ptr(btree, parent, + path[level + 1].bp_index); + req.bpr_ptr = ptr; + ret = btree->bt_bmap.b_pops->bpop_prepare_start_ptr(&btree->bt_bmap, + &req); + if (ret < 0) + return ret; + btree->bt_bmap.b_pops->bpop_commit_start_ptr(&btree->bt_bmap, + &req, blocknr); + + key = nilfs_btree_node_get_key(btree, parent, + path[level + 1].bp_index); + /* on-disk format */ + binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); + binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + + return 0; +} + +static int nilfs_btree_assign(struct nilfs_bmap *bmap, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + struct nilfs_btree_node *node; + __u64 key; + int level, ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + if (buffer_nilfs_node(*bh)) { + node = (struct nilfs_btree_node *)(*bh)->b_data; + key = nilfs_btree_node_get_key(btree, node, 0); + level = nilfs_btree_node_get_level(btree, node); + } else { + key = nilfs_bmap_data_get_key(bmap, *bh); + level = NILFS_BTREE_LEVEL_DATA; + } + + ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + goto out; + } + + ret = btree->bt_ops->btop_assign(btree, path, level, bh, + blocknr, binfo); + + out: + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + + return ret; +} + +static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_btree *btree; + struct nilfs_btree_node *node; + __u64 key; + int ret; + + btree = (struct nilfs_btree *)bmap; + ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr); + if (ret < 0) + return ret; + + if (buffer_nilfs_node(*bh)) { + node = (struct nilfs_btree_node *)(*bh)->b_data; + key = nilfs_btree_node_get_key(btree, node, 0); + } else + key = nilfs_bmap_data_get_key(bmap, *bh); + + /* on-disk format */ + binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr); + binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + + return 0; +} + +static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) +{ + struct buffer_head *bh; + struct nilfs_btree *btree; + struct nilfs_btree_path *path; + __u64 ptr; + int ret; + + btree = (struct nilfs_btree *)bmap; + path = nilfs_btree_alloc_path(btree); + if (path == NULL) + return -ENOMEM; + nilfs_btree_init_path(btree, path); + + ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + goto out; + } + ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, &bh); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + goto out; + } + + if (!buffer_dirty(bh)) + nilfs_btnode_mark_dirty(bh); + nilfs_bmap_put_block(&btree->bt_bmap, bh); + if (!nilfs_bmap_dirty(&btree->bt_bmap)) + nilfs_bmap_set_dirty(&btree->bt_bmap); + + out: + nilfs_btree_clear_path(btree, path); + nilfs_btree_free_path(btree, path); + return ret; +} + +static const struct nilfs_bmap_operations nilfs_btree_ops = { + .bop_lookup = nilfs_btree_lookup, + .bop_insert = nilfs_btree_insert, + .bop_delete = nilfs_btree_delete, + .bop_clear = NULL, + + .bop_propagate = nilfs_btree_propagate, + + .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers, + + .bop_assign = nilfs_btree_assign, + .bop_mark = nilfs_btree_mark, + + .bop_last_key = nilfs_btree_last_key, + .bop_check_insert = NULL, + .bop_check_delete = nilfs_btree_check_delete, + .bop_gather_data = nilfs_btree_gather_data, +}; + +static const struct nilfs_bmap_operations nilfs_btree_ops_gc = { + .bop_lookup = NULL, + .bop_insert = NULL, + .bop_delete = NULL, + .bop_clear = NULL, + + .bop_propagate = nilfs_btree_propagate_gc, + + .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers, + + .bop_assign = nilfs_btree_assign_gc, + .bop_mark = NULL, + + .bop_last_key = NULL, + .bop_check_insert = NULL, + .bop_check_delete = NULL, + .bop_gather_data = NULL, +}; + +static const struct nilfs_btree_operations nilfs_btree_ops_v = { + .btop_find_target = nilfs_btree_find_target_v, + .btop_set_target = nilfs_btree_set_target_v, + .btop_propagate = nilfs_btree_propagate_v, + .btop_assign = nilfs_btree_assign_v, +}; + +static const struct nilfs_btree_operations nilfs_btree_ops_p = { + .btop_find_target = NULL, + .btop_set_target = NULL, + .btop_propagate = nilfs_btree_propagate_p, + .btop_assign = nilfs_btree_assign_p, +}; + +int nilfs_btree_init(struct nilfs_bmap *bmap, __u64 low, __u64 high) +{ + struct nilfs_btree *btree; + + btree = (struct nilfs_btree *)bmap; + bmap->b_ops = &nilfs_btree_ops; + bmap->b_low = low; + bmap->b_high = high; + switch (bmap->b_inode->i_ino) { + case NILFS_DAT_INO: + btree->bt_ops = &nilfs_btree_ops_p; + break; + default: + btree->bt_ops = &nilfs_btree_ops_v; + break; + } + + return 0; +} + +void nilfs_btree_init_gc(struct nilfs_bmap *bmap) +{ + bmap->b_low = NILFS_BMAP_LARGE_LOW; + bmap->b_high = NILFS_BMAP_LARGE_HIGH; + bmap->b_ops = &nilfs_btree_ops_gc; +} diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h new file mode 100644 index 00000000000..4766deb52fb --- /dev/null +++ b/fs/nilfs2/btree.h @@ -0,0 +1,117 @@ +/* + * btree.h - NILFS B-tree. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_BTREE_H +#define _NILFS_BTREE_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/list.h> +#include <linux/nilfs2_fs.h> +#include "btnode.h" +#include "bmap.h" + +struct nilfs_btree; +struct nilfs_btree_path; + +/** + * struct nilfs_btree_operations - B-tree operation table + */ +struct nilfs_btree_operations { + __u64 (*btop_find_target)(const struct nilfs_btree *, + const struct nilfs_btree_path *, __u64); + void (*btop_set_target)(struct nilfs_btree *, __u64, __u64); + + struct the_nilfs *(*btop_get_nilfs)(struct nilfs_btree *); + + int (*btop_propagate)(struct nilfs_btree *, + struct nilfs_btree_path *, + int, + struct buffer_head *); + int (*btop_assign)(struct nilfs_btree *, + struct nilfs_btree_path *, + int, + struct buffer_head **, + sector_t, + union nilfs_binfo *); +}; + +/** + * struct nilfs_btree_node - B-tree node + * @bn_flags: flags + * @bn_level: level + * @bn_nchildren: number of children + * @bn_pad: padding + */ +struct nilfs_btree_node { + __u8 bn_flags; + __u8 bn_level; + __le16 bn_nchildren; + __le32 bn_pad; +}; + +/* flags */ +#define NILFS_BTREE_NODE_ROOT 0x01 + +/* level */ +#define NILFS_BTREE_LEVEL_DATA 0 +#define NILFS_BTREE_LEVEL_NODE_MIN (NILFS_BTREE_LEVEL_DATA + 1) +#define NILFS_BTREE_LEVEL_MAX 14 + +/** + * struct nilfs_btree - B-tree structure + * @bt_bmap: bmap base structure + * @bt_ops: B-tree operation table + */ +struct nilfs_btree { + struct nilfs_bmap bt_bmap; + + /* B-tree-specific members */ + const struct nilfs_btree_operations *bt_ops; +}; + + +#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE +#define NILFS_BTREE_ROOT_NCHILDREN_MAX \ + ((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) / \ + (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */))) +#define NILFS_BTREE_ROOT_NCHILDREN_MIN 0 +#define NILFS_BTREE_NODE_EXTRA_PAD_SIZE (sizeof(__le64)) +#define NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) \ + (((nodesize) - sizeof(struct nilfs_btree_node) - \ + NILFS_BTREE_NODE_EXTRA_PAD_SIZE) / \ + (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */))) +#define NILFS_BTREE_NODE_NCHILDREN_MIN(nodesize) \ + ((NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) - 1) / 2 + 1) +#define NILFS_BTREE_KEY_MIN ((__u64)0) +#define NILFS_BTREE_KEY_MAX (~(__u64)0) + + +int nilfs_btree_path_cache_init(void); +void nilfs_btree_path_cache_destroy(void); +int nilfs_btree_init(struct nilfs_bmap *, __u64, __u64); +int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64, + const __u64 *, const __u64 *, + int, __u64, __u64); +void nilfs_btree_init_gc(struct nilfs_bmap *); + +#endif /* _NILFS_BTREE_H */ diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c new file mode 100644 index 00000000000..e90b60dfced --- /dev/null +++ b/fs/nilfs2/cpfile.c @@ -0,0 +1,925 @@ +/* + * cpfile.c - NILFS checkpoint file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/buffer_head.h> +#include <linux/errno.h> +#include <linux/nilfs2_fs.h> +#include "mdt.h" +#include "cpfile.h" + + +static inline unsigned long +nilfs_cpfile_checkpoints_per_block(const struct inode *cpfile) +{ + return NILFS_MDT(cpfile)->mi_entries_per_block; +} + +/* block number from the beginning of the file */ +static unsigned long +nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno) +{ + __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1; + do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); + return (unsigned long)tcno; +} + +/* offset in block */ +static unsigned long +nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno) +{ + __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1; + return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); +} + +static unsigned long +nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile, + __u64 curr, + __u64 max) +{ + return min_t(__u64, + nilfs_cpfile_checkpoints_per_block(cpfile) - + nilfs_cpfile_get_offset(cpfile, curr), + max - curr); +} + +static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile, + __u64 cno) +{ + return nilfs_cpfile_get_blkoff(cpfile, cno) == 0; +} + +static unsigned int +nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile, + struct buffer_head *bh, + void *kaddr, + unsigned int n) +{ + struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + unsigned int count; + + count = le32_to_cpu(cp->cp_checkpoints_count) + n; + cp->cp_checkpoints_count = cpu_to_le32(count); + return count; +} + +static unsigned int +nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile, + struct buffer_head *bh, + void *kaddr, + unsigned int n) +{ + struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + unsigned int count; + + WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n); + count = le32_to_cpu(cp->cp_checkpoints_count) - n; + cp->cp_checkpoints_count = cpu_to_le32(count); + return count; +} + +static inline struct nilfs_cpfile_header * +nilfs_cpfile_block_get_header(const struct inode *cpfile, + struct buffer_head *bh, + void *kaddr) +{ + return kaddr + bh_offset(bh); +} + +static struct nilfs_checkpoint * +nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno, + struct buffer_head *bh, + void *kaddr) +{ + return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) * + NILFS_MDT(cpfile)->mi_entry_size; +} + +static void nilfs_cpfile_block_init(struct inode *cpfile, + struct buffer_head *bh, + void *kaddr) +{ + struct nilfs_checkpoint *cp = kaddr + bh_offset(bh); + size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; + int n = nilfs_cpfile_checkpoints_per_block(cpfile); + + while (n-- > 0) { + nilfs_checkpoint_set_invalid(cp); + cp = (void *)cp + cpsz; + } +} + +static inline int nilfs_cpfile_get_header_block(struct inode *cpfile, + struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp); +} + +static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile, + __u64 cno, + int create, + struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(cpfile, + nilfs_cpfile_get_blkoff(cpfile, cno), + create, nilfs_cpfile_block_init, bhp); +} + +static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile, + __u64 cno) +{ + return nilfs_mdt_delete_block(cpfile, + nilfs_cpfile_get_blkoff(cpfile, cno)); +} + +/** + * nilfs_cpfile_get_checkpoint - get a checkpoint + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * @create: create flag + * @cpp: pointer to a checkpoint + * @bhp: pointer to a buffer head + * + * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint + * specified by @cno. A new checkpoint will be created if @cno is the current + * checkpoint number and @create is nonzero. + * + * Return Value: On success, 0 is returned, and the checkpoint and the + * buffer head of the buffer on which the checkpoint is located are stored in + * the place pointed by @cpp and @bhp, respectively. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - No such checkpoint. + * + * %-EINVAL - invalid checkpoint. + */ +int nilfs_cpfile_get_checkpoint(struct inode *cpfile, + __u64 cno, + int create, + struct nilfs_checkpoint **cpp, + struct buffer_head **bhp) +{ + struct buffer_head *header_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + void *kaddr; + int ret; + + if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) || + (cno < nilfs_mdt_cno(cpfile) && create))) + return -EINVAL; + + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_sem; + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh); + if (ret < 0) + goto out_header; + kaddr = kmap(cp_bh->b_page); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + if (!create) { + kunmap(cp_bh->b_page); + brelse(cp_bh); + ret = -ENOENT; + goto out_header; + } + /* a newly-created checkpoint */ + nilfs_checkpoint_clear_invalid(cp); + if (!nilfs_cpfile_is_in_first(cpfile, cno)) + nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh, + kaddr, 1); + nilfs_mdt_mark_buffer_dirty(cp_bh); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, + kaddr); + le64_add_cpu(&header->ch_ncheckpoints, 1); + kunmap_atomic(kaddr, KM_USER0); + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + } + + if (cpp != NULL) + *cpp = cp; + *bhp = cp_bh; + + out_header: + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_put_checkpoint - put a checkpoint + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * @bh: buffer head + * + * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint + * specified by @cno. @bh must be the buffer head which has been returned by + * a previous call to nilfs_cpfile_get_checkpoint() with @cno. + */ +void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno, + struct buffer_head *bh) +{ + kunmap(bh->b_page); + brelse(bh); +} + +/** + * nilfs_cpfile_delete_checkpoints - delete checkpoints + * @cpfile: inode of checkpoint file + * @start: start checkpoint number + * @end: end checkpoint numer + * + * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in + * the period from @start to @end, excluding @end itself. The checkpoints + * which have been already deleted are ignored. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - invalid checkpoints. + */ +int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, + __u64 start, + __u64 end) +{ + struct buffer_head *header_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; + __u64 cno; + void *kaddr; + unsigned long tnicps; + int ret, ncps, nicps, count, i; + + if (unlikely(start == 0 || start > end)) { + printk(KERN_ERR "%s: invalid range of checkpoint numbers: " + "[%llu, %llu)\n", __func__, + (unsigned long long)start, (unsigned long long)end); + return -EINVAL; + } + + /* cannot delete the latest checkpoint */ + if (start == nilfs_mdt_cno(cpfile) - 1) + return -EPERM; + + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_sem; + tnicps = 0; + + for (cno = start; cno < end; cno += ncps) { + ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out_sem; + /* skip hole */ + ret = 0; + continue; + } + + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint( + cpfile, cno, cp_bh, kaddr); + nicps = 0; + for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) { + WARN_ON(nilfs_checkpoint_snapshot(cp)); + if (!nilfs_checkpoint_invalid(cp)) { + nilfs_checkpoint_set_invalid(cp); + nicps++; + } + } + if (nicps > 0) { + tnicps += nicps; + nilfs_mdt_mark_buffer_dirty(cp_bh); + nilfs_mdt_mark_dirty(cpfile); + if (!nilfs_cpfile_is_in_first(cpfile, cno) && + (count = nilfs_cpfile_block_sub_valid_checkpoints( + cpfile, cp_bh, kaddr, nicps)) == 0) { + /* make hole */ + kunmap_atomic(kaddr, KM_USER0); + brelse(cp_bh); + ret = nilfs_cpfile_delete_checkpoint_block( + cpfile, cno); + if (ret == 0) + continue; + printk(KERN_ERR "%s: cannot delete block\n", + __func__); + goto out_sem; + } + } + + kunmap_atomic(kaddr, KM_USER0); + brelse(cp_bh); + } + + if (tnicps > 0) { + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, + kaddr); + le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps); + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + kunmap_atomic(kaddr, KM_USER0); + } + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile, + struct nilfs_checkpoint *cp, + struct nilfs_cpinfo *ci) +{ + ci->ci_flags = le32_to_cpu(cp->cp_flags); + ci->ci_cno = le64_to_cpu(cp->cp_cno); + ci->ci_create = le64_to_cpu(cp->cp_create); + ci->ci_nblk_inc = le64_to_cpu(cp->cp_nblk_inc); + ci->ci_inodes_count = le64_to_cpu(cp->cp_inodes_count); + ci->ci_blocks_count = le64_to_cpu(cp->cp_blocks_count); + ci->ci_next = le64_to_cpu(cp->cp_snapshot_list.ssl_next); +} + +static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, + struct nilfs_cpinfo *ci, size_t nci) +{ + struct nilfs_checkpoint *cp; + struct buffer_head *bh; + size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size; + __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop; + void *kaddr; + int n, ret; + int ncps, i; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_read(&NILFS_MDT(cpfile)->mi_sem); + + for (n = 0; cno < cur_cno && n < nci; cno += ncps) { + ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out; + continue; /* skip hole */ + } + + kaddr = kmap_atomic(bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) { + if (!nilfs_checkpoint_invalid(cp)) + nilfs_cpfile_checkpoint_to_cpinfo( + cpfile, cp, &ci[n++]); + } + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + } + + ret = n; + if (n > 0) + *cnop = ci[n - 1].ci_cno + 1; + + out: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, + struct nilfs_cpinfo *ci, size_t nci) +{ + struct buffer_head *bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + __u64 curr = *cnop, next; + unsigned long curr_blkoff, next_blkoff; + void *kaddr; + int n = 0, ret; + + down_read(&NILFS_MDT(cpfile)->mi_sem); + + if (curr == 0) { + ret = nilfs_cpfile_get_header_block(cpfile, &bh); + if (ret < 0) + goto out; + kaddr = kmap_atomic(bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + curr = le64_to_cpu(header->ch_snapshot_list.ssl_next); + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + if (curr == 0) { + ret = 0; + goto out; + } + } else if (unlikely(curr == ~(__u64)0)) { + ret = 0; + goto out; + } + + curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh); + if (unlikely(ret < 0)) { + if (ret == -ENOENT) + ret = 0; /* No snapshots (started from a hole block) */ + goto out; + } + kaddr = kmap_atomic(bh->b_page, KM_USER0); + while (n < nci) { + cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr); + curr = ~(__u64)0; /* Terminator */ + if (unlikely(nilfs_checkpoint_invalid(cp) || + !nilfs_checkpoint_snapshot(cp))) + break; + nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, &ci[n++]); + next = le64_to_cpu(cp->cp_snapshot_list.ssl_next); + if (next == 0) + break; /* reach end of the snapshot list */ + + next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next); + if (curr_blkoff != next_blkoff) { + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, + 0, &bh); + if (unlikely(ret < 0)) { + WARN_ON(ret == -ENOENT); + goto out; + } + kaddr = kmap_atomic(bh->b_page, KM_USER0); + } + curr = next; + curr_blkoff = next_blkoff; + } + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + *cnop = curr; + ret = n; + + out: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_get_cpinfo - + * @cpfile: + * @cno: + * @ci: + * @nci: + */ + +ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode, + struct nilfs_cpinfo *ci, size_t nci) +{ + switch (mode) { + case NILFS_CHECKPOINT: + return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, ci, nci); + case NILFS_SNAPSHOT: + return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, ci, nci); + default: + return -EINVAL; + } +} + +/** + * nilfs_cpfile_delete_checkpoint - + * @cpfile: + * @cno: + */ +int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno) +{ + struct nilfs_cpinfo ci; + __u64 tcno = cno; + ssize_t nci; + int ret; + + nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, 1); + if (nci < 0) + return nci; + else if (nci == 0 || ci.ci_cno != cno) + return -ENOENT; + + /* cannot delete the latest checkpoint nor snapshots */ + ret = nilfs_cpinfo_snapshot(&ci); + if (ret < 0) + return ret; + else if (ret > 0 || cno == nilfs_mdt_cno(cpfile) - 1) + return -EPERM; + + return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1); +} + +static struct nilfs_snapshot_list * +nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile, + __u64 cno, + struct buffer_head *bh, + void *kaddr) +{ + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + struct nilfs_snapshot_list *list; + + if (cno != 0) { + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + list = &cp->cp_snapshot_list; + } else { + header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + list = &header->ch_snapshot_list; + } + return list; +} + +static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) +{ + struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + struct nilfs_snapshot_list *list; + __u64 curr, prev; + unsigned long curr_blkoff, prev_blkoff; + void *kaddr; + int ret; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + ret = -ENOENT; + kunmap_atomic(kaddr, KM_USER0); + goto out_cp; + } + if (nilfs_checkpoint_snapshot(cp)) { + ret = 0; + kunmap_atomic(kaddr, KM_USER0); + goto out_cp; + } + kunmap_atomic(kaddr, KM_USER0); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_cp; + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + list = &header->ch_snapshot_list; + curr_bh = header_bh; + get_bh(curr_bh); + curr = 0; + curr_blkoff = 0; + prev = le64_to_cpu(list->ssl_prev); + while (prev > cno) { + prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev); + curr = prev; + if (curr_blkoff != prev_blkoff) { + kunmap_atomic(kaddr, KM_USER0); + brelse(curr_bh); + ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, + 0, &curr_bh); + if (ret < 0) + goto out_header; + kaddr = kmap_atomic(curr_bh->b_page, KM_USER0); + } + curr_blkoff = prev_blkoff; + cp = nilfs_cpfile_block_get_checkpoint( + cpfile, curr, curr_bh, kaddr); + list = &cp->cp_snapshot_list; + prev = le64_to_cpu(list->ssl_prev); + } + kunmap_atomic(kaddr, KM_USER0); + + if (prev != 0) { + ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, + &prev_bh); + if (ret < 0) + goto out_curr; + } else { + prev_bh = header_bh; + get_bh(prev_bh); + } + + kaddr = kmap_atomic(curr_bh->b_page, KM_USER0); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, curr, curr_bh, kaddr); + list->ssl_prev = cpu_to_le64(cno); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr); + cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev); + nilfs_checkpoint_set_snapshot(cp); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(prev_bh->b_page, KM_USER0); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, prev, prev_bh, kaddr); + list->ssl_next = cpu_to_le64(cno); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + le64_add_cpu(&header->ch_nsnapshots, 1); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(prev_bh); + nilfs_mdt_mark_buffer_dirty(curr_bh); + nilfs_mdt_mark_buffer_dirty(cp_bh); + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + + brelse(prev_bh); + + out_curr: + brelse(curr_bh); + + out_header: + brelse(header_bh); + + out_cp: + brelse(cp_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) +{ + struct buffer_head *header_bh, *next_bh, *prev_bh, *cp_bh; + struct nilfs_cpfile_header *header; + struct nilfs_checkpoint *cp; + struct nilfs_snapshot_list *list; + __u64 next, prev; + void *kaddr; + int ret; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_write(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + if (nilfs_checkpoint_invalid(cp)) { + ret = -ENOENT; + kunmap_atomic(kaddr, KM_USER0); + goto out_cp; + } + if (!nilfs_checkpoint_snapshot(cp)) { + ret = 0; + kunmap_atomic(kaddr, KM_USER0); + goto out_cp; + } + + list = &cp->cp_snapshot_list; + next = le64_to_cpu(list->ssl_next); + prev = le64_to_cpu(list->ssl_prev); + kunmap_atomic(kaddr, KM_USER0); + + ret = nilfs_cpfile_get_header_block(cpfile, &header_bh); + if (ret < 0) + goto out_cp; + if (next != 0) { + ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0, + &next_bh); + if (ret < 0) + goto out_header; + } else { + next_bh = header_bh; + get_bh(next_bh); + } + if (prev != 0) { + ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0, + &prev_bh); + if (ret < 0) + goto out_next; + } else { + prev_bh = header_bh; + get_bh(prev_bh); + } + + kaddr = kmap_atomic(next_bh->b_page, KM_USER0); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, next, next_bh, kaddr); + list->ssl_prev = cpu_to_le64(prev); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(prev_bh->b_page, KM_USER0); + list = nilfs_cpfile_block_get_snapshot_list( + cpfile, prev, prev_bh, kaddr); + list->ssl_next = cpu_to_le64(next); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(cp_bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr); + cp->cp_snapshot_list.ssl_next = cpu_to_le64(0); + cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0); + nilfs_checkpoint_clear_snapshot(cp); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); + le64_add_cpu(&header->ch_nsnapshots, -1); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(next_bh); + nilfs_mdt_mark_buffer_dirty(prev_bh); + nilfs_mdt_mark_buffer_dirty(cp_bh); + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(cpfile); + + brelse(prev_bh); + + out_next: + brelse(next_bh); + + out_header: + brelse(header_bh); + + out_cp: + brelse(cp_bh); + + out_sem: + up_write(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_is_snapshot - + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * + * Description: + * + * Return Value: On success, 1 is returned if the checkpoint specified by + * @cno is a snapshot, or 0 if not. On error, one of the following negative + * error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - No such checkpoint. + */ +int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) +{ + struct buffer_head *bh; + struct nilfs_checkpoint *cp; + void *kaddr; + int ret; + + if (cno == 0) + return -ENOENT; /* checkpoint number 0 is invalid */ + down_read(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); + if (ret < 0) + goto out; + kaddr = kmap_atomic(bh->b_page, KM_USER0); + cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); + ret = nilfs_checkpoint_snapshot(cp); + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + + out: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} + +/** + * nilfs_cpfile_change_cpmode - change checkpoint mode + * @cpfile: inode of checkpoint file + * @cno: checkpoint number + * @status: mode of checkpoint + * + * Description: nilfs_change_cpmode() changes the mode of the checkpoint + * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - No such checkpoint. + */ +int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) +{ + struct the_nilfs *nilfs; + int ret; + + nilfs = NILFS_MDT(cpfile)->mi_nilfs; + + switch (mode) { + case NILFS_CHECKPOINT: + /* + * Check for protecting existing snapshot mounts: + * bd_mount_sem is used to make this operation atomic and + * exclusive with a new mount job. Though it doesn't cover + * umount, it's enough for the purpose. + */ + down(&nilfs->ns_bdev->bd_mount_sem); + if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) { + /* Current implementation does not have to protect + plain read-only mounts since they are exclusive + with a read/write mount and are protected from the + cleaner. */ + ret = -EBUSY; + } else + ret = nilfs_cpfile_clear_snapshot(cpfile, cno); + up(&nilfs->ns_bdev->bd_mount_sem); + return ret; + case NILFS_SNAPSHOT: + return nilfs_cpfile_set_snapshot(cpfile, cno); + default: + return -EINVAL; + } +} + +/** + * nilfs_cpfile_get_stat - get checkpoint statistics + * @cpfile: inode of checkpoint file + * @stat: pointer to a structure of checkpoint statistics + * + * Description: nilfs_cpfile_get_stat() returns information about checkpoints. + * + * Return Value: On success, 0 is returned, and checkpoints information is + * stored in the place pointed by @stat. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat) +{ + struct buffer_head *bh; + struct nilfs_cpfile_header *header; + void *kaddr; + int ret; + + down_read(&NILFS_MDT(cpfile)->mi_sem); + + ret = nilfs_cpfile_get_header_block(cpfile, &bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(bh->b_page, KM_USER0); + header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr); + cpstat->cs_cno = nilfs_mdt_cno(cpfile); + cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints); + cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots); + kunmap_atomic(kaddr, KM_USER0); + brelse(bh); + + out_sem: + up_read(&NILFS_MDT(cpfile)->mi_sem); + return ret; +} diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h new file mode 100644 index 00000000000..1a8a1008c34 --- /dev/null +++ b/fs/nilfs2/cpfile.h @@ -0,0 +1,45 @@ +/* + * cpfile.h - NILFS checkpoint file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_CPFILE_H +#define _NILFS_CPFILE_H + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> + +#define NILFS_CPFILE_GFP NILFS_MDT_GFP + + +int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, + struct nilfs_checkpoint **, + struct buffer_head **); +void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *); +int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64); +int nilfs_cpfile_delete_checkpoint(struct inode *, __u64); +int nilfs_cpfile_change_cpmode(struct inode *, __u64, int); +int nilfs_cpfile_is_snapshot(struct inode *, __u64); +int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *); +ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, + struct nilfs_cpinfo *, size_t); + +#endif /* _NILFS_CPFILE_H */ diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c new file mode 100644 index 00000000000..bb8a5818e7f --- /dev/null +++ b/fs/nilfs2/dat.c @@ -0,0 +1,430 @@ +/* + * dat.c - NILFS disk address translation. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/string.h> +#include <linux/errno.h> +#include "nilfs.h" +#include "mdt.h" +#include "alloc.h" +#include "dat.h" + + +#define NILFS_CNO_MIN ((__u64)1) +#define NILFS_CNO_MAX (~(__u64)0) + +static int nilfs_dat_prepare_entry(struct inode *dat, + struct nilfs_palloc_req *req, int create) +{ + return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr, + create, &req->pr_entry_bh); +} + +static void nilfs_dat_commit_entry(struct inode *dat, + struct nilfs_palloc_req *req) +{ + nilfs_mdt_mark_buffer_dirty(req->pr_entry_bh); + nilfs_mdt_mark_dirty(dat); + brelse(req->pr_entry_bh); +} + +static void nilfs_dat_abort_entry(struct inode *dat, + struct nilfs_palloc_req *req) +{ + brelse(req->pr_entry_bh); +} + +int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req) +{ + int ret; + + ret = nilfs_palloc_prepare_alloc_entry(dat, req); + if (ret < 0) + return ret; + + ret = nilfs_dat_prepare_entry(dat, req, 1); + if (ret < 0) + nilfs_palloc_abort_alloc_entry(dat, req); + + return ret; +} + +void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + entry->de_start = cpu_to_le64(NILFS_CNO_MIN); + entry->de_end = cpu_to_le64(NILFS_CNO_MAX); + entry->de_blocknr = cpu_to_le64(0); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_palloc_commit_alloc_entry(dat, req); + nilfs_dat_commit_entry(dat, req); +} + +void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req) +{ + nilfs_dat_abort_entry(dat, req); + nilfs_palloc_abort_alloc_entry(dat, req); +} + +int nilfs_dat_prepare_free(struct inode *dat, struct nilfs_palloc_req *req) +{ + int ret; + + ret = nilfs_palloc_prepare_free_entry(dat, req); + if (ret < 0) + return ret; + ret = nilfs_dat_prepare_entry(dat, req, 0); + if (ret < 0) { + nilfs_palloc_abort_free_entry(dat, req); + return ret; + } + return 0; +} + +void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + entry->de_start = cpu_to_le64(NILFS_CNO_MIN); + entry->de_end = cpu_to_le64(NILFS_CNO_MIN); + entry->de_blocknr = cpu_to_le64(0); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_dat_commit_entry(dat, req); + nilfs_palloc_commit_free_entry(dat, req); +} + +void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req) +{ + nilfs_dat_abort_entry(dat, req); + nilfs_palloc_abort_free_entry(dat, req); +} + +int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req) +{ + int ret; + + ret = nilfs_dat_prepare_entry(dat, req, 0); + WARN_ON(ret == -ENOENT); + return ret; +} + +void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, + sector_t blocknr) +{ + struct nilfs_dat_entry *entry; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat)); + if (entry->de_blocknr != cpu_to_le64(0) || + entry->de_end != cpu_to_le64(NILFS_CNO_MAX)) { + printk(KERN_CRIT + "%s: vbn = %llu, start = %llu, end = %llu, pbn = %llu\n", + __func__, (unsigned long long)req->pr_entry_nr, + (unsigned long long)le64_to_cpu(entry->de_start), + (unsigned long long)le64_to_cpu(entry->de_end), + (unsigned long long)le64_to_cpu(entry->de_blocknr)); + } + entry->de_blocknr = cpu_to_le64(blocknr); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_dat_commit_entry(dat, req); +} + +void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req) +{ + nilfs_dat_abort_entry(dat, req); +} + +int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + __u64 start; + sector_t blocknr; + void *kaddr; + int ret; + + ret = nilfs_dat_prepare_entry(dat, req, 0); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + return ret; + } + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + start = le64_to_cpu(entry->de_start); + blocknr = le64_to_cpu(entry->de_blocknr); + kunmap_atomic(kaddr, KM_USER0); + + if (blocknr == 0) { + ret = nilfs_palloc_prepare_free_entry(dat, req); + if (ret < 0) { + nilfs_dat_abort_entry(dat, req); + return ret; + } + } + + return 0; +} + +void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req, + int dead) +{ + struct nilfs_dat_entry *entry; + __u64 start, end; + sector_t blocknr; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + end = start = le64_to_cpu(entry->de_start); + if (!dead) { + end = nilfs_mdt_cno(dat); + WARN_ON(start > end); + } + entry->de_end = cpu_to_le64(end); + blocknr = le64_to_cpu(entry->de_blocknr); + kunmap_atomic(kaddr, KM_USER0); + + if (blocknr == 0) + nilfs_dat_commit_free(dat, req); + else + nilfs_dat_commit_entry(dat, req); +} + +void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req) +{ + struct nilfs_dat_entry *entry; + __u64 start; + sector_t blocknr; + void *kaddr; + + kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, + req->pr_entry_bh, kaddr); + start = le64_to_cpu(entry->de_start); + blocknr = le64_to_cpu(entry->de_blocknr); + kunmap_atomic(kaddr, KM_USER0); + + if (start == nilfs_mdt_cno(dat) && blocknr == 0) + nilfs_palloc_abort_free_entry(dat, req); + nilfs_dat_abort_entry(dat, req); +} + +/** + * nilfs_dat_mark_dirty - + * @dat: DAT file inode + * @vblocknr: virtual block number + * + * Description: + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr) +{ + struct nilfs_palloc_req req; + int ret; + + req.pr_entry_nr = vblocknr; + ret = nilfs_dat_prepare_entry(dat, &req, 0); + if (ret == 0) + nilfs_dat_commit_entry(dat, &req); + return ret; +} + +/** + * nilfs_dat_freev - free virtual block numbers + * @dat: DAT file inode + * @vblocknrs: array of virtual block numbers + * @nitems: number of virtual block numbers + * + * Description: nilfs_dat_freev() frees the virtual block numbers specified by + * @vblocknrs and @nitems. + * + * Return Value: On success, 0 is returned. On error, one of the following + * nagative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The virtual block number have not been allocated. + */ +int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems) +{ + return nilfs_palloc_freev(dat, vblocknrs, nitems); +} + +/** + * nilfs_dat_move - change a block number + * @dat: DAT file inode + * @vblocknr: virtual block number + * @blocknr: block number + * + * Description: nilfs_dat_move() changes the block number associated with + * @vblocknr to @blocknr. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) +{ + struct buffer_head *entry_bh; + struct nilfs_dat_entry *entry; + void *kaddr; + int ret; + + ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); + if (ret < 0) + return ret; + kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); + if (unlikely(entry->de_blocknr == cpu_to_le64(0))) { + printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__, + (unsigned long long)vblocknr, + (unsigned long long)le64_to_cpu(entry->de_start), + (unsigned long long)le64_to_cpu(entry->de_end)); + kunmap_atomic(kaddr, KM_USER0); + brelse(entry_bh); + return -EINVAL; + } + WARN_ON(blocknr == 0); + entry->de_blocknr = cpu_to_le64(blocknr); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(entry_bh); + nilfs_mdt_mark_dirty(dat); + + brelse(entry_bh); + + return 0; +} + +/** + * nilfs_dat_translate - translate a virtual block number to a block number + * @dat: DAT file inode + * @vblocknr: virtual block number + * @blocknrp: pointer to a block number + * + * Description: nilfs_dat_translate() maps the virtual block number @vblocknr + * to the corresponding block number. + * + * Return Value: On success, 0 is returned and the block number associated + * with @vblocknr is stored in the place pointed by @blocknrp. On error, one + * of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A block number associated with @vblocknr does not exist. + */ +int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) +{ + struct buffer_head *entry_bh; + struct nilfs_dat_entry *entry; + sector_t blocknr; + void *kaddr; + int ret; + + ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); + if (ret < 0) + return ret; + + kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); + entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); + blocknr = le64_to_cpu(entry->de_blocknr); + if (blocknr == 0) { + ret = -ENOENT; + goto out; + } + if (blocknrp != NULL) + *blocknrp = blocknr; + + out: + kunmap_atomic(kaddr, KM_USER0); + brelse(entry_bh); + return ret; +} + +ssize_t nilfs_dat_get_vinfo(struct inode *dat, struct nilfs_vinfo *vinfo, + size_t nvi) +{ + struct buffer_head *entry_bh; + struct nilfs_dat_entry *entry; + __u64 first, last; + void *kaddr; + unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block; + int i, j, n, ret; + + for (i = 0; i < nvi; i += n) { + ret = nilfs_palloc_get_entry_block(dat, vinfo[i].vi_vblocknr, + 0, &entry_bh); + if (ret < 0) + return ret; + kaddr = kmap_atomic(entry_bh->b_page, KM_USER0); + /* last virtual block number in this block */ + first = vinfo[i].vi_vblocknr; + do_div(first, entries_per_block); + first *= entries_per_block; + last = first + entries_per_block - 1; + for (j = i, n = 0; + j < nvi && vinfo[j].vi_vblocknr >= first && + vinfo[j].vi_vblocknr <= last; + j++, n++) { + entry = nilfs_palloc_block_get_entry( + dat, vinfo[j].vi_vblocknr, entry_bh, kaddr); + vinfo[j].vi_start = le64_to_cpu(entry->de_start); + vinfo[j].vi_end = le64_to_cpu(entry->de_end); + vinfo[j].vi_blocknr = le64_to_cpu(entry->de_blocknr); + } + kunmap_atomic(kaddr, KM_USER0); + brelse(entry_bh); + } + + return nvi; +} diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h new file mode 100644 index 00000000000..d9560654a4b --- /dev/null +++ b/fs/nilfs2/dat.h @@ -0,0 +1,52 @@ +/* + * dat.h - NILFS disk address translation. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_DAT_H +#define _NILFS_DAT_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> + +#define NILFS_DAT_GFP NILFS_MDT_GFP + +struct nilfs_palloc_req; + +int nilfs_dat_translate(struct inode *, __u64, sector_t *); + +int nilfs_dat_prepare_alloc(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_commit_alloc(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *); +int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *, + sector_t); +void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *); +int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *); +void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int); +void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *); + +int nilfs_dat_mark_dirty(struct inode *, __u64); +int nilfs_dat_freev(struct inode *, __u64 *, size_t); +int nilfs_dat_move(struct inode *, __u64, sector_t); +ssize_t nilfs_dat_get_vinfo(struct inode *, struct nilfs_vinfo *, size_t); + +#endif /* _NILFS_DAT_H */ diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c new file mode 100644 index 00000000000..54100acc110 --- /dev/null +++ b/fs/nilfs2/dir.c @@ -0,0 +1,711 @@ +/* + * dir.c - NILFS directory entry operations + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net> + */ +/* + * linux/fs/ext2/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext2 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * All code that works with directory layout had been switched to pagecache + * and moved here. AV + */ + +#include <linux/pagemap.h> +#include <linux/smp_lock.h> +#include "nilfs.h" +#include "page.h" + +/* + * nilfs uses block-sized chunks. Arguably, sector-sized ones would be + * more robust, but we have what we have + */ +static inline unsigned nilfs_chunk_size(struct inode *inode) +{ + return inode->i_sb->s_blocksize; +} + +static inline void nilfs_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +static inline unsigned long dir_pages(struct inode *inode) +{ + return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; +} + +/* + * Return the offset into page `page_nr' of the last valid + * byte in that page, plus one. + */ +static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr) +{ + unsigned last_byte = inode->i_size; + + last_byte -= page_nr << PAGE_CACHE_SHIFT; + if (last_byte > PAGE_CACHE_SIZE) + last_byte = PAGE_CACHE_SIZE; + return last_byte; +} + +static int nilfs_prepare_chunk_uninterruptible(struct page *page, + struct address_space *mapping, + unsigned from, unsigned to) +{ + loff_t pos = page_offset(page) + from; + return block_write_begin(NULL, mapping, pos, to - from, + AOP_FLAG_UNINTERRUPTIBLE, &page, + NULL, nilfs_get_block); +} + +static int nilfs_prepare_chunk(struct page *page, + struct address_space *mapping, + unsigned from, unsigned to) +{ + loff_t pos = page_offset(page) + from; + return block_write_begin(NULL, mapping, pos, to - from, 0, &page, + NULL, nilfs_get_block); +} + +static int nilfs_commit_chunk(struct page *page, + struct address_space *mapping, + unsigned from, unsigned to) +{ + struct inode *dir = mapping->host; + struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb); + loff_t pos = page_offset(page) + from; + unsigned len = to - from; + unsigned nr_dirty, copied; + int err; + + nr_dirty = nilfs_page_count_clean_buffers(page, from, to); + copied = block_write_end(NULL, mapping, pos, len, len, page, NULL); + if (pos + copied > dir->i_size) { + i_size_write(dir, pos + copied); + mark_inode_dirty(dir); + } + if (IS_DIRSYNC(dir)) + nilfs_set_transaction_flag(NILFS_TI_SYNC); + err = nilfs_set_file_dirty(sbi, dir, nr_dirty); + unlock_page(page); + return err; +} + +static void nilfs_check_page(struct page *page) +{ + struct inode *dir = page->mapping->host; + struct super_block *sb = dir->i_sb; + unsigned chunk_size = nilfs_chunk_size(dir); + char *kaddr = page_address(page); + unsigned offs, rec_len; + unsigned limit = PAGE_CACHE_SIZE; + struct nilfs_dir_entry *p; + char *error; + + if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) { + limit = dir->i_size & ~PAGE_CACHE_MASK; + if (limit & (chunk_size - 1)) + goto Ebadsize; + if (!limit) + goto out; + } + for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) { + p = (struct nilfs_dir_entry *)(kaddr + offs); + rec_len = le16_to_cpu(p->rec_len); + + if (rec_len < NILFS_DIR_REC_LEN(1)) + goto Eshort; + if (rec_len & 3) + goto Ealign; + if (rec_len < NILFS_DIR_REC_LEN(p->name_len)) + goto Enamelen; + if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) + goto Espan; + } + if (offs != limit) + goto Eend; +out: + SetPageChecked(page); + return; + + /* Too bad, we had an error */ + +Ebadsize: + nilfs_error(sb, "nilfs_check_page", + "size of directory #%lu is not a multiple of chunk size", + dir->i_ino + ); + goto fail; +Eshort: + error = "rec_len is smaller than minimal"; + goto bad_entry; +Ealign: + error = "unaligned directory entry"; + goto bad_entry; +Enamelen: + error = "rec_len is too small for name_len"; + goto bad_entry; +Espan: + error = "directory entry across blocks"; +bad_entry: + nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - " + "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, + (unsigned long) le64_to_cpu(p->inode), + rec_len, p->name_len); + goto fail; +Eend: + p = (struct nilfs_dir_entry *)(kaddr + offs); + nilfs_error(sb, "nilfs_check_page", + "entry in directory #%lu spans the page boundary" + "offset=%lu, inode=%lu", + dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, + (unsigned long) le64_to_cpu(p->inode)); +fail: + SetPageChecked(page); + SetPageError(page); +} + +static struct page *nilfs_get_page(struct inode *dir, unsigned long n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page = read_cache_page(mapping, n, + (filler_t *)mapping->a_ops->readpage, NULL); + if (!IS_ERR(page)) { + wait_on_page_locked(page); + kmap(page); + if (!PageUptodate(page)) + goto fail; + if (!PageChecked(page)) + nilfs_check_page(page); + if (PageError(page)) + goto fail; + } + return page; + +fail: + nilfs_put_page(page); + return ERR_PTR(-EIO); +} + +/* + * NOTE! unlike strncmp, nilfs_match returns 1 for success, 0 for failure. + * + * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller. + */ +static int +nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +/* + * p is at least 6 bytes before the end of page + */ +static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p) +{ + return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len)); +} + +static unsigned char +nilfs_filetype_table[NILFS_FT_MAX] = { + [NILFS_FT_UNKNOWN] = DT_UNKNOWN, + [NILFS_FT_REG_FILE] = DT_REG, + [NILFS_FT_DIR] = DT_DIR, + [NILFS_FT_CHRDEV] = DT_CHR, + [NILFS_FT_BLKDEV] = DT_BLK, + [NILFS_FT_FIFO] = DT_FIFO, + [NILFS_FT_SOCK] = DT_SOCK, + [NILFS_FT_SYMLINK] = DT_LNK, +}; + +#define S_SHIFT 12 +static unsigned char +nilfs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = NILFS_FT_DIR, + [S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = NILFS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = NILFS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = NILFS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = NILFS_FT_SYMLINK, +}; + +static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode) +{ + mode_t mode = inode->i_mode; + + de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + loff_t pos = filp->f_pos; + struct inode *inode = filp->f_dentry->d_inode; + struct super_block *sb = inode->i_sb; + unsigned int offset = pos & ~PAGE_CACHE_MASK; + unsigned long n = pos >> PAGE_CACHE_SHIFT; + unsigned long npages = dir_pages(inode); +/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */ + unsigned char *types = NULL; + int ret; + + if (pos > inode->i_size - NILFS_DIR_REC_LEN(1)) + goto success; + + types = nilfs_filetype_table; + + for ( ; n < npages; n++, offset = 0) { + char *kaddr, *limit; + struct nilfs_dir_entry *de; + struct page *page = nilfs_get_page(inode, n); + + if (IS_ERR(page)) { + nilfs_error(sb, __func__, "bad page in #%lu", + inode->i_ino); + filp->f_pos += PAGE_CACHE_SIZE - offset; + ret = -EIO; + goto done; + } + kaddr = page_address(page); + de = (struct nilfs_dir_entry *)(kaddr + offset); + limit = kaddr + nilfs_last_byte(inode, n) - + NILFS_DIR_REC_LEN(1); + for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) { + if (de->rec_len == 0) { + nilfs_error(sb, __func__, + "zero-length directory entry"); + ret = -EIO; + nilfs_put_page(page); + goto done; + } + if (de->inode) { + int over; + unsigned char d_type = DT_UNKNOWN; + + if (types && de->file_type < NILFS_FT_MAX) + d_type = types[de->file_type]; + + offset = (char *)de - kaddr; + over = filldir(dirent, de->name, de->name_len, + (n<<PAGE_CACHE_SHIFT) | offset, + le64_to_cpu(de->inode), d_type); + if (over) { + nilfs_put_page(page); + goto success; + } + } + filp->f_pos += le16_to_cpu(de->rec_len); + } + nilfs_put_page(page); + } + +success: + ret = 0; +done: + return ret; +} + +/* + * nilfs_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the page in which the entry was found, and the entry itself + * (as a parameter - res_dir). Page is returned mapped and unlocked. + * Entry is guaranteed to be valid. + */ +struct nilfs_dir_entry * +nilfs_find_entry(struct inode *dir, struct dentry *dentry, + struct page **res_page) +{ + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned reclen = NILFS_DIR_REC_LEN(namelen); + unsigned long start, n; + unsigned long npages = dir_pages(dir); + struct page *page = NULL; + struct nilfs_inode_info *ei = NILFS_I(dir); + struct nilfs_dir_entry *de; + + if (npages == 0) + goto out; + + /* OFFSET_CACHE */ + *res_page = NULL; + + start = ei->i_dir_start_lookup; + if (start >= npages) + start = 0; + n = start; + do { + char *kaddr; + page = nilfs_get_page(dir, n); + if (!IS_ERR(page)) { + kaddr = page_address(page); + de = (struct nilfs_dir_entry *)kaddr; + kaddr += nilfs_last_byte(dir, n) - reclen; + while ((char *) de <= kaddr) { + if (de->rec_len == 0) { + nilfs_error(dir->i_sb, __func__, + "zero-length directory entry"); + nilfs_put_page(page); + goto out; + } + if (nilfs_match(namelen, name, de)) + goto found; + de = nilfs_next_entry(de); + } + nilfs_put_page(page); + } + if (++n >= npages) + n = 0; + /* next page is past the blocks we've got */ + if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) { + nilfs_error(dir->i_sb, __func__, + "dir %lu size %lld exceeds block cout %llu", + dir->i_ino, dir->i_size, + (unsigned long long)dir->i_blocks); + goto out; + } + } while (n != start); +out: + return NULL; + +found: + *res_page = page; + ei->i_dir_start_lookup = n; + return de; +} + +struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p) +{ + struct page *page = nilfs_get_page(dir, 0); + struct nilfs_dir_entry *de = NULL; + + if (!IS_ERR(page)) { + de = nilfs_next_entry( + (struct nilfs_dir_entry *)page_address(page)); + *p = page; + } + return de; +} + +ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry) +{ + ino_t res = 0; + struct nilfs_dir_entry *de; + struct page *page; + + de = nilfs_find_entry(dir, dentry, &page); + if (de) { + res = le64_to_cpu(de->inode); + kunmap(page); + page_cache_release(page); + } + return res; +} + +/* Releases the page */ +void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, + struct page *page, struct inode *inode) +{ + unsigned from = (char *) de - (char *) page_address(page); + unsigned to = from + le16_to_cpu(de->rec_len); + struct address_space *mapping = page->mapping; + int err; + + lock_page(page); + err = nilfs_prepare_chunk_uninterruptible(page, mapping, from, to); + BUG_ON(err); + de->inode = cpu_to_le64(inode->i_ino); + nilfs_set_de_type(de, inode); + err = nilfs_commit_chunk(page, mapping, from, to); + nilfs_put_page(page); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; +/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */ + mark_inode_dirty(dir); +} + +/* + * Parent is locked. + */ +int nilfs_add_link(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned chunk_size = nilfs_chunk_size(dir); + unsigned reclen = NILFS_DIR_REC_LEN(namelen); + unsigned short rec_len, name_len; + struct page *page = NULL; + struct nilfs_dir_entry *de; + unsigned long npages = dir_pages(dir); + unsigned long n; + char *kaddr; + unsigned from, to; + int err; + + /* + * We take care of directory expansion in the same loop. + * This code plays outside i_size, so it locks the page + * to protect that region. + */ + for (n = 0; n <= npages; n++) { + char *dir_end; + + page = nilfs_get_page(dir, n); + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; + lock_page(page); + kaddr = page_address(page); + dir_end = kaddr + nilfs_last_byte(dir, n); + de = (struct nilfs_dir_entry *)kaddr; + kaddr += PAGE_CACHE_SIZE - reclen; + while ((char *)de <= kaddr) { + if ((char *)de == dir_end) { + /* We hit i_size */ + name_len = 0; + rec_len = chunk_size; + de->rec_len = cpu_to_le16(chunk_size); + de->inode = 0; + goto got_it; + } + if (de->rec_len == 0) { + nilfs_error(dir->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out_unlock; + } + err = -EEXIST; + if (nilfs_match(namelen, name, de)) + goto out_unlock; + name_len = NILFS_DIR_REC_LEN(de->name_len); + rec_len = le16_to_cpu(de->rec_len); + if (!de->inode && rec_len >= reclen) + goto got_it; + if (rec_len >= name_len + reclen) + goto got_it; + de = (struct nilfs_dir_entry *)((char *)de + rec_len); + } + unlock_page(page); + nilfs_put_page(page); + } + BUG(); + return -EINVAL; + +got_it: + from = (char *)de - (char *)page_address(page); + to = from + rec_len; + err = nilfs_prepare_chunk(page, page->mapping, from, to); + if (err) + goto out_unlock; + if (de->inode) { + struct nilfs_dir_entry *de1; + + de1 = (struct nilfs_dir_entry *)((char *)de + name_len); + de1->rec_len = cpu_to_le16(rec_len - name_len); + de->rec_len = cpu_to_le16(name_len); + de = de1; + } + de->name_len = namelen; + memcpy(de->name, name, namelen); + de->inode = cpu_to_le64(inode->i_ino); + nilfs_set_de_type(de, inode); + err = nilfs_commit_chunk(page, page->mapping, from, to); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; +/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */ + mark_inode_dirty(dir); + /* OFFSET_CACHE */ +out_put: + nilfs_put_page(page); +out: + return err; +out_unlock: + unlock_page(page); + goto out_put; +} + +/* + * nilfs_delete_entry deletes a directory entry by merging it with the + * previous entry. Page is up-to-date. Releases the page. + */ +int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + char *kaddr = page_address(page); + unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1); + unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len); + struct nilfs_dir_entry *pde = NULL; + struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from); + int err; + + while ((char *)de < (char *)dir) { + if (de->rec_len == 0) { + nilfs_error(inode->i_sb, __func__, + "zero-length directory entry"); + err = -EIO; + goto out; + } + pde = de; + de = nilfs_next_entry(de); + } + if (pde) + from = (char *)pde - (char *)page_address(page); + lock_page(page); + err = nilfs_prepare_chunk(page, mapping, from, to); + BUG_ON(err); + if (pde) + pde->rec_len = cpu_to_le16(to - from); + dir->inode = 0; + err = nilfs_commit_chunk(page, mapping, from, to); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; +/* NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */ + mark_inode_dirty(inode); +out: + nilfs_put_page(page); + return err; +} + +/* + * Set the first fragment of directory. + */ +int nilfs_make_empty(struct inode *inode, struct inode *parent) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page = grab_cache_page(mapping, 0); + unsigned chunk_size = nilfs_chunk_size(inode); + struct nilfs_dir_entry *de; + int err; + void *kaddr; + + if (!page) + return -ENOMEM; + + err = nilfs_prepare_chunk(page, mapping, 0, chunk_size); + if (unlikely(err)) { + unlock_page(page); + goto fail; + } + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr, 0, chunk_size); + de = (struct nilfs_dir_entry *)kaddr; + de->name_len = 1; + de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1)); + memcpy(de->name, ".\0\0", 4); + de->inode = cpu_to_le64(inode->i_ino); + nilfs_set_de_type(de, inode); + + de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1)); + de->name_len = 2; + de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1)); + de->inode = cpu_to_le64(parent->i_ino); + memcpy(de->name, "..\0", 4); + nilfs_set_de_type(de, inode); + kunmap_atomic(kaddr, KM_USER0); + err = nilfs_commit_chunk(page, mapping, 0, chunk_size); +fail: + page_cache_release(page); + return err; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +int nilfs_empty_dir(struct inode *inode) +{ + struct page *page = NULL; + unsigned long i, npages = dir_pages(inode); + + for (i = 0; i < npages; i++) { + char *kaddr; + struct nilfs_dir_entry *de; + + page = nilfs_get_page(inode, i); + if (IS_ERR(page)) + continue; + + kaddr = page_address(page); + de = (struct nilfs_dir_entry *)kaddr; + kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1); + + while ((char *)de <= kaddr) { + if (de->rec_len == 0) { + nilfs_error(inode->i_sb, __func__, + "zero-length directory entry " + "(kaddr=%p, de=%p)\n", kaddr, de); + goto not_empty; + } + if (de->inode != 0) { + /* check for . and .. */ + if (de->name[0] != '.') + goto not_empty; + if (de->name_len > 2) + goto not_empty; + if (de->name_len < 2) { + if (de->inode != + cpu_to_le64(inode->i_ino)) + goto not_empty; + } else if (de->name[1] != '.') + goto not_empty; + } + de = nilfs_next_entry(de); + } + nilfs_put_page(page); + } + return 1; + +not_empty: + nilfs_put_page(page); + return 0; +} + +struct file_operations nilfs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = nilfs_readdir, + .unlocked_ioctl = nilfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = nilfs_ioctl, +#endif /* CONFIG_COMPAT */ + .fsync = nilfs_sync_file, + +}; diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c new file mode 100644 index 00000000000..c6379e48278 --- /dev/null +++ b/fs/nilfs2/direct.c @@ -0,0 +1,436 @@ +/* + * direct.c - NILFS direct block pointer. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/errno.h> +#include "nilfs.h" +#include "page.h" +#include "direct.h" +#include "alloc.h" + +static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct) +{ + return (__le64 *) + ((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1); +} + +static inline __u64 +nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key) +{ + return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key)); +} + +static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct, + __u64 key, __u64 ptr) +{ + *(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr); +} + +static int nilfs_direct_lookup(const struct nilfs_bmap *bmap, + __u64 key, int level, __u64 *ptrp) +{ + struct nilfs_direct *direct; + __u64 ptr; + + direct = (struct nilfs_direct *)bmap; + if ((key > NILFS_DIRECT_KEY_MAX) || + (level != 1) || /* XXX: use macro for level 1 */ + ((ptr = nilfs_direct_get_ptr(direct, key)) == + NILFS_BMAP_INVALID_PTR)) + return -ENOENT; + + if (ptrp != NULL) + *ptrp = ptr; + return 0; +} + +static __u64 +nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key) +{ + __u64 ptr; + + ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key); + if (ptr != NILFS_BMAP_INVALID_PTR) + /* sequential access */ + return ptr; + else + /* block group */ + return nilfs_bmap_find_target_in_group(&direct->d_bmap); +} + +static void nilfs_direct_set_target_v(struct nilfs_direct *direct, + __u64 key, __u64 ptr) +{ + direct->d_bmap.b_last_allocated_key = key; + direct->d_bmap.b_last_allocated_ptr = ptr; +} + +static int nilfs_direct_prepare_insert(struct nilfs_direct *direct, + __u64 key, + union nilfs_bmap_ptr_req *req, + struct nilfs_bmap_stats *stats) +{ + int ret; + + if (direct->d_ops->dop_find_target != NULL) + req->bpr_ptr = direct->d_ops->dop_find_target(direct, key); + ret = direct->d_bmap.b_pops->bpop_prepare_alloc_ptr(&direct->d_bmap, + req); + if (ret < 0) + return ret; + + stats->bs_nblocks = 1; + return 0; +} + +static void nilfs_direct_commit_insert(struct nilfs_direct *direct, + union nilfs_bmap_ptr_req *req, + __u64 key, __u64 ptr) +{ + struct buffer_head *bh; + + /* ptr must be a pointer to a buffer head. */ + bh = (struct buffer_head *)((unsigned long)ptr); + set_buffer_nilfs_volatile(bh); + + if (direct->d_bmap.b_pops->bpop_commit_alloc_ptr != NULL) + direct->d_bmap.b_pops->bpop_commit_alloc_ptr( + &direct->d_bmap, req); + nilfs_direct_set_ptr(direct, key, req->bpr_ptr); + + if (!nilfs_bmap_dirty(&direct->d_bmap)) + nilfs_bmap_set_dirty(&direct->d_bmap); + + if (direct->d_ops->dop_set_target != NULL) + direct->d_ops->dop_set_target(direct, key, req->bpr_ptr); +} + +static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) +{ + struct nilfs_direct *direct; + union nilfs_bmap_ptr_req req; + struct nilfs_bmap_stats stats; + int ret; + + direct = (struct nilfs_direct *)bmap; + if (key > NILFS_DIRECT_KEY_MAX) + return -ENOENT; + if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR) + return -EEXIST; + + ret = nilfs_direct_prepare_insert(direct, key, &req, &stats); + if (ret < 0) + return ret; + nilfs_direct_commit_insert(direct, &req, key, ptr); + nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); + + return 0; +} + +static int nilfs_direct_prepare_delete(struct nilfs_direct *direct, + union nilfs_bmap_ptr_req *req, + __u64 key, + struct nilfs_bmap_stats *stats) +{ + int ret; + + if (direct->d_bmap.b_pops->bpop_prepare_end_ptr != NULL) { + req->bpr_ptr = nilfs_direct_get_ptr(direct, key); + ret = direct->d_bmap.b_pops->bpop_prepare_end_ptr( + &direct->d_bmap, req); + if (ret < 0) + return ret; + } + + stats->bs_nblocks = 1; + return 0; +} + +static void nilfs_direct_commit_delete(struct nilfs_direct *direct, + union nilfs_bmap_ptr_req *req, + __u64 key) +{ + if (direct->d_bmap.b_pops->bpop_commit_end_ptr != NULL) + direct->d_bmap.b_pops->bpop_commit_end_ptr( + &direct->d_bmap, req); + nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); +} + +static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) +{ + struct nilfs_direct *direct; + union nilfs_bmap_ptr_req req; + struct nilfs_bmap_stats stats; + int ret; + + direct = (struct nilfs_direct *)bmap; + if ((key > NILFS_DIRECT_KEY_MAX) || + nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR) + return -ENOENT; + + ret = nilfs_direct_prepare_delete(direct, &req, key, &stats); + if (ret < 0) + return ret; + nilfs_direct_commit_delete(direct, &req, key); + nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); + + return 0; +} + +static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) +{ + struct nilfs_direct *direct; + __u64 key, lastkey; + + direct = (struct nilfs_direct *)bmap; + lastkey = NILFS_DIRECT_KEY_MAX + 1; + for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++) + if (nilfs_direct_get_ptr(direct, key) != + NILFS_BMAP_INVALID_PTR) + lastkey = key; + + if (lastkey == NILFS_DIRECT_KEY_MAX + 1) + return -ENOENT; + + *keyp = lastkey; + + return 0; +} + +static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key) +{ + return key > NILFS_DIRECT_KEY_MAX; +} + +static int nilfs_direct_gather_data(struct nilfs_bmap *bmap, + __u64 *keys, __u64 *ptrs, int nitems) +{ + struct nilfs_direct *direct; + __u64 key; + __u64 ptr; + int n; + + direct = (struct nilfs_direct *)bmap; + if (nitems > NILFS_DIRECT_NBLOCKS) + nitems = NILFS_DIRECT_NBLOCKS; + n = 0; + for (key = 0; key < nitems; key++) { + ptr = nilfs_direct_get_ptr(direct, key); + if (ptr != NILFS_BMAP_INVALID_PTR) { + keys[n] = key; + ptrs[n] = ptr; + n++; + } + } + return n; +} + +int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, + __u64 key, __u64 *keys, __u64 *ptrs, + int n, __u64 low, __u64 high) +{ + struct nilfs_direct *direct; + __le64 *dptrs; + int ret, i, j; + + /* no need to allocate any resource for conversion */ + + /* delete */ + ret = bmap->b_ops->bop_delete(bmap, key); + if (ret < 0) + return ret; + + /* free resources */ + if (bmap->b_ops->bop_clear != NULL) + bmap->b_ops->bop_clear(bmap); + + /* convert */ + direct = (struct nilfs_direct *)bmap; + dptrs = nilfs_direct_dptrs(direct); + for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) { + if ((j < n) && (i == keys[j])) { + dptrs[i] = (i != key) ? + nilfs_bmap_ptr_to_dptr(ptrs[j]) : + NILFS_BMAP_INVALID_PTR; + j++; + } else + dptrs[i] = NILFS_BMAP_INVALID_PTR; + } + + nilfs_direct_init(bmap, low, high); + + return 0; +} + +static int nilfs_direct_propagate_v(struct nilfs_direct *direct, + struct buffer_head *bh) +{ + union nilfs_bmap_ptr_req oldreq, newreq; + __u64 key; + __u64 ptr; + int ret; + + key = nilfs_bmap_data_get_key(&direct->d_bmap, bh); + ptr = nilfs_direct_get_ptr(direct, key); + if (!buffer_nilfs_volatile(bh)) { + oldreq.bpr_ptr = ptr; + newreq.bpr_ptr = ptr; + ret = nilfs_bmap_prepare_update(&direct->d_bmap, &oldreq, + &newreq); + if (ret < 0) + return ret; + nilfs_bmap_commit_update(&direct->d_bmap, &oldreq, &newreq); + set_buffer_nilfs_volatile(bh); + nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr); + } else + ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr); + + return ret; +} + +static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, + struct buffer_head *bh) +{ + struct nilfs_direct *direct; + + direct = (struct nilfs_direct *)bmap; + return (direct->d_ops->dop_propagate != NULL) ? + direct->d_ops->dop_propagate(direct, bh) : + 0; +} + +static int nilfs_direct_assign_v(struct nilfs_direct *direct, + __u64 key, __u64 ptr, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + union nilfs_bmap_ptr_req req; + int ret; + + req.bpr_ptr = ptr; + ret = direct->d_bmap.b_pops->bpop_prepare_start_ptr( + &direct->d_bmap, &req); + if (ret < 0) + return ret; + direct->d_bmap.b_pops->bpop_commit_start_ptr(&direct->d_bmap, + &req, blocknr); + + binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); + binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + + return 0; +} + +static int nilfs_direct_assign_p(struct nilfs_direct *direct, + __u64 key, __u64 ptr, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + nilfs_direct_set_ptr(direct, key, blocknr); + + binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); + binfo->bi_dat.bi_level = 0; + + return 0; +} + +static int nilfs_direct_assign(struct nilfs_bmap *bmap, + struct buffer_head **bh, + sector_t blocknr, + union nilfs_binfo *binfo) +{ + struct nilfs_direct *direct; + __u64 key; + __u64 ptr; + + direct = (struct nilfs_direct *)bmap; + key = nilfs_bmap_data_get_key(bmap, *bh); + if (unlikely(key > NILFS_DIRECT_KEY_MAX)) { + printk(KERN_CRIT "%s: invalid key: %llu\n", __func__, + (unsigned long long)key); + return -EINVAL; + } + ptr = nilfs_direct_get_ptr(direct, key); + if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) { + printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__, + (unsigned long long)ptr); + return -EINVAL; + } + + return direct->d_ops->dop_assign(direct, key, ptr, bh, + blocknr, binfo); +} + +static const struct nilfs_bmap_operations nilfs_direct_ops = { + .bop_lookup = nilfs_direct_lookup, + .bop_insert = nilfs_direct_insert, + .bop_delete = nilfs_direct_delete, + .bop_clear = NULL, + + .bop_propagate = nilfs_direct_propagate, + + .bop_lookup_dirty_buffers = NULL, + + .bop_assign = nilfs_direct_assign, + .bop_mark = NULL, + + .bop_last_key = nilfs_direct_last_key, + .bop_check_insert = nilfs_direct_check_insert, + .bop_check_delete = NULL, + .bop_gather_data = nilfs_direct_gather_data, +}; + + +static const struct nilfs_direct_operations nilfs_direct_ops_v = { + .dop_find_target = nilfs_direct_find_target_v, + .dop_set_target = nilfs_direct_set_target_v, + .dop_propagate = nilfs_direct_propagate_v, + .dop_assign = nilfs_direct_assign_v, +}; + +static const struct nilfs_direct_operations nilfs_direct_ops_p = { + .dop_find_target = NULL, + .dop_set_target = NULL, + .dop_propagate = NULL, + .dop_assign = nilfs_direct_assign_p, +}; + +int nilfs_direct_init(struct nilfs_bmap *bmap, __u64 low, __u64 high) +{ + struct nilfs_direct *direct; + + direct = (struct nilfs_direct *)bmap; + bmap->b_ops = &nilfs_direct_ops; + bmap->b_low = low; + bmap->b_high = high; + switch (bmap->b_inode->i_ino) { + case NILFS_DAT_INO: + direct->d_ops = &nilfs_direct_ops_p; + break; + default: + direct->d_ops = &nilfs_direct_ops_v; + break; + } + + return 0; +} diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h new file mode 100644 index 00000000000..45d2c5cda81 --- /dev/null +++ b/fs/nilfs2/direct.h @@ -0,0 +1,78 @@ +/* + * direct.h - NILFS direct block pointer. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_DIRECT_H +#define _NILFS_DIRECT_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include "bmap.h" + + +struct nilfs_direct; + +/** + * struct nilfs_direct_operations - direct mapping operation table + */ +struct nilfs_direct_operations { + __u64 (*dop_find_target)(const struct nilfs_direct *, __u64); + void (*dop_set_target)(struct nilfs_direct *, __u64, __u64); + int (*dop_propagate)(struct nilfs_direct *, struct buffer_head *); + int (*dop_assign)(struct nilfs_direct *, __u64, __u64, + struct buffer_head **, sector_t, + union nilfs_binfo *); +}; + +/** + * struct nilfs_direct_node - direct node + * @dn_flags: flags + * @dn_pad: padding + */ +struct nilfs_direct_node { + __u8 dn_flags; + __u8 pad[7]; +}; + +/** + * struct nilfs_direct - direct mapping + * @d_bmap: bmap structure + * @d_ops: direct mapping operation table + */ +struct nilfs_direct { + struct nilfs_bmap d_bmap; + + /* direct-mapping-specific members */ + const struct nilfs_direct_operations *d_ops; +}; + + +#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1) +#define NILFS_DIRECT_KEY_MIN 0 +#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1) + + +int nilfs_direct_init(struct nilfs_bmap *, __u64, __u64); +int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *, + __u64 *, int, __u64, __u64); + + +#endif /* _NILFS_DIRECT_H */ diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c new file mode 100644 index 00000000000..6bd84a0d823 --- /dev/null +++ b/fs/nilfs2/file.c @@ -0,0 +1,160 @@ +/* + * file.c - NILFS regular file handling primitives including fsync(). + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Amagai Yoshiji <amagai@osrg.net>, + * Ryusuke Konishi <ryusuke@osrg.net> + */ + +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/writeback.h> +#include "nilfs.h" +#include "segment.h" + +int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync) +{ + /* + * Called from fsync() system call + * This is the only entry point that can catch write and synch + * timing for both data blocks and intermediate blocks. + * + * This function should be implemented when the writeback function + * will be implemented. + */ + struct inode *inode = dentry->d_inode; + int err; + + if (!nilfs_inode_dirty(inode)) + return 0; + + if (datasync) + err = nilfs_construct_dsync_segment(inode->i_sb, inode, 0, + LLONG_MAX); + else + err = nilfs_construct_segment(inode->i_sb); + + return err; +} + +static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct nilfs_transaction_info ti; + int ret; + + if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs))) + return VM_FAULT_SIGBUS; /* -ENOSPC */ + + lock_page(page); + if (page->mapping != inode->i_mapping || + page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) { + unlock_page(page); + return VM_FAULT_NOPAGE; /* make the VM retry the fault */ + } + + /* + * check to see if the page is mapped already (no holes) + */ + if (PageMappedToDisk(page)) { + unlock_page(page); + goto mapped; + } + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + int fully_mapped = 1; + + bh = head = page_buffers(page); + do { + if (!buffer_mapped(bh)) { + fully_mapped = 0; + break; + } + } while (bh = bh->b_this_page, bh != head); + + if (fully_mapped) { + SetPageMappedToDisk(page); + unlock_page(page); + goto mapped; + } + } + unlock_page(page); + + /* + * fill hole blocks + */ + ret = nilfs_transaction_begin(inode->i_sb, &ti, 1); + /* never returns -ENOMEM, but may return -ENOSPC */ + if (unlikely(ret)) + return VM_FAULT_SIGBUS; + + ret = block_page_mkwrite(vma, vmf, nilfs_get_block); + if (unlikely(ret)) { + nilfs_transaction_abort(inode->i_sb); + return ret; + } + nilfs_transaction_commit(inode->i_sb); + + mapped: + SetPageChecked(page); + wait_on_page_writeback(page); + return 0; +} + +struct vm_operations_struct nilfs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = nilfs_page_mkwrite, +}; + +static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_ops = &nilfs_file_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + return 0; +} + +/* + * We have mostly NULL's here: the current defaults are ok for + * the nilfs filesystem. + */ +struct file_operations nilfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = generic_file_aio_write, + .unlocked_ioctl = nilfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = nilfs_ioctl, +#endif /* CONFIG_COMPAT */ + .mmap = nilfs_file_mmap, + .open = generic_file_open, + /* .release = nilfs_release_file, */ + .fsync = nilfs_sync_file, + .splice_read = generic_file_splice_read, +}; + +struct inode_operations nilfs_file_inode_operations = { + .truncate = nilfs_truncate, + .setattr = nilfs_setattr, + .permission = nilfs_permission, +}; + +/* end of file */ diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c new file mode 100644 index 00000000000..93383c5cee9 --- /dev/null +++ b/fs/nilfs2/gcdat.c @@ -0,0 +1,84 @@ +/* + * gcdat.c - NILFS shadow DAT inode for GC + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>, + * and Ryusuke Konishi <ryusuke@osrg.net>. + * + */ + +#include <linux/buffer_head.h> +#include "nilfs.h" +#include "page.h" +#include "mdt.h" + +int nilfs_init_gcdat_inode(struct the_nilfs *nilfs) +{ + struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat; + struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat); + int err; + + gcdat->i_state = 0; + gcdat->i_blocks = dat->i_blocks; + gii->i_flags = dii->i_flags; + gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT); + gii->i_cno = 0; + nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap); + err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping); + if (unlikely(err)) + return err; + + return nilfs_copy_dirty_pages(&gii->i_btnode_cache, + &dii->i_btnode_cache); +} + +void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs) +{ + struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat; + struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat); + struct address_space *mapping = dat->i_mapping; + struct address_space *gmapping = gcdat->i_mapping; + + down_write(&NILFS_MDT(dat)->mi_sem); + dat->i_blocks = gcdat->i_blocks; + dii->i_flags = gii->i_flags; + dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT); + + nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap); + + nilfs_clear_dirty_pages(mapping); + nilfs_copy_back_pages(mapping, gmapping); + /* note: mdt dirty flags should be cleared by segctor. */ + + nilfs_clear_dirty_pages(&dii->i_btnode_cache); + nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache); + + up_write(&NILFS_MDT(dat)->mi_sem); +} + +void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs) +{ + struct inode *gcdat = nilfs->ns_gc_dat; + struct nilfs_inode_info *gii = NILFS_I(gcdat); + + gcdat->i_state = I_CLEAR; + gii->i_flags = 0; + + truncate_inode_pages(gcdat->i_mapping, 0); + truncate_inode_pages(&gii->i_btnode_cache, 0); +} diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c new file mode 100644 index 00000000000..19d2102b6a6 --- /dev/null +++ b/fs/nilfs2/gcinode.c @@ -0,0 +1,288 @@ +/* + * gcinode.c - dummy inodes to buffer blocks for garbage collection + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>, + * and Ryusuke Konishi <ryusuke@osrg.net>. + * Revised by Ryusuke Konishi <ryusuke@osrg.net>. + * + */ +/* + * This file adds the cache of on-disk blocks to be moved in garbage + * collection. The disk blocks are held with dummy inodes (called + * gcinodes), and this file provides lookup function of the dummy + * inodes and their buffer read function. + * + * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it + * has to treat blocks that belong to a same file but have different + * checkpoint numbers. To avoid interference among generations, dummy + * inodes are managed separatly from actual inodes, and their lookup + * function (nilfs_gc_iget) is designed to be specified with a + * checkpoint number argument as well as an inode number. + * + * Buffers and pages held by the dummy inodes will be released each + * time after they are copied to a new log. Dirty blocks made on the + * current generation and the blocks to be moved by GC never overlap + * because the dirty blocks make a new generation; they rather must be + * written individually. + */ + +#include <linux/buffer_head.h> +#include <linux/mpage.h> +#include <linux/hash.h> +#include <linux/swap.h> +#include "nilfs.h" +#include "page.h" +#include "mdt.h" +#include "dat.h" +#include "ifile.h" + +static struct address_space_operations def_gcinode_aops = {}; +/* XXX need def_gcinode_iops/fops? */ + +/* + * nilfs_gccache_submit_read_data() - add data buffer and submit read request + * @inode - gc inode + * @blkoff - dummy offset treated as the key for the page cache + * @pbn - physical block number of the block + * @vbn - virtual block number of the block, 0 for non-virtual block + * @out_bh - indirect pointer to a buffer_head struct to receive the results + * + * Description: nilfs_gccache_submit_read_data() registers the data buffer + * specified by @pbn to the GC pagecache with the key @blkoff. + * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer. + * + * Return Value: On success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The block specified with @pbn does not exist. + */ +int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, + sector_t pbn, __u64 vbn, + struct buffer_head **out_bh) +{ + struct buffer_head *bh; + int err; + + bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); + if (unlikely(!bh)) + return -ENOMEM; + + if (buffer_uptodate(bh)) + goto out; + + if (pbn == 0) { + struct inode *dat_inode = NILFS_I_NILFS(inode)->ns_dat; + /* use original dat, not gc dat. */ + err = nilfs_dat_translate(dat_inode, vbn, &pbn); + if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */ + brelse(bh); + goto failed; + } + } + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + goto out; + } + + if (!buffer_mapped(bh)) { + bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; + set_buffer_mapped(bh); + } + bh->b_blocknr = pbn; + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(READ, bh); + if (vbn) + bh->b_blocknr = vbn; + out: + err = 0; + *out_bh = bh; + + failed: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + return err; +} + +/* + * nilfs_gccache_submit_read_node() - add node buffer and submit read request + * @inode - gc inode + * @pbn - physical block number for the block + * @vbn - virtual block number for the block + * @out_bh - indirect pointer to a buffer_head struct to receive the results + * + * Description: nilfs_gccache_submit_read_node() registers the node buffer + * specified by @vbn to the GC pagecache. @pbn can be supplied by the + * caller to avoid translation of the disk block address. + * + * Return Value: On success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, + __u64 vbn, struct buffer_head **out_bh) +{ + int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, + vbn ? : pbn, pbn, out_bh, 0); + if (ret == -EEXIST) /* internal code (cache hit) */ + ret = 0; + return ret; +} + +int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) +{ + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + return -EIO; + if (buffer_dirty(bh)) + return -EEXIST; + + if (buffer_nilfs_node(bh)) + nilfs_btnode_mark_dirty(bh); + else + nilfs_mdt_mark_buffer_dirty(bh); + return 0; +} + +/* + * nilfs_init_gccache() - allocate and initialize gc_inode hash table + * @nilfs - the_nilfs + * + * Return Value: On success, 0. + * On error, a negative error code is returned. + */ +int nilfs_init_gccache(struct the_nilfs *nilfs) +{ + int loop; + + BUG_ON(nilfs->ns_gc_inodes_h); + + INIT_LIST_HEAD(&nilfs->ns_gc_inodes); + + nilfs->ns_gc_inodes_h = + kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE, + GFP_NOFS); + if (nilfs->ns_gc_inodes_h == NULL) + return -ENOMEM; + + for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++) + INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]); + return 0; +} + +/* + * nilfs_destroy_gccache() - free gc_inode hash table + * @nilfs - the nilfs + */ +void nilfs_destroy_gccache(struct the_nilfs *nilfs) +{ + if (nilfs->ns_gc_inodes_h) { + nilfs_remove_all_gcinode(nilfs); + kfree(nilfs->ns_gc_inodes_h); + nilfs->ns_gc_inodes_h = NULL; + } +} + +static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino, + __u64 cno) +{ + struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS); + struct nilfs_inode_info *ii; + + if (!inode) + return NULL; + + inode->i_op = NULL; + inode->i_fop = NULL; + inode->i_mapping->a_ops = &def_gcinode_aops; + + ii = NILFS_I(inode); + ii->i_cno = cno; + ii->i_flags = 0; + ii->i_state = 1 << NILFS_I_GCINODE; + ii->i_bh = NULL; + nilfs_bmap_init_gc(ii->i_bmap); + + return inode; +} + +static unsigned long ihash(ino_t ino, __u64 cno) +{ + return hash_long((unsigned long)((ino << 2) + cno), + NILFS_GCINODE_HASH_BITS); +} + +/* + * nilfs_gc_iget() - find or create gc inode with specified (ino,cno) + */ +struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno) +{ + struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno); + struct hlist_node *node; + struct inode *inode; + + hlist_for_each_entry(inode, node, head, i_hash) { + if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno) + return inode; + } + + inode = alloc_gcinode(nilfs, ino, cno); + if (likely(inode)) { + hlist_add_head(&inode->i_hash, head); + list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes); + } + return inode; +} + +/* + * nilfs_clear_gcinode() - clear and free a gc inode + */ +void nilfs_clear_gcinode(struct inode *inode) +{ + nilfs_mdt_clear(inode); + nilfs_mdt_destroy(inode); +} + +/* + * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs + */ +void nilfs_remove_all_gcinode(struct the_nilfs *nilfs) +{ + struct hlist_head *head = nilfs->ns_gc_inodes_h; + struct hlist_node *node, *n; + struct inode *inode; + int loop; + + for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) { + hlist_for_each_entry_safe(inode, node, n, head, i_hash) { + hlist_del_init(&inode->i_hash); + list_del_init(&NILFS_I(inode)->i_dirty); + nilfs_clear_gcinode(inode); /* might sleep */ + } + } +} diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c new file mode 100644 index 00000000000..de86401f209 --- /dev/null +++ b/fs/nilfs2/ifile.c @@ -0,0 +1,150 @@ +/* + * ifile.c - NILFS inode file + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Amagai Yoshiji <amagai@osrg.net>. + * Revised by Ryusuke Konishi <ryusuke@osrg.net>. + * + */ + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include "nilfs.h" +#include "mdt.h" +#include "alloc.h" +#include "ifile.h" + +/** + * nilfs_ifile_create_inode - create a new disk inode + * @ifile: ifile inode + * @out_ino: pointer to a variable to store inode number + * @out_bh: buffer_head contains newly allocated disk inode + * + * Return Value: On success, 0 is returned and the newly allocated inode + * number is stored in the place pointed by @ino, and buffer_head pointer + * that contains newly allocated disk inode structure is stored in the + * place pointed by @out_bh + * On error, one of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOSPC - No inode left. + */ +int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino, + struct buffer_head **out_bh) +{ + struct nilfs_palloc_req req; + int ret; + + req.pr_entry_nr = 0; /* 0 says find free inode from beginning of + a group. dull code!! */ + req.pr_entry_bh = NULL; + + ret = nilfs_palloc_prepare_alloc_entry(ifile, &req); + if (!ret) { + ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1, + &req.pr_entry_bh); + if (ret < 0) + nilfs_palloc_abort_alloc_entry(ifile, &req); + } + if (ret < 0) { + brelse(req.pr_entry_bh); + return ret; + } + nilfs_palloc_commit_alloc_entry(ifile, &req); + nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh); + nilfs_mdt_mark_dirty(ifile); + *out_ino = (ino_t)req.pr_entry_nr; + *out_bh = req.pr_entry_bh; + return 0; +} + +/** + * nilfs_ifile_delete_inode - delete a disk inode + * @ifile: ifile inode + * @ino: inode number + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - The inode number @ino have not been allocated. + */ +int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino) +{ + struct nilfs_palloc_req req = { + .pr_entry_nr = ino, .pr_entry_bh = NULL + }; + struct nilfs_inode *raw_inode; + void *kaddr; + int ret; + + ret = nilfs_palloc_prepare_free_entry(ifile, &req); + if (!ret) { + ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 0, + &req.pr_entry_bh); + if (ret < 0) + nilfs_palloc_abort_free_entry(ifile, &req); + } + if (ret < 0) { + brelse(req.pr_entry_bh); + return ret; + } + + kaddr = kmap_atomic(req.pr_entry_bh->b_page, KM_USER0); + raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr, + req.pr_entry_bh, kaddr); + raw_inode->i_flags = 0; + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh); + brelse(req.pr_entry_bh); + + nilfs_palloc_commit_free_entry(ifile, &req); + + return 0; +} + +int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino, + struct buffer_head **out_bh) +{ + struct super_block *sb = ifile->i_sb; + int err; + + if (unlikely(!NILFS_VALID_INODE(sb, ino))) { + nilfs_error(sb, __func__, "bad inode number: %lu", + (unsigned long) ino); + return -EINVAL; + } + + err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh); + if (unlikely(err)) { + if (err == -EINVAL) + nilfs_error(sb, __func__, "ifile is broken"); + else + nilfs_warning(sb, __func__, + "unable to read inode: %lu", + (unsigned long) ino); + } + return err; +} diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h new file mode 100644 index 00000000000..5d30a35679b --- /dev/null +++ b/fs/nilfs2/ifile.h @@ -0,0 +1,53 @@ +/* + * ifile.h - NILFS inode file + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Amagai Yoshiji <amagai@osrg.net> + * Revised by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#ifndef _NILFS_IFILE_H +#define _NILFS_IFILE_H + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> +#include "mdt.h" +#include "alloc.h" + +#define NILFS_IFILE_GFP NILFS_MDT_GFP + +static inline struct nilfs_inode * +nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) +{ + void *kaddr = kmap(ibh->b_page); + return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr); +} + +static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino, + struct buffer_head *ibh) +{ + kunmap(ibh->b_page); +} + +int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **); +int nilfs_ifile_delete_inode(struct inode *, ino_t); +int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **); + +#endif /* _NILFS_IFILE_H */ diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c new file mode 100644 index 00000000000..49ab4a49bb4 --- /dev/null +++ b/fs/nilfs2/inode.c @@ -0,0 +1,785 @@ +/* + * inode.c - NILFS inode operations. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#include <linux/buffer_head.h> +#include <linux/mpage.h> +#include <linux/writeback.h> +#include <linux/uio.h> +#include "nilfs.h" +#include "segment.h" +#include "page.h" +#include "mdt.h" +#include "cpfile.h" +#include "ifile.h" + + +/** + * nilfs_get_block() - get a file block on the filesystem (callback function) + * @inode - inode struct of the target file + * @blkoff - file block number + * @bh_result - buffer head to be mapped on + * @create - indicate whether allocating the block or not when it has not + * been allocated yet. + * + * This function does not issue actual read request of the specified data + * block. It is done by VFS. + * Bulk read for direct-io is not supported yet. (should be supported) + */ +int nilfs_get_block(struct inode *inode, sector_t blkoff, + struct buffer_head *bh_result, int create) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + unsigned long blknum = 0; + int err = 0, ret; + struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode)); + + /* This exclusion control is a workaround; should be revised */ + down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + ret = nilfs_bmap_lookup(ii->i_bmap, (unsigned long)blkoff, &blknum); + up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + if (ret == 0) { /* found */ + map_bh(bh_result, inode->i_sb, blknum); + goto out; + } + /* data block was not found */ + if (ret == -ENOENT && create) { + struct nilfs_transaction_info ti; + + bh_result->b_blocknr = 0; + err = nilfs_transaction_begin(inode->i_sb, &ti, 1); + if (unlikely(err)) + goto out; + err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff, + (unsigned long)bh_result); + if (unlikely(err != 0)) { + if (err == -EEXIST) { + /* + * The get_block() function could be called + * from multiple callers for an inode. + * However, the page having this block must + * be locked in this case. + */ + printk(KERN_WARNING + "nilfs_get_block: a race condition " + "while inserting a data block. " + "(inode number=%lu, file block " + "offset=%llu)\n", + inode->i_ino, + (unsigned long long)blkoff); + err = 0; + } else if (err == -EINVAL) { + nilfs_error(inode->i_sb, __func__, + "broken bmap (inode=%lu)\n", + inode->i_ino); + err = -EIO; + } + nilfs_transaction_abort(inode->i_sb); + goto out; + } + nilfs_transaction_commit(inode->i_sb); /* never fails */ + /* Error handling should be detailed */ + set_buffer_new(bh_result); + map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed + to proper value */ + } else if (ret == -ENOENT) { + /* not found is not error (e.g. hole); must return without + the mapped state flag. */ + ; + } else { + err = ret; + } + + out: + return err; +} + +/** + * nilfs_readpage() - implement readpage() method of nilfs_aops {} + * address_space_operations. + * @file - file struct of the file to be read + * @page - the page to be read + */ +static int nilfs_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, nilfs_get_block); +} + +/** + * nilfs_readpages() - implement readpages() method of nilfs_aops {} + * address_space_operations. + * @file - file struct of the file to be read + * @mapping - address_space struct used for reading multiple pages + * @pages - the pages to be read + * @nr_pages - number of pages to be read + */ +static int nilfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block); +} + +static int nilfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + int err = 0; + + if (wbc->sync_mode == WB_SYNC_ALL) + err = nilfs_construct_dsync_segment(inode->i_sb, inode, + wbc->range_start, + wbc->range_end); + return err; +} + +static int nilfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + int err; + + redirty_page_for_writepage(wbc, page); + unlock_page(page); + + if (wbc->sync_mode == WB_SYNC_ALL) { + err = nilfs_construct_segment(inode->i_sb); + if (unlikely(err)) + return err; + } else if (wbc->for_reclaim) + nilfs_flush_segment(inode->i_sb, inode->i_ino); + + return 0; +} + +static int nilfs_set_page_dirty(struct page *page) +{ + int ret = __set_page_dirty_buffers(page); + + if (ret) { + struct inode *inode = page->mapping->host; + struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); + unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); + + nilfs_set_file_dirty(sbi, inode, nr_dirty); + } + return ret; +} + +static int nilfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) + +{ + struct inode *inode = mapping->host; + int err = nilfs_transaction_begin(inode->i_sb, NULL, 1); + + if (unlikely(err)) + return err; + + *pagep = NULL; + err = block_write_begin(file, mapping, pos, len, flags, pagep, + fsdata, nilfs_get_block); + if (unlikely(err)) + nilfs_transaction_abort(inode->i_sb); + return err; +} + +static int nilfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + unsigned start = pos & (PAGE_CACHE_SIZE - 1); + unsigned nr_dirty; + int err; + + nr_dirty = nilfs_page_count_clean_buffers(page, start, + start + copied); + copied = generic_write_end(file, mapping, pos, len, copied, page, + fsdata); + nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty); + err = nilfs_transaction_commit(inode->i_sb); + return err ? : copied; +} + +static ssize_t +nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t size; + + if (rw == WRITE) + return 0; + + /* Needs synchronization with the cleaner */ + size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, nilfs_get_block, NULL); + return size; +} + +struct address_space_operations nilfs_aops = { + .writepage = nilfs_writepage, + .readpage = nilfs_readpage, + /* .sync_page = nilfs_sync_page, */ + .writepages = nilfs_writepages, + .set_page_dirty = nilfs_set_page_dirty, + .readpages = nilfs_readpages, + .write_begin = nilfs_write_begin, + .write_end = nilfs_write_end, + /* .releasepage = nilfs_releasepage, */ + .invalidatepage = block_invalidatepage, + .direct_IO = nilfs_direct_IO, +}; + +struct inode *nilfs_new_inode(struct inode *dir, int mode) +{ + struct super_block *sb = dir->i_sb; + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct inode *inode; + struct nilfs_inode_info *ii; + int err = -ENOMEM; + ino_t ino; + + inode = new_inode(sb); + if (unlikely(!inode)) + goto failed; + + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + + ii = NILFS_I(inode); + ii->i_state = 1 << NILFS_I_NEW; + + err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh); + if (unlikely(err)) + goto failed_ifile_create_inode; + /* reference count of i_bh inherits from nilfs_mdt_read_block() */ + + atomic_inc(&sbi->s_inodes_count); + + inode->i_uid = current_fsuid(); + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current_fsgid(); + + inode->i_mode = mode; + inode->i_ino = ino; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { + err = nilfs_bmap_read(ii->i_bmap, NULL); + if (err < 0) + goto failed_bmap; + + set_bit(NILFS_I_BMAP, &ii->i_state); + /* No lock is needed; iget() ensures it. */ + } + + ii->i_flags = NILFS_I(dir)->i_flags; + if (S_ISLNK(mode)) + ii->i_flags &= ~(NILFS_IMMUTABLE_FL | NILFS_APPEND_FL); + if (!S_ISDIR(mode)) + ii->i_flags &= ~NILFS_DIRSYNC_FL; + + /* ii->i_file_acl = 0; */ + /* ii->i_dir_acl = 0; */ + ii->i_dir_start_lookup = 0; +#ifdef CONFIG_NILFS_FS_POSIX_ACL + ii->i_acl = NULL; + ii->i_default_acl = NULL; +#endif + ii->i_cno = 0; + nilfs_set_inode_flags(inode); + spin_lock(&sbi->s_next_gen_lock); + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + insert_inode_hash(inode); + + err = nilfs_init_acl(inode, dir); + if (unlikely(err)) + goto failed_acl; /* never occur. When supporting + nilfs_init_acl(), proper cancellation of + above jobs should be considered */ + + mark_inode_dirty(inode); + return inode; + + failed_acl: + failed_bmap: + inode->i_nlink = 0; + iput(inode); /* raw_inode will be deleted through + generic_delete_inode() */ + goto failed; + + failed_ifile_create_inode: + make_bad_inode(inode); + iput(inode); /* if i_nlink == 1, generic_forget_inode() will be + called */ + failed: + return ERR_PTR(err); +} + +void nilfs_free_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct nilfs_sb_info *sbi = NILFS_SB(sb); + + clear_inode(inode); + /* XXX: check error code? Is there any thing I can do? */ + (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino); + atomic_dec(&sbi->s_inodes_count); +} + +void nilfs_set_inode_flags(struct inode *inode) +{ + unsigned int flags = NILFS_I(inode)->i_flags; + + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | + S_DIRSYNC); + if (flags & NILFS_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & NILFS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & NILFS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; +#ifndef NILFS_ATIME_DISABLE + if (flags & NILFS_NOATIME_FL) +#endif + inode->i_flags |= S_NOATIME; + if (flags & NILFS_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); +} + +int nilfs_read_inode_common(struct inode *inode, + struct nilfs_inode *raw_inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + int err; + + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid); + inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid); + inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + inode->i_size = le64_to_cpu(raw_inode->i_size); + inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); + inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); + inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); + inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); + inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + if (inode->i_nlink == 0 && inode->i_mode == 0) + return -EINVAL; /* this inode is deleted */ + + inode->i_blocks = le64_to_cpu(raw_inode->i_blocks); + ii->i_flags = le32_to_cpu(raw_inode->i_flags); +#if 0 + ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + ii->i_dir_acl = S_ISREG(inode->i_mode) ? + 0 : le32_to_cpu(raw_inode->i_dir_acl); +#endif + ii->i_cno = 0; + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) { + err = nilfs_bmap_read(ii->i_bmap, raw_inode); + if (err < 0) + return err; + set_bit(NILFS_I_BMAP, &ii->i_state); + /* No lock is needed; iget() ensures it. */ + } + return 0; +} + +static int __nilfs_read_inode(struct super_block *sb, unsigned long ino, + struct inode *inode) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct inode *dat = nilfs_dat_inode(sbi->s_nilfs); + struct buffer_head *bh; + struct nilfs_inode *raw_inode; + int err; + + down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh); + if (unlikely(err)) + goto bad_inode; + + raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh); + +#ifdef CONFIG_NILFS_FS_POSIX_ACL + ii->i_acl = NILFS_ACL_NOT_CACHED; + ii->i_default_acl = NILFS_ACL_NOT_CACHED; +#endif + if (nilfs_read_inode_common(inode, raw_inode)) + goto failed_unmap; + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &nilfs_file_inode_operations; + inode->i_fop = &nilfs_file_operations; + inode->i_mapping->a_ops = &nilfs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &nilfs_dir_inode_operations; + inode->i_fop = &nilfs_dir_operations; + inode->i_mapping->a_ops = &nilfs_aops; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &nilfs_symlink_inode_operations; + inode->i_mapping->a_ops = &nilfs_aops; + } else { + inode->i_op = &nilfs_special_inode_operations; + init_special_inode( + inode, inode->i_mode, + new_decode_dev(le64_to_cpu(raw_inode->i_device_code))); + } + nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); + brelse(bh); + up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + nilfs_set_inode_flags(inode); + return 0; + + failed_unmap: + nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh); + brelse(bh); + + bad_inode: + up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + return err; +} + +struct inode *nilfs_iget(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; + int err; + + inode = iget_locked(sb, ino); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = __nilfs_read_inode(sb, ino, inode); + if (unlikely(err)) { + iget_failed(inode); + return ERR_PTR(err); + } + unlock_new_inode(inode); + return inode; +} + +void nilfs_write_inode_common(struct inode *inode, + struct nilfs_inode *raw_inode, int has_bmap) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + raw_inode->i_uid = cpu_to_le32(inode->i_uid); + raw_inode->i_gid = cpu_to_le32(inode->i_gid); + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + raw_inode->i_size = cpu_to_le64(inode->i_size); + raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); + raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + raw_inode->i_blocks = cpu_to_le64(inode->i_blocks); + + raw_inode->i_flags = cpu_to_le32(ii->i_flags); + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + + if (has_bmap) + nilfs_bmap_write(ii->i_bmap, raw_inode); + else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + raw_inode->i_device_code = + cpu_to_le64(new_encode_dev(inode->i_rdev)); + /* When extending inode, nilfs->ns_inode_size should be checked + for substitutions of appended fields */ +} + +void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh) +{ + ino_t ino = inode->i_ino; + struct nilfs_inode_info *ii = NILFS_I(inode); + struct super_block *sb = inode->i_sb; + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_inode *raw_inode; + + raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh); + + /* The buffer is guarded with lock_buffer() by the caller */ + if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) + memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size); + set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); + + nilfs_write_inode_common(inode, raw_inode, 0); + /* XXX: call with has_bmap = 0 is a workaround to avoid + deadlock of bmap. This delays update of i_bmap to just + before writing */ + nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh); +} + +#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */ + +static void nilfs_truncate_bmap(struct nilfs_inode_info *ii, + unsigned long from) +{ + unsigned long b; + int ret; + + if (!test_bit(NILFS_I_BMAP, &ii->i_state)) + return; + repeat: + ret = nilfs_bmap_last_key(ii->i_bmap, &b); + if (ret == -ENOENT) + return; + else if (ret < 0) + goto failed; + + if (b < from) + return; + + b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from); + ret = nilfs_bmap_truncate(ii->i_bmap, b); + nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb); + if (!ret || (ret == -ENOMEM && + nilfs_bmap_truncate(ii->i_bmap, b) == 0)) + goto repeat; + + failed: + if (ret == -EINVAL) + nilfs_error(ii->vfs_inode.i_sb, __func__, + "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino); + else + nilfs_warning(ii->vfs_inode.i_sb, __func__, + "failed to truncate bmap (ino=%lu, err=%d)", + ii->vfs_inode.i_ino, ret); +} + +void nilfs_truncate(struct inode *inode) +{ + unsigned long blkoff; + unsigned int blocksize; + struct nilfs_transaction_info ti; + struct super_block *sb = inode->i_sb; + struct nilfs_inode_info *ii = NILFS_I(inode); + + if (!test_bit(NILFS_I_BMAP, &ii->i_state)) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + + blocksize = sb->s_blocksize; + blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits; + nilfs_transaction_begin(sb, &ti, 0); /* never fails */ + + block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block); + + nilfs_truncate_bmap(ii, blkoff); + + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (IS_SYNC(inode)) + nilfs_set_transaction_flag(NILFS_TI_SYNC); + + nilfs_set_file_dirty(NILFS_SB(sb), inode, 0); + nilfs_transaction_commit(sb); + /* May construct a logical segment and may fail in sync mode. + But truncate has no return value. */ +} + +void nilfs_delete_inode(struct inode *inode) +{ + struct nilfs_transaction_info ti; + struct super_block *sb = inode->i_sb; + struct nilfs_inode_info *ii = NILFS_I(inode); + + if (unlikely(is_bad_inode(inode))) { + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + return; + } + nilfs_transaction_begin(sb, &ti, 0); /* never fails */ + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + + nilfs_truncate_bmap(ii, 0); + nilfs_free_inode(inode); + /* nilfs_free_inode() marks inode buffer dirty */ + if (IS_SYNC(inode)) + nilfs_set_transaction_flag(NILFS_TI_SYNC); + nilfs_transaction_commit(sb); + /* May construct a logical segment and may fail in sync mode. + But delete_inode has no return value. */ +} + +int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct nilfs_transaction_info ti; + struct inode *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + int err; + + err = inode_change_ok(inode, iattr); + if (err) + return err; + + err = nilfs_transaction_begin(sb, &ti, 0); + if (unlikely(err)) + return err; + err = inode_setattr(inode, iattr); + if (!err && (iattr->ia_valid & ATTR_MODE)) + err = nilfs_acl_chmod(inode); + if (likely(!err)) + err = nilfs_transaction_commit(sb); + else + nilfs_transaction_abort(sb); + + return err; +} + +int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode, + struct buffer_head **pbh) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + int err; + + spin_lock(&sbi->s_inode_lock); + /* Caller of this function MUST lock s_inode_lock */ + if (ii->i_bh == NULL) { + spin_unlock(&sbi->s_inode_lock); + err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino, + pbh); + if (unlikely(err)) + return err; + spin_lock(&sbi->s_inode_lock); + if (ii->i_bh == NULL) + ii->i_bh = *pbh; + else { + brelse(*pbh); + *pbh = ii->i_bh; + } + } else + *pbh = ii->i_bh; + + get_bh(*pbh); + spin_unlock(&sbi->s_inode_lock); + return 0; +} + +int nilfs_inode_dirty(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); + int ret = 0; + + if (!list_empty(&ii->i_dirty)) { + spin_lock(&sbi->s_inode_lock); + ret = test_bit(NILFS_I_DIRTY, &ii->i_state) || + test_bit(NILFS_I_BUSY, &ii->i_state); + spin_unlock(&sbi->s_inode_lock); + } + return ret; +} + +int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode, + unsigned nr_dirty) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks); + + if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state)) + return 0; + + spin_lock(&sbi->s_inode_lock); + if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && + !test_bit(NILFS_I_BUSY, &ii->i_state)) { + /* Because this routine may race with nilfs_dispose_list(), + we have to check NILFS_I_QUEUED here, too. */ + if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) { + /* This will happen when somebody is freeing + this inode. */ + nilfs_warning(sbi->s_super, __func__, + "cannot get inode (ino=%lu)\n", + inode->i_ino); + spin_unlock(&sbi->s_inode_lock); + return -EINVAL; /* NILFS_I_DIRTY may remain for + freeing inode */ + } + list_del(&ii->i_dirty); + list_add_tail(&ii->i_dirty, &sbi->s_dirty_files); + set_bit(NILFS_I_QUEUED, &ii->i_state); + } + spin_unlock(&sbi->s_inode_lock); + return 0; +} + +int nilfs_mark_inode_dirty(struct inode *inode) +{ + struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); + struct buffer_head *ibh; + int err; + + err = nilfs_load_inode_block(sbi, inode, &ibh); + if (unlikely(err)) { + nilfs_warning(inode->i_sb, __func__, + "failed to reget inode block.\n"); + return err; + } + lock_buffer(ibh); + nilfs_update_inode(inode, ibh); + unlock_buffer(ibh); + nilfs_mdt_mark_buffer_dirty(ibh); + nilfs_mdt_mark_dirty(sbi->s_ifile); + brelse(ibh); + return 0; +} + +/** + * nilfs_dirty_inode - reflect changes on given inode to an inode block. + * @inode: inode of the file to be registered. + * + * nilfs_dirty_inode() loads a inode block containing the specified + * @inode and copies data from a nilfs_inode to a corresponding inode + * entry in the inode block. This operation is excluded from the segment + * construction. This function can be called both as a single operation + * and as a part of indivisible file operations. + */ +void nilfs_dirty_inode(struct inode *inode) +{ + struct nilfs_transaction_info ti; + + if (is_bad_inode(inode)) { + nilfs_warning(inode->i_sb, __func__, + "tried to mark bad_inode dirty. ignored.\n"); + dump_stack(); + return; + } + nilfs_transaction_begin(inode->i_sb, &ti, 0); + nilfs_mark_inode_dirty(inode); + nilfs_transaction_commit(inode->i_sb); /* never fails */ +} diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c new file mode 100644 index 00000000000..108d281ebca --- /dev/null +++ b/fs/nilfs2/ioctl.c @@ -0,0 +1,654 @@ +/* + * ioctl.c - NILFS ioctl operations. + * + * Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/fs.h> +#include <linux/wait.h> +#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */ +#include <linux/capability.h> /* capable() */ +#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ +#include <linux/nilfs2_fs.h> +#include "nilfs.h" +#include "segment.h" +#include "bmap.h" +#include "cpfile.h" +#include "sufile.h" +#include "dat.h" + + +static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs, + struct nilfs_argv *argv, int dir, + ssize_t (*dofunc)(struct the_nilfs *, + __u64 *, int, + void *, size_t, size_t)) +{ + void *buf; + void __user *base = (void __user *)(unsigned long)argv->v_base; + size_t maxmembs, total, n; + ssize_t nr; + int ret, i; + __u64 pos, ppos; + + if (argv->v_nmembs == 0) + return 0; + + if (argv->v_size > PAGE_SIZE) + return -EINVAL; + + buf = (void *)__get_free_pages(GFP_NOFS, 0); + if (unlikely(!buf)) + return -ENOMEM; + maxmembs = PAGE_SIZE / argv->v_size; + + ret = 0; + total = 0; + pos = argv->v_index; + for (i = 0; i < argv->v_nmembs; i += n) { + n = (argv->v_nmembs - i < maxmembs) ? + argv->v_nmembs - i : maxmembs; + if ((dir & _IOC_WRITE) && + copy_from_user(buf, base + argv->v_size * i, + argv->v_size * n)) { + ret = -EFAULT; + break; + } + ppos = pos; + nr = dofunc(nilfs, &pos, argv->v_flags, buf, argv->v_size, + n); + if (nr < 0) { + ret = nr; + break; + } + if ((dir & _IOC_READ) && + copy_to_user(base + argv->v_size * i, buf, + argv->v_size * nr)) { + ret = -EFAULT; + break; + } + total += nr; + if ((size_t)nr < n) + break; + if (pos == ppos) + pos += n; + } + argv->v_nmembs = total; + + free_pages((unsigned long)buf, 0); + return ret; +} + +static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile; + struct nilfs_transaction_info ti; + struct nilfs_cpmode cpmode; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&cpmode, argp, sizeof(cpmode))) + return -EFAULT; + + nilfs_transaction_begin(inode->i_sb, &ti, 0); + ret = nilfs_cpfile_change_cpmode( + cpfile, cpmode.cm_cno, cpmode.cm_mode); + if (unlikely(ret < 0)) { + nilfs_transaction_abort(inode->i_sb); + return ret; + } + nilfs_transaction_commit(inode->i_sb); /* never fails */ + return ret; +} + +static int +nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile; + struct nilfs_transaction_info ti; + __u64 cno; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&cno, argp, sizeof(cno))) + return -EFAULT; + + nilfs_transaction_begin(inode->i_sb, &ti, 0); + ret = nilfs_cpfile_delete_checkpoint(cpfile, cno); + if (unlikely(ret < 0)) { + nilfs_transaction_abort(inode->i_sb); + return ret; + } + nilfs_transaction_commit(inode->i_sb); /* never fails */ + return ret; +} + +static ssize_t +nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + return nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf, + nmembs); +} + +static int nilfs_ioctl_get_cpinfo(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_argv argv; + int ret; + + if (copy_from_user(&argv, argp, sizeof(argv))) + return -EFAULT; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), + nilfs_ioctl_do_get_cpinfo); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &argv, sizeof(argv))) + ret = -EFAULT; + return ret; +} + +static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_cpstat cpstat; + int ret; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &cpstat, sizeof(cpstat))) + ret = -EFAULT; + return ret; +} + +static ssize_t +nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + return nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, nmembs); +} + +static int nilfs_ioctl_get_suinfo(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_argv argv; + int ret; + + if (copy_from_user(&argv, argp, sizeof(argv))) + return -EFAULT; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), + nilfs_ioctl_do_get_suinfo); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &argv, sizeof(argv))) + ret = -EFAULT; + return ret; +} + +static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_sustat sustat; + int ret; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &sustat, sizeof(sustat))) + ret = -EFAULT; + return ret; +} + +static ssize_t +nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + return nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, nmembs); +} + +static int nilfs_ioctl_get_vinfo(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_argv argv; + int ret; + + if (copy_from_user(&argv, argp, sizeof(argv))) + return -EFAULT; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), + nilfs_ioctl_do_get_vinfo); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &argv, sizeof(argv))) + ret = -EFAULT; + return ret; +} + +static ssize_t +nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + struct inode *dat = nilfs_dat_inode(nilfs); + struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap; + struct nilfs_bdesc *bdescs = buf; + int ret, i; + + for (i = 0; i < nmembs; i++) { + ret = nilfs_bmap_lookup_at_level(bmap, + bdescs[i].bd_offset, + bdescs[i].bd_level + 1, + &bdescs[i].bd_blocknr); + if (ret < 0) { + if (ret != -ENOENT) + return ret; + bdescs[i].bd_blocknr = 0; + } + } + return nmembs; +} + +static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; + struct nilfs_argv argv; + int ret; + + if (copy_from_user(&argv, argp, sizeof(argv))) + return -EFAULT; + + down_read(&nilfs->ns_segctor_sem); + ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd), + nilfs_ioctl_do_get_bdescs); + up_read(&nilfs->ns_segctor_sem); + if (ret < 0) + return ret; + + if (copy_to_user(argp, &argv, sizeof(argv))) + ret = -EFAULT; + return ret; +} + +static int nilfs_ioctl_move_inode_block(struct inode *inode, + struct nilfs_vdesc *vdesc, + struct list_head *buffers) +{ + struct buffer_head *bh; + int ret; + + if (vdesc->vd_flags == 0) + ret = nilfs_gccache_submit_read_data( + inode, vdesc->vd_offset, vdesc->vd_blocknr, + vdesc->vd_vblocknr, &bh); + else + ret = nilfs_gccache_submit_read_node( + inode, vdesc->vd_blocknr, vdesc->vd_vblocknr, &bh); + + if (unlikely(ret < 0)) { + if (ret == -ENOENT) + printk(KERN_CRIT + "%s: invalid virtual block address (%s): " + "ino=%llu, cno=%llu, offset=%llu, " + "blocknr=%llu, vblocknr=%llu\n", + __func__, vdesc->vd_flags ? "node" : "data", + (unsigned long long)vdesc->vd_ino, + (unsigned long long)vdesc->vd_cno, + (unsigned long long)vdesc->vd_offset, + (unsigned long long)vdesc->vd_blocknr, + (unsigned long long)vdesc->vd_vblocknr); + return ret; + } + bh->b_private = vdesc; + list_add_tail(&bh->b_assoc_buffers, buffers); + return 0; +} + +static ssize_t +nilfs_ioctl_do_move_blocks(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + struct inode *inode; + struct nilfs_vdesc *vdesc; + struct buffer_head *bh, *n; + LIST_HEAD(buffers); + ino_t ino; + __u64 cno; + int i, ret; + + for (i = 0, vdesc = buf; i < nmembs; ) { + ino = vdesc->vd_ino; + cno = vdesc->vd_cno; + inode = nilfs_gc_iget(nilfs, ino, cno); + if (unlikely(inode == NULL)) { + ret = -ENOMEM; + goto failed; + } + do { + ret = nilfs_ioctl_move_inode_block(inode, vdesc, + &buffers); + if (unlikely(ret < 0)) + goto failed; + vdesc++; + } while (++i < nmembs && + vdesc->vd_ino == ino && vdesc->vd_cno == cno); + } + + list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { + ret = nilfs_gccache_wait_and_mark_dirty(bh); + if (unlikely(ret < 0)) { + if (ret == -EEXIST) { + vdesc = bh->b_private; + printk(KERN_CRIT + "%s: conflicting %s buffer: " + "ino=%llu, cno=%llu, offset=%llu, " + "blocknr=%llu, vblocknr=%llu\n", + __func__, + vdesc->vd_flags ? "node" : "data", + (unsigned long long)vdesc->vd_ino, + (unsigned long long)vdesc->vd_cno, + (unsigned long long)vdesc->vd_offset, + (unsigned long long)vdesc->vd_blocknr, + (unsigned long long)vdesc->vd_vblocknr); + } + goto failed; + } + list_del_init(&bh->b_assoc_buffers); + bh->b_private = NULL; + brelse(bh); + } + return nmembs; + + failed: + list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { + list_del_init(&bh->b_assoc_buffers); + bh->b_private = NULL; + brelse(bh); + } + return ret; +} + +static inline int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs, + struct nilfs_argv *argv, + int dir) +{ + return nilfs_ioctl_wrap_copy(nilfs, argv, dir, + nilfs_ioctl_do_move_blocks); +} + +static ssize_t +nilfs_ioctl_do_delete_checkpoints(struct the_nilfs *nilfs, __u64 *posp, + int flags, void *buf, size_t size, + size_t nmembs) +{ + struct inode *cpfile = nilfs->ns_cpfile; + struct nilfs_period *periods = buf; + int ret, i; + + for (i = 0; i < nmembs; i++) { + ret = nilfs_cpfile_delete_checkpoints( + cpfile, periods[i].p_start, periods[i].p_end); + if (ret < 0) + return ret; + } + return nmembs; +} + +static inline int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs, + struct nilfs_argv *argv, + int dir) +{ + return nilfs_ioctl_wrap_copy(nilfs, argv, dir, + nilfs_ioctl_do_delete_checkpoints); +} + +static ssize_t +nilfs_ioctl_do_free_vblocknrs(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + int ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs); + + return (ret < 0) ? ret : nmembs; +} + +static inline int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs, + struct nilfs_argv *argv, + int dir) +{ + return nilfs_ioctl_wrap_copy(nilfs, argv, dir, + nilfs_ioctl_do_free_vblocknrs); +} + +static ssize_t +nilfs_ioctl_do_mark_blocks_dirty(struct the_nilfs *nilfs, __u64 *posp, + int flags, void *buf, size_t size, + size_t nmembs) +{ + struct inode *dat = nilfs_dat_inode(nilfs); + struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap; + struct nilfs_bdesc *bdescs = buf; + int ret, i; + + for (i = 0; i < nmembs; i++) { + /* XXX: use macro or inline func to check liveness */ + ret = nilfs_bmap_lookup_at_level(bmap, + bdescs[i].bd_offset, + bdescs[i].bd_level + 1, + &bdescs[i].bd_blocknr); + if (ret < 0) { + if (ret != -ENOENT) + return ret; + bdescs[i].bd_blocknr = 0; + } + if (bdescs[i].bd_blocknr != bdescs[i].bd_oblocknr) + /* skip dead block */ + continue; + if (bdescs[i].bd_level == 0) { + ret = nilfs_mdt_mark_block_dirty(dat, + bdescs[i].bd_offset); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + return ret; + } + } else { + ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset, + bdescs[i].bd_level); + if (ret < 0) { + WARN_ON(ret == -ENOENT); + return ret; + } + } + } + return nmembs; +} + +static inline int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs, + struct nilfs_argv *argv, + int dir) +{ + return nilfs_ioctl_wrap_copy(nilfs, argv, dir, + nilfs_ioctl_do_mark_blocks_dirty); +} + +static ssize_t +nilfs_ioctl_do_free_segments(struct the_nilfs *nilfs, __u64 *posp, int flags, + void *buf, size_t size, size_t nmembs) +{ + struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs); + int ret; + + if (unlikely(!sbi)) + return -EROFS; + ret = nilfs_segctor_add_segments_to_be_freed( + NILFS_SC(sbi), buf, nmembs); + nilfs_put_writer(nilfs); + + return (ret < 0) ? ret : nmembs; +} + +static inline int nilfs_ioctl_free_segments(struct the_nilfs *nilfs, + struct nilfs_argv *argv, + int dir) +{ + return nilfs_ioctl_wrap_copy(nilfs, argv, dir, + nilfs_ioctl_do_free_segments); +} + +int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, + void __user *argp) +{ + struct nilfs_argv argv[5]; + const char *msg; + int dir, ret; + + if (copy_from_user(argv, argp, sizeof(argv))) + return -EFAULT; + + dir = _IOC_WRITE; + ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], dir); + if (ret < 0) { + msg = "cannot read source blocks"; + goto failed; + } + ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], dir); + if (ret < 0) { + /* + * can safely abort because checkpoints can be removed + * independently. + */ + msg = "cannot delete checkpoints"; + goto failed; + } + ret = nilfs_ioctl_free_vblocknrs(nilfs, &argv[2], dir); + if (ret < 0) { + /* + * can safely abort because DAT file is updated atomically + * using a copy-on-write technique. + */ + msg = "cannot delete virtual blocks from DAT file"; + goto failed; + } + ret = nilfs_ioctl_mark_blocks_dirty(nilfs, &argv[3], dir); + if (ret < 0) { + /* + * can safely abort because the operation is nondestructive. + */ + msg = "cannot mark copying blocks dirty"; + goto failed; + } + ret = nilfs_ioctl_free_segments(nilfs, &argv[4], dir); + if (ret < 0) { + /* + * can safely abort because this operation is atomic. + */ + msg = "cannot set segments to be freed"; + goto failed; + } + return 0; + + failed: + nilfs_remove_all_gcinode(nilfs); + printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n", + msg, ret); + return ret; +} + +static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + return nilfs_clean_segments(inode->i_sb, argp); +} + +static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, + unsigned int cmd, void __user *argp) +{ + __u64 cno; + int ret; + + ret = nilfs_construct_segment(inode->i_sb); + if (ret < 0) + return ret; + + if (argp != NULL) { + cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1; + if (copy_to_user(argp, &cno, sizeof(cno))) + return -EFAULT; + } + return 0; +} + +long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = filp->f_dentry->d_inode; + void __user *argp = (void * __user *)arg; + + switch (cmd) { + case NILFS_IOCTL_CHANGE_CPMODE: + return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp); + case NILFS_IOCTL_DELETE_CHECKPOINT: + return nilfs_ioctl_delete_checkpoint(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_CPINFO: + return nilfs_ioctl_get_cpinfo(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_CPSTAT: + return nilfs_ioctl_get_cpstat(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_SUINFO: + return nilfs_ioctl_get_suinfo(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_SUSTAT: + return nilfs_ioctl_get_sustat(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_VINFO: + /* XXX: rename to ??? */ + return nilfs_ioctl_get_vinfo(inode, filp, cmd, argp); + case NILFS_IOCTL_GET_BDESCS: + return nilfs_ioctl_get_bdescs(inode, filp, cmd, argp); + case NILFS_IOCTL_CLEAN_SEGMENTS: + return nilfs_ioctl_clean_segments(inode, filp, cmd, argp); + case NILFS_IOCTL_SYNC: + return nilfs_ioctl_sync(inode, filp, cmd, argp); + default: + return -ENOTTY; + } +} diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c new file mode 100644 index 00000000000..47dd815433f --- /dev/null +++ b/fs/nilfs2/mdt.c @@ -0,0 +1,563 @@ +/* + * mdt.c - meta data file for NILFS + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + */ + +#include <linux/buffer_head.h> +#include <linux/mpage.h> +#include <linux/mm.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> +#include <linux/swap.h> +#include "nilfs.h" +#include "segment.h" +#include "page.h" +#include "mdt.h" + + +#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1) + +#define INIT_UNUSED_INODE_FIELDS + +static int +nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, + struct buffer_head *bh, + void (*init_block)(struct inode *, + struct buffer_head *, void *)) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + void *kaddr; + int ret; + + /* Caller exclude read accesses using page lock */ + + /* set_buffer_new(bh); */ + bh->b_blocknr = 0; + + ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh); + if (unlikely(ret)) + return ret; + + set_buffer_mapped(bh); + + kaddr = kmap_atomic(bh->b_page, KM_USER0); + memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits); + if (init_block) + init_block(inode, bh, kaddr); + flush_dcache_page(bh->b_page); + kunmap_atomic(kaddr, KM_USER0); + + set_buffer_uptodate(bh); + nilfs_mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(inode); + return 0; +} + +static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, + struct buffer_head **out_bh, + void (*init_block)(struct inode *, + struct buffer_head *, + void *)) +{ + struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs; + struct nilfs_sb_info *writer = NULL; + struct super_block *sb = inode->i_sb; + struct nilfs_transaction_info ti; + struct buffer_head *bh; + int err; + + if (!sb) { + writer = nilfs_get_writer(nilfs); + if (!writer) { + err = -EROFS; + goto out; + } + sb = writer->s_super; + } + + nilfs_transaction_begin(sb, &ti, 0); + + err = -ENOMEM; + bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0); + if (unlikely(!bh)) + goto failed_unlock; + + err = -EEXIST; + if (buffer_uptodate(bh) || buffer_mapped(bh)) + goto failed_bh; +#if 0 + /* The uptodate flag is not protected by the page lock, but + the mapped flag is. Thus, we don't have to wait the buffer. */ + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + goto failed_bh; +#endif + + bh->b_bdev = nilfs->ns_bdev; + err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); + if (likely(!err)) { + get_bh(bh); + *out_bh = bh; + } + + failed_bh: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + brelse(bh); + + failed_unlock: + if (likely(!err)) + err = nilfs_transaction_commit(sb); + else + nilfs_transaction_abort(sb); + if (writer) + nilfs_put_writer(nilfs); + out: + return err; +} + +static int +nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, + int mode, struct buffer_head **out_bh) +{ + struct buffer_head *bh; + unsigned long blknum = 0; + int ret = -ENOMEM; + + bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); + if (unlikely(!bh)) + goto failed; + + ret = -EEXIST; /* internal code */ + if (buffer_uptodate(bh)) + goto out; + + if (mode == READA) { + if (!trylock_buffer(bh)) { + ret = -EBUSY; + goto failed_bh; + } + } else /* mode == READ */ + lock_buffer(bh); + + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + goto out; + } + if (!buffer_mapped(bh)) { /* unused buffer */ + ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, + &blknum); + if (unlikely(ret)) { + unlock_buffer(bh); + goto failed_bh; + } + bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev; + bh->b_blocknr = blknum; + set_buffer_mapped(bh); + } + + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(mode, bh); + ret = 0; + out: + get_bh(bh); + *out_bh = bh; + + failed_bh: + unlock_page(bh->b_page); + page_cache_release(bh->b_page); + brelse(bh); + failed: + return ret; +} + +static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, + struct buffer_head **out_bh) +{ + struct buffer_head *first_bh, *bh; + unsigned long blkoff; + int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS; + int err; + + err = nilfs_mdt_submit_block(inode, block, READ, &first_bh); + if (err == -EEXIST) /* internal code */ + goto out; + + if (unlikely(err)) + goto failed; + + blkoff = block + 1; + for (i = 0; i < nr_ra_blocks; i++, blkoff++) { + err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh); + if (likely(!err || err == -EEXIST)) + brelse(bh); + else if (err != -EBUSY) + break; /* abort readahead if bmap lookup failed */ + + if (!buffer_locked(first_bh)) + goto out_no_wait; + } + + wait_on_buffer(first_bh); + + out_no_wait: + err = -EIO; + if (!buffer_uptodate(first_bh)) + goto failed_bh; + out: + *out_bh = first_bh; + return 0; + + failed_bh: + brelse(first_bh); + failed: + return err; +} + +/** + * nilfs_mdt_get_block - read or create a buffer on meta data file. + * @inode: inode of the meta data file + * @blkoff: block offset + * @create: create flag + * @init_block: initializer used for newly allocated block + * @out_bh: output of a pointer to the buffer_head + * + * nilfs_mdt_get_block() looks up the specified buffer and tries to create + * a new buffer if @create is not zero. On success, the returned buffer is + * assured to be either existing or formatted using a buffer lock on success. + * @out_bh is substituted only when zero is returned. + * + * Return Value: On success, it returns 0. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - the specified block does not exist (hole block) + * + * %-EINVAL - bmap is broken. (the caller should call nilfs_error()) + * + * %-EROFS - Read only filesystem (for create mode) + */ +int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, + void (*init_block)(struct inode *, + struct buffer_head *, void *), + struct buffer_head **out_bh) +{ + int ret; + + /* Should be rewritten with merging nilfs_mdt_read_block() */ + retry: + ret = nilfs_mdt_read_block(inode, blkoff, out_bh); + if (!create || ret != -ENOENT) + return ret; + + ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block); + if (unlikely(ret == -EEXIST)) { + /* create = 0; */ /* limit read-create loop retries */ + goto retry; + } + return ret; +} + +/** + * nilfs_mdt_delete_block - make a hole on the meta data file. + * @inode: inode of the meta data file + * @block: block offset + * + * Return Value: On success, zero is returned. + * On error, one of the following negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-EINVAL - bmap is broken. (the caller should call nilfs_error()) + */ +int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + int err; + + err = nilfs_bmap_delete(ii->i_bmap, block); + if (likely(!err)) { + nilfs_mdt_mark_dirty(inode); + nilfs_mdt_forget_block(inode, block); + } + return err; +} + +/** + * nilfs_mdt_forget_block - discard dirty state and try to remove the page + * @inode: inode of the meta data file + * @block: block offset + * + * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and + * tries to release the page including the buffer from a page cache. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-EBUSY - page has an active buffer. + * + * %-ENOENT - page cache has no page addressed by the offset. + */ +int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) +{ + pgoff_t index = (pgoff_t)block >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + struct page *page; + unsigned long first_block; + int ret = 0; + int still_dirty; + + page = find_lock_page(inode->i_mapping, index); + if (!page) + return -ENOENT; + + wait_on_page_writeback(page); + + first_block = (unsigned long)index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + if (page_has_buffers(page)) { + struct buffer_head *bh; + + bh = nilfs_page_get_nth_block(page, block - first_block); + nilfs_forget_buffer(bh); + } + still_dirty = PageDirty(page); + unlock_page(page); + page_cache_release(page); + + if (still_dirty || + invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0) + ret = -EBUSY; + return ret; +} + +/** + * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty. + * @inode: inode of the meta data file + * @block: block offset + * + * Return Value: On success, it returns 0. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - the specified block does not exist (hole block) + * + * %-EINVAL - bmap is broken. (the caller should call nilfs_error()) + */ +int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block) +{ + struct buffer_head *bh; + int err; + + err = nilfs_mdt_read_block(inode, block, &bh); + if (unlikely(err)) + return err; + nilfs_mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(inode); + brelse(bh); + return 0; +} + +int nilfs_mdt_fetch_dirty(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) { + set_bit(NILFS_I_DIRTY, &ii->i_state); + return 1; + } + return test_bit(NILFS_I_DIRTY, &ii->i_state); +} + +static int +nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = container_of(page->mapping, + struct inode, i_data); + struct super_block *sb = inode->i_sb; + struct nilfs_sb_info *writer = NULL; + int err = 0; + + redirty_page_for_writepage(wbc, page); + unlock_page(page); + + if (page->mapping->assoc_mapping) + return 0; /* Do not request flush for shadow page cache */ + if (!sb) { + writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs); + if (!writer) + return -EROFS; + sb = writer->s_super; + } + + if (wbc->sync_mode == WB_SYNC_ALL) + err = nilfs_construct_segment(sb); + else if (wbc->for_reclaim) + nilfs_flush_segment(sb, inode->i_ino); + + if (writer) + nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs); + return err; +} + + +static struct address_space_operations def_mdt_aops = { + .writepage = nilfs_mdt_write_page, +}; + +static struct inode_operations def_mdt_iops; +static struct file_operations def_mdt_fops; + +/* + * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile, + * ifile, or gcinodes. This allows the B-tree code and segment constructor + * to treat them like regular files, and this helps to simplify the + * implementation. + * On the other hand, some of the pseudo inodes have an irregular point: + * They don't have valid inode->i_sb pointer because their lifetimes are + * longer than those of the super block structs; they may continue for + * several consecutive mounts/umounts. This would need discussions. + */ +struct inode * +nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, + ino_t ino, gfp_t gfp_mask) +{ + struct inode *inode = nilfs_alloc_inode(sb); + + if (!inode) + return NULL; + else { + struct address_space * const mapping = &inode->i_data; + struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS); + + if (!mi) { + nilfs_destroy_inode(inode); + return NULL; + } + mi->mi_nilfs = nilfs; + init_rwsem(&mi->mi_sem); + + inode->i_sb = sb; /* sb may be NULL for some meta data files */ + inode->i_blkbits = nilfs->ns_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); + inode->i_nlink = 1; + inode->i_ino = ino; + inode->i_mode = S_IFREG; + inode->i_private = mi; + +#ifdef INIT_UNUSED_INODE_FIELDS + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_bytes = 0; + inode->i_generation = 0; +#ifdef CONFIG_QUOTA + memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); +#endif + inode->i_pipe = NULL; + inode->i_bdev = NULL; + inode->i_cdev = NULL; + inode->i_rdev = 0; +#ifdef CONFIG_SECURITY + inode->i_security = NULL; +#endif + inode->dirtied_when = 0; + + INIT_LIST_HEAD(&inode->i_list); + INIT_LIST_HEAD(&inode->i_sb_list); + inode->i_state = 0; +#endif + + spin_lock_init(&inode->i_lock); + mutex_init(&inode->i_mutex); + init_rwsem(&inode->i_alloc_sem); + + mapping->host = NULL; /* instead of inode */ + mapping->flags = 0; + mapping_set_gfp_mask(mapping, gfp_mask); + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = nilfs->ns_bdi; + + inode->i_mapping = mapping; + } + + return inode; +} + +struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, + ino_t ino, gfp_t gfp_mask) +{ + struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask); + + if (!inode) + return NULL; + + inode->i_op = &def_mdt_iops; + inode->i_fop = &def_mdt_fops; + inode->i_mapping->a_ops = &def_mdt_aops; + return inode; +} + +void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size, + unsigned header_size) +{ + struct nilfs_mdt_info *mi = NILFS_MDT(inode); + + mi->mi_entry_size = entry_size; + mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size; + mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); +} + +void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow) +{ + shadow->i_mapping->assoc_mapping = orig->i_mapping; + NILFS_I(shadow)->i_btnode_cache.assoc_mapping = + &NILFS_I(orig)->i_btnode_cache; +} + +void nilfs_mdt_clear(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + + invalidate_mapping_pages(inode->i_mapping, 0, -1); + truncate_inode_pages(inode->i_mapping, 0); + + nilfs_bmap_clear(ii->i_bmap); + nilfs_btnode_cache_clear(&ii->i_btnode_cache); +} + +void nilfs_mdt_destroy(struct inode *inode) +{ + struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + + kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ + kfree(mdi); + nilfs_destroy_inode(inode); +} diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h new file mode 100644 index 00000000000..df683e0bca6 --- /dev/null +++ b/fs/nilfs2/mdt.h @@ -0,0 +1,125 @@ +/* + * mdt.h - NILFS meta data file prototype and definitions + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + */ + +#ifndef _NILFS_MDT_H +#define _NILFS_MDT_H + +#include <linux/buffer_head.h> +#include <linux/blockgroup_lock.h> +#include "nilfs.h" +#include "page.h" + +/** + * struct nilfs_mdt_info - on-memory private data of meta data files + * @mi_nilfs: back pointer to the_nilfs struct + * @mi_sem: reader/writer semaphore for meta data operations + * @mi_bgl: per-blockgroup locking + * @mi_entry_size: size of an entry + * @mi_first_entry_offset: offset to the first entry + * @mi_entries_per_block: number of entries in a block + * @mi_blocks_per_group: number of blocks in a group + * @mi_blocks_per_desc_block: number of blocks per descriptor block + */ +struct nilfs_mdt_info { + struct the_nilfs *mi_nilfs; + struct rw_semaphore mi_sem; + struct blockgroup_lock *mi_bgl; + unsigned mi_entry_size; + unsigned mi_first_entry_offset; + unsigned long mi_entries_per_block; + unsigned long mi_blocks_per_group; + unsigned long mi_blocks_per_desc_block; +}; + +static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode) +{ + return inode->i_private; +} + +static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs; +} + +/* Default GFP flags using highmem */ +#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM) + +int nilfs_mdt_get_block(struct inode *, unsigned long, int, + void (*init_block)(struct inode *, + struct buffer_head *, void *), + struct buffer_head **); +int nilfs_mdt_delete_block(struct inode *, unsigned long); +int nilfs_mdt_forget_block(struct inode *, unsigned long); +int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); +int nilfs_mdt_fetch_dirty(struct inode *); + +struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t, + gfp_t); +struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *, + ino_t, gfp_t); +void nilfs_mdt_destroy(struct inode *); +void nilfs_mdt_clear(struct inode *); +void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned); +void nilfs_mdt_set_shadow(struct inode *, struct inode *); + + +#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh) + +static inline void nilfs_mdt_mark_dirty(struct inode *inode) +{ + if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state)) + set_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state); +} + +static inline void nilfs_mdt_clear_dirty(struct inode *inode) +{ + clear_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state); +} + +static inline __u64 nilfs_mdt_cno(struct inode *inode) +{ + return NILFS_MDT(inode)->mi_nilfs->ns_cno; +} + +#define nilfs_mdt_bgl_lock(inode, bg) \ + (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock) + + +static inline int +nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh, + unsigned n) +{ + return nilfs_read_inode_common( + inode, (struct nilfs_inode *)(bh->b_data + n)); +} + +static inline void +nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh, + unsigned n) +{ + nilfs_write_inode_common( + inode, (struct nilfs_inode *)(bh->b_data + n), 1); +} + +#endif /* _NILFS_MDT_H */ diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c new file mode 100644 index 00000000000..df70dadb336 --- /dev/null +++ b/fs/nilfs2/namei.c @@ -0,0 +1,474 @@ +/* + * namei.c - NILFS pathname lookup operations. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>, + * Ryusuke Konishi <ryusuke@osrg.net> + */ +/* + * linux/fs/ext2/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include <linux/pagemap.h> +#include "nilfs.h" + + +static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) +{ + int err = nilfs_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + inode_dec_link_count(inode); + iput(inode); + return err; +} + +/* + * Methods themselves. + */ + +static struct dentry * +nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode; + ino_t ino; + + if (dentry->d_name.len > NILFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ino = nilfs_inode_by_name(dir, dentry); + inode = NULL; + if (ino) { + inode = nilfs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + return d_splice_alias(inode, dentry); +} + +struct dentry *nilfs_get_parent(struct dentry *child) +{ + unsigned long ino; + struct inode *inode; + struct dentry dotdot; + + dotdot.d_name.name = ".."; + dotdot.d_name.len = 2; + + ino = nilfs_inode_by_name(child->d_inode, &dotdot); + if (!ino) + return ERR_PTR(-ENOENT); + + inode = nilfs_iget(child->d_inode->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + return d_obtain_alias(inode); +} + +/* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ +static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + inode = nilfs_new_inode(dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &nilfs_file_inode_operations; + inode->i_fop = &nilfs_file_operations; + inode->i_mapping->a_ops = &nilfs_aops; + mark_inode_dirty(inode); + err = nilfs_add_nondir(dentry, inode); + } + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int +nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) +{ + struct inode *inode; + struct nilfs_transaction_info ti; + int err; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + inode = nilfs_new_inode(dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, inode->i_mode, rdev); + mark_inode_dirty(inode); + err = nilfs_add_nondir(dentry, inode); + } + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct nilfs_transaction_info ti; + struct super_block *sb = dir->i_sb; + unsigned l = strlen(symname)+1; + struct inode *inode; + int err; + + if (l > sb->s_blocksize) + return -ENAMETOOLONG; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + + inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out; + + /* slow symlink */ + inode->i_op = &nilfs_symlink_inode_operations; + inode->i_mapping->a_ops = &nilfs_aops; + err = page_symlink(inode, symname, l); + if (err) + goto out_fail; + + /* mark_inode_dirty(inode); */ + /* nilfs_new_inode() and page_symlink() do this */ + + err = nilfs_add_nondir(dentry, inode); +out: + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; + +out_fail: + inode_dec_link_count(inode); + iput(inode); + goto out; +} + +static int nilfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + struct nilfs_transaction_info ti; + int err; + + if (inode->i_nlink >= NILFS_LINK_MAX) + return -EMLINK; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + + inode->i_ctime = CURRENT_TIME; + inode_inc_link_count(inode); + atomic_inc(&inode->i_count); + + err = nilfs_add_nondir(dentry, inode); + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + struct nilfs_transaction_info ti; + int err; + + if (dir->i_nlink >= NILFS_LINK_MAX) + return -EMLINK; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 1); + if (err) + return err; + + inode_inc_link_count(dir); + + inode = nilfs_new_inode(dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_dir; + + inode->i_op = &nilfs_dir_inode_operations; + inode->i_fop = &nilfs_dir_operations; + inode->i_mapping->a_ops = &nilfs_aops; + + inode_inc_link_count(inode); + + err = nilfs_make_empty(inode, dir); + if (err) + goto out_fail; + + err = nilfs_add_link(dentry, inode); + if (err) + goto out_fail; + + d_instantiate(dentry, inode); +out: + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; + +out_fail: + inode_dec_link_count(inode); + inode_dec_link_count(inode); + iput(inode); +out_dir: + inode_dec_link_count(dir); + goto out; +} + +static int nilfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode; + struct nilfs_dir_entry *de; + struct page *page; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 0); + if (err) + return err; + + err = -ENOENT; + de = nilfs_find_entry(dir, dentry, &page); + if (!de) + goto out; + + inode = dentry->d_inode; + err = -EIO; + if (le64_to_cpu(de->inode) != inode->i_ino) + goto out; + + if (!inode->i_nlink) { + nilfs_warning(inode->i_sb, __func__, + "deleting nonexistent file (%lu), %d\n", + inode->i_ino, inode->i_nlink); + inode->i_nlink = 1; + } + err = nilfs_delete_entry(de, page); + if (err) + goto out; + + inode->i_ctime = dir->i_ctime; + inode_dec_link_count(inode); + err = 0; +out: + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(dir->i_sb, &ti, 0); + if (err) + return err; + + err = -ENOTEMPTY; + if (nilfs_empty_dir(inode)) { + err = nilfs_unlink(dir, dentry); + if (!err) { + inode->i_size = 0; + inode_dec_link_count(inode); + inode_dec_link_count(dir); + } + } + if (!err) + err = nilfs_transaction_commit(dir->i_sb); + else + nilfs_transaction_abort(dir->i_sb); + + return err; +} + +static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct page *dir_page = NULL; + struct nilfs_dir_entry *dir_de = NULL; + struct page *old_page; + struct nilfs_dir_entry *old_de; + struct nilfs_transaction_info ti; + int err; + + err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1); + if (unlikely(err)) + return err; + + err = -ENOENT; + old_de = nilfs_find_entry(old_dir, old_dentry, &old_page); + if (!old_de) + goto out; + + if (S_ISDIR(old_inode->i_mode)) { + err = -EIO; + dir_de = nilfs_dotdot(old_inode, &dir_page); + if (!dir_de) + goto out_old; + } + + if (new_inode) { + struct page *new_page; + struct nilfs_dir_entry *new_de; + + err = -ENOTEMPTY; + if (dir_de && !nilfs_empty_dir(new_inode)) + goto out_dir; + + err = -ENOENT; + new_de = nilfs_find_entry(new_dir, new_dentry, &new_page); + if (!new_de) + goto out_dir; + inode_inc_link_count(old_inode); + nilfs_set_link(new_dir, new_de, new_page, old_inode); + new_inode->i_ctime = CURRENT_TIME; + if (dir_de) + drop_nlink(new_inode); + inode_dec_link_count(new_inode); + } else { + if (dir_de) { + err = -EMLINK; + if (new_dir->i_nlink >= NILFS_LINK_MAX) + goto out_dir; + } + inode_inc_link_count(old_inode); + err = nilfs_add_link(new_dentry, old_inode); + if (err) { + inode_dec_link_count(old_inode); + goto out_dir; + } + if (dir_de) + inode_inc_link_count(new_dir); + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + * inode_dec_link_count() will mark the inode dirty. + */ + old_inode->i_ctime = CURRENT_TIME; + + nilfs_delete_entry(old_de, old_page); + inode_dec_link_count(old_inode); + + if (dir_de) { + nilfs_set_link(old_inode, dir_de, dir_page, new_dir); + inode_dec_link_count(old_dir); + } + + err = nilfs_transaction_commit(old_dir->i_sb); + return err; + +out_dir: + if (dir_de) { + kunmap(dir_page); + page_cache_release(dir_page); + } +out_old: + kunmap(old_page); + page_cache_release(old_page); +out: + nilfs_transaction_abort(old_dir->i_sb); + return err; +} + +struct inode_operations nilfs_dir_inode_operations = { + .create = nilfs_create, + .lookup = nilfs_lookup, + .link = nilfs_link, + .unlink = nilfs_unlink, + .symlink = nilfs_symlink, + .mkdir = nilfs_mkdir, + .rmdir = nilfs_rmdir, + .mknod = nilfs_mknod, + .rename = nilfs_rename, + .setattr = nilfs_setattr, + .permission = nilfs_permission, +}; + +struct inode_operations nilfs_special_inode_operations = { + .setattr = nilfs_setattr, + .permission = nilfs_permission, +}; + +struct inode_operations nilfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, +}; diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h new file mode 100644 index 00000000000..3d0c18a16db --- /dev/null +++ b/fs/nilfs2/nilfs.h @@ -0,0 +1,313 @@ +/* + * nilfs.h - NILFS local header file. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net> + * Ryusuke Konishi <ryusuke@osrg.net> + */ + +#ifndef _NILFS_H +#define _NILFS_H + +#include <linux/kernel.h> +#include <linux/buffer_head.h> +#include <linux/spinlock.h> +#include <linux/blkdev.h> +#include <linux/nilfs2_fs.h> +#include "the_nilfs.h" +#include "sb.h" +#include "bmap.h" +#include "bmap_union.h" + +/* + * nilfs inode data in memory + */ +struct nilfs_inode_info { + __u32 i_flags; + unsigned long i_state; /* Dynamic state flags */ + struct nilfs_bmap *i_bmap; + union nilfs_bmap_union i_bmap_union; + __u64 i_xattr; /* sector_t ??? */ + __u32 i_dir_start_lookup; + __u64 i_cno; /* check point number for GC inode */ + struct address_space i_btnode_cache; + struct list_head i_dirty; /* List for connecting dirty files */ + +#ifdef CONFIG_NILFS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_sem even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif +#ifdef CONFIG_NILFS_POSIX_ACL + struct posix_acl *i_acl; + struct posix_acl *i_default_acl; +#endif + struct buffer_head *i_bh; /* i_bh contains a new or dirty + disk inode */ + struct inode vfs_inode; +}; + +static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode) +{ + return container_of(inode, struct nilfs_inode_info, vfs_inode); +} + +static inline struct nilfs_inode_info * +NILFS_BMAP_I(const struct nilfs_bmap *bmap) +{ + return container_of((union nilfs_bmap_union *)bmap, + struct nilfs_inode_info, + i_bmap_union); +} + +static inline struct inode *NILFS_BTNC_I(struct address_space *btnc) +{ + struct nilfs_inode_info *ii = + container_of(btnc, struct nilfs_inode_info, i_btnode_cache); + return &ii->vfs_inode; +} + +static inline struct inode *NILFS_AS_I(struct address_space *mapping) +{ + return (mapping->host) ? : + container_of(mapping, struct inode, i_data); +} + +/* + * Dynamic state flags of NILFS on-memory inode (i_state) + */ +enum { + NILFS_I_NEW = 0, /* Inode is newly created */ + NILFS_I_DIRTY, /* The file is dirty */ + NILFS_I_QUEUED, /* inode is in dirty_files list */ + NILFS_I_BUSY, /* inode is grabbed by a segment + constructor */ + NILFS_I_COLLECTED, /* All dirty blocks are collected */ + NILFS_I_UPDATED, /* The file has been written back */ + NILFS_I_INODE_DIRTY, /* write_inode is requested */ + NILFS_I_BMAP, /* has bmap and btnode_cache */ + NILFS_I_GCINODE, /* inode for GC, on memory only */ + NILFS_I_GCDAT, /* shadow DAT, on memory only */ +}; + +/* + * Macros to check inode numbers + */ +#define NILFS_MDT_INO_BITS \ + ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO | \ + 1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO | \ + 1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO)) + +#define NILFS_SYS_INO_BITS \ + ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS) + +#define NILFS_FIRST_INO(sb) (NILFS_SB(sb)->s_nilfs->ns_first_ino) + +#define NILFS_MDT_INODE(sb, ino) \ + ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino)))) +#define NILFS_VALID_INODE(sb, ino) \ + ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino)))) + +/** + * struct nilfs_transaction_info: context information for synchronization + * @ti_magic: Magic number + * @ti_save: Backup of journal_info field of task_struct + * @ti_flags: Flags + * @ti_count: Nest level + * @ti_garbage: List of inode to be put when releasing semaphore + */ +struct nilfs_transaction_info { + u32 ti_magic; + void *ti_save; + /* This should never used. If this happens, + one of other filesystems has a bug. */ + unsigned short ti_flags; + unsigned short ti_count; + struct list_head ti_garbage; +}; + +/* ti_magic */ +#define NILFS_TI_MAGIC 0xd9e392fb + +/* ti_flags */ +#define NILFS_TI_DYNAMIC_ALLOC 0x0001 /* Allocated from slab */ +#define NILFS_TI_SYNC 0x0002 /* Force to construct segment at the + end of transaction. */ +#define NILFS_TI_GC 0x0004 /* GC context */ +#define NILFS_TI_COMMIT 0x0008 /* Change happened or not */ +#define NILFS_TI_WRITER 0x0010 /* Constructor context */ + + +int nilfs_transaction_begin(struct super_block *, + struct nilfs_transaction_info *, int); +int nilfs_transaction_commit(struct super_block *); +void nilfs_transaction_abort(struct super_block *); + +static inline void nilfs_set_transaction_flag(unsigned int flag) +{ + struct nilfs_transaction_info *ti = current->journal_info; + + ti->ti_flags |= flag; +} + +static inline int nilfs_test_transaction_flag(unsigned int flag) +{ + struct nilfs_transaction_info *ti = current->journal_info; + + if (ti == NULL || ti->ti_magic != NILFS_TI_MAGIC) + return 0; + return !!(ti->ti_flags & flag); +} + +static inline int nilfs_doing_gc(void) +{ + return nilfs_test_transaction_flag(NILFS_TI_GC); +} + +static inline int nilfs_doing_construction(void) +{ + return nilfs_test_transaction_flag(NILFS_TI_WRITER); +} + +static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs) +{ + return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat; +} + +/* + * function prototype + */ +#ifdef CONFIG_NILFS_POSIX_ACL +#error "NILFS: not yet supported POSIX ACL" +extern int nilfs_permission(struct inode *, int, struct nameidata *); +extern int nilfs_acl_chmod(struct inode *); +extern int nilfs_init_acl(struct inode *, struct inode *); +#else +#define nilfs_permission NULL + +static inline int nilfs_acl_chmod(struct inode *inode) +{ + return 0; +} + +static inline int nilfs_init_acl(struct inode *inode, struct inode *dir) +{ + inode->i_mode &= ~current_umask(); + return 0; +} +#endif + +#define NILFS_ATIME_DISABLE + +/* dir.c */ +extern int nilfs_add_link(struct dentry *, struct inode *); +extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *); +extern int nilfs_make_empty(struct inode *, struct inode *); +extern struct nilfs_dir_entry * +nilfs_find_entry(struct inode *, struct dentry *, struct page **); +extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *); +extern int nilfs_empty_dir(struct inode *); +extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **); +extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *, + struct page *, struct inode *); + +/* file.c */ +extern int nilfs_sync_file(struct file *, struct dentry *, int); + +/* ioctl.c */ +long nilfs_ioctl(struct file *, unsigned int, unsigned long); +int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, void __user *); + +/* inode.c */ +extern struct inode *nilfs_new_inode(struct inode *, int); +extern void nilfs_free_inode(struct inode *); +extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern void nilfs_set_inode_flags(struct inode *); +extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *); +extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int); +extern struct inode *nilfs_iget(struct super_block *, unsigned long); +extern void nilfs_update_inode(struct inode *, struct buffer_head *); +extern void nilfs_truncate(struct inode *); +extern void nilfs_delete_inode(struct inode *); +extern int nilfs_setattr(struct dentry *, struct iattr *); +extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *, + struct buffer_head **); +extern int nilfs_inode_dirty(struct inode *); +extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *, + unsigned); +extern int nilfs_mark_inode_dirty(struct inode *); +extern void nilfs_dirty_inode(struct inode *); + +/* namei.c */ +extern struct dentry *nilfs_get_parent(struct dentry *); + +/* super.c */ +extern struct inode *nilfs_alloc_inode(struct super_block *); +extern void nilfs_destroy_inode(struct inode *); +extern void nilfs_error(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void nilfs_warning(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern struct nilfs_super_block * +nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); +extern int nilfs_store_magic_and_option(struct super_block *, + struct nilfs_super_block *, char *); +extern int nilfs_commit_super(struct nilfs_sb_info *, int); +extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64); +extern void nilfs_detach_checkpoint(struct nilfs_sb_info *); + +/* gcinode.c */ +int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64, + struct buffer_head **); +int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64, + struct buffer_head **); +int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *); +int nilfs_init_gccache(struct the_nilfs *); +void nilfs_destroy_gccache(struct the_nilfs *); +void nilfs_clear_gcinode(struct inode *); +struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64); +void nilfs_remove_all_gcinode(struct the_nilfs *); + +/* gcdat.c */ +int nilfs_init_gcdat_inode(struct the_nilfs *); +void nilfs_commit_gcdat_inode(struct the_nilfs *); +void nilfs_clear_gcdat_inode(struct the_nilfs *); + +/* + * Inodes and files operations + */ +extern struct file_operations nilfs_dir_operations; +extern struct inode_operations nilfs_file_inode_operations; +extern struct file_operations nilfs_file_operations; +extern struct address_space_operations nilfs_aops; +extern struct inode_operations nilfs_dir_inode_operations; +extern struct inode_operations nilfs_special_inode_operations; +extern struct inode_operations nilfs_symlink_inode_operations; + +/* + * filesystem type + */ +extern struct file_system_type nilfs_fs_type; + + +#endif /* _NILFS_H */ diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c new file mode 100644 index 00000000000..1bfbba9c0e9 --- /dev/null +++ b/fs/nilfs2/page.c @@ -0,0 +1,540 @@ +/* + * page.c - buffer/page management specific to NILFS + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net>, + * Seiji Kihara <kihara@osrg.net>. + */ + +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/swap.h> +#include <linux/bitops.h> +#include <linux/page-flags.h> +#include <linux/list.h> +#include <linux/highmem.h> +#include <linux/pagevec.h> +#include "nilfs.h" +#include "page.h" +#include "mdt.h" + + +#define NILFS_BUFFER_INHERENT_BITS \ + ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \ + (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated)) + +static struct buffer_head * +__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, + int blkbits, unsigned long b_state) + +{ + unsigned long first_block; + struct buffer_head *bh; + + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << blkbits, b_state); + + first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits); + bh = nilfs_page_get_nth_block(page, block - first_block); + + touch_buffer(bh); + wait_on_buffer(bh); + return bh; +} + +/* + * Since the page cache of B-tree node pages or data page cache of pseudo + * inodes does not have a valid mapping->host pointer, calling + * mark_buffer_dirty() for their buffers causes a NULL pointer dereference; + * it calls __mark_inode_dirty(NULL) through __set_page_dirty(). + * To avoid this problem, the old style mark_buffer_dirty() is used instead. + */ +void nilfs_mark_buffer_dirty(struct buffer_head *bh) +{ + if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) + __set_page_dirty_nobuffers(bh->b_page); +} + +struct buffer_head *nilfs_grab_buffer(struct inode *inode, + struct address_space *mapping, + unsigned long blkoff, + unsigned long b_state) +{ + int blkbits = inode->i_blkbits; + pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits); + struct page *page, *opage; + struct buffer_head *bh, *obh; + + page = grab_cache_page(mapping, index); + if (unlikely(!page)) + return NULL; + + bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state); + if (unlikely(!bh)) { + unlock_page(page); + page_cache_release(page); + return NULL; + } + if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) { + /* + * Shadow page cache uses assoc_mapping to point its original + * page cache. The following code tries the original cache + * if the given cache is a shadow and it didn't hit. + */ + opage = find_lock_page(mapping->assoc_mapping, index); + if (!opage) + return bh; + + obh = __nilfs_get_page_block(opage, blkoff, index, blkbits, + b_state); + if (buffer_uptodate(obh)) { + nilfs_copy_buffer(bh, obh); + if (buffer_dirty(obh)) { + nilfs_mark_buffer_dirty(bh); + if (!buffer_nilfs_node(bh) && NILFS_MDT(inode)) + nilfs_mdt_mark_dirty(inode); + } + } + brelse(obh); + unlock_page(opage); + page_cache_release(opage); + } + return bh; +} + +/** + * nilfs_forget_buffer - discard dirty state + * @inode: owner inode of the buffer + * @bh: buffer head of the buffer to be discarded + */ +void nilfs_forget_buffer(struct buffer_head *bh) +{ + struct page *page = bh->b_page; + + lock_buffer(bh); + clear_buffer_nilfs_volatile(bh); + if (test_clear_buffer_dirty(bh) && nilfs_page_buffers_clean(page)) + __nilfs_clear_page_dirty(page); + + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + bh->b_blocknr = -1; + ClearPageUptodate(page); + ClearPageMappedToDisk(page); + unlock_buffer(bh); + brelse(bh); +} + +/** + * nilfs_copy_buffer -- copy buffer data and flags + * @dbh: destination buffer + * @sbh: source buffer + */ +void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh) +{ + void *kaddr0, *kaddr1; + unsigned long bits; + struct page *spage = sbh->b_page, *dpage = dbh->b_page; + struct buffer_head *bh; + + kaddr0 = kmap_atomic(spage, KM_USER0); + kaddr1 = kmap_atomic(dpage, KM_USER1); + memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size); + kunmap_atomic(kaddr1, KM_USER1); + kunmap_atomic(kaddr0, KM_USER0); + + dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS; + dbh->b_blocknr = sbh->b_blocknr; + dbh->b_bdev = sbh->b_bdev; + + bh = dbh; + bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped)); + while ((bh = bh->b_this_page) != dbh) { + lock_buffer(bh); + bits &= bh->b_state; + unlock_buffer(bh); + } + if (bits & (1UL << BH_Uptodate)) + SetPageUptodate(dpage); + else + ClearPageUptodate(dpage); + if (bits & (1UL << BH_Mapped)) + SetPageMappedToDisk(dpage); + else + ClearPageMappedToDisk(dpage); +} + +/** + * nilfs_page_buffers_clean - check if a page has dirty buffers or not. + * @page: page to be checked + * + * nilfs_page_buffers_clean() returns zero if the page has dirty buffers. + * Otherwise, it returns non-zero value. + */ +int nilfs_page_buffers_clean(struct page *page) +{ + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + if (buffer_dirty(bh)) + return 0; + bh = bh->b_this_page; + } while (bh != head); + return 1; +} + +void nilfs_page_bug(struct page *page) +{ + struct address_space *m; + unsigned long ino = 0; + + if (unlikely(!page)) { + printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n"); + return; + } + + m = page->mapping; + if (m) { + struct inode *inode = NILFS_AS_I(m); + if (inode != NULL) + ino = inode->i_ino; + } + printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx " + "mapping=%p ino=%lu\n", + page, atomic_read(&page->_count), + (unsigned long long)page->index, page->flags, m, ino); + + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + int i = 0; + + bh = head = page_buffers(page); + do { + printk(KERN_CRIT + " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n", + i++, bh, atomic_read(&bh->b_count), + (unsigned long long)bh->b_blocknr, bh->b_state); + bh = bh->b_this_page; + } while (bh != head); + } +} + +/** + * nilfs_alloc_private_page - allocate a private page with buffer heads + * + * Return Value: On success, a pointer to the allocated page is returned. + * On error, NULL is returned. + */ +struct page *nilfs_alloc_private_page(struct block_device *bdev, int size, + unsigned long state) +{ + struct buffer_head *bh, *head, *tail; + struct page *page; + + page = alloc_page(GFP_NOFS); /* page_count of the returned page is 1 */ + if (unlikely(!page)) + return NULL; + + lock_page(page); + head = alloc_page_buffers(page, size, 0); + if (unlikely(!head)) { + unlock_page(page); + __free_page(page); + return NULL; + } + + bh = head; + do { + bh->b_state = (1UL << BH_NILFS_Allocated) | state; + tail = bh; + bh->b_bdev = bdev; + bh = bh->b_this_page; + } while (bh); + + tail->b_this_page = head; + attach_page_buffers(page, head); + + return page; +} + +void nilfs_free_private_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + BUG_ON(page->mapping); + + if (page_has_buffers(page) && !try_to_free_buffers(page)) + NILFS_PAGE_BUG(page, "failed to free page"); + + unlock_page(page); + __free_page(page); +} + +/** + * nilfs_copy_page -- copy the page with buffers + * @dst: destination page + * @src: source page + * @copy_dirty: flag whether to copy dirty states on the page's buffer heads. + * + * This fuction is for both data pages and btnode pages. The dirty flag + * should be treated by caller. The page must not be under i/o. + * Both src and dst page must be locked + */ +static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) +{ + struct buffer_head *dbh, *dbufs, *sbh, *sbufs; + unsigned long mask = NILFS_BUFFER_INHERENT_BITS; + + BUG_ON(PageWriteback(dst)); + + sbh = sbufs = page_buffers(src); + if (!page_has_buffers(dst)) + create_empty_buffers(dst, sbh->b_size, 0); + + if (copy_dirty) + mask |= (1UL << BH_Dirty); + + dbh = dbufs = page_buffers(dst); + do { + lock_buffer(sbh); + lock_buffer(dbh); + dbh->b_state = sbh->b_state & mask; + dbh->b_blocknr = sbh->b_blocknr; + dbh->b_bdev = sbh->b_bdev; + sbh = sbh->b_this_page; + dbh = dbh->b_this_page; + } while (dbh != dbufs); + + copy_highpage(dst, src); + + if (PageUptodate(src) && !PageUptodate(dst)) + SetPageUptodate(dst); + else if (!PageUptodate(src) && PageUptodate(dst)) + ClearPageUptodate(dst); + if (PageMappedToDisk(src) && !PageMappedToDisk(dst)) + SetPageMappedToDisk(dst); + else if (!PageMappedToDisk(src) && PageMappedToDisk(dst)) + ClearPageMappedToDisk(dst); + + do { + unlock_buffer(sbh); + unlock_buffer(dbh); + sbh = sbh->b_this_page; + dbh = dbh->b_this_page; + } while (dbh != dbufs); +} + +int nilfs_copy_dirty_pages(struct address_space *dmap, + struct address_space *smap) +{ + struct pagevec pvec; + unsigned int i; + pgoff_t index = 0; + int err = 0; + + pagevec_init(&pvec, 0); +repeat: + if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) + return 0; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i], *dpage; + + lock_page(page); + if (unlikely(!PageDirty(page))) + NILFS_PAGE_BUG(page, "inconsistent dirty state"); + + dpage = grab_cache_page(dmap, page->index); + if (unlikely(!dpage)) { + /* No empty page is added to the page cache */ + err = -ENOMEM; + unlock_page(page); + break; + } + if (unlikely(!page_has_buffers(page))) + NILFS_PAGE_BUG(page, + "found empty page in dat page cache"); + + nilfs_copy_page(dpage, page, 1); + __set_page_dirty_nobuffers(dpage); + + unlock_page(dpage); + page_cache_release(dpage); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + + if (likely(!err)) + goto repeat; + return err; +} + +/** + * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache + * @dmap: destination page cache + * @smap: source page cache + * + * No pages must no be added to the cache during this process. + * This must be ensured by the caller. + */ +void nilfs_copy_back_pages(struct address_space *dmap, + struct address_space *smap) +{ + struct pagevec pvec; + unsigned int i, n; + pgoff_t index = 0; + int err; + + pagevec_init(&pvec, 0); +repeat: + n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE); + if (!n) + return; + index = pvec.pages[n - 1]->index + 1; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i], *dpage; + pgoff_t offset = page->index; + + lock_page(page); + dpage = find_lock_page(dmap, offset); + if (dpage) { + /* override existing page on the destination cache */ + WARN_ON(PageDirty(dpage)); + nilfs_copy_page(dpage, page, 0); + unlock_page(dpage); + page_cache_release(dpage); + } else { + struct page *page2; + + /* move the page to the destination cache */ + spin_lock_irq(&smap->tree_lock); + page2 = radix_tree_delete(&smap->page_tree, offset); + WARN_ON(page2 != page); + + smap->nrpages--; + spin_unlock_irq(&smap->tree_lock); + + spin_lock_irq(&dmap->tree_lock); + err = radix_tree_insert(&dmap->page_tree, offset, page); + if (unlikely(err < 0)) { + WARN_ON(err == -EEXIST); + page->mapping = NULL; + page_cache_release(page); /* for cache */ + } else { + page->mapping = dmap; + dmap->nrpages++; + if (PageDirty(page)) + radix_tree_tag_set(&dmap->page_tree, + offset, + PAGECACHE_TAG_DIRTY); + } + spin_unlock_irq(&dmap->tree_lock); + } + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + + goto repeat; +} + +void nilfs_clear_dirty_pages(struct address_space *mapping) +{ + struct pagevec pvec; + unsigned int i; + pgoff_t index = 0; + + pagevec_init(&pvec, 0); + + while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + lock_page(page); + ClearPageUptodate(page); + ClearPageMappedToDisk(page); + bh = head = page_buffers(page); + do { + lock_buffer(bh); + clear_buffer_dirty(bh); + clear_buffer_nilfs_volatile(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + unlock_buffer(bh); + bh = bh->b_this_page; + } while (bh != head); + + __nilfs_clear_page_dirty(page); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } +} + +unsigned nilfs_page_count_clean_buffers(struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + struct buffer_head *bh, *head; + unsigned nc = 0; + + for (bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + bh->b_size; + if (block_end > from && block_start < to && !buffer_dirty(bh)) + nc++; + } + return nc; +} + +/* + * NILFS2 needs clear_page_dirty() in the following two cases: + * + * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears + * page dirty flags when it copies back pages from the shadow cache + * (gcdat->{i_mapping,i_btnode_cache}) to its original cache + * (dat->{i_mapping,i_btnode_cache}). + * + * 2) Some B-tree operations like insertion or deletion may dispose buffers + * in dirty state, and this needs to cancel the dirty state of their pages. + */ +int __nilfs_clear_page_dirty(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (mapping) { + spin_lock_irq(&mapping->tree_lock); + if (test_bit(PG_dirty, &page->flags)) { + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irq(&mapping->tree_lock); + return clear_page_dirty_for_io(page); + } + spin_unlock_irq(&mapping->tree_lock); + return 0; + } + return TestClearPageDirty(page); +} diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h new file mode 100644 index 00000000000..8abca4d1c1f --- /dev/null +++ b/fs/nilfs2/page.h @@ -0,0 +1,76 @@ +/* + * page.h - buffer/page management specific to NILFS + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net>, + * Seiji Kihara <kihara@osrg.net>. + */ + +#ifndef _NILFS_PAGE_H +#define _NILFS_PAGE_H + +#include <linux/buffer_head.h> +#include "nilfs.h" + +/* + * Extended buffer state bits + */ +enum { + BH_NILFS_Allocated = BH_PrivateStart, + BH_NILFS_Node, + BH_NILFS_Volatile, +}; + +BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */ +BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */ +BUFFER_FNS(NILFS_Volatile, nilfs_volatile) + + +void nilfs_mark_buffer_dirty(struct buffer_head *bh); +int __nilfs_clear_page_dirty(struct page *); + +struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *, + unsigned long, unsigned long); +void nilfs_forget_buffer(struct buffer_head *); +void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *); +int nilfs_page_buffers_clean(struct page *); +void nilfs_page_bug(struct page *); +struct page *nilfs_alloc_private_page(struct block_device *, int, + unsigned long); +void nilfs_free_private_page(struct page *); + +int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); +void nilfs_copy_back_pages(struct address_space *, struct address_space *); +void nilfs_clear_dirty_pages(struct address_space *); +unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); + +#define NILFS_PAGE_BUG(page, m, a...) \ + do { nilfs_page_bug(page); BUG(); } while (0) + +static inline struct buffer_head * +nilfs_page_get_nth_block(struct page *page, unsigned int count) +{ + struct buffer_head *bh = page_buffers(page); + + while (count-- > 0) + bh = bh->b_this_page; + get_bh(bh); + return bh; +} + +#endif /* _NILFS_PAGE_H */ diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c new file mode 100644 index 00000000000..4fc081e47d7 --- /dev/null +++ b/fs/nilfs2/recovery.c @@ -0,0 +1,917 @@ +/* + * recovery.c - NILFS recovery logic + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + */ + +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/swap.h> +#include <linux/crc32.h> +#include "nilfs.h" +#include "segment.h" +#include "sufile.h" +#include "page.h" +#include "seglist.h" +#include "segbuf.h" + +/* + * Segment check result + */ +enum { + NILFS_SEG_VALID, + NILFS_SEG_NO_SUPER_ROOT, + NILFS_SEG_FAIL_IO, + NILFS_SEG_FAIL_MAGIC, + NILFS_SEG_FAIL_SEQ, + NILFS_SEG_FAIL_CHECKSUM_SEGSUM, + NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT, + NILFS_SEG_FAIL_CHECKSUM_FULL, + NILFS_SEG_FAIL_CONSISTENCY, +}; + +/* work structure for recovery */ +struct nilfs_recovery_block { + ino_t ino; /* Inode number of the file that this block + belongs to */ + sector_t blocknr; /* block number */ + __u64 vblocknr; /* virtual block number */ + unsigned long blkoff; /* File offset of the data block (per block) */ + struct list_head list; +}; + + +static int nilfs_warn_segment_error(int err) +{ + switch (err) { + case NILFS_SEG_FAIL_IO: + printk(KERN_WARNING + "NILFS warning: I/O error on loading last segment\n"); + return -EIO; + case NILFS_SEG_FAIL_MAGIC: + printk(KERN_WARNING + "NILFS warning: Segment magic number invalid\n"); + break; + case NILFS_SEG_FAIL_SEQ: + printk(KERN_WARNING + "NILFS warning: Sequence number mismatch\n"); + break; + case NILFS_SEG_FAIL_CHECKSUM_SEGSUM: + printk(KERN_WARNING + "NILFS warning: Checksum error in segment summary\n"); + break; + case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT: + printk(KERN_WARNING + "NILFS warning: Checksum error in super root\n"); + break; + case NILFS_SEG_FAIL_CHECKSUM_FULL: + printk(KERN_WARNING + "NILFS warning: Checksum error in segment payload\n"); + break; + case NILFS_SEG_FAIL_CONSISTENCY: + printk(KERN_WARNING + "NILFS warning: Inconsistent segment\n"); + break; + case NILFS_SEG_NO_SUPER_ROOT: + printk(KERN_WARNING + "NILFS warning: No super root in the last segment\n"); + break; + } + return -EINVAL; +} + +static void store_segsum_info(struct nilfs_segsum_info *ssi, + struct nilfs_segment_summary *sum, + unsigned int blocksize) +{ + ssi->flags = le16_to_cpu(sum->ss_flags); + ssi->seg_seq = le64_to_cpu(sum->ss_seq); + ssi->ctime = le64_to_cpu(sum->ss_create); + ssi->next = le64_to_cpu(sum->ss_next); + ssi->nblocks = le32_to_cpu(sum->ss_nblocks); + ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo); + ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes); + + ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize); + ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi); +} + +/** + * calc_crc_cont - check CRC of blocks continuously + * @sbi: nilfs_sb_info + * @bhs: buffer head of start block + * @sum: place to store result + * @offset: offset bytes in the first block + * @check_bytes: number of bytes to be checked + * @start: DBN of start block + * @nblock: number of blocks to be checked + */ +static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs, + u32 *sum, unsigned long offset, u64 check_bytes, + sector_t start, unsigned long nblock) +{ + unsigned long blocksize = sbi->s_super->s_blocksize; + unsigned long size; + u32 crc; + + BUG_ON(offset >= blocksize); + check_bytes -= offset; + size = min_t(u64, check_bytes, blocksize - offset); + crc = crc32_le(sbi->s_nilfs->ns_crc_seed, + (unsigned char *)bhs->b_data + offset, size); + if (--nblock > 0) { + do { + struct buffer_head *bh + = sb_bread(sbi->s_super, ++start); + if (!bh) + return -EIO; + check_bytes -= size; + size = min_t(u64, check_bytes, blocksize); + crc = crc32_le(crc, bh->b_data, size); + brelse(bh); + } while (--nblock > 0); + } + *sum = crc; + return 0; +} + +/** + * nilfs_read_super_root_block - read super root block + * @sb: super_block + * @sr_block: disk block number of the super root block + * @pbh: address of a buffer_head pointer to return super root buffer + * @check: CRC check flag + */ +int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block, + struct buffer_head **pbh, int check) +{ + struct buffer_head *bh_sr; + struct nilfs_super_root *sr; + u32 crc; + int ret; + + *pbh = NULL; + bh_sr = sb_bread(sb, sr_block); + if (unlikely(!bh_sr)) { + ret = NILFS_SEG_FAIL_IO; + goto failed; + } + + sr = (struct nilfs_super_root *)bh_sr->b_data; + if (check) { + unsigned bytes = le16_to_cpu(sr->sr_bytes); + + if (bytes == 0 || bytes > sb->s_blocksize) { + ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; + goto failed_bh; + } + if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc, + sizeof(sr->sr_sum), bytes, sr_block, 1)) { + ret = NILFS_SEG_FAIL_IO; + goto failed_bh; + } + if (crc != le32_to_cpu(sr->sr_sum)) { + ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; + goto failed_bh; + } + } + *pbh = bh_sr; + return 0; + + failed_bh: + brelse(bh_sr); + + failed: + return nilfs_warn_segment_error(ret); +} + +/** + * load_segment_summary - read segment summary of the specified partial segment + * @sbi: nilfs_sb_info + * @pseg_start: start disk block number of partial segment + * @seg_seq: sequence number requested + * @ssi: pointer to nilfs_segsum_info struct to store information + * @full_check: full check flag + * (0: only checks segment summary CRC, 1: data CRC) + */ +static int +load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start, + u64 seg_seq, struct nilfs_segsum_info *ssi, + int full_check) +{ + struct buffer_head *bh_sum; + struct nilfs_segment_summary *sum; + unsigned long offset, nblock; + u64 check_bytes; + u32 crc, crc_sum; + int ret = NILFS_SEG_FAIL_IO; + + bh_sum = sb_bread(sbi->s_super, pseg_start); + if (!bh_sum) + goto out; + + sum = (struct nilfs_segment_summary *)bh_sum->b_data; + + /* Check consistency of segment summary */ + if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) { + ret = NILFS_SEG_FAIL_MAGIC; + goto failed; + } + store_segsum_info(ssi, sum, sbi->s_super->s_blocksize); + if (seg_seq != ssi->seg_seq) { + ret = NILFS_SEG_FAIL_SEQ; + goto failed; + } + if (full_check) { + offset = sizeof(sum->ss_datasum); + check_bytes = + ((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits); + nblock = ssi->nblocks; + crc_sum = le32_to_cpu(sum->ss_datasum); + ret = NILFS_SEG_FAIL_CHECKSUM_FULL; + } else { /* only checks segment summary */ + offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum); + check_bytes = ssi->sumbytes; + nblock = ssi->nsumblk; + crc_sum = le32_to_cpu(sum->ss_sumsum); + ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM; + } + + if (unlikely(nblock == 0 || + nblock > sbi->s_nilfs->ns_blocks_per_segment)) { + /* This limits the number of blocks read in the CRC check */ + ret = NILFS_SEG_FAIL_CONSISTENCY; + goto failed; + } + if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes, + pseg_start, nblock)) { + ret = NILFS_SEG_FAIL_IO; + goto failed; + } + if (crc == crc_sum) + ret = 0; + failed: + brelse(bh_sum); + out: + return ret; +} + +static void *segsum_get(struct super_block *sb, struct buffer_head **pbh, + unsigned int *offset, unsigned int bytes) +{ + void *ptr; + sector_t blocknr; + + BUG_ON((*pbh)->b_size < *offset); + if (bytes > (*pbh)->b_size - *offset) { + blocknr = (*pbh)->b_blocknr; + brelse(*pbh); + *pbh = sb_bread(sb, blocknr + 1); + if (unlikely(!*pbh)) + return NULL; + *offset = 0; + } + ptr = (*pbh)->b_data + *offset; + *offset += bytes; + return ptr; +} + +static void segsum_skip(struct super_block *sb, struct buffer_head **pbh, + unsigned int *offset, unsigned int bytes, + unsigned long count) +{ + unsigned int rest_item_in_current_block + = ((*pbh)->b_size - *offset) / bytes; + + if (count <= rest_item_in_current_block) { + *offset += bytes * count; + } else { + sector_t blocknr = (*pbh)->b_blocknr; + unsigned int nitem_per_block = (*pbh)->b_size / bytes; + unsigned int bcnt; + + count -= rest_item_in_current_block; + bcnt = DIV_ROUND_UP(count, nitem_per_block); + *offset = bytes * (count - (bcnt - 1) * nitem_per_block); + + brelse(*pbh); + *pbh = sb_bread(sb, blocknr + bcnt); + } +} + +static int +collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr, + struct nilfs_segsum_info *ssi, + struct list_head *head) +{ + struct buffer_head *bh; + unsigned int offset; + unsigned long nfinfo = ssi->nfinfo; + sector_t blocknr = sum_blocknr + ssi->nsumblk; + ino_t ino; + int err = -EIO; + + if (!nfinfo) + return 0; + + bh = sb_bread(sbi->s_super, sum_blocknr); + if (unlikely(!bh)) + goto out; + + offset = le16_to_cpu( + ((struct nilfs_segment_summary *)bh->b_data)->ss_bytes); + for (;;) { + unsigned long nblocks, ndatablk, nnodeblk; + struct nilfs_finfo *finfo; + + finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo)); + if (unlikely(!finfo)) + goto out; + + ino = le64_to_cpu(finfo->fi_ino); + nblocks = le32_to_cpu(finfo->fi_nblocks); + ndatablk = le32_to_cpu(finfo->fi_ndatablk); + nnodeblk = nblocks - ndatablk; + + while (ndatablk-- > 0) { + struct nilfs_recovery_block *rb; + struct nilfs_binfo_v *binfo; + + binfo = segsum_get(sbi->s_super, &bh, &offset, + sizeof(*binfo)); + if (unlikely(!binfo)) + goto out; + + rb = kmalloc(sizeof(*rb), GFP_NOFS); + if (unlikely(!rb)) { + err = -ENOMEM; + goto out; + } + rb->ino = ino; + rb->blocknr = blocknr++; + rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr); + rb->blkoff = le64_to_cpu(binfo->bi_blkoff); + /* INIT_LIST_HEAD(&rb->list); */ + list_add_tail(&rb->list, head); + } + if (--nfinfo == 0) + break; + blocknr += nnodeblk; /* always 0 for the data sync segments */ + segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64), + nnodeblk); + if (unlikely(!bh)) + goto out; + } + err = 0; + out: + brelse(bh); /* brelse(NULL) is just ignored */ + return err; +} + +static void dispose_recovery_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct nilfs_recovery_block *rb + = list_entry(head->next, + struct nilfs_recovery_block, list); + list_del(&rb->list); + kfree(rb); + } +} + +void nilfs_dispose_segment_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct nilfs_segment_entry *ent + = list_entry(head->next, + struct nilfs_segment_entry, list); + list_del(&ent->list); + nilfs_free_segment_entry(ent); + } +} + +static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, + struct nilfs_recovery_info *ri) +{ + struct list_head *head = &ri->ri_used_segments; + struct nilfs_segment_entry *ent, *n; + struct inode *sufile = nilfs->ns_sufile; + __u64 segnum[4]; + int err; + int i; + + segnum[0] = nilfs->ns_segnum; + segnum[1] = nilfs->ns_nextnum; + segnum[2] = ri->ri_segnum; + segnum[3] = ri->ri_nextnum; + + /* + * Releasing the next segment of the latest super root. + * The next segment is invalidated by this recovery. + */ + err = nilfs_sufile_free(sufile, segnum[1]); + if (unlikely(err)) + goto failed; + + err = -ENOMEM; + for (i = 1; i < 4; i++) { + ent = nilfs_alloc_segment_entry(segnum[i]); + if (unlikely(!ent)) + goto failed; + list_add_tail(&ent->list, head); + } + + /* + * Collecting segments written after the latest super root. + * These are marked dirty to avoid being reallocated in the next write. + */ + list_for_each_entry_safe(ent, n, head, list) { + if (ent->segnum != segnum[0]) { + err = nilfs_sufile_scrap(sufile, ent->segnum); + if (unlikely(err)) + goto failed; + } + list_del(&ent->list); + nilfs_free_segment_entry(ent); + } + + /* Allocate new segments for recovery */ + err = nilfs_sufile_alloc(sufile, &segnum[0]); + if (unlikely(err)) + goto failed; + + nilfs->ns_pseg_offset = 0; + nilfs->ns_seg_seq = ri->ri_seq + 2; + nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0]; + return 0; + + failed: + /* No need to recover sufile because it will be destroyed on error */ + return err; +} + +static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi, + struct nilfs_recovery_block *rb, + struct page *page) +{ + struct buffer_head *bh_org; + void *kaddr; + + bh_org = sb_bread(sbi->s_super, rb->blocknr); + if (unlikely(!bh_org)) + return -EIO; + + kaddr = kmap_atomic(page, KM_USER0); + memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size); + kunmap_atomic(kaddr, KM_USER0); + brelse(bh_org); + return 0; +} + +static int recover_dsync_blocks(struct nilfs_sb_info *sbi, + struct list_head *head, + unsigned long *nr_salvaged_blocks) +{ + struct inode *inode; + struct nilfs_recovery_block *rb, *n; + unsigned blocksize = sbi->s_super->s_blocksize; + struct page *page; + loff_t pos; + int err = 0, err2 = 0; + + list_for_each_entry_safe(rb, n, head, list) { + inode = nilfs_iget(sbi->s_super, rb->ino); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + inode = NULL; + goto failed_inode; + } + + pos = rb->blkoff << inode->i_blkbits; + page = NULL; + err = block_write_begin(NULL, inode->i_mapping, pos, blocksize, + 0, &page, NULL, nilfs_get_block); + if (unlikely(err)) + goto failed_inode; + + err = nilfs_recovery_copy_block(sbi, rb, page); + if (unlikely(err)) + goto failed_page; + + err = nilfs_set_file_dirty(sbi, inode, 1); + if (unlikely(err)) + goto failed_page; + + block_write_end(NULL, inode->i_mapping, pos, blocksize, + blocksize, page, NULL); + + unlock_page(page); + page_cache_release(page); + + (*nr_salvaged_blocks)++; + goto next; + + failed_page: + unlock_page(page); + page_cache_release(page); + + failed_inode: + printk(KERN_WARNING + "NILFS warning: error recovering data block " + "(err=%d, ino=%lu, block-offset=%llu)\n", + err, rb->ino, (unsigned long long)rb->blkoff); + if (!err2) + err2 = err; + next: + iput(inode); /* iput(NULL) is just ignored */ + list_del_init(&rb->list); + kfree(rb); + } + return err2; +} + +/** + * nilfs_do_roll_forward - salvage logical segments newer than the latest + * checkpoint + * @sbi: nilfs_sb_info + * @nilfs: the_nilfs + * @ri: pointer to a nilfs_recovery_info + */ +static int nilfs_do_roll_forward(struct the_nilfs *nilfs, + struct nilfs_sb_info *sbi, + struct nilfs_recovery_info *ri) +{ + struct nilfs_segsum_info ssi; + sector_t pseg_start; + sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */ + unsigned long nsalvaged_blocks = 0; + u64 seg_seq; + __u64 segnum, nextnum = 0; + int empty_seg = 0; + int err = 0, ret; + LIST_HEAD(dsync_blocks); /* list of data blocks to be recovered */ + enum { + RF_INIT_ST, + RF_DSYNC_ST, /* scanning data-sync segments */ + }; + int state = RF_INIT_ST; + + nilfs_attach_writer(nilfs, sbi); + pseg_start = ri->ri_lsegs_start; + seg_seq = ri->ri_lsegs_start_seq; + segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + + while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { + + ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); + if (ret) { + if (ret == NILFS_SEG_FAIL_IO) { + err = -EIO; + goto failed; + } + goto strayed; + } + if (unlikely(NILFS_SEG_HAS_SR(&ssi))) + goto confused; + + /* Found a valid partial segment; do recovery actions */ + nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next); + empty_seg = 0; + nilfs->ns_ctime = ssi.ctime; + if (!(ssi.flags & NILFS_SS_GC)) + nilfs->ns_nongc_ctime = ssi.ctime; + + switch (state) { + case RF_INIT_ST: + if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi)) + goto try_next_pseg; + state = RF_DSYNC_ST; + /* Fall through */ + case RF_DSYNC_ST: + if (!NILFS_SEG_DSYNC(&ssi)) + goto confused; + + err = collect_blocks_from_segsum( + sbi, pseg_start, &ssi, &dsync_blocks); + if (unlikely(err)) + goto failed; + if (NILFS_SEG_LOGEND(&ssi)) { + err = recover_dsync_blocks( + sbi, &dsync_blocks, &nsalvaged_blocks); + if (unlikely(err)) + goto failed; + state = RF_INIT_ST; + } + break; /* Fall through to try_next_pseg */ + } + + try_next_pseg: + if (pseg_start == ri->ri_lsegs_end) + break; + pseg_start += ssi.nblocks; + if (pseg_start < seg_end) + continue; + goto feed_segment; + + strayed: + if (pseg_start == ri->ri_lsegs_end) + break; + + feed_segment: + /* Looking to the next full segment */ + if (empty_seg++) + break; + seg_seq++; + segnum = nextnum; + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + pseg_start = seg_start; + } + + if (nsalvaged_blocks) { + printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n", + sbi->s_super->s_id, nsalvaged_blocks); + ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; + } + out: + dispose_recovery_list(&dsync_blocks); + nilfs_detach_writer(sbi->s_nilfs, sbi); + return err; + + confused: + err = -EINVAL; + failed: + printk(KERN_ERR + "NILFS (device %s): Error roll-forwarding " + "(err=%d, pseg block=%llu). ", + sbi->s_super->s_id, err, (unsigned long long)pseg_start); + goto out; +} + +static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, + struct nilfs_sb_info *sbi, + struct nilfs_recovery_info *ri) +{ + struct buffer_head *bh; + int err; + + if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) != + nilfs_get_segnum_of_block(nilfs, ri->ri_super_root)) + return; + + bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start); + BUG_ON(!bh); + memset(bh->b_data, 0, bh->b_size); + set_buffer_dirty(bh); + err = sync_dirty_buffer(bh); + if (unlikely(err)) + printk(KERN_WARNING + "NILFS warning: buffer sync write failed during " + "post-cleaning of recovery.\n"); + brelse(bh); +} + +/** + * nilfs_recover_logical_segments - salvage logical segments written after + * the latest super root + * @nilfs: the_nilfs + * @sbi: nilfs_sb_info + * @ri: pointer to a nilfs_recovery_info struct to store search results. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-EINVAL - Inconsistent filesystem state. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_recover_logical_segments(struct the_nilfs *nilfs, + struct nilfs_sb_info *sbi, + struct nilfs_recovery_info *ri) +{ + int err; + + if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0) + return 0; + + err = nilfs_attach_checkpoint(sbi, ri->ri_cno); + if (unlikely(err)) { + printk(KERN_ERR + "NILFS: error loading the latest checkpoint.\n"); + return err; + } + + err = nilfs_do_roll_forward(nilfs, sbi, ri); + if (unlikely(err)) + goto failed; + + if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) { + err = nilfs_prepare_segment_for_recovery(nilfs, ri); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: Error preparing segments for " + "recovery.\n"); + goto failed; + } + + err = nilfs_attach_segment_constructor(sbi); + if (unlikely(err)) + goto failed; + + set_nilfs_discontinued(nilfs); + err = nilfs_construct_segment(sbi->s_super); + nilfs_detach_segment_constructor(sbi); + + if (unlikely(err)) { + printk(KERN_ERR "NILFS: Oops! recovery failed. " + "(err=%d)\n", err); + goto failed; + } + + nilfs_finish_roll_forward(nilfs, sbi, ri); + } + + nilfs_detach_checkpoint(sbi); + return 0; + + failed: + nilfs_detach_checkpoint(sbi); + nilfs_mdt_clear(nilfs->ns_cpfile); + nilfs_mdt_clear(nilfs->ns_sufile); + nilfs_mdt_clear(nilfs->ns_dat); + return err; +} + +/** + * nilfs_search_super_root - search the latest valid super root + * @nilfs: the_nilfs + * @sbi: nilfs_sb_info + * @ri: pointer to a nilfs_recovery_info struct to store search results. + * + * nilfs_search_super_root() looks for the latest super-root from a partial + * segment pointed by the superblock. It sets up struct the_nilfs through + * this search. It fills nilfs_recovery_info (ri) required for recovery. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-EINVAL - No valid segment found + * + * %-EIO - I/O error + */ +int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, + struct nilfs_recovery_info *ri) +{ + struct nilfs_segsum_info ssi; + sector_t pseg_start, pseg_end, sr_pseg_start = 0; + sector_t seg_start, seg_end; /* range of full segment (block number) */ + u64 seg_seq; + __u64 segnum, nextnum = 0; + __u64 cno; + struct nilfs_segment_entry *ent; + LIST_HEAD(segments); + int empty_seg = 0, scan_newer = 0; + int ret; + + pseg_start = nilfs->ns_last_pseg; + seg_seq = nilfs->ns_last_seq; + cno = nilfs->ns_last_cno; + segnum = nilfs_get_segnum_of_block(nilfs, pseg_start); + + /* Calculate range of segment */ + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + + for (;;) { + /* Load segment summary */ + ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1); + if (ret) { + if (ret == NILFS_SEG_FAIL_IO) + goto failed; + goto strayed; + } + pseg_end = pseg_start + ssi.nblocks - 1; + if (unlikely(pseg_end > seg_end)) { + ret = NILFS_SEG_FAIL_CONSISTENCY; + goto strayed; + } + + /* A valid partial segment */ + ri->ri_pseg_start = pseg_start; + ri->ri_seq = seg_seq; + ri->ri_segnum = segnum; + nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next); + ri->ri_nextnum = nextnum; + empty_seg = 0; + + if (!NILFS_SEG_HAS_SR(&ssi)) { + if (!scan_newer) { + /* This will never happen because a superblock + (last_segment) always points to a pseg + having a super root. */ + ret = NILFS_SEG_FAIL_CONSISTENCY; + goto failed; + } + if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) { + ri->ri_lsegs_start = pseg_start; + ri->ri_lsegs_start_seq = seg_seq; + } + if (NILFS_SEG_LOGEND(&ssi)) + ri->ri_lsegs_end = pseg_start; + goto try_next_pseg; + } + + /* A valid super root was found. */ + ri->ri_cno = cno++; + ri->ri_super_root = pseg_end; + ri->ri_lsegs_start = ri->ri_lsegs_end = 0; + + nilfs_dispose_segment_list(&segments); + nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start) + + ssi.nblocks - seg_start; + nilfs->ns_seg_seq = seg_seq; + nilfs->ns_segnum = segnum; + nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */ + nilfs->ns_ctime = ssi.ctime; + nilfs->ns_nextnum = nextnum; + + if (scan_newer) + ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED; + else { + if (nilfs->ns_mount_state & NILFS_VALID_FS) + goto super_root_found; + scan_newer = 1; + } + + /* reset region for roll-forward */ + pseg_start += ssi.nblocks; + if (pseg_start < seg_end) + continue; + goto feed_segment; + + try_next_pseg: + /* Standing on a course, or met an inconsistent state */ + pseg_start += ssi.nblocks; + if (pseg_start < seg_end) + continue; + goto feed_segment; + + strayed: + /* Off the trail */ + if (!scan_newer) + /* + * This can happen if a checkpoint was written without + * barriers, or as a result of an I/O failure. + */ + goto failed; + + feed_segment: + /* Looking to the next full segment */ + if (empty_seg++) + goto super_root_found; /* found a valid super root */ + + ent = nilfs_alloc_segment_entry(segnum); + if (unlikely(!ent)) { + ret = -ENOMEM; + goto failed; + } + list_add_tail(&ent->list, &segments); + + seg_seq++; + segnum = nextnum; + nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); + pseg_start = seg_start; + } + + super_root_found: + /* Updating pointers relating to the latest checkpoint */ + list_splice(&segments, ri->ri_used_segments.prev); + nilfs->ns_last_pseg = sr_pseg_start; + nilfs->ns_last_seq = nilfs->ns_seg_seq; + nilfs->ns_last_cno = ri->ri_cno; + return 0; + + failed: + nilfs_dispose_segment_list(&segments); + return (ret < 0) ? ret : nilfs_warn_segment_error(ret); +} diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h new file mode 100644 index 00000000000..adccd4fc654 --- /dev/null +++ b/fs/nilfs2/sb.h @@ -0,0 +1,102 @@ +/* + * sb.h - NILFS on-memory super block structure. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#ifndef _NILFS_SB +#define _NILFS_SB + +#include <linux/types.h> +#include <linux/fs.h> + +/* + * Mount options + */ +struct nilfs_mount_options { + unsigned long mount_opt; + __u64 snapshot_cno; +}; + +struct the_nilfs; +struct nilfs_sc_info; + +/* + * NILFS super-block data in memory + */ +struct nilfs_sb_info { + /* Snapshot status */ + __u64 s_snapshot_cno; /* Checkpoint number */ + atomic_t s_inodes_count; + atomic_t s_blocks_count; /* Reserved (might be deleted) */ + + /* Mount options */ + unsigned long s_mount_opt; + uid_t s_resuid; + gid_t s_resgid; + + unsigned long s_interval; /* construction interval */ + unsigned long s_watermark; /* threshold of data amount + for the segment construction */ + + /* Fundamental members */ + struct super_block *s_super; /* reverse pointer to super_block */ + struct the_nilfs *s_nilfs; + struct list_head s_list; /* list head for nilfs->ns_supers */ + + /* Segment constructor */ + struct list_head s_dirty_files; /* dirty files list */ + struct nilfs_sc_info *s_sc_info; /* segment constructor info */ + spinlock_t s_inode_lock; /* Lock for the nilfs inode. + It covers s_dirty_files list */ + + /* Metadata files */ + struct inode *s_ifile; /* index file inode */ + + /* Inode allocator */ + spinlock_t s_next_gen_lock; + u32 s_next_generation; +}; + +static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct nilfs_sc_info *NILFS_SC(struct nilfs_sb_info *sbi) +{ + return sbi->s_sc_info; +} + +/* + * Bit operations for the mount option + */ +#define nilfs_clear_opt(sbi, opt) \ + do { (sbi)->s_mount_opt &= ~NILFS_MOUNT_##opt; } while (0) +#define nilfs_set_opt(sbi, opt) \ + do { (sbi)->s_mount_opt |= NILFS_MOUNT_##opt; } while (0) +#define nilfs_test_opt(sbi, opt) ((sbi)->s_mount_opt & NILFS_MOUNT_##opt) +#define nilfs_write_opt(sbi, mask, opt) \ + do { (sbi)->s_mount_opt = \ + (((sbi)->s_mount_opt & ~NILFS_MOUNT_##mask) | \ + NILFS_MOUNT_##opt); \ + } while (0) + +#endif /* _NILFS_SB */ diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c new file mode 100644 index 00000000000..1e68821b4a9 --- /dev/null +++ b/fs/nilfs2/segbuf.c @@ -0,0 +1,439 @@ +/* + * segbuf.c - NILFS segment buffer + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/crc32.h> +#include "page.h" +#include "segbuf.h" +#include "seglist.h" + + +static struct kmem_cache *nilfs_segbuf_cachep; + +static void nilfs_segbuf_init_once(void *obj) +{ + memset(obj, 0, sizeof(struct nilfs_segment_buffer)); +} + +int __init nilfs_init_segbuf_cache(void) +{ + nilfs_segbuf_cachep = + kmem_cache_create("nilfs2_segbuf_cache", + sizeof(struct nilfs_segment_buffer), + 0, SLAB_RECLAIM_ACCOUNT, + nilfs_segbuf_init_once); + + return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0; +} + +void nilfs_destroy_segbuf_cache(void) +{ + kmem_cache_destroy(nilfs_segbuf_cachep); +} + +struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) +{ + struct nilfs_segment_buffer *segbuf; + + segbuf = kmem_cache_alloc(nilfs_segbuf_cachep, GFP_NOFS); + if (unlikely(!segbuf)) + return NULL; + + segbuf->sb_super = sb; + INIT_LIST_HEAD(&segbuf->sb_list); + INIT_LIST_HEAD(&segbuf->sb_segsum_buffers); + INIT_LIST_HEAD(&segbuf->sb_payload_buffers); + return segbuf; +} + +void nilfs_segbuf_free(struct nilfs_segment_buffer *segbuf) +{ + kmem_cache_free(nilfs_segbuf_cachep, segbuf); +} + +void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum, + unsigned long offset, struct the_nilfs *nilfs) +{ + segbuf->sb_segnum = segnum; + nilfs_get_segment_range(nilfs, segnum, &segbuf->sb_fseg_start, + &segbuf->sb_fseg_end); + + segbuf->sb_pseg_start = segbuf->sb_fseg_start + offset; + segbuf->sb_rest_blocks = + segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1; +} + +void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf, + __u64 nextnum, struct the_nilfs *nilfs) +{ + segbuf->sb_nextnum = nextnum; + segbuf->sb_sum.next = nilfs_get_segment_start_blocknr(nilfs, nextnum); +} + +int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf) +{ + struct buffer_head *bh; + + bh = sb_getblk(segbuf->sb_super, + segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk); + if (unlikely(!bh)) + return -ENOMEM; + + nilfs_segbuf_add_segsum_buffer(segbuf, bh); + return 0; +} + +int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + + bh = sb_getblk(segbuf->sb_super, + segbuf->sb_pseg_start + segbuf->sb_sum.nblocks); + if (unlikely(!bh)) + return -ENOMEM; + + nilfs_segbuf_add_payload_buffer(segbuf, bh); + *bhp = bh; + return 0; +} + +int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, + time_t ctime) +{ + int err; + + segbuf->sb_sum.nblocks = segbuf->sb_sum.nsumblk = 0; + err = nilfs_segbuf_extend_segsum(segbuf); + if (unlikely(err)) + return err; + + segbuf->sb_sum.flags = flags; + segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary); + segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0; + segbuf->sb_sum.ctime = ctime; + + segbuf->sb_io_error = 0; + return 0; +} + +/* + * Setup segument summary + */ +void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf) +{ + struct nilfs_segment_summary *raw_sum; + struct buffer_head *bh_sum; + + bh_sum = list_entry(segbuf->sb_segsum_buffers.next, + struct buffer_head, b_assoc_buffers); + raw_sum = (struct nilfs_segment_summary *)bh_sum->b_data; + + raw_sum->ss_magic = cpu_to_le32(NILFS_SEGSUM_MAGIC); + raw_sum->ss_bytes = cpu_to_le16(sizeof(*raw_sum)); + raw_sum->ss_flags = cpu_to_le16(segbuf->sb_sum.flags); + raw_sum->ss_seq = cpu_to_le64(segbuf->sb_sum.seg_seq); + raw_sum->ss_create = cpu_to_le64(segbuf->sb_sum.ctime); + raw_sum->ss_next = cpu_to_le64(segbuf->sb_sum.next); + raw_sum->ss_nblocks = cpu_to_le32(segbuf->sb_sum.nblocks); + raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo); + raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes); + raw_sum->ss_pad = 0; +} + +/* + * CRC calculation routines + */ +void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf, + u32 seed) +{ + struct buffer_head *bh; + struct nilfs_segment_summary *raw_sum; + unsigned long size, bytes = segbuf->sb_sum.sumbytes; + u32 crc; + + bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, + b_assoc_buffers); + + raw_sum = (struct nilfs_segment_summary *)bh->b_data; + size = min_t(unsigned long, bytes, bh->b_size); + crc = crc32_le(seed, + (unsigned char *)raw_sum + + sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum), + size - (sizeof(raw_sum->ss_datasum) + + sizeof(raw_sum->ss_sumsum))); + + list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + bytes -= size; + size = min_t(unsigned long, bytes, bh->b_size); + crc = crc32_le(crc, bh->b_data, size); + } + raw_sum->ss_sumsum = cpu_to_le32(crc); +} + +void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf, + u32 seed) +{ + struct buffer_head *bh; + struct nilfs_segment_summary *raw_sum; + void *kaddr; + u32 crc; + + bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head, + b_assoc_buffers); + raw_sum = (struct nilfs_segment_summary *)bh->b_data; + crc = crc32_le(seed, + (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum), + bh->b_size - sizeof(raw_sum->ss_datasum)); + + list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + crc = crc32_le(crc, bh->b_data, bh->b_size); + } + list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { + kaddr = kmap_atomic(bh->b_page, KM_USER0); + crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size); + kunmap_atomic(kaddr, KM_USER0); + } + raw_sum->ss_datasum = cpu_to_le32(crc); +} + +void nilfs_release_buffers(struct list_head *list) +{ + struct buffer_head *bh, *n; + + list_for_each_entry_safe(bh, n, list, b_assoc_buffers) { + list_del_init(&bh->b_assoc_buffers); + if (buffer_nilfs_allocated(bh)) { + struct page *clone_page = bh->b_page; + + /* remove clone page */ + brelse(bh); + page_cache_release(clone_page); /* for each bh */ + if (page_count(clone_page) <= 2) { + lock_page(clone_page); + nilfs_free_private_page(clone_page); + } + continue; + } + brelse(bh); + } +} + +/* + * BIO operations + */ +static void nilfs_end_bio_write(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct nilfs_write_info *wi = bio->bi_private; + + if (err == -EOPNOTSUPP) { + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + bio_put(bio); + /* to be detected by submit_seg_bio() */ + } + + if (!uptodate) + atomic_inc(&wi->err); + + bio_put(bio); + complete(&wi->bio_event); +} + +static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode) +{ + struct bio *bio = wi->bio; + int err; + + if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) { + wait_for_completion(&wi->bio_event); + wi->nbio--; + if (unlikely(atomic_read(&wi->err))) { + bio_put(bio); + err = -EIO; + goto failed; + } + } + + bio->bi_end_io = nilfs_end_bio_write; + bio->bi_private = wi; + bio_get(bio); + submit_bio(mode, bio); + if (bio_flagged(bio, BIO_EOPNOTSUPP)) { + bio_put(bio); + err = -EOPNOTSUPP; + goto failed; + } + wi->nbio++; + bio_put(bio); + + wi->bio = NULL; + wi->rest_blocks -= wi->end - wi->start; + wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); + wi->start = wi->end; + return 0; + + failed: + wi->bio = NULL; + return err; +} + +/** + * nilfs_alloc_seg_bio - allocate a bio for writing segment. + * @sb: super block + * @start: beginning disk block number of this BIO. + * @nr_vecs: request size of page vector. + * + * alloc_seg_bio() allocates a new BIO structure and initialize it. + * + * Return Value: On success, pointer to the struct bio is returned. + * On error, NULL is returned. + */ +static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start, + int nr_vecs) +{ + struct bio *bio; + + bio = bio_alloc(GFP_NOWAIT, nr_vecs); + if (bio == NULL) { + while (!bio && (nr_vecs >>= 1)) + bio = bio_alloc(GFP_NOWAIT, nr_vecs); + } + if (likely(bio)) { + bio->bi_bdev = sb->s_bdev; + bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9); + } + return bio; +} + +void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi) +{ + wi->bio = NULL; + wi->rest_blocks = segbuf->sb_sum.nblocks; + wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev); + wi->nr_vecs = min(wi->max_pages, wi->rest_blocks); + wi->start = wi->end = 0; + wi->nbio = 0; + wi->blocknr = segbuf->sb_pseg_start; + + atomic_set(&wi->err, 0); + init_completion(&wi->bio_event); +} + +static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh, + int mode) +{ + int len, err; + + BUG_ON(wi->nr_vecs <= 0); + repeat: + if (!wi->bio) { + wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end, + wi->nr_vecs); + if (unlikely(!wi->bio)) + return -ENOMEM; + } + + len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh)); + if (len == bh->b_size) { + wi->end++; + return 0; + } + /* bio is FULL */ + err = nilfs_submit_seg_bio(wi, mode); + /* never submit current bh */ + if (likely(!err)) + goto repeat; + return err; +} + +int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi) +{ + struct buffer_head *bh; + int res, rw = WRITE; + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { + res = nilfs_submit_bh(wi, bh, rw); + if (unlikely(res)) + goto failed_bio; + } + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { + res = nilfs_submit_bh(wi, bh, rw); + if (unlikely(res)) + goto failed_bio; + } + + if (wi->bio) { + /* + * Last BIO is always sent through the following + * submission. + */ + rw |= (1 << BIO_RW_SYNCIO); + res = nilfs_submit_seg_bio(wi, rw); + if (unlikely(res)) + goto failed_bio; + } + + res = 0; + out: + return res; + + failed_bio: + atomic_inc(&wi->err); + goto out; +} + +/** + * nilfs_segbuf_wait - wait for completion of requested BIOs + * @wi: nilfs_write_info + * + * Return Value: On Success, 0 is returned. On Error, one of the following + * negative error code is returned. + * + * %-EIO - I/O error + */ +int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf, + struct nilfs_write_info *wi) +{ + int err = 0; + + if (!wi->nbio) + return 0; + + do { + wait_for_completion(&wi->bio_event); + } while (--wi->nbio > 0); + + if (unlikely(atomic_read(&wi->err) > 0)) { + printk(KERN_ERR "NILFS: IO error writing segment\n"); + err = -EIO; + segbuf->sb_io_error = 1; + } + return err; +} diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h new file mode 100644 index 00000000000..0c3076f4e59 --- /dev/null +++ b/fs/nilfs2/segbuf.h @@ -0,0 +1,201 @@ +/* + * segbuf.h - NILFS Segment buffer prototypes and definitions + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ +#ifndef _NILFS_SEGBUF_H +#define _NILFS_SEGBUF_H + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/bio.h> +#include <linux/completion.h> +#include <linux/backing-dev.h> + +/** + * struct nilfs_segsum_info - On-memory segment summary + * @flags: Flags + * @nfinfo: Number of file information structures + * @nblocks: Number of blocks included in the partial segment + * @nsumblk: Number of summary blocks + * @sumbytes: Byte count of segment summary + * @nfileblk: Total number of file blocks + * @seg_seq: Segment sequence number + * @ctime: Creation time + * @next: Block number of the next full segment + */ +struct nilfs_segsum_info { + unsigned int flags; + unsigned long nfinfo; + unsigned long nblocks; + unsigned long nsumblk; + unsigned long sumbytes; + unsigned long nfileblk; + u64 seg_seq; + time_t ctime; + sector_t next; +}; + +/* macro for the flags */ +#define NILFS_SEG_HAS_SR(sum) ((sum)->flags & NILFS_SS_SR) +#define NILFS_SEG_LOGBGN(sum) ((sum)->flags & NILFS_SS_LOGBGN) +#define NILFS_SEG_LOGEND(sum) ((sum)->flags & NILFS_SS_LOGEND) +#define NILFS_SEG_DSYNC(sum) ((sum)->flags & NILFS_SS_SYNDT) +#define NILFS_SEG_SIMPLEX(sum) \ + (((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \ + (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) + +#define NILFS_SEG_EMPTY(sum) ((sum)->nblocks == (sum)->nsumblk) + +/** + * struct nilfs_segment_buffer - Segment buffer + * @sb_super: back pointer to a superblock struct + * @sb_list: List head to chain this structure + * @sb_sum: On-memory segment summary + * @sb_segnum: Index number of the full segment + * @sb_nextnum: Index number of the next full segment + * @sb_fseg_start: Start block number of the full segment + * @sb_fseg_end: End block number of the full segment + * @sb_pseg_start: Disk block number of partial segment + * @sb_rest_blocks: Number of residual blocks in the current segment + * @sb_segsum_buffers: List of buffers for segment summaries + * @sb_payload_buffers: List of buffers for segment payload + * @sb_io_error: I/O error status + */ +struct nilfs_segment_buffer { + struct super_block *sb_super; + struct list_head sb_list; + + /* Segment information */ + struct nilfs_segsum_info sb_sum; + __u64 sb_segnum; + __u64 sb_nextnum; + sector_t sb_fseg_start, sb_fseg_end; + sector_t sb_pseg_start; + unsigned sb_rest_blocks; + + /* Buffers */ + struct list_head sb_segsum_buffers; + struct list_head sb_payload_buffers; /* including super root */ + + /* io status */ + int sb_io_error; +}; + +#define NILFS_LIST_SEGBUF(head) \ + list_entry((head), struct nilfs_segment_buffer, sb_list) +#define NILFS_NEXT_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.next) +#define NILFS_PREV_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.prev) +#define NILFS_LAST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->prev) +#define NILFS_FIRST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->next) +#define NILFS_SEGBUF_IS_LAST(segbuf, head) ((segbuf)->sb_list.next == (head)) + +#define nilfs_for_each_segbuf_before(s, t, h) \ + for ((s) = NILFS_FIRST_SEGBUF(h); (s) != (t); \ + (s) = NILFS_NEXT_SEGBUF(s)) + +#define NILFS_SEGBUF_FIRST_BH(head) \ + (list_entry((head)->next, struct buffer_head, b_assoc_buffers)) +#define NILFS_SEGBUF_NEXT_BH(bh) \ + (list_entry((bh)->b_assoc_buffers.next, struct buffer_head, \ + b_assoc_buffers)) +#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head) + + +int __init nilfs_init_segbuf_cache(void); +void nilfs_destroy_segbuf_cache(void); +struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *); +void nilfs_segbuf_free(struct nilfs_segment_buffer *); +void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long, + struct the_nilfs *); +void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, + struct the_nilfs *); +int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t); +int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *); +int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, + struct buffer_head **); +void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *); +void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32); +void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32); + +static inline void +nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf, + struct buffer_head *bh) +{ + list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_segsum_buffers); + segbuf->sb_sum.nblocks++; + segbuf->sb_sum.nsumblk++; +} + +static inline void +nilfs_segbuf_add_payload_buffer(struct nilfs_segment_buffer *segbuf, + struct buffer_head *bh) +{ + list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_payload_buffers); + segbuf->sb_sum.nblocks++; +} + +static inline void +nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf, + struct buffer_head *bh) +{ + get_bh(bh); + nilfs_segbuf_add_payload_buffer(segbuf, bh); + segbuf->sb_sum.nfileblk++; +} + +void nilfs_release_buffers(struct list_head *); + +static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf) +{ + nilfs_release_buffers(&segbuf->sb_segsum_buffers); + nilfs_release_buffers(&segbuf->sb_payload_buffers); +} + +struct nilfs_write_info { + struct bio *bio; + int start, end; /* The region to be submitted */ + int rest_blocks; + int max_pages; + int nr_vecs; + sector_t blocknr; + + int nbio; + atomic_t err; + struct completion bio_event; + /* completion event of segment write */ + + /* + * The following fields must be set explicitly + */ + struct super_block *sb; + struct backing_dev_info *bdi; /* backing dev info */ + struct buffer_head *bh_sr; +}; + + +void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *, + struct nilfs_write_info *); +int nilfs_segbuf_write(struct nilfs_segment_buffer *, + struct nilfs_write_info *); +int nilfs_segbuf_wait(struct nilfs_segment_buffer *, + struct nilfs_write_info *); + +#endif /* _NILFS_SEGBUF_H */ diff --git a/fs/nilfs2/seglist.h b/fs/nilfs2/seglist.h new file mode 100644 index 00000000000..d39df9144e9 --- /dev/null +++ b/fs/nilfs2/seglist.h @@ -0,0 +1,85 @@ +/* + * seglist.h - expediential structure and routines to handle list of segments + * (would be removed in a future release) + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ +#ifndef _NILFS_SEGLIST_H +#define _NILFS_SEGLIST_H + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> +#include "sufile.h" + +struct nilfs_segment_entry { + __u64 segnum; + +#define NILFS_SLH_FREED 0x0001 /* The segment was freed provisonally. + It must be cancelled if + construction aborted */ + + unsigned flags; + struct list_head list; + struct buffer_head *bh_su; + struct nilfs_segment_usage *raw_su; +}; + + +void nilfs_dispose_segment_list(struct list_head *); + +static inline struct nilfs_segment_entry * +nilfs_alloc_segment_entry(__u64 segnum) +{ + struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS); + + if (likely(ent)) { + ent->segnum = segnum; + ent->flags = 0; + ent->bh_su = NULL; + ent->raw_su = NULL; + INIT_LIST_HEAD(&ent->list); + } + return ent; +} + +static inline int nilfs_open_segment_entry(struct nilfs_segment_entry *ent, + struct inode *sufile) +{ + return nilfs_sufile_get_segment_usage(sufile, ent->segnum, + &ent->raw_su, &ent->bh_su); +} + +static inline void nilfs_close_segment_entry(struct nilfs_segment_entry *ent, + struct inode *sufile) +{ + if (!ent->bh_su) + return; + nilfs_sufile_put_segment_usage(sufile, ent->segnum, ent->bh_su); + ent->bh_su = NULL; + ent->raw_su = NULL; +} + +static inline void nilfs_free_segment_entry(struct nilfs_segment_entry *ent) +{ + kfree(ent); +} + +#endif /* _NILFS_SEGLIST_H */ diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c new file mode 100644 index 00000000000..fb70ec3be20 --- /dev/null +++ b/fs/nilfs2/segment.c @@ -0,0 +1,2977 @@ +/* + * segment.c - NILFS segment constructor. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#include <linux/pagemap.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/bio.h> +#include <linux/completion.h> +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/crc32.h> +#include <linux/pagevec.h> +#include "nilfs.h" +#include "btnode.h" +#include "page.h" +#include "segment.h" +#include "sufile.h" +#include "cpfile.h" +#include "ifile.h" +#include "seglist.h" +#include "segbuf.h" + + +/* + * Segment constructor + */ +#define SC_N_INODEVEC 16 /* Size of locally allocated inode vector */ + +#define SC_MAX_SEGDELTA 64 /* Upper limit of the number of segments + appended in collection retry loop */ + +/* Construction mode */ +enum { + SC_LSEG_SR = 1, /* Make a logical segment having a super root */ + SC_LSEG_DSYNC, /* Flush data blocks of a given file and make + a logical segment without a super root */ + SC_FLUSH_FILE, /* Flush data files, leads to segment writes without + creating a checkpoint */ + SC_FLUSH_DAT, /* Flush DAT file. This also creates segments without + a checkpoint */ +}; + +/* Stage numbers of dirty block collection */ +enum { + NILFS_ST_INIT = 0, + NILFS_ST_GC, /* Collecting dirty blocks for GC */ + NILFS_ST_FILE, + NILFS_ST_IFILE, + NILFS_ST_CPFILE, + NILFS_ST_SUFILE, + NILFS_ST_DAT, + NILFS_ST_SR, /* Super root */ + NILFS_ST_DSYNC, /* Data sync blocks */ + NILFS_ST_DONE, +}; + +/* State flags of collection */ +#define NILFS_CF_NODE 0x0001 /* Collecting node blocks */ +#define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */ +#define NILFS_CF_HISTORY_MASK (NILFS_CF_IFILE_STARTED) + +/* Operations depending on the construction mode and file type */ +struct nilfs_sc_operations { + int (*collect_data)(struct nilfs_sc_info *, struct buffer_head *, + struct inode *); + int (*collect_node)(struct nilfs_sc_info *, struct buffer_head *, + struct inode *); + int (*collect_bmap)(struct nilfs_sc_info *, struct buffer_head *, + struct inode *); + void (*write_data_binfo)(struct nilfs_sc_info *, + struct nilfs_segsum_pointer *, + union nilfs_binfo *); + void (*write_node_binfo)(struct nilfs_sc_info *, + struct nilfs_segsum_pointer *, + union nilfs_binfo *); +}; + +/* + * Other definitions + */ +static void nilfs_segctor_start_timer(struct nilfs_sc_info *); +static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int); +static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *); +static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *, + int); + +#define nilfs_cnt32_gt(a, b) \ + (typecheck(__u32, a) && typecheck(__u32, b) && \ + ((__s32)(b) - (__s32)(a) < 0)) +#define nilfs_cnt32_ge(a, b) \ + (typecheck(__u32, a) && typecheck(__u32, b) && \ + ((__s32)(a) - (__s32)(b) >= 0)) +#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a) +#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a) + +/* + * Transaction + */ +static struct kmem_cache *nilfs_transaction_cachep; + +/** + * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info + * + * nilfs_init_transaction_cache() creates a slab cache for the struct + * nilfs_transaction_info. + * + * Return Value: On success, it returns 0. On error, one of the following + * negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_init_transaction_cache(void) +{ + nilfs_transaction_cachep = + kmem_cache_create("nilfs2_transaction_cache", + sizeof(struct nilfs_transaction_info), + 0, SLAB_RECLAIM_ACCOUNT, NULL); + return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0; +} + +/** + * nilfs_detroy_transaction_cache - destroy the cache for transaction info + * + * nilfs_destroy_transaction_cache() frees the slab cache for the struct + * nilfs_transaction_info. + */ +void nilfs_destroy_transaction_cache(void) +{ + kmem_cache_destroy(nilfs_transaction_cachep); +} + +static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) +{ + struct nilfs_transaction_info *cur_ti = current->journal_info; + void *save = NULL; + + if (cur_ti) { + if (cur_ti->ti_magic == NILFS_TI_MAGIC) + return ++cur_ti->ti_count; + else { + /* + * If journal_info field is occupied by other FS, + * it is saved and will be restored on + * nilfs_transaction_commit(). + */ + printk(KERN_WARNING + "NILFS warning: journal info from a different " + "FS\n"); + save = current->journal_info; + } + } + if (!ti) { + ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS); + if (!ti) + return -ENOMEM; + ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC; + } else { + ti->ti_flags = 0; + } + ti->ti_count = 0; + ti->ti_save = save; + ti->ti_magic = NILFS_TI_MAGIC; + current->journal_info = ti; + return 0; +} + +/** + * nilfs_transaction_begin - start indivisible file operations. + * @sb: super block + * @ti: nilfs_transaction_info + * @vacancy_check: flags for vacancy rate checks + * + * nilfs_transaction_begin() acquires a reader/writer semaphore, called + * the segment semaphore, to make a segment construction and write tasks + * exclusive. The function is used with nilfs_transaction_commit() in pairs. + * The region enclosed by these two functions can be nested. To avoid a + * deadlock, the semaphore is only acquired or released in the outermost call. + * + * This function allocates a nilfs_transaction_info struct to keep context + * information on it. It is initialized and hooked onto the current task in + * the outermost call. If a pre-allocated struct is given to @ti, it is used + * instead; othewise a new struct is assigned from a slab. + * + * When @vacancy_check flag is set, this function will check the amount of + * free space, and will wait for the GC to reclaim disk space if low capacity. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-ENOSPC - No space left on device + */ +int nilfs_transaction_begin(struct super_block *sb, + struct nilfs_transaction_info *ti, + int vacancy_check) +{ + struct nilfs_sb_info *sbi; + struct the_nilfs *nilfs; + int ret = nilfs_prepare_segment_lock(ti); + + if (unlikely(ret < 0)) + return ret; + if (ret > 0) + return 0; + + sbi = NILFS_SB(sb); + nilfs = sbi->s_nilfs; + down_read(&nilfs->ns_segctor_sem); + if (vacancy_check && nilfs_near_disk_full(nilfs)) { + up_read(&nilfs->ns_segctor_sem); + ret = -ENOSPC; + goto failed; + } + return 0; + + failed: + ti = current->journal_info; + current->journal_info = ti->ti_save; + if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) + kmem_cache_free(nilfs_transaction_cachep, ti); + return ret; +} + +/** + * nilfs_transaction_commit - commit indivisible file operations. + * @sb: super block + * + * nilfs_transaction_commit() releases the read semaphore which is + * acquired by nilfs_transaction_begin(). This is only performed + * in outermost call of this function. If a commit flag is set, + * nilfs_transaction_commit() sets a timer to start the segment + * constructor. If a sync flag is set, it starts construction + * directly. + */ +int nilfs_transaction_commit(struct super_block *sb) +{ + struct nilfs_transaction_info *ti = current->journal_info; + struct nilfs_sb_info *sbi; + struct nilfs_sc_info *sci; + int err = 0; + + BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); + ti->ti_flags |= NILFS_TI_COMMIT; + if (ti->ti_count > 0) { + ti->ti_count--; + return 0; + } + sbi = NILFS_SB(sb); + sci = NILFS_SC(sbi); + if (sci != NULL) { + if (ti->ti_flags & NILFS_TI_COMMIT) + nilfs_segctor_start_timer(sci); + if (atomic_read(&sbi->s_nilfs->ns_ndirtyblks) > + sci->sc_watermark) + nilfs_segctor_do_flush(sci, 0); + } + up_read(&sbi->s_nilfs->ns_segctor_sem); + current->journal_info = ti->ti_save; + + if (ti->ti_flags & NILFS_TI_SYNC) + err = nilfs_construct_segment(sb); + if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) + kmem_cache_free(nilfs_transaction_cachep, ti); + return err; +} + +void nilfs_transaction_abort(struct super_block *sb) +{ + struct nilfs_transaction_info *ti = current->journal_info; + + BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); + if (ti->ti_count > 0) { + ti->ti_count--; + return; + } + up_read(&NILFS_SB(sb)->s_nilfs->ns_segctor_sem); + + current->journal_info = ti->ti_save; + if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) + kmem_cache_free(nilfs_transaction_cachep, ti); +} + +void nilfs_relax_pressure_in_lock(struct super_block *sb) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_sc_info *sci = NILFS_SC(sbi); + struct the_nilfs *nilfs = sbi->s_nilfs; + + if (!sci || !sci->sc_flush_request) + return; + + set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags); + up_read(&nilfs->ns_segctor_sem); + + down_write(&nilfs->ns_segctor_sem); + if (sci->sc_flush_request && + test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) { + struct nilfs_transaction_info *ti = current->journal_info; + + ti->ti_flags |= NILFS_TI_WRITER; + nilfs_segctor_do_immediate_flush(sci); + ti->ti_flags &= ~NILFS_TI_WRITER; + } + downgrade_write(&nilfs->ns_segctor_sem); +} + +static void nilfs_transaction_lock(struct nilfs_sb_info *sbi, + struct nilfs_transaction_info *ti, + int gcflag) +{ + struct nilfs_transaction_info *cur_ti = current->journal_info; + + WARN_ON(cur_ti); + ti->ti_flags = NILFS_TI_WRITER; + ti->ti_count = 0; + ti->ti_save = cur_ti; + ti->ti_magic = NILFS_TI_MAGIC; + INIT_LIST_HEAD(&ti->ti_garbage); + current->journal_info = ti; + + for (;;) { + down_write(&sbi->s_nilfs->ns_segctor_sem); + if (!test_bit(NILFS_SC_PRIOR_FLUSH, &NILFS_SC(sbi)->sc_flags)) + break; + + nilfs_segctor_do_immediate_flush(NILFS_SC(sbi)); + + up_write(&sbi->s_nilfs->ns_segctor_sem); + yield(); + } + if (gcflag) + ti->ti_flags |= NILFS_TI_GC; +} + +static void nilfs_transaction_unlock(struct nilfs_sb_info *sbi) +{ + struct nilfs_transaction_info *ti = current->journal_info; + + BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); + BUG_ON(ti->ti_count > 0); + + up_write(&sbi->s_nilfs->ns_segctor_sem); + current->journal_info = ti->ti_save; + if (!list_empty(&ti->ti_garbage)) + nilfs_dispose_list(sbi, &ti->ti_garbage, 0); +} + +static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + unsigned bytes) +{ + struct nilfs_segment_buffer *segbuf = sci->sc_curseg; + unsigned blocksize = sci->sc_super->s_blocksize; + void *p; + + if (unlikely(ssp->offset + bytes > blocksize)) { + ssp->offset = 0; + BUG_ON(NILFS_SEGBUF_BH_IS_LAST(ssp->bh, + &segbuf->sb_segsum_buffers)); + ssp->bh = NILFS_SEGBUF_NEXT_BH(ssp->bh); + } + p = ssp->bh->b_data + ssp->offset; + ssp->offset += bytes; + return p; +} + +/** + * nilfs_segctor_reset_segment_buffer - reset the current segment buffer + * @sci: nilfs_sc_info + */ +static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf = sci->sc_curseg; + struct buffer_head *sumbh; + unsigned sumbytes; + unsigned flags = 0; + int err; + + if (nilfs_doing_gc()) + flags = NILFS_SS_GC; + err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime); + if (unlikely(err)) + return err; + + sumbh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers); + sumbytes = segbuf->sb_sum.sumbytes; + sci->sc_finfo_ptr.bh = sumbh; sci->sc_finfo_ptr.offset = sumbytes; + sci->sc_binfo_ptr.bh = sumbh; sci->sc_binfo_ptr.offset = sumbytes; + sci->sc_blk_cnt = sci->sc_datablk_cnt = 0; + return 0; +} + +static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci) +{ + sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks; + if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs)) + return -E2BIG; /* The current segment is filled up + (internal code) */ + sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg); + return nilfs_segctor_reset_segment_buffer(sci); +} + +static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf = sci->sc_curseg; + int err; + + if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) { + err = nilfs_segctor_feed_segment(sci); + if (err) + return err; + segbuf = sci->sc_curseg; + } + err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root); + if (likely(!err)) + segbuf->sb_sum.flags |= NILFS_SS_SR; + return err; +} + +/* + * Functions for making segment summary and payloads + */ +static int nilfs_segctor_segsum_block_required( + struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp, + unsigned binfo_size) +{ + unsigned blocksize = sci->sc_super->s_blocksize; + /* Size of finfo and binfo is enough small against blocksize */ + + return ssp->offset + binfo_size + + (!sci->sc_blk_cnt ? sizeof(struct nilfs_finfo) : 0) > + blocksize; +} + +static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci, + struct inode *inode) +{ + sci->sc_curseg->sb_sum.nfinfo++; + sci->sc_binfo_ptr = sci->sc_finfo_ptr; + nilfs_segctor_map_segsum_entry( + sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo)); + + if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) + set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); + /* skip finfo */ +} + +static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci, + struct inode *inode) +{ + struct nilfs_finfo *finfo; + struct nilfs_inode_info *ii; + struct nilfs_segment_buffer *segbuf; + + if (sci->sc_blk_cnt == 0) + return; + + ii = NILFS_I(inode); + finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr, + sizeof(*finfo)); + finfo->fi_ino = cpu_to_le64(inode->i_ino); + finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt); + finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt); + finfo->fi_cno = cpu_to_le64(ii->i_cno); + + segbuf = sci->sc_curseg; + segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset + + sci->sc_super->s_blocksize * (segbuf->sb_sum.nsumblk - 1); + sci->sc_finfo_ptr = sci->sc_binfo_ptr; + sci->sc_blk_cnt = sci->sc_datablk_cnt = 0; +} + +static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci, + struct buffer_head *bh, + struct inode *inode, + unsigned binfo_size) +{ + struct nilfs_segment_buffer *segbuf; + int required, err = 0; + + retry: + segbuf = sci->sc_curseg; + required = nilfs_segctor_segsum_block_required( + sci, &sci->sc_binfo_ptr, binfo_size); + if (segbuf->sb_sum.nblocks + required + 1 > segbuf->sb_rest_blocks) { + nilfs_segctor_end_finfo(sci, inode); + err = nilfs_segctor_feed_segment(sci); + if (err) + return err; + goto retry; + } + if (unlikely(required)) { + err = nilfs_segbuf_extend_segsum(segbuf); + if (unlikely(err)) + goto failed; + } + if (sci->sc_blk_cnt == 0) + nilfs_segctor_begin_finfo(sci, inode); + + nilfs_segctor_map_segsum_entry(sci, &sci->sc_binfo_ptr, binfo_size); + /* Substitution to vblocknr is delayed until update_blocknr() */ + nilfs_segbuf_add_file_buffer(segbuf, bh); + sci->sc_blk_cnt++; + failed: + return err; +} + +static int nilfs_handle_bmap_error(int err, const char *fname, + struct inode *inode, struct super_block *sb) +{ + if (err == -EINVAL) { + nilfs_error(sb, fname, "broken bmap (inode=%lu)\n", + inode->i_ino); + err = -EIO; + } + return err; +} + +/* + * Callback functions that enumerate, mark, and collect dirty blocks + */ +static int nilfs_collect_file_data(struct nilfs_sc_info *sci, + struct buffer_head *bh, struct inode *inode) +{ + int err; + + err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); + if (unlikely(err < 0)) + return nilfs_handle_bmap_error(err, __func__, inode, + sci->sc_super); + + err = nilfs_segctor_add_file_block(sci, bh, inode, + sizeof(struct nilfs_binfo_v)); + if (!err) + sci->sc_datablk_cnt++; + return err; +} + +static int nilfs_collect_file_node(struct nilfs_sc_info *sci, + struct buffer_head *bh, + struct inode *inode) +{ + int err; + + err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); + if (unlikely(err < 0)) + return nilfs_handle_bmap_error(err, __func__, inode, + sci->sc_super); + return 0; +} + +static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci, + struct buffer_head *bh, + struct inode *inode) +{ + WARN_ON(!buffer_dirty(bh)); + return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64)); +} + +static void nilfs_write_file_data_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + struct nilfs_binfo_v *binfo_v = nilfs_segctor_map_segsum_entry( + sci, ssp, sizeof(*binfo_v)); + *binfo_v = binfo->bi_v; +} + +static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + __le64 *vblocknr = nilfs_segctor_map_segsum_entry( + sci, ssp, sizeof(*vblocknr)); + *vblocknr = binfo->bi_v.bi_vblocknr; +} + +struct nilfs_sc_operations nilfs_sc_file_ops = { + .collect_data = nilfs_collect_file_data, + .collect_node = nilfs_collect_file_node, + .collect_bmap = nilfs_collect_file_bmap, + .write_data_binfo = nilfs_write_file_data_binfo, + .write_node_binfo = nilfs_write_file_node_binfo, +}; + +static int nilfs_collect_dat_data(struct nilfs_sc_info *sci, + struct buffer_head *bh, struct inode *inode) +{ + int err; + + err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); + if (unlikely(err < 0)) + return nilfs_handle_bmap_error(err, __func__, inode, + sci->sc_super); + + err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64)); + if (!err) + sci->sc_datablk_cnt++; + return err; +} + +static int nilfs_collect_dat_bmap(struct nilfs_sc_info *sci, + struct buffer_head *bh, struct inode *inode) +{ + WARN_ON(!buffer_dirty(bh)); + return nilfs_segctor_add_file_block(sci, bh, inode, + sizeof(struct nilfs_binfo_dat)); +} + +static void nilfs_write_dat_data_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + __le64 *blkoff = nilfs_segctor_map_segsum_entry(sci, ssp, + sizeof(*blkoff)); + *blkoff = binfo->bi_dat.bi_blkoff; +} + +static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci, + struct nilfs_segsum_pointer *ssp, + union nilfs_binfo *binfo) +{ + struct nilfs_binfo_dat *binfo_dat = + nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*binfo_dat)); + *binfo_dat = binfo->bi_dat; +} + +struct nilfs_sc_operations nilfs_sc_dat_ops = { + .collect_data = nilfs_collect_dat_data, + .collect_node = nilfs_collect_file_node, + .collect_bmap = nilfs_collect_dat_bmap, + .write_data_binfo = nilfs_write_dat_data_binfo, + .write_node_binfo = nilfs_write_dat_node_binfo, +}; + +struct nilfs_sc_operations nilfs_sc_dsync_ops = { + .collect_data = nilfs_collect_file_data, + .collect_node = NULL, + .collect_bmap = NULL, + .write_data_binfo = nilfs_write_file_data_binfo, + .write_node_binfo = NULL, +}; + +static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, + struct list_head *listp, + size_t nlimit, + loff_t start, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + pgoff_t index = 0, last = ULONG_MAX; + size_t ndirties = 0; + int i; + + if (unlikely(start != 0 || end != LLONG_MAX)) { + /* + * A valid range is given for sync-ing data pages. The + * range is rounded to per-page; extra dirty buffers + * may be included if blocksize < pagesize. + */ + index = start >> PAGE_SHIFT; + last = end >> PAGE_SHIFT; + } + pagevec_init(&pvec, 0); + repeat: + if (unlikely(index > last) || + !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, + min_t(pgoff_t, last - index, + PAGEVEC_SIZE - 1) + 1)) + return ndirties; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct buffer_head *bh, *head; + struct page *page = pvec.pages[i]; + + if (unlikely(page->index > last)) + break; + + if (mapping->host) { + lock_page(page); + if (!page_has_buffers(page)) + create_empty_buffers(page, + 1 << inode->i_blkbits, 0); + unlock_page(page); + } + + bh = head = page_buffers(page); + do { + if (!buffer_dirty(bh)) + continue; + get_bh(bh); + list_add_tail(&bh->b_assoc_buffers, listp); + ndirties++; + if (unlikely(ndirties >= nlimit)) { + pagevec_release(&pvec); + cond_resched(); + return ndirties; + } + } while (bh = bh->b_this_page, bh != head); + } + pagevec_release(&pvec); + cond_resched(); + goto repeat; +} + +static void nilfs_lookup_dirty_node_buffers(struct inode *inode, + struct list_head *listp) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + struct address_space *mapping = &ii->i_btnode_cache; + struct pagevec pvec; + struct buffer_head *bh, *head; + unsigned int i; + pgoff_t index = 0; + + pagevec_init(&pvec, 0); + + while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, + PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + bh = head = page_buffers(pvec.pages[i]); + do { + if (buffer_dirty(bh)) { + get_bh(bh); + list_add_tail(&bh->b_assoc_buffers, + listp); + } + bh = bh->b_this_page; + } while (bh != head); + } + pagevec_release(&pvec); + cond_resched(); + } +} + +static void nilfs_dispose_list(struct nilfs_sb_info *sbi, + struct list_head *head, int force) +{ + struct nilfs_inode_info *ii, *n; + struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii; + unsigned nv = 0; + + while (!list_empty(head)) { + spin_lock(&sbi->s_inode_lock); + list_for_each_entry_safe(ii, n, head, i_dirty) { + list_del_init(&ii->i_dirty); + if (force) { + if (unlikely(ii->i_bh)) { + brelse(ii->i_bh); + ii->i_bh = NULL; + } + } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) { + set_bit(NILFS_I_QUEUED, &ii->i_state); + list_add_tail(&ii->i_dirty, + &sbi->s_dirty_files); + continue; + } + ivec[nv++] = ii; + if (nv == SC_N_INODEVEC) + break; + } + spin_unlock(&sbi->s_inode_lock); + + for (pii = ivec; nv > 0; pii++, nv--) + iput(&(*pii)->vfs_inode); + } +} + +static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + int ret = 0; + + if (nilfs_mdt_fetch_dirty(sbi->s_ifile)) + ret++; + if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile)) + ret++; + if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile)) + ret++; + if (ret || nilfs_doing_gc()) + if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs))) + ret++; + return ret; +} + +static int nilfs_segctor_clean(struct nilfs_sc_info *sci) +{ + return list_empty(&sci->sc_dirty_files) && + !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) && + list_empty(&sci->sc_cleaning_segments) && + (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes)); +} + +static int nilfs_segctor_confirm(struct nilfs_sc_info *sci) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + int ret = 0; + + if (nilfs_test_metadata_dirty(sbi)) + set_bit(NILFS_SC_DIRTY, &sci->sc_flags); + + spin_lock(&sbi->s_inode_lock); + if (list_empty(&sbi->s_dirty_files) && nilfs_segctor_clean(sci)) + ret++; + + spin_unlock(&sbi->s_inode_lock); + return ret; +} + +static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + + nilfs_mdt_clear_dirty(sbi->s_ifile); + nilfs_mdt_clear_dirty(nilfs->ns_cpfile); + nilfs_mdt_clear_dirty(nilfs->ns_sufile); + nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs)); +} + +static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) +{ + struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; + struct buffer_head *bh_cp; + struct nilfs_checkpoint *raw_cp; + int err; + + /* XXX: this interface will be changed */ + err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1, + &raw_cp, &bh_cp); + if (likely(!err)) { + /* The following code is duplicated with cpfile. But, it is + needed to collect the checkpoint even if it was not newly + created */ + nilfs_mdt_mark_buffer_dirty(bh_cp); + nilfs_mdt_mark_dirty(nilfs->ns_cpfile); + nilfs_cpfile_put_checkpoint( + nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); + } else + WARN_ON(err == -EINVAL || err == -ENOENT); + + return err; +} + +static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + struct buffer_head *bh_cp; + struct nilfs_checkpoint *raw_cp; + int err; + + err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0, + &raw_cp, &bh_cp); + if (unlikely(err)) { + WARN_ON(err == -EINVAL || err == -ENOENT); + goto failed_ibh; + } + raw_cp->cp_snapshot_list.ssl_next = 0; + raw_cp->cp_snapshot_list.ssl_prev = 0; + raw_cp->cp_inodes_count = + cpu_to_le64(atomic_read(&sbi->s_inodes_count)); + raw_cp->cp_blocks_count = + cpu_to_le64(atomic_read(&sbi->s_blocks_count)); + raw_cp->cp_nblk_inc = + cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc); + raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime); + raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno); + + if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) + nilfs_checkpoint_clear_minor(raw_cp); + else + nilfs_checkpoint_set_minor(raw_cp); + + nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1); + nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); + return 0; + + failed_ibh: + return err; +} + +static void nilfs_fill_in_file_bmap(struct inode *ifile, + struct nilfs_inode_info *ii) + +{ + struct buffer_head *ibh; + struct nilfs_inode *raw_inode; + + if (test_bit(NILFS_I_BMAP, &ii->i_state)) { + ibh = ii->i_bh; + BUG_ON(!ibh); + raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino, + ibh); + nilfs_bmap_write(ii->i_bmap, raw_inode); + nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh); + } +} + +static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci, + struct inode *ifile) +{ + struct nilfs_inode_info *ii; + + list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) { + nilfs_fill_in_file_bmap(ifile, ii); + set_bit(NILFS_I_COLLECTED, &ii->i_state); + } +} + +/* + * CRC calculation routines + */ +static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed) +{ + struct nilfs_super_root *raw_sr = + (struct nilfs_super_root *)bh_sr->b_data; + u32 crc; + + crc = crc32_le(seed, + (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum), + NILFS_SR_BYTES - sizeof(raw_sr->sr_sum)); + raw_sr->sr_sum = cpu_to_le32(crc); +} + +static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci, + u32 seed) +{ + struct nilfs_segment_buffer *segbuf; + + if (sci->sc_super_root) + nilfs_fill_in_super_root_crc(sci->sc_super_root, seed); + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + nilfs_segbuf_fill_in_segsum_crc(segbuf, seed); + nilfs_segbuf_fill_in_data_crc(segbuf, seed); + } +} + +static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + struct buffer_head *bh_sr = sci->sc_super_root; + struct nilfs_super_root *raw_sr = + (struct nilfs_super_root *)bh_sr->b_data; + unsigned isz = nilfs->ns_inode_size; + + raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES); + raw_sr->sr_nongc_ctime + = cpu_to_le64(nilfs_doing_gc() ? + nilfs->ns_nongc_ctime : sci->sc_seg_ctime); + raw_sr->sr_flags = 0; + + nilfs_mdt_write_inode_direct( + nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz)); + nilfs_mdt_write_inode_direct( + nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz)); + nilfs_mdt_write_inode_direct( + nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz)); +} + +static void nilfs_redirty_inodes(struct list_head *head) +{ + struct nilfs_inode_info *ii; + + list_for_each_entry(ii, head, i_dirty) { + if (test_bit(NILFS_I_COLLECTED, &ii->i_state)) + clear_bit(NILFS_I_COLLECTED, &ii->i_state); + } +} + +static void nilfs_drop_collected_inodes(struct list_head *head) +{ + struct nilfs_inode_info *ii; + + list_for_each_entry(ii, head, i_dirty) { + if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state)) + continue; + + clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state); + set_bit(NILFS_I_UPDATED, &ii->i_state); + } +} + +static void nilfs_segctor_cancel_free_segments(struct nilfs_sc_info *sci, + struct inode *sufile) + +{ + struct list_head *head = &sci->sc_cleaning_segments; + struct nilfs_segment_entry *ent; + int err; + + list_for_each_entry(ent, head, list) { + if (!(ent->flags & NILFS_SLH_FREED)) + break; + err = nilfs_sufile_cancel_free(sufile, ent->segnum); + WARN_ON(err); /* do not happen */ + ent->flags &= ~NILFS_SLH_FREED; + } +} + +static int nilfs_segctor_prepare_free_segments(struct nilfs_sc_info *sci, + struct inode *sufile) +{ + struct list_head *head = &sci->sc_cleaning_segments; + struct nilfs_segment_entry *ent; + int err; + + list_for_each_entry(ent, head, list) { + err = nilfs_sufile_free(sufile, ent->segnum); + if (unlikely(err)) + return err; + ent->flags |= NILFS_SLH_FREED; + } + return 0; +} + +static void nilfs_segctor_commit_free_segments(struct nilfs_sc_info *sci) +{ + nilfs_dispose_segment_list(&sci->sc_cleaning_segments); +} + +static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci, + struct inode *inode, + struct list_head *listp, + int (*collect)(struct nilfs_sc_info *, + struct buffer_head *, + struct inode *)) +{ + struct buffer_head *bh, *n; + int err = 0; + + if (collect) { + list_for_each_entry_safe(bh, n, listp, b_assoc_buffers) { + list_del_init(&bh->b_assoc_buffers); + err = collect(sci, bh, inode); + brelse(bh); + if (unlikely(err)) + goto dispose_buffers; + } + return 0; + } + + dispose_buffers: + while (!list_empty(listp)) { + bh = list_entry(listp->next, struct buffer_head, + b_assoc_buffers); + list_del_init(&bh->b_assoc_buffers); + brelse(bh); + } + return err; +} + +static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci) +{ + /* Remaining number of blocks within segment buffer */ + return sci->sc_segbuf_nblocks - + (sci->sc_nblk_this_inc + sci->sc_curseg->sb_sum.nblocks); +} + +static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci, + struct inode *inode, + struct nilfs_sc_operations *sc_ops) +{ + LIST_HEAD(data_buffers); + LIST_HEAD(node_buffers); + int err; + + if (!(sci->sc_stage.flags & NILFS_CF_NODE)) { + size_t n, rest = nilfs_segctor_buffer_rest(sci); + + n = nilfs_lookup_dirty_data_buffers( + inode, &data_buffers, rest + 1, 0, LLONG_MAX); + if (n > rest) { + err = nilfs_segctor_apply_buffers( + sci, inode, &data_buffers, + sc_ops->collect_data); + BUG_ON(!err); /* always receive -E2BIG or true error */ + goto break_or_fail; + } + } + nilfs_lookup_dirty_node_buffers(inode, &node_buffers); + + if (!(sci->sc_stage.flags & NILFS_CF_NODE)) { + err = nilfs_segctor_apply_buffers( + sci, inode, &data_buffers, sc_ops->collect_data); + if (unlikely(err)) { + /* dispose node list */ + nilfs_segctor_apply_buffers( + sci, inode, &node_buffers, NULL); + goto break_or_fail; + } + sci->sc_stage.flags |= NILFS_CF_NODE; + } + /* Collect node */ + err = nilfs_segctor_apply_buffers( + sci, inode, &node_buffers, sc_ops->collect_node); + if (unlikely(err)) + goto break_or_fail; + + nilfs_bmap_lookup_dirty_buffers(NILFS_I(inode)->i_bmap, &node_buffers); + err = nilfs_segctor_apply_buffers( + sci, inode, &node_buffers, sc_ops->collect_bmap); + if (unlikely(err)) + goto break_or_fail; + + nilfs_segctor_end_finfo(sci, inode); + sci->sc_stage.flags &= ~NILFS_CF_NODE; + + break_or_fail: + return err; +} + +static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci, + struct inode *inode) +{ + LIST_HEAD(data_buffers); + size_t n, rest = nilfs_segctor_buffer_rest(sci); + int err; + + n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1, + sci->sc_dsync_start, + sci->sc_dsync_end); + + err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers, + nilfs_collect_file_data); + if (!err) { + nilfs_segctor_end_finfo(sci, inode); + BUG_ON(n > rest); + /* always receive -E2BIG or true error if n > rest */ + } + return err; +} + +static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + struct list_head *head; + struct nilfs_inode_info *ii; + int err = 0; + + switch (sci->sc_stage.scnt) { + case NILFS_ST_INIT: + /* Pre-processes */ + sci->sc_stage.flags = 0; + + if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) { + sci->sc_nblk_inc = 0; + sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN; + if (mode == SC_LSEG_DSYNC) { + sci->sc_stage.scnt = NILFS_ST_DSYNC; + goto dsync_mode; + } + } + + sci->sc_stage.dirty_file_ptr = NULL; + sci->sc_stage.gc_inode_ptr = NULL; + if (mode == SC_FLUSH_DAT) { + sci->sc_stage.scnt = NILFS_ST_DAT; + goto dat_stage; + } + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_GC: + if (nilfs_doing_gc()) { + head = &sci->sc_gc_inodes; + ii = list_prepare_entry(sci->sc_stage.gc_inode_ptr, + head, i_dirty); + list_for_each_entry_continue(ii, head, i_dirty) { + err = nilfs_segctor_scan_file( + sci, &ii->vfs_inode, + &nilfs_sc_file_ops); + if (unlikely(err)) { + sci->sc_stage.gc_inode_ptr = list_entry( + ii->i_dirty.prev, + struct nilfs_inode_info, + i_dirty); + goto break_or_fail; + } + set_bit(NILFS_I_COLLECTED, &ii->i_state); + } + sci->sc_stage.gc_inode_ptr = NULL; + } + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_FILE: + head = &sci->sc_dirty_files; + ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head, + i_dirty); + list_for_each_entry_continue(ii, head, i_dirty) { + clear_bit(NILFS_I_DIRTY, &ii->i_state); + + err = nilfs_segctor_scan_file(sci, &ii->vfs_inode, + &nilfs_sc_file_ops); + if (unlikely(err)) { + sci->sc_stage.dirty_file_ptr = + list_entry(ii->i_dirty.prev, + struct nilfs_inode_info, + i_dirty); + goto break_or_fail; + } + /* sci->sc_stage.dirty_file_ptr = NILFS_I(inode); */ + /* XXX: required ? */ + } + sci->sc_stage.dirty_file_ptr = NULL; + if (mode == SC_FLUSH_FILE) { + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + } + sci->sc_stage.scnt++; + sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED; + /* Fall through */ + case NILFS_ST_IFILE: + err = nilfs_segctor_scan_file(sci, sbi->s_ifile, + &nilfs_sc_file_ops); + if (unlikely(err)) + break; + sci->sc_stage.scnt++; + /* Creating a checkpoint */ + err = nilfs_segctor_create_checkpoint(sci); + if (unlikely(err)) + break; + /* Fall through */ + case NILFS_ST_CPFILE: + err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile, + &nilfs_sc_file_ops); + if (unlikely(err)) + break; + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_SUFILE: + err = nilfs_segctor_prepare_free_segments(sci, + nilfs->ns_sufile); + if (unlikely(err)) + break; + err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile, + &nilfs_sc_file_ops); + if (unlikely(err)) + break; + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_DAT: + dat_stage: + err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs), + &nilfs_sc_dat_ops); + if (unlikely(err)) + break; + if (mode == SC_FLUSH_DAT) { + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + } + sci->sc_stage.scnt++; /* Fall through */ + case NILFS_ST_SR: + if (mode == SC_LSEG_SR) { + /* Appending a super root */ + err = nilfs_segctor_add_super_root(sci); + if (unlikely(err)) + break; + } + /* End of a logical segment */ + sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + case NILFS_ST_DSYNC: + dsync_mode: + sci->sc_curseg->sb_sum.flags |= NILFS_SS_SYNDT; + ii = sci->sc_dsync_inode; + if (!test_bit(NILFS_I_BUSY, &ii->i_state)) + break; + + err = nilfs_segctor_scan_file_dsync(sci, &ii->vfs_inode); + if (unlikely(err)) + break; + sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND; + sci->sc_stage.scnt = NILFS_ST_DONE; + return 0; + case NILFS_ST_DONE: + return 0; + default: + BUG(); + } + + break_or_fail: + return err; +} + +static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum) +{ + struct buffer_head *bh_su; + struct nilfs_segment_usage *raw_su; + int err; + + err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su); + if (unlikely(err)) + return err; + nilfs_mdt_mark_buffer_dirty(bh_su); + nilfs_mdt_mark_dirty(sufile); + nilfs_sufile_put_segment_usage(sufile, segnum, bh_su); + return 0; +} + +static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + struct nilfs_segment_buffer *segbuf, *n; + __u64 nextnum; + int err; + + if (list_empty(&sci->sc_segbufs)) { + segbuf = nilfs_segbuf_new(sci->sc_super); + if (unlikely(!segbuf)) + return -ENOMEM; + list_add(&segbuf->sb_list, &sci->sc_segbufs); + } else + segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + + nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset, + nilfs); + + if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) { + nilfs_shift_to_next_segment(nilfs); + nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs); + } + sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks; + + err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum); + if (unlikely(err)) + return err; + + if (nilfs->ns_segnum == nilfs->ns_nextnum) { + /* Start from the head of a new full segment */ + err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum); + if (unlikely(err)) + return err; + } else + nextnum = nilfs->ns_nextnum; + + segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq; + nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs); + + /* truncating segment buffers */ + list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs, + sb_list) { + list_del_init(&segbuf->sb_list); + nilfs_segbuf_free(segbuf); + } + return 0; +} + +static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs, int nadd) +{ + struct nilfs_segment_buffer *segbuf, *prev, *n; + struct inode *sufile = nilfs->ns_sufile; + __u64 nextnextnum; + LIST_HEAD(list); + int err, ret, i; + + prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs); + /* + * Since the segment specified with nextnum might be allocated during + * the previous construction, the buffer including its segusage may + * not be dirty. The following call ensures that the buffer is dirty + * and will pin the buffer on memory until the sufile is written. + */ + err = nilfs_touch_segusage(sufile, prev->sb_nextnum); + if (unlikely(err)) + return err; + + for (i = 0; i < nadd; i++) { + /* extend segment info */ + err = -ENOMEM; + segbuf = nilfs_segbuf_new(sci->sc_super); + if (unlikely(!segbuf)) + goto failed; + + /* map this buffer to region of segment on-disk */ + nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs); + sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks; + + /* allocate the next next full segment */ + err = nilfs_sufile_alloc(sufile, &nextnextnum); + if (unlikely(err)) + goto failed_segbuf; + + segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq + 1; + nilfs_segbuf_set_next_segnum(segbuf, nextnextnum, nilfs); + + list_add_tail(&segbuf->sb_list, &list); + prev = segbuf; + } + list_splice(&list, sci->sc_segbufs.prev); + return 0; + + failed_segbuf: + nilfs_segbuf_free(segbuf); + failed: + list_for_each_entry_safe(segbuf, n, &list, sb_list) { + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); + WARN_ON(ret); /* never fails */ + list_del_init(&segbuf->sb_list); + nilfs_segbuf_free(segbuf); + } + return err; +} + +static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs) +{ + struct nilfs_segment_buffer *segbuf; + int ret, done = 0; + + segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + if (nilfs->ns_nextnum != segbuf->sb_nextnum) { + ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum); + WARN_ON(ret); /* never fails */ + } + if (segbuf->sb_io_error) { + /* Case 1: The first segment failed */ + if (segbuf->sb_pseg_start != segbuf->sb_fseg_start) + /* Case 1a: Partial segment appended into an existing + segment */ + nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start, + segbuf->sb_fseg_end); + else /* Case 1b: New full segment */ + set_nilfs_discontinued(nilfs); + done++; + } + + list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) { + ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum); + WARN_ON(ret); /* never fails */ + if (!done && segbuf->sb_io_error) { + if (segbuf->sb_segnum != nilfs->ns_nextnum) + /* Case 2: extended segment (!= next) failed */ + nilfs_sufile_set_error(nilfs->ns_sufile, + segbuf->sb_segnum); + done++; + } + } +} + +static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) + nilfs_segbuf_clear(segbuf); + sci->sc_super_root = NULL; +} + +static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf; + + while (!list_empty(&sci->sc_segbufs)) { + segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + list_del_init(&segbuf->sb_list); + nilfs_segbuf_free(segbuf); + } + /* sci->sc_curseg = NULL; */ +} + +static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs, int err) +{ + if (unlikely(err)) { + nilfs_segctor_free_incomplete_segments(sci, nilfs); + nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile); + } + nilfs_segctor_clear_segment_buffers(sci); +} + +static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci, + struct inode *sufile) +{ + struct nilfs_segment_buffer *segbuf; + struct buffer_head *bh_su; + struct nilfs_segment_usage *raw_su; + unsigned long live_blocks; + int ret; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum, + &raw_su, &bh_su); + WARN_ON(ret); /* always succeed because bh_su is dirty */ + live_blocks = segbuf->sb_sum.nblocks + + (segbuf->sb_pseg_start - segbuf->sb_fseg_start); + raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime); + raw_su->su_nblocks = cpu_to_le32(live_blocks); + nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, + bh_su); + } +} + +static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci, + struct inode *sufile) +{ + struct nilfs_segment_buffer *segbuf; + struct buffer_head *bh_su; + struct nilfs_segment_usage *raw_su; + int ret; + + segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum, + &raw_su, &bh_su); + WARN_ON(ret); /* always succeed because bh_su is dirty */ + raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start - + segbuf->sb_fseg_start); + nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su); + + list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) { + ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum, + &raw_su, &bh_su); + WARN_ON(ret); /* always succeed */ + raw_su->su_nblocks = 0; + nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, + bh_su); + } +} + +static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci, + struct nilfs_segment_buffer *last, + struct inode *sufile) +{ + struct nilfs_segment_buffer *segbuf = last, *n; + int ret; + + list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs, + sb_list) { + list_del_init(&segbuf->sb_list); + sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks; + ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum); + WARN_ON(ret); + nilfs_segbuf_free(segbuf); + } +} + + +static int nilfs_segctor_collect(struct nilfs_sc_info *sci, + struct the_nilfs *nilfs, int mode) +{ + struct nilfs_cstage prev_stage = sci->sc_stage; + int err, nadd = 1; + + /* Collection retry loop */ + for (;;) { + sci->sc_super_root = NULL; + sci->sc_nblk_this_inc = 0; + sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs); + + err = nilfs_segctor_reset_segment_buffer(sci); + if (unlikely(err)) + goto failed; + + err = nilfs_segctor_collect_blocks(sci, mode); + sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks; + if (!err) + break; + + if (unlikely(err != -E2BIG)) + goto failed; + + /* The current segment is filled up */ + if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE) + break; + + nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile); + nilfs_segctor_clear_segment_buffers(sci); + + err = nilfs_segctor_extend_segments(sci, nilfs, nadd); + if (unlikely(err)) + return err; + + nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA); + sci->sc_stage = prev_stage; + } + nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile); + return 0; + + failed: + return err; +} + +static void nilfs_list_replace_buffer(struct buffer_head *old_bh, + struct buffer_head *new_bh) +{ + BUG_ON(!list_empty(&new_bh->b_assoc_buffers)); + + list_replace_init(&old_bh->b_assoc_buffers, &new_bh->b_assoc_buffers); + /* The caller must release old_bh */ +} + +static int +nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci, + struct nilfs_segment_buffer *segbuf, + int mode) +{ + struct inode *inode = NULL; + sector_t blocknr; + unsigned long nfinfo = segbuf->sb_sum.nfinfo; + unsigned long nblocks = 0, ndatablk = 0; + struct nilfs_sc_operations *sc_op = NULL; + struct nilfs_segsum_pointer ssp; + struct nilfs_finfo *finfo = NULL; + union nilfs_binfo binfo; + struct buffer_head *bh, *bh_org; + ino_t ino = 0; + int err = 0; + + if (!nfinfo) + goto out; + + blocknr = segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk; + ssp.bh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers); + ssp.offset = sizeof(struct nilfs_segment_summary); + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { + if (bh == sci->sc_super_root) + break; + if (!finfo) { + finfo = nilfs_segctor_map_segsum_entry( + sci, &ssp, sizeof(*finfo)); + ino = le64_to_cpu(finfo->fi_ino); + nblocks = le32_to_cpu(finfo->fi_nblocks); + ndatablk = le32_to_cpu(finfo->fi_ndatablk); + + if (buffer_nilfs_node(bh)) + inode = NILFS_BTNC_I(bh->b_page->mapping); + else + inode = NILFS_AS_I(bh->b_page->mapping); + + if (mode == SC_LSEG_DSYNC) + sc_op = &nilfs_sc_dsync_ops; + else if (ino == NILFS_DAT_INO) + sc_op = &nilfs_sc_dat_ops; + else /* file blocks */ + sc_op = &nilfs_sc_file_ops; + } + bh_org = bh; + get_bh(bh_org); + err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr, + &binfo); + if (bh != bh_org) + nilfs_list_replace_buffer(bh_org, bh); + brelse(bh_org); + if (unlikely(err)) + goto failed_bmap; + + if (ndatablk > 0) + sc_op->write_data_binfo(sci, &ssp, &binfo); + else + sc_op->write_node_binfo(sci, &ssp, &binfo); + + blocknr++; + if (--nblocks == 0) { + finfo = NULL; + if (--nfinfo == 0) + break; + } else if (ndatablk > 0) + ndatablk--; + } + out: + return 0; + + failed_bmap: + err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super); + return err; +} + +static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode) +{ + struct nilfs_segment_buffer *segbuf; + int err; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + err = nilfs_segctor_update_payload_blocknr(sci, segbuf, mode); + if (unlikely(err)) + return err; + nilfs_segbuf_fill_in_segsum(segbuf); + } + return 0; +} + +static int +nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out) +{ + struct page *clone_page; + struct buffer_head *bh, *head, *bh2; + void *kaddr; + + bh = head = page_buffers(page); + + clone_page = nilfs_alloc_private_page(bh->b_bdev, bh->b_size, 0); + if (unlikely(!clone_page)) + return -ENOMEM; + + bh2 = page_buffers(clone_page); + kaddr = kmap_atomic(page, KM_USER0); + do { + if (list_empty(&bh->b_assoc_buffers)) + continue; + get_bh(bh2); + page_cache_get(clone_page); /* for each bh */ + memcpy(bh2->b_data, kaddr + bh_offset(bh), bh2->b_size); + bh2->b_blocknr = bh->b_blocknr; + list_replace(&bh->b_assoc_buffers, &bh2->b_assoc_buffers); + list_add_tail(&bh->b_assoc_buffers, out); + } while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head); + kunmap_atomic(kaddr, KM_USER0); + + if (!TestSetPageWriteback(clone_page)) + inc_zone_page_state(clone_page, NR_WRITEBACK); + unlock_page(clone_page); + + return 0; +} + +static int nilfs_test_page_to_be_frozen(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (!mapping || !mapping->host || S_ISDIR(mapping->host->i_mode)) + return 0; + + if (page_mapped(page)) { + ClearPageChecked(page); + return 1; + } + return PageChecked(page); +} + +static int nilfs_begin_page_io(struct page *page, struct list_head *out) +{ + if (!page || PageWriteback(page)) + /* For split b-tree node pages, this function may be called + twice. We ignore the 2nd or later calls by this check. */ + return 0; + + lock_page(page); + clear_page_dirty_for_io(page); + set_page_writeback(page); + unlock_page(page); + + if (nilfs_test_page_to_be_frozen(page)) { + int err = nilfs_copy_replace_page_buffers(page, out); + if (unlikely(err)) + return err; + } + return 0; +} + +static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci, + struct page **failed_page) +{ + struct nilfs_segment_buffer *segbuf; + struct page *bd_page = NULL, *fs_page = NULL; + struct list_head *list = &sci->sc_copied_buffers; + int err; + + *failed_page = NULL; + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + struct buffer_head *bh; + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + if (bh->b_page != bd_page) { + if (bd_page) { + lock_page(bd_page); + clear_page_dirty_for_io(bd_page); + set_page_writeback(bd_page); + unlock_page(bd_page); + } + bd_page = bh->b_page; + } + } + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, + b_assoc_buffers) { + if (bh == sci->sc_super_root) { + if (bh->b_page != bd_page) { + lock_page(bd_page); + clear_page_dirty_for_io(bd_page); + set_page_writeback(bd_page); + unlock_page(bd_page); + bd_page = bh->b_page; + } + break; + } + if (bh->b_page != fs_page) { + err = nilfs_begin_page_io(fs_page, list); + if (unlikely(err)) { + *failed_page = fs_page; + goto out; + } + fs_page = bh->b_page; + } + } + } + if (bd_page) { + lock_page(bd_page); + clear_page_dirty_for_io(bd_page); + set_page_writeback(bd_page); + unlock_page(bd_page); + } + err = nilfs_begin_page_io(fs_page, list); + if (unlikely(err)) + *failed_page = fs_page; + out: + return err; +} + +static int nilfs_segctor_write(struct nilfs_sc_info *sci, + struct backing_dev_info *bdi) +{ + struct nilfs_segment_buffer *segbuf; + struct nilfs_write_info wi; + int err, res; + + wi.sb = sci->sc_super; + wi.bh_sr = sci->sc_super_root; + wi.bdi = bdi; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + nilfs_segbuf_prepare_write(segbuf, &wi); + err = nilfs_segbuf_write(segbuf, &wi); + + res = nilfs_segbuf_wait(segbuf, &wi); + err = unlikely(err) ? : res; + if (unlikely(err)) + return err; + } + return 0; +} + +static int nilfs_page_has_uncleared_buffer(struct page *page) +{ + struct buffer_head *head, *bh; + + head = bh = page_buffers(page); + do { + if (buffer_dirty(bh) && !list_empty(&bh->b_assoc_buffers)) + return 1; + bh = bh->b_this_page; + } while (bh != head); + return 0; +} + +static void __nilfs_end_page_io(struct page *page, int err) +{ + if (!err) { + if (!nilfs_page_buffers_clean(page)) + __set_page_dirty_nobuffers(page); + ClearPageError(page); + } else { + __set_page_dirty_nobuffers(page); + SetPageError(page); + } + + if (buffer_nilfs_allocated(page_buffers(page))) { + if (TestClearPageWriteback(page)) + dec_zone_page_state(page, NR_WRITEBACK); + } else + end_page_writeback(page); +} + +static void nilfs_end_page_io(struct page *page, int err) +{ + if (!page) + return; + + if (buffer_nilfs_node(page_buffers(page)) && + nilfs_page_has_uncleared_buffer(page)) + /* For b-tree node pages, this function may be called twice + or more because they might be split in a segment. + This check assures that cleanup has been done for all + buffers in a split btnode page. */ + return; + + __nilfs_end_page_io(page, err); +} + +static void nilfs_clear_copied_buffers(struct list_head *list, int err) +{ + struct buffer_head *bh, *head; + struct page *page; + + while (!list_empty(list)) { + bh = list_entry(list->next, struct buffer_head, + b_assoc_buffers); + page = bh->b_page; + page_cache_get(page); + head = bh = page_buffers(page); + do { + if (!list_empty(&bh->b_assoc_buffers)) { + list_del_init(&bh->b_assoc_buffers); + if (!err) { + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + clear_buffer_nilfs_volatile(bh); + } + brelse(bh); /* for b_assoc_buffers */ + } + } while ((bh = bh->b_this_page) != head); + + __nilfs_end_page_io(page, err); + page_cache_release(page); + } +} + +static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci, + struct page *failed_page, int err) +{ + struct nilfs_segment_buffer *segbuf; + struct page *bd_page = NULL, *fs_page = NULL; + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + struct buffer_head *bh; + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + if (bh->b_page != bd_page) { + if (bd_page) + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + } + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, + b_assoc_buffers) { + if (bh == sci->sc_super_root) { + if (bh->b_page != bd_page) { + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + break; + } + if (bh->b_page != fs_page) { + nilfs_end_page_io(fs_page, err); + if (unlikely(fs_page == failed_page)) + goto done; + fs_page = bh->b_page; + } + } + } + if (bd_page) + end_page_writeback(bd_page); + + nilfs_end_page_io(fs_page, err); + done: + nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err); +} + +static void nilfs_set_next_segment(struct the_nilfs *nilfs, + struct nilfs_segment_buffer *segbuf) +{ + nilfs->ns_segnum = segbuf->sb_segnum; + nilfs->ns_nextnum = segbuf->sb_nextnum; + nilfs->ns_pseg_offset = segbuf->sb_pseg_start - segbuf->sb_fseg_start + + segbuf->sb_sum.nblocks; + nilfs->ns_seg_seq = segbuf->sb_sum.seg_seq; + nilfs->ns_ctime = segbuf->sb_sum.ctime; +} + +static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) +{ + struct nilfs_segment_buffer *segbuf; + struct page *bd_page = NULL, *fs_page = NULL; + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + int update_sr = (sci->sc_super_root != NULL); + + list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { + struct buffer_head *bh; + + list_for_each_entry(bh, &segbuf->sb_segsum_buffers, + b_assoc_buffers) { + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + if (bh->b_page != bd_page) { + if (bd_page) + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + } + /* + * We assume that the buffers which belong to the same page + * continue over the buffer list. + * Under this assumption, the last BHs of pages is + * identifiable by the discontinuity of bh->b_page + * (page != fs_page). + * + * For B-tree node blocks, however, this assumption is not + * guaranteed. The cleanup code of B-tree node pages needs + * special care. + */ + list_for_each_entry(bh, &segbuf->sb_payload_buffers, + b_assoc_buffers) { + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + clear_buffer_nilfs_volatile(bh); + if (bh == sci->sc_super_root) { + if (bh->b_page != bd_page) { + end_page_writeback(bd_page); + bd_page = bh->b_page; + } + break; + } + if (bh->b_page != fs_page) { + nilfs_end_page_io(fs_page, 0); + fs_page = bh->b_page; + } + } + + if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) { + if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) { + set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); + sci->sc_lseg_stime = jiffies; + } + if (NILFS_SEG_LOGEND(&segbuf->sb_sum)) + clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); + } + } + /* + * Since pages may continue over multiple segment buffers, + * end of the last page must be checked outside of the loop. + */ + if (bd_page) + end_page_writeback(bd_page); + + nilfs_end_page_io(fs_page, 0); + + nilfs_clear_copied_buffers(&sci->sc_copied_buffers, 0); + + nilfs_drop_collected_inodes(&sci->sc_dirty_files); + + if (nilfs_doing_gc()) { + nilfs_drop_collected_inodes(&sci->sc_gc_inodes); + if (update_sr) + nilfs_commit_gcdat_inode(nilfs); + } else + nilfs->ns_nongc_ctime = sci->sc_seg_ctime; + + sci->sc_nblk_inc += sci->sc_nblk_this_inc; + + segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs); + nilfs_set_next_segment(nilfs, segbuf); + + if (update_sr) { + nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, + segbuf->sb_sum.seg_seq, nilfs->ns_cno++); + sbi->s_super->s_dirt = 1; + + clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); + clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); + set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); + } else + clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags); +} + +static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci, + struct nilfs_sb_info *sbi) +{ + struct nilfs_inode_info *ii, *n; + __u64 cno = sbi->s_nilfs->ns_cno; + + spin_lock(&sbi->s_inode_lock); + retry: + list_for_each_entry_safe(ii, n, &sbi->s_dirty_files, i_dirty) { + if (!ii->i_bh) { + struct buffer_head *ibh; + int err; + + spin_unlock(&sbi->s_inode_lock); + err = nilfs_ifile_get_inode_block( + sbi->s_ifile, ii->vfs_inode.i_ino, &ibh); + if (unlikely(err)) { + nilfs_warning(sbi->s_super, __func__, + "failed to get inode block.\n"); + return err; + } + nilfs_mdt_mark_buffer_dirty(ibh); + nilfs_mdt_mark_dirty(sbi->s_ifile); + spin_lock(&sbi->s_inode_lock); + if (likely(!ii->i_bh)) + ii->i_bh = ibh; + else + brelse(ibh); + goto retry; + } + ii->i_cno = cno; + + clear_bit(NILFS_I_QUEUED, &ii->i_state); + set_bit(NILFS_I_BUSY, &ii->i_state); + list_del(&ii->i_dirty); + list_add_tail(&ii->i_dirty, &sci->sc_dirty_files); + } + spin_unlock(&sbi->s_inode_lock); + + NILFS_I(sbi->s_ifile)->i_cno = cno; + + return 0; +} + +static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci, + struct nilfs_sb_info *sbi) +{ + struct nilfs_transaction_info *ti = current->journal_info; + struct nilfs_inode_info *ii, *n; + __u64 cno = sbi->s_nilfs->ns_cno; + + spin_lock(&sbi->s_inode_lock); + list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) { + if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) || + test_bit(NILFS_I_DIRTY, &ii->i_state)) { + /* The current checkpoint number (=nilfs->ns_cno) is + changed between check-in and check-out only if the + super root is written out. So, we can update i_cno + for the inodes that remain in the dirty list. */ + ii->i_cno = cno; + continue; + } + clear_bit(NILFS_I_BUSY, &ii->i_state); + brelse(ii->i_bh); + ii->i_bh = NULL; + list_del(&ii->i_dirty); + list_add_tail(&ii->i_dirty, &ti->ti_garbage); + } + spin_unlock(&sbi->s_inode_lock); +} + +/* + * Main procedure of segment constructor + */ +static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + struct page *failed_page; + int err, has_sr = 0; + + sci->sc_stage.scnt = NILFS_ST_INIT; + + err = nilfs_segctor_check_in_files(sci, sbi); + if (unlikely(err)) + goto out; + + if (nilfs_test_metadata_dirty(sbi)) + set_bit(NILFS_SC_DIRTY, &sci->sc_flags); + + if (nilfs_segctor_clean(sci)) + goto out; + + do { + sci->sc_stage.flags &= ~NILFS_CF_HISTORY_MASK; + + err = nilfs_segctor_begin_construction(sci, nilfs); + if (unlikely(err)) + goto out; + + /* Update time stamp */ + sci->sc_seg_ctime = get_seconds(); + + err = nilfs_segctor_collect(sci, nilfs, mode); + if (unlikely(err)) + goto failed; + + has_sr = (sci->sc_super_root != NULL); + + /* Avoid empty segment */ + if (sci->sc_stage.scnt == NILFS_ST_DONE && + NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) { + nilfs_segctor_end_construction(sci, nilfs, 1); + goto out; + } + + err = nilfs_segctor_assign(sci, mode); + if (unlikely(err)) + goto failed; + + if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) + nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile); + + if (has_sr) { + err = nilfs_segctor_fill_in_checkpoint(sci); + if (unlikely(err)) + goto failed_to_make_up; + + nilfs_segctor_fill_in_super_root(sci, nilfs); + } + nilfs_segctor_update_segusage(sci, nilfs->ns_sufile); + + /* Write partial segments */ + err = nilfs_segctor_prepare_write(sci, &failed_page); + if (unlikely(err)) + goto failed_to_write; + + nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed); + + err = nilfs_segctor_write(sci, nilfs->ns_bdi); + if (unlikely(err)) + goto failed_to_write; + + nilfs_segctor_complete_write(sci); + + /* Commit segments */ + if (has_sr) { + nilfs_segctor_commit_free_segments(sci); + nilfs_segctor_clear_metadata_dirty(sci); + } + + nilfs_segctor_end_construction(sci, nilfs, 0); + + } while (sci->sc_stage.scnt != NILFS_ST_DONE); + + out: + nilfs_segctor_destroy_segment_buffers(sci); + nilfs_segctor_check_out_files(sci, sbi); + return err; + + failed_to_write: + nilfs_segctor_abort_write(sci, failed_page, err); + nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile); + + failed_to_make_up: + if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED) + nilfs_redirty_inodes(&sci->sc_dirty_files); + + failed: + if (nilfs_doing_gc()) + nilfs_redirty_inodes(&sci->sc_gc_inodes); + nilfs_segctor_end_construction(sci, nilfs, err); + goto out; +} + +/** + * nilfs_secgtor_start_timer - set timer of background write + * @sci: nilfs_sc_info + * + * If the timer has already been set, it ignores the new request. + * This function MUST be called within a section locking the segment + * semaphore. + */ +static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci) +{ + spin_lock(&sci->sc_state_lock); + if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) { + sci->sc_timer->expires = jiffies + sci->sc_interval; + add_timer(sci->sc_timer); + sci->sc_state |= NILFS_SEGCTOR_COMMIT; + } + spin_unlock(&sci->sc_state_lock); +} + +static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn) +{ + spin_lock(&sci->sc_state_lock); + if (!(sci->sc_flush_request & (1 << bn))) { + unsigned long prev_req = sci->sc_flush_request; + + sci->sc_flush_request |= (1 << bn); + if (!prev_req) + wake_up(&sci->sc_wait_daemon); + } + spin_unlock(&sci->sc_state_lock); +} + +/** + * nilfs_flush_segment - trigger a segment construction for resource control + * @sb: super block + * @ino: inode number of the file to be flushed out. + */ +void nilfs_flush_segment(struct super_block *sb, ino_t ino) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_sc_info *sci = NILFS_SC(sbi); + + if (!sci || nilfs_doing_construction()) + return; + nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0); + /* assign bit 0 to data files */ +} + +int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *sci, + __u64 *segnum, size_t nsegs) +{ + struct nilfs_segment_entry *ent; + struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; + struct inode *sufile = nilfs->ns_sufile; + LIST_HEAD(list); + __u64 *pnum; + size_t i; + int err; + + for (pnum = segnum, i = 0; i < nsegs; pnum++, i++) { + ent = nilfs_alloc_segment_entry(*pnum); + if (unlikely(!ent)) { + err = -ENOMEM; + goto failed; + } + list_add_tail(&ent->list, &list); + + err = nilfs_open_segment_entry(ent, sufile); + if (unlikely(err)) + goto failed; + + if (unlikely(!nilfs_segment_usage_dirty(ent->raw_su))) + printk(KERN_WARNING "NILFS: unused segment is " + "requested to be cleaned (segnum=%llu)\n", + (unsigned long long)ent->segnum); + nilfs_close_segment_entry(ent, sufile); + } + list_splice(&list, sci->sc_cleaning_segments.prev); + return 0; + + failed: + nilfs_dispose_segment_list(&list); + return err; +} + +void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *sci) +{ + nilfs_dispose_segment_list(&sci->sc_cleaning_segments); +} + +struct nilfs_segctor_wait_request { + wait_queue_t wq; + __u32 seq; + int err; + atomic_t done; +}; + +static int nilfs_segctor_sync(struct nilfs_sc_info *sci) +{ + struct nilfs_segctor_wait_request wait_req; + int err = 0; + + spin_lock(&sci->sc_state_lock); + init_wait(&wait_req.wq); + wait_req.err = 0; + atomic_set(&wait_req.done, 0); + wait_req.seq = ++sci->sc_seq_request; + spin_unlock(&sci->sc_state_lock); + + init_waitqueue_entry(&wait_req.wq, current); + add_wait_queue(&sci->sc_wait_request, &wait_req.wq); + set_current_state(TASK_INTERRUPTIBLE); + wake_up(&sci->sc_wait_daemon); + + for (;;) { + if (atomic_read(&wait_req.done)) { + err = wait_req.err; + break; + } + if (!signal_pending(current)) { + schedule(); + continue; + } + err = -ERESTARTSYS; + break; + } + finish_wait(&sci->sc_wait_request, &wait_req.wq); + return err; +} + +static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err) +{ + struct nilfs_segctor_wait_request *wrq, *n; + unsigned long flags; + + spin_lock_irqsave(&sci->sc_wait_request.lock, flags); + list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list, + wq.task_list) { + if (!atomic_read(&wrq->done) && + nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) { + wrq->err = err; + atomic_set(&wrq->done, 1); + } + if (atomic_read(&wrq->done)) { + wrq->wq.func(&wrq->wq, + TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 0, NULL); + } + } + spin_unlock_irqrestore(&sci->sc_wait_request.lock, flags); +} + +/** + * nilfs_construct_segment - construct a logical segment + * @sb: super block + * + * Return Value: On success, 0 is retured. On errors, one of the following + * negative error code is returned. + * + * %-EROFS - Read only filesystem. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_construct_segment(struct super_block *sb) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_sc_info *sci = NILFS_SC(sbi); + struct nilfs_transaction_info *ti; + int err; + + if (!sci) + return -EROFS; + + /* A call inside transactions causes a deadlock. */ + BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC); + + err = nilfs_segctor_sync(sci); + return err; +} + +/** + * nilfs_construct_dsync_segment - construct a data-only logical segment + * @sb: super block + * @inode: inode whose data blocks should be written out + * @start: start byte offset + * @end: end byte offset (inclusive) + * + * Return Value: On success, 0 is retured. On errors, one of the following + * negative error code is returned. + * + * %-EROFS - Read only filesystem. + * + * %-EIO - I/O error + * + * %-ENOSPC - No space left on device (only in a panic state). + * + * %-ERESTARTSYS - Interrupted. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, + loff_t start, loff_t end) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_sc_info *sci = NILFS_SC(sbi); + struct nilfs_inode_info *ii; + struct nilfs_transaction_info ti; + int err = 0; + + if (!sci) + return -EROFS; + + nilfs_transaction_lock(sbi, &ti, 0); + + ii = NILFS_I(inode); + if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) || + nilfs_test_opt(sbi, STRICT_ORDER) || + test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || + nilfs_discontinued(sbi->s_nilfs)) { + nilfs_transaction_unlock(sbi); + err = nilfs_segctor_sync(sci); + return err; + } + + spin_lock(&sbi->s_inode_lock); + if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && + !test_bit(NILFS_I_BUSY, &ii->i_state)) { + spin_unlock(&sbi->s_inode_lock); + nilfs_transaction_unlock(sbi); + return 0; + } + spin_unlock(&sbi->s_inode_lock); + sci->sc_dsync_inode = ii; + sci->sc_dsync_start = start; + sci->sc_dsync_end = end; + + err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC); + + nilfs_transaction_unlock(sbi); + return err; +} + +struct nilfs_segctor_req { + int mode; + __u32 seq_accepted; + int sc_err; /* construction failure */ + int sb_err; /* super block writeback failure */ +}; + +#define FLUSH_FILE_BIT (0x1) /* data file only */ +#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */ + +static void nilfs_segctor_accept(struct nilfs_sc_info *sci, + struct nilfs_segctor_req *req) +{ + req->sc_err = req->sb_err = 0; + spin_lock(&sci->sc_state_lock); + req->seq_accepted = sci->sc_seq_request; + spin_unlock(&sci->sc_state_lock); + + if (sci->sc_timer) + del_timer_sync(sci->sc_timer); +} + +static void nilfs_segctor_notify(struct nilfs_sc_info *sci, + struct nilfs_segctor_req *req) +{ + /* Clear requests (even when the construction failed) */ + spin_lock(&sci->sc_state_lock); + + sci->sc_state &= ~NILFS_SEGCTOR_COMMIT; + + if (req->mode == SC_LSEG_SR) { + sci->sc_seq_done = req->seq_accepted; + nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err); + sci->sc_flush_request = 0; + } else if (req->mode == SC_FLUSH_FILE) + sci->sc_flush_request &= ~FLUSH_FILE_BIT; + else if (req->mode == SC_FLUSH_DAT) + sci->sc_flush_request &= ~FLUSH_DAT_BIT; + + spin_unlock(&sci->sc_state_lock); +} + +static int nilfs_segctor_construct(struct nilfs_sc_info *sci, + struct nilfs_segctor_req *req) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct the_nilfs *nilfs = sbi->s_nilfs; + int err = 0; + + if (nilfs_discontinued(nilfs)) + req->mode = SC_LSEG_SR; + if (!nilfs_segctor_confirm(sci)) { + err = nilfs_segctor_do_construct(sci, req->mode); + req->sc_err = err; + } + if (likely(!err)) { + if (req->mode != SC_FLUSH_DAT) + atomic_set(&nilfs->ns_ndirtyblks, 0); + if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && + nilfs_discontinued(nilfs)) { + down_write(&nilfs->ns_sem); + req->sb_err = nilfs_commit_super(sbi, 0); + up_write(&nilfs->ns_sem); + } + } + return err; +} + +static void nilfs_construction_timeout(unsigned long data) +{ + struct task_struct *p = (struct task_struct *)data; + wake_up_process(p); +} + +static void +nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head) +{ + struct nilfs_inode_info *ii, *n; + + list_for_each_entry_safe(ii, n, head, i_dirty) { + if (!test_bit(NILFS_I_UPDATED, &ii->i_state)) + continue; + hlist_del_init(&ii->vfs_inode.i_hash); + list_del_init(&ii->i_dirty); + nilfs_clear_gcinode(&ii->vfs_inode); + } +} + +int nilfs_clean_segments(struct super_block *sb, void __user *argp) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_sc_info *sci = NILFS_SC(sbi); + struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_transaction_info ti; + struct nilfs_segctor_req req = { .mode = SC_LSEG_SR }; + int err; + + if (unlikely(!sci)) + return -EROFS; + + nilfs_transaction_lock(sbi, &ti, 1); + + err = nilfs_init_gcdat_inode(nilfs); + if (unlikely(err)) + goto out_unlock; + err = nilfs_ioctl_prepare_clean_segments(nilfs, argp); + if (unlikely(err)) + goto out_unlock; + + list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev); + + for (;;) { + nilfs_segctor_accept(sci, &req); + err = nilfs_segctor_construct(sci, &req); + nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes); + nilfs_segctor_notify(sci, &req); + + if (likely(!err)) + break; + + nilfs_warning(sb, __func__, + "segment construction failed. (err=%d)", err); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(sci->sc_interval); + } + + out_unlock: + nilfs_clear_gcdat_inode(nilfs); + nilfs_transaction_unlock(sbi); + return err; +} + +static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct nilfs_transaction_info ti; + struct nilfs_segctor_req req = { .mode = mode }; + + nilfs_transaction_lock(sbi, &ti, 0); + + nilfs_segctor_accept(sci, &req); + nilfs_segctor_construct(sci, &req); + nilfs_segctor_notify(sci, &req); + + /* + * Unclosed segment should be retried. We do this using sc_timer. + * Timeout of sc_timer will invoke complete construction which leads + * to close the current logical segment. + */ + if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) + nilfs_segctor_start_timer(sci); + + nilfs_transaction_unlock(sbi); +} + +static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci) +{ + int mode = 0; + int err; + + spin_lock(&sci->sc_state_lock); + mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ? + SC_FLUSH_DAT : SC_FLUSH_FILE; + spin_unlock(&sci->sc_state_lock); + + if (mode) { + err = nilfs_segctor_do_construct(sci, mode); + + spin_lock(&sci->sc_state_lock); + sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ? + ~FLUSH_FILE_BIT : ~FLUSH_DAT_BIT; + spin_unlock(&sci->sc_state_lock); + } + clear_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags); +} + +static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci) +{ + if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || + time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq)) { + if (!(sci->sc_flush_request & ~FLUSH_FILE_BIT)) + return SC_FLUSH_FILE; + else if (!(sci->sc_flush_request & ~FLUSH_DAT_BIT)) + return SC_FLUSH_DAT; + } + return SC_LSEG_SR; +} + +/** + * nilfs_segctor_thread - main loop of the segment constructor thread. + * @arg: pointer to a struct nilfs_sc_info. + * + * nilfs_segctor_thread() initializes a timer and serves as a daemon + * to execute segment constructions. + */ +static int nilfs_segctor_thread(void *arg) +{ + struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; + struct timer_list timer; + int timeout = 0; + + init_timer(&timer); + timer.data = (unsigned long)current; + timer.function = nilfs_construction_timeout; + sci->sc_timer = &timer; + + /* start sync. */ + sci->sc_task = current; + wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */ + printk(KERN_INFO + "segctord starting. Construction interval = %lu seconds, " + "CP frequency < %lu seconds\n", + sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ); + + spin_lock(&sci->sc_state_lock); + loop: + for (;;) { + int mode; + + if (sci->sc_state & NILFS_SEGCTOR_QUIT) + goto end_thread; + + if (timeout || sci->sc_seq_request != sci->sc_seq_done) + mode = SC_LSEG_SR; + else if (!sci->sc_flush_request) + break; + else + mode = nilfs_segctor_flush_mode(sci); + + spin_unlock(&sci->sc_state_lock); + nilfs_segctor_thread_construct(sci, mode); + spin_lock(&sci->sc_state_lock); + timeout = 0; + } + + + if (freezing(current)) { + spin_unlock(&sci->sc_state_lock); + refrigerator(); + spin_lock(&sci->sc_state_lock); + } else { + DEFINE_WAIT(wait); + int should_sleep = 1; + + prepare_to_wait(&sci->sc_wait_daemon, &wait, + TASK_INTERRUPTIBLE); + + if (sci->sc_seq_request != sci->sc_seq_done) + should_sleep = 0; + else if (sci->sc_flush_request) + should_sleep = 0; + else if (sci->sc_state & NILFS_SEGCTOR_COMMIT) + should_sleep = time_before(jiffies, + sci->sc_timer->expires); + + if (should_sleep) { + spin_unlock(&sci->sc_state_lock); + schedule(); + spin_lock(&sci->sc_state_lock); + } + finish_wait(&sci->sc_wait_daemon, &wait); + timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && + time_after_eq(jiffies, sci->sc_timer->expires)); + } + goto loop; + + end_thread: + spin_unlock(&sci->sc_state_lock); + del_timer_sync(sci->sc_timer); + sci->sc_timer = NULL; + + /* end sync. */ + sci->sc_task = NULL; + wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */ + return 0; +} + +static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci) +{ + struct task_struct *t; + + t = kthread_run(nilfs_segctor_thread, sci, "segctord"); + if (IS_ERR(t)) { + int err = PTR_ERR(t); + + printk(KERN_ERR "NILFS: error %d creating segctord thread\n", + err); + return err; + } + wait_event(sci->sc_wait_task, sci->sc_task != NULL); + return 0; +} + +static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci) +{ + sci->sc_state |= NILFS_SEGCTOR_QUIT; + + while (sci->sc_task) { + wake_up(&sci->sc_wait_daemon); + spin_unlock(&sci->sc_state_lock); + wait_event(sci->sc_wait_task, sci->sc_task == NULL); + spin_lock(&sci->sc_state_lock); + } +} + +static int nilfs_segctor_init(struct nilfs_sc_info *sci) +{ + sci->sc_seq_done = sci->sc_seq_request; + + return nilfs_segctor_start_thread(sci); +} + +/* + * Setup & clean-up functions + */ +static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi) +{ + struct nilfs_sc_info *sci; + + sci = kzalloc(sizeof(*sci), GFP_KERNEL); + if (!sci) + return NULL; + + sci->sc_sbi = sbi; + sci->sc_super = sbi->s_super; + + init_waitqueue_head(&sci->sc_wait_request); + init_waitqueue_head(&sci->sc_wait_daemon); + init_waitqueue_head(&sci->sc_wait_task); + spin_lock_init(&sci->sc_state_lock); + INIT_LIST_HEAD(&sci->sc_dirty_files); + INIT_LIST_HEAD(&sci->sc_segbufs); + INIT_LIST_HEAD(&sci->sc_gc_inodes); + INIT_LIST_HEAD(&sci->sc_cleaning_segments); + INIT_LIST_HEAD(&sci->sc_copied_buffers); + + sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; + sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; + sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK; + + if (sbi->s_interval) + sci->sc_interval = sbi->s_interval; + if (sbi->s_watermark) + sci->sc_watermark = sbi->s_watermark; + return sci; +} + +static void nilfs_segctor_write_out(struct nilfs_sc_info *sci) +{ + int ret, retrycount = NILFS_SC_CLEANUP_RETRY; + + /* The segctord thread was stopped and its timer was removed. + But some tasks remain. */ + do { + struct nilfs_sb_info *sbi = sci->sc_sbi; + struct nilfs_transaction_info ti; + struct nilfs_segctor_req req = { .mode = SC_LSEG_SR }; + + nilfs_transaction_lock(sbi, &ti, 0); + nilfs_segctor_accept(sci, &req); + ret = nilfs_segctor_construct(sci, &req); + nilfs_segctor_notify(sci, &req); + nilfs_transaction_unlock(sbi); + + } while (ret && retrycount-- > 0); +} + +/** + * nilfs_segctor_destroy - destroy the segment constructor. + * @sci: nilfs_sc_info + * + * nilfs_segctor_destroy() kills the segctord thread and frees + * the nilfs_sc_info struct. + * Caller must hold the segment semaphore. + */ +static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) +{ + struct nilfs_sb_info *sbi = sci->sc_sbi; + int flag; + + up_write(&sbi->s_nilfs->ns_segctor_sem); + + spin_lock(&sci->sc_state_lock); + nilfs_segctor_kill_thread(sci); + flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request + || sci->sc_seq_request != sci->sc_seq_done); + spin_unlock(&sci->sc_state_lock); + + if (flag || nilfs_segctor_confirm(sci)) + nilfs_segctor_write_out(sci); + + WARN_ON(!list_empty(&sci->sc_copied_buffers)); + + if (!list_empty(&sci->sc_dirty_files)) { + nilfs_warning(sbi->s_super, __func__, + "dirty file(s) after the final construction\n"); + nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1); + } + + if (!list_empty(&sci->sc_cleaning_segments)) + nilfs_dispose_segment_list(&sci->sc_cleaning_segments); + + WARN_ON(!list_empty(&sci->sc_segbufs)); + + down_write(&sbi->s_nilfs->ns_segctor_sem); + + kfree(sci); +} + +/** + * nilfs_attach_segment_constructor - attach a segment constructor + * @sbi: nilfs_sb_info + * + * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info, + * initilizes it, and starts the segment constructor. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error code is returned. + * + * %-ENOMEM - Insufficient memory available. + */ +int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + int err; + + /* Each field of nilfs_segctor is cleared through the initialization + of super-block info */ + sbi->s_sc_info = nilfs_segctor_new(sbi); + if (!sbi->s_sc_info) + return -ENOMEM; + + nilfs_attach_writer(nilfs, sbi); + err = nilfs_segctor_init(NILFS_SC(sbi)); + if (err) { + nilfs_detach_writer(nilfs, sbi); + kfree(sbi->s_sc_info); + sbi->s_sc_info = NULL; + } + return err; +} + +/** + * nilfs_detach_segment_constructor - destroy the segment constructor + * @sbi: nilfs_sb_info + * + * nilfs_detach_segment_constructor() kills the segment constructor daemon, + * frees the struct nilfs_sc_info, and destroy the dirty file list. + */ +void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + LIST_HEAD(garbage_list); + + down_write(&nilfs->ns_segctor_sem); + if (NILFS_SC(sbi)) { + nilfs_segctor_destroy(NILFS_SC(sbi)); + sbi->s_sc_info = NULL; + } + + /* Force to free the list of dirty files */ + spin_lock(&sbi->s_inode_lock); + if (!list_empty(&sbi->s_dirty_files)) { + list_splice_init(&sbi->s_dirty_files, &garbage_list); + nilfs_warning(sbi->s_super, __func__, + "Non empty dirty list after the last " + "segment construction\n"); + } + spin_unlock(&sbi->s_inode_lock); + up_write(&nilfs->ns_segctor_sem); + + nilfs_dispose_list(sbi, &garbage_list, 1); + nilfs_detach_writer(nilfs, sbi); +} diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h new file mode 100644 index 00000000000..a98fc1ed0bb --- /dev/null +++ b/fs/nilfs2/segment.h @@ -0,0 +1,243 @@ +/* + * segment.h - NILFS Segment constructor prototypes and definitions + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ +#ifndef _NILFS_SEGMENT_H +#define _NILFS_SEGMENT_H + +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> +#include "sb.h" + +/** + * struct nilfs_recovery_info - Recovery infomation + * @ri_need_recovery: Recovery status + * @ri_super_root: Block number of the last super root + * @ri_ri_cno: Number of the last checkpoint + * @ri_lsegs_start: Region for roll-forwarding (start block number) + * @ri_lsegs_end: Region for roll-forwarding (end block number) + * @ri_lseg_start_seq: Sequence value of the segment at ri_lsegs_start + * @ri_used_segments: List of segments to be mark active + * @ri_pseg_start: Block number of the last partial segment + * @ri_seq: Sequence number on the last partial segment + * @ri_segnum: Segment number on the last partial segment + * @ri_nextnum: Next segment number on the last partial segment + */ +struct nilfs_recovery_info { + int ri_need_recovery; + sector_t ri_super_root; + __u64 ri_cno; + + sector_t ri_lsegs_start; + sector_t ri_lsegs_end; + u64 ri_lsegs_start_seq; + struct list_head ri_used_segments; + sector_t ri_pseg_start; + u64 ri_seq; + __u64 ri_segnum; + __u64 ri_nextnum; +}; + +/* ri_need_recovery */ +#define NILFS_RECOVERY_SR_UPDATED 1 /* The super root was updated */ +#define NILFS_RECOVERY_ROLLFORWARD_DONE 2 /* Rollforward was carried out */ + +/** + * struct nilfs_cstage - Context of collection stage + * @scnt: Stage count + * @flags: State flags + * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file + * @gc_inode_ptr: Pointer on the list of gc-inodes + */ +struct nilfs_cstage { + int scnt; + unsigned flags; + struct nilfs_inode_info *dirty_file_ptr; + struct nilfs_inode_info *gc_inode_ptr; +}; + +struct nilfs_segment_buffer; + +struct nilfs_segsum_pointer { + struct buffer_head *bh; + unsigned offset; /* offset in bytes */ +}; + +/** + * struct nilfs_sc_info - Segment constructor information + * @sc_super: Back pointer to super_block struct + * @sc_sbi: Back pointer to nilfs_sb_info struct + * @sc_nblk_inc: Block count of current generation + * @sc_dirty_files: List of files to be written + * @sc_gc_inodes: List of GC inodes having blocks to be written + * @sc_cleaning_segments: List of segments to be freed through construction + * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data + * @sc_dsync_inode: inode whose data pages are written for a sync operation + * @sc_dsync_start: start byte offset of data pages + * @sc_dsync_end: end byte offset of data pages (inclusive) + * @sc_segbufs: List of segment buffers + * @sc_segbuf_nblocks: Number of available blocks in segment buffers. + * @sc_curseg: Current segment buffer + * @sc_super_root: Pointer to the super root buffer + * @sc_stage: Collection stage + * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary + * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary + * @sc_blk_cnt: Block count of a file + * @sc_datablk_cnt: Data block count of a file + * @sc_nblk_this_inc: Number of blocks included in the current logical segment + * @sc_seg_ctime: Creation time + * @sc_flags: Internal flags + * @sc_state_lock: spinlock for sc_state and so on + * @sc_state: Segctord state flags + * @sc_flush_request: inode bitmap of metadata files to be flushed + * @sc_wait_request: Client request queue + * @sc_wait_daemon: Daemon wait queue + * @sc_wait_task: Start/end wait queue to control segctord task + * @sc_seq_request: Request counter + * @sc_seq_done: Completion counter + * @sc_sync: Request of explicit sync operation + * @sc_interval: Timeout value of background construction + * @sc_mjcp_freq: Frequency of creating checkpoints + * @sc_lseg_stime: Start time of the latest logical segment + * @sc_watermark: Watermark for the number of dirty buffers + * @sc_timer: Timer for segctord + * @sc_task: current thread of segctord + */ +struct nilfs_sc_info { + struct super_block *sc_super; + struct nilfs_sb_info *sc_sbi; + + unsigned long sc_nblk_inc; + + struct list_head sc_dirty_files; + struct list_head sc_gc_inodes; + struct list_head sc_cleaning_segments; + struct list_head sc_copied_buffers; + + struct nilfs_inode_info *sc_dsync_inode; + loff_t sc_dsync_start; + loff_t sc_dsync_end; + + /* Segment buffers */ + struct list_head sc_segbufs; + unsigned long sc_segbuf_nblocks; + struct nilfs_segment_buffer *sc_curseg; + struct buffer_head *sc_super_root; + + struct nilfs_cstage sc_stage; + + struct nilfs_segsum_pointer sc_finfo_ptr; + struct nilfs_segsum_pointer sc_binfo_ptr; + unsigned long sc_blk_cnt; + unsigned long sc_datablk_cnt; + unsigned long sc_nblk_this_inc; + time_t sc_seg_ctime; + + unsigned long sc_flags; + + spinlock_t sc_state_lock; + unsigned long sc_state; + unsigned long sc_flush_request; + + wait_queue_head_t sc_wait_request; + wait_queue_head_t sc_wait_daemon; + wait_queue_head_t sc_wait_task; + + __u32 sc_seq_request; + __u32 sc_seq_done; + + int sc_sync; + unsigned long sc_interval; + unsigned long sc_mjcp_freq; + unsigned long sc_lseg_stime; /* in 1/HZ seconds */ + unsigned long sc_watermark; + + struct timer_list *sc_timer; + struct task_struct *sc_task; +}; + +/* sc_flags */ +enum { + NILFS_SC_DIRTY, /* One or more dirty meta-data blocks exist */ + NILFS_SC_UNCLOSED, /* Logical segment is not closed */ + NILFS_SC_SUPER_ROOT, /* The latest segment has a super root */ + NILFS_SC_PRIOR_FLUSH, /* Requesting immediate flush without making a + checkpoint */ + NILFS_SC_HAVE_DELTA, /* Next checkpoint will have update of files + other than DAT, cpfile, sufile, or files + moved by GC */ +}; + +/* sc_state */ +#define NILFS_SEGCTOR_QUIT 0x0001 /* segctord is being destroyed */ +#define NILFS_SEGCTOR_COMMIT 0x0004 /* committed transaction exists */ + +/* + * Constant parameters + */ +#define NILFS_SC_CLEANUP_RETRY 3 /* Retry count of construction when + destroying segctord */ + +/* + * Default values of timeout, in seconds. + */ +#define NILFS_SC_DEFAULT_TIMEOUT 5 /* Timeout value of dirty blocks. + It triggers construction of a + logical segment with a super root */ +#define NILFS_SC_DEFAULT_SR_FREQ 30 /* Maximum frequency of super root + creation */ + +/* + * The default threshold amount of data, in block counts. + */ +#define NILFS_SC_DEFAULT_WATERMARK 3600 + + +/* segment.c */ +extern int nilfs_init_transaction_cache(void); +extern void nilfs_destroy_transaction_cache(void); +extern void nilfs_relax_pressure_in_lock(struct super_block *); + +extern int nilfs_construct_segment(struct super_block *); +extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *, + loff_t, loff_t); +extern void nilfs_flush_segment(struct super_block *, ino_t); +extern int nilfs_clean_segments(struct super_block *, void __user *); + +extern int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *, + __u64 *, size_t); +extern void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *); + +extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *); +extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *); + +/* recovery.c */ +extern int nilfs_read_super_root_block(struct super_block *, sector_t, + struct buffer_head **, int); +extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *, + struct nilfs_recovery_info *); +extern int nilfs_recover_logical_segments(struct the_nilfs *, + struct nilfs_sb_info *, + struct nilfs_recovery_info *); + +#endif /* _NILFS_SEGMENT_H */ diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c new file mode 100644 index 00000000000..98e68677f04 --- /dev/null +++ b/fs/nilfs2/sufile.c @@ -0,0 +1,558 @@ +/* + * sufile.c - NILFS segment usage file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/buffer_head.h> +#include <linux/errno.h> +#include <linux/nilfs2_fs.h> +#include "mdt.h" +#include "sufile.h" + + +static inline unsigned long +nilfs_sufile_segment_usages_per_block(const struct inode *sufile) +{ + return NILFS_MDT(sufile)->mi_entries_per_block; +} + +static unsigned long +nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum) +{ + __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; + do_div(t, nilfs_sufile_segment_usages_per_block(sufile)); + return (unsigned long)t; +} + +static unsigned long +nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum) +{ + __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; + return do_div(t, nilfs_sufile_segment_usages_per_block(sufile)); +} + +static unsigned long +nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr, + __u64 max) +{ + return min_t(unsigned long, + nilfs_sufile_segment_usages_per_block(sufile) - + nilfs_sufile_get_offset(sufile, curr), + max - curr + 1); +} + +static inline struct nilfs_sufile_header * +nilfs_sufile_block_get_header(const struct inode *sufile, + struct buffer_head *bh, + void *kaddr) +{ + return kaddr + bh_offset(bh); +} + +static struct nilfs_segment_usage * +nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum, + struct buffer_head *bh, void *kaddr) +{ + return kaddr + bh_offset(bh) + + nilfs_sufile_get_offset(sufile, segnum) * + NILFS_MDT(sufile)->mi_entry_size; +} + +static inline int nilfs_sufile_get_header_block(struct inode *sufile, + struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp); +} + +static inline int +nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum, + int create, struct buffer_head **bhp) +{ + return nilfs_mdt_get_block(sufile, + nilfs_sufile_get_blkoff(sufile, segnum), + create, NULL, bhp); +} + +static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, + u64 ncleanadd, u64 ndirtyadd) +{ + struct nilfs_sufile_header *header; + void *kaddr; + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = kaddr + bh_offset(header_bh); + le64_add_cpu(&header->sh_ncleansegs, ncleanadd); + le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(header_bh); +} + +int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create, + void (*dofunc)(struct inode *, __u64, + struct buffer_head *, + struct buffer_head *)) +{ + struct buffer_head *header_bh, *bh; + int ret; + + if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) { + printk(KERN_WARNING "%s: invalid segment number: %llu\n", + __func__, (unsigned long long)segnum); + return -EINVAL; + } + down_write(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, create, &bh); + if (!ret) { + dofunc(sufile, segnum, header_bh, bh); + brelse(bh); + } + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_alloc - allocate a segment + * @sufile: inode of segment usage file + * @segnump: pointer to segment number + * + * Description: nilfs_sufile_alloc() allocates a clean segment. + * + * Return Value: On success, 0 is returned and the segment number of the + * allocated segment is stored in the place pointed by @segnump. On error, one + * of the following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOSPC - No clean segment left. + */ +int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) +{ + struct buffer_head *header_bh, *su_bh; + struct nilfs_sufile_header *header; + struct nilfs_segment_usage *su; + size_t susz = NILFS_MDT(sufile)->mi_entry_size; + __u64 segnum, maxsegnum, last_alloc; + void *kaddr; + unsigned long nsegments, ncleansegs, nsus; + int ret, i, j; + + down_write(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); + ncleansegs = le64_to_cpu(header->sh_ncleansegs); + last_alloc = le64_to_cpu(header->sh_last_alloc); + kunmap_atomic(kaddr, KM_USER0); + + nsegments = nilfs_sufile_get_nsegments(sufile); + segnum = last_alloc + 1; + maxsegnum = nsegments - 1; + for (i = 0; i < nsegments; i += nsus) { + if (segnum >= nsegments) { + /* wrap around */ + segnum = 0; + maxsegnum = last_alloc; + } + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, + &su_bh); + if (ret < 0) + goto out_header; + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + + nsus = nilfs_sufile_segment_usages_in_block( + sufile, segnum, maxsegnum); + for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) { + if (!nilfs_segment_usage_clean(su)) + continue; + /* found a clean segment */ + nilfs_segment_usage_set_dirty(su); + kunmap_atomic(kaddr, KM_USER0); + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_sufile_block_get_header( + sufile, header_bh, kaddr); + le64_add_cpu(&header->sh_ncleansegs, -1); + le64_add_cpu(&header->sh_ndirtysegs, 1); + header->sh_last_alloc = cpu_to_le64(segnum); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_mdt_mark_buffer_dirty(header_bh); + nilfs_mdt_mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); + brelse(su_bh); + *segnump = segnum; + goto out_header; + } + + kunmap_atomic(kaddr, KM_USER0); + brelse(su_bh); + } + + /* no segments left */ + ret = -ENOSPC; + + out_header: + brelse(header_bh); + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum, + struct buffer_head *header_bh, + struct buffer_head *su_bh) +{ + struct nilfs_segment_usage *su; + void *kaddr; + + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + if (unlikely(!nilfs_segment_usage_clean(su))) { + printk(KERN_WARNING "%s: segment %llu must be clean\n", + __func__, (unsigned long long)segnum); + kunmap_atomic(kaddr, KM_USER0); + return; + } + nilfs_segment_usage_set_dirty(su); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_sufile_mod_counter(header_bh, -1, 1); + nilfs_mdt_mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); +} + +void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, + struct buffer_head *header_bh, + struct buffer_head *su_bh) +{ + struct nilfs_segment_usage *su; + void *kaddr; + int clean, dirty; + + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) && + su->su_nblocks == cpu_to_le32(0)) { + kunmap_atomic(kaddr, KM_USER0); + return; + } + clean = nilfs_segment_usage_clean(su); + dirty = nilfs_segment_usage_dirty(su); + + /* make the segment garbage */ + su->su_lastmod = cpu_to_le64(0); + su->su_nblocks = cpu_to_le32(0); + su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY); + kunmap_atomic(kaddr, KM_USER0); + + nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1); + nilfs_mdt_mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); +} + +void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, + struct buffer_head *header_bh, + struct buffer_head *su_bh) +{ + struct nilfs_segment_usage *su; + void *kaddr; + int sudirty; + + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + if (nilfs_segment_usage_clean(su)) { + printk(KERN_WARNING "%s: segment %llu is already clean\n", + __func__, (unsigned long long)segnum); + kunmap_atomic(kaddr, KM_USER0); + return; + } + WARN_ON(nilfs_segment_usage_error(su)); + WARN_ON(!nilfs_segment_usage_dirty(su)); + + sudirty = nilfs_segment_usage_dirty(su); + nilfs_segment_usage_set_clean(su); + kunmap_atomic(kaddr, KM_USER0); + nilfs_mdt_mark_buffer_dirty(su_bh); + + nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0); + nilfs_mdt_mark_dirty(sufile); +} + +/** + * nilfs_sufile_get_segment_usage - get a segment usage + * @sufile: inode of segment usage file + * @segnum: segment number + * @sup: pointer to segment usage + * @bhp: pointer to buffer head + * + * Description: nilfs_sufile_get_segment_usage() acquires the segment usage + * specified by @segnum. + * + * Return Value: On success, 0 is returned, and the segment usage and the + * buffer head of the buffer on which the segment usage is located are stored + * in the place pointed by @sup and @bhp, respectively. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid segment usage number. + */ +int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum, + struct nilfs_segment_usage **sup, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + struct nilfs_segment_usage *su; + void *kaddr; + int ret; + + /* segnum is 0 origin */ + if (segnum >= nilfs_sufile_get_nsegments(sufile)) + return -EINVAL; + down_write(&NILFS_MDT(sufile)->mi_sem); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh); + if (ret < 0) + goto out_sem; + kaddr = kmap(bh->b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr); + if (nilfs_segment_usage_error(su)) { + kunmap(bh->b_page); + brelse(bh); + ret = -EINVAL; + goto out_sem; + } + + if (sup != NULL) + *sup = su; + *bhp = bh; + + out_sem: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_put_segment_usage - put a segment usage + * @sufile: inode of segment usage file + * @segnum: segment number + * @bh: buffer head + * + * Description: nilfs_sufile_put_segment_usage() releases the segment usage + * specified by @segnum. @bh must be the buffer head which have been returned + * by a previous call to nilfs_sufile_get_segment_usage() with @segnum. + */ +void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum, + struct buffer_head *bh) +{ + kunmap(bh->b_page); + brelse(bh); +} + +/** + * nilfs_sufile_get_stat - get segment usage statistics + * @sufile: inode of segment usage file + * @stat: pointer to a structure of segment usage statistics + * + * Description: nilfs_sufile_get_stat() returns information about segment + * usage. + * + * Return Value: On success, 0 is returned, and segment usage information is + * stored in the place pointed by @stat. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) +{ + struct buffer_head *header_bh; + struct nilfs_sufile_header *header; + struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs; + void *kaddr; + int ret; + + down_read(&NILFS_MDT(sufile)->mi_sem); + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out_sem; + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr); + sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile); + sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs); + sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs); + sustat->ss_ctime = nilfs->ns_ctime; + sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime; + spin_lock(&nilfs->ns_last_segment_lock); + sustat->ss_prot_seq = nilfs->ns_prot_seq; + spin_unlock(&nilfs->ns_last_segment_lock); + kunmap_atomic(kaddr, KM_USER0); + brelse(header_bh); + + out_sem: + up_read(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** + * nilfs_sufile_get_ncleansegs - get the number of clean segments + * @sufile: inode of segment usage file + * @nsegsp: pointer to the number of clean segments + * + * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean + * segments. + * + * Return Value: On success, 0 is returned and the number of clean segments is + * stored in the place pointed by @nsegsp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp) +{ + struct nilfs_sustat sustat; + int ret; + + ret = nilfs_sufile_get_stat(sufile, &sustat); + if (ret == 0) + *nsegsp = sustat.ss_ncleansegs; + return ret; +} + +void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, + struct buffer_head *header_bh, + struct buffer_head *su_bh) +{ + struct nilfs_segment_usage *su; + void *kaddr; + int suclean; + + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr); + if (nilfs_segment_usage_error(su)) { + kunmap_atomic(kaddr, KM_USER0); + return; + } + suclean = nilfs_segment_usage_clean(su); + nilfs_segment_usage_set_error(su); + kunmap_atomic(kaddr, KM_USER0); + + if (suclean) + nilfs_sufile_mod_counter(header_bh, -1, 0); + nilfs_mdt_mark_buffer_dirty(su_bh); + nilfs_mdt_mark_dirty(sufile); +} + +/** + * nilfs_sufile_get_suinfo - + * @sufile: inode of segment usage file + * @segnum: segment number to start looking + * @si: array of suinfo + * @nsi: size of suinfo array + * + * Description: + * + * Return Value: On success, 0 is returned and .... On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, + struct nilfs_suinfo *si, size_t nsi) +{ + struct buffer_head *su_bh; + struct nilfs_segment_usage *su; + size_t susz = NILFS_MDT(sufile)->mi_entry_size; + struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs; + void *kaddr; + unsigned long nsegs, segusages_per_block; + ssize_t n; + int ret, i, j; + + down_read(&NILFS_MDT(sufile)->mi_sem); + + segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile); + nsegs = min_t(unsigned long, + nilfs_sufile_get_nsegments(sufile) - segnum, + nsi); + for (i = 0; i < nsegs; i += n, segnum += n) { + n = min_t(unsigned long, + segusages_per_block - + nilfs_sufile_get_offset(sufile, segnum), + nsegs - i); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, + &su_bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out; + /* hole */ + memset(&si[i], 0, sizeof(struct nilfs_suinfo) * n); + continue; + } + + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + for (j = 0; j < n; j++, su = (void *)su + susz) { + si[i + j].sui_lastmod = le64_to_cpu(su->su_lastmod); + si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks); + si[i + j].sui_flags = le32_to_cpu(su->su_flags) & + ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE); + if (nilfs_segment_is_active(nilfs, segnum + j)) + si[i + j].sui_flags |= + (1UL << NILFS_SEGMENT_USAGE_ACTIVE); + } + kunmap_atomic(kaddr, KM_USER0); + brelse(su_bh); + } + ret = nsegs; + + out: + up_read(&NILFS_MDT(sufile)->mi_sem); + return ret; +} diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h new file mode 100644 index 00000000000..a2e2efd4ade --- /dev/null +++ b/fs/nilfs2/sufile.h @@ -0,0 +1,125 @@ +/* + * sufile.h - NILFS segment usage file. + * + * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Koji Sato <koji@osrg.net>. + */ + +#ifndef _NILFS_SUFILE_H +#define _NILFS_SUFILE_H + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/nilfs2_fs.h> +#include "mdt.h" + +#define NILFS_SUFILE_GFP NILFS_MDT_GFP + +static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) +{ + return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments; +} + +int nilfs_sufile_alloc(struct inode *, __u64 *); +int nilfs_sufile_get_segment_usage(struct inode *, __u64, + struct nilfs_segment_usage **, + struct buffer_head **); +void nilfs_sufile_put_segment_usage(struct inode *, __u64, + struct buffer_head *); +int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); +int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *); +ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *, + size_t); + +int nilfs_sufile_update(struct inode *, __u64, int, + void (*dofunc)(struct inode *, __u64, + struct buffer_head *, + struct buffer_head *)); +void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *, + struct buffer_head *); +void nilfs_sufile_do_scrap(struct inode *, __u64, struct buffer_head *, + struct buffer_head *); +void nilfs_sufile_do_free(struct inode *, __u64, struct buffer_head *, + struct buffer_head *); +void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, + struct buffer_head *); + +/** + * nilfs_sufile_cancel_free - + * @sufile: inode of segment usage file + * @segnum: segment number + * + * Description: + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +static inline int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum) +{ + return nilfs_sufile_update(sufile, segnum, 0, + nilfs_sufile_do_cancel_free); +} + +/** + * nilfs_sufile_scrap - make a segment garbage + * @sufile: inode of segment usage file + * @segnum: segment number to be freed + */ +static inline int nilfs_sufile_scrap(struct inode *sufile, __u64 segnum) +{ + return nilfs_sufile_update(sufile, segnum, 1, nilfs_sufile_do_scrap); +} + +/** + * nilfs_sufile_free - free segment + * @sufile: inode of segment usage file + * @segnum: segment number to be freed + */ +static inline int nilfs_sufile_free(struct inode *sufile, __u64 segnum) +{ + return nilfs_sufile_update(sufile, segnum, 0, nilfs_sufile_do_free); +} + +/** + * nilfs_sufile_set_error - mark a segment as erroneous + * @sufile: inode of segment usage file + * @segnum: segment number + * + * Description: nilfs_sufile_set_error() marks the segment specified by + * @segnum as erroneous. The error segment will never be used again. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid segment usage number. + */ +static inline int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum) +{ + return nilfs_sufile_update(sufile, segnum, 0, + nilfs_sufile_do_set_error); +} + +#endif /* _NILFS_SUFILE_H */ diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c new file mode 100644 index 00000000000..6989b03e97a --- /dev/null +++ b/fs/nilfs2/super.c @@ -0,0 +1,1326 @@ +/* + * super.c - NILFS module and super block management. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + */ +/* + * linux/fs/ext2/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/parser.h> +#include <linux/random.h> +#include <linux/crc32.h> +#include <linux/smp_lock.h> +#include <linux/vfs.h> +#include <linux/writeback.h> +#include <linux/kobject.h> +#include <linux/exportfs.h> +#include "nilfs.h" +#include "mdt.h" +#include "alloc.h" +#include "page.h" +#include "cpfile.h" +#include "ifile.h" +#include "dat.h" +#include "segment.h" +#include "segbuf.h" + +MODULE_AUTHOR("NTT Corp."); +MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " + "(NILFS)"); +MODULE_LICENSE("GPL"); + +static int nilfs_remount(struct super_block *sb, int *flags, char *data); +static int test_exclusive_mount(struct file_system_type *fs_type, + struct block_device *bdev, int flags); + +/** + * nilfs_error() - report failure condition on a filesystem + * + * nilfs_error() sets an ERROR_FS flag on the superblock as well as + * reporting an error message. It should be called when NILFS detects + * incoherences or defects of meta data on disk. As for sustainable + * errors such as a single-shot I/O error, nilfs_warning() or the printk() + * function should be used instead. + * + * The segment constructor must not call this function because it can + * kill itself. + */ +void nilfs_error(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + va_list args; + + va_start(args, fmt); + printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + if (!(sb->s_flags & MS_RDONLY)) { + struct the_nilfs *nilfs = sbi->s_nilfs; + + if (!nilfs_test_opt(sbi, ERRORS_CONT)) + nilfs_detach_segment_constructor(sbi); + + down_write(&nilfs->ns_sem); + if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { + nilfs->ns_mount_state |= NILFS_ERROR_FS; + nilfs->ns_sbp[0]->s_state |= + cpu_to_le16(NILFS_ERROR_FS); + nilfs_commit_super(sbi, 1); + } + up_write(&nilfs->ns_sem); + + if (nilfs_test_opt(sbi, ERRORS_RO)) { + printk(KERN_CRIT "Remounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + } + } + + if (nilfs_test_opt(sbi, ERRORS_PANIC)) + panic("NILFS (device %s): panic forced after error\n", + sb->s_id); +} + +void nilfs_warning(struct super_block *sb, const char *function, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + printk(KERN_WARNING "NILFS warning (device %s): %s: ", + sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); +} + +static struct kmem_cache *nilfs_inode_cachep; + +struct inode *nilfs_alloc_inode(struct super_block *sb) +{ + struct nilfs_inode_info *ii; + + ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS); + if (!ii) + return NULL; + ii->i_bh = NULL; + ii->i_state = 0; + ii->vfs_inode.i_version = 1; + nilfs_btnode_cache_init(&ii->i_btnode_cache); + return &ii->vfs_inode; +} + +void nilfs_destroy_inode(struct inode *inode) +{ + kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); +} + +static void init_once(void *obj) +{ + struct nilfs_inode_info *ii = obj; + + INIT_LIST_HEAD(&ii->i_dirty); +#ifdef CONFIG_NILFS_XATTR + init_rwsem(&ii->xattr_sem); +#endif + nilfs_btnode_cache_init_once(&ii->i_btnode_cache); + ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union; + inode_init_once(&ii->vfs_inode); +} + +static int nilfs_init_inode_cache(void) +{ + nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", + sizeof(struct nilfs_inode_info), + 0, SLAB_RECLAIM_ACCOUNT, + init_once); + + return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0; +} + +static inline void nilfs_destroy_inode_cache(void) +{ + kmem_cache_destroy(nilfs_inode_cachep); +} + +static void nilfs_clear_inode(struct inode *inode) +{ + struct nilfs_inode_info *ii = NILFS_I(inode); + +#ifdef CONFIG_NILFS_POSIX_ACL + if (ii->i_acl && ii->i_acl != NILFS_ACL_NOT_CACHED) { + posix_acl_release(ii->i_acl); + ii->i_acl = NILFS_ACL_NOT_CACHED; + } + if (ii->i_default_acl && ii->i_default_acl != NILFS_ACL_NOT_CACHED) { + posix_acl_release(ii->i_default_acl); + ii->i_default_acl = NILFS_ACL_NOT_CACHED; + } +#endif + /* + * Free resources allocated in nilfs_read_inode(), here. + */ + BUG_ON(!list_empty(&ii->i_dirty)); + brelse(ii->i_bh); + ii->i_bh = NULL; + + if (test_bit(NILFS_I_BMAP, &ii->i_state)) + nilfs_bmap_clear(ii->i_bmap); + + nilfs_btnode_cache_clear(&ii->i_btnode_cache); +} + +static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + int err; + int barrier_done = 0; + + if (nilfs_test_opt(sbi, BARRIER)) { + set_buffer_ordered(nilfs->ns_sbh[0]); + barrier_done = 1; + } + retry: + set_buffer_dirty(nilfs->ns_sbh[0]); + err = sync_dirty_buffer(nilfs->ns_sbh[0]); + if (err == -EOPNOTSUPP && barrier_done) { + nilfs_warning(sbi->s_super, __func__, + "barrier-based sync failed. " + "disabling barriers\n"); + nilfs_clear_opt(sbi, BARRIER); + barrier_done = 0; + clear_buffer_ordered(nilfs->ns_sbh[0]); + goto retry; + } + if (unlikely(err)) { + printk(KERN_ERR + "NILFS: unable to write superblock (err=%d)\n", err); + if (err == -EIO && nilfs->ns_sbh[1]) { + nilfs_fall_back_super_block(nilfs); + goto retry; + } + } else { + struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; + + /* + * The latest segment becomes trailable from the position + * written in superblock. + */ + clear_nilfs_discontinued(nilfs); + + /* update GC protection for recent segments */ + if (nilfs->ns_sbh[1]) { + sbp = NULL; + if (dupsb) { + set_buffer_dirty(nilfs->ns_sbh[1]); + if (!sync_dirty_buffer(nilfs->ns_sbh[1])) + sbp = nilfs->ns_sbp[1]; + } + } + if (sbp) { + spin_lock(&nilfs->ns_last_segment_lock); + nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq); + spin_unlock(&nilfs->ns_last_segment_lock); + } + } + + return err; +} + +int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_super_block **sbp = nilfs->ns_sbp; + sector_t nfreeblocks; + time_t t; + int err; + + /* nilfs->sem must be locked by the caller. */ + if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) { + if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC) + nilfs_swap_super_block(nilfs); + else { + printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", + sbi->s_super->s_id); + return -EIO; + } + } + err = nilfs_count_free_blocks(nilfs, &nfreeblocks); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: failed to count free blocks\n"); + return err; + } + spin_lock(&nilfs->ns_last_segment_lock); + sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq); + sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg); + sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno); + spin_unlock(&nilfs->ns_last_segment_lock); + + t = get_seconds(); + nilfs->ns_sbwtime[0] = t; + sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks); + sbp[0]->s_wtime = cpu_to_le64(t); + sbp[0]->s_sum = 0; + sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, + (unsigned char *)sbp[0], + nilfs->ns_sbsize)); + if (dupsb && sbp[1]) { + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); + nilfs->ns_sbwtime[1] = t; + } + sbi->s_super->s_dirt = 0; + return nilfs_sync_super(sbi, dupsb); +} + +static void nilfs_put_super(struct super_block *sb) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct the_nilfs *nilfs = sbi->s_nilfs; + + nilfs_detach_segment_constructor(sbi); + + if (!(sb->s_flags & MS_RDONLY)) { + down_write(&nilfs->ns_sem); + nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); + nilfs_commit_super(sbi, 1); + up_write(&nilfs->ns_sem); + } + + nilfs_detach_checkpoint(sbi); + put_nilfs(sbi->s_nilfs); + sbi->s_super = NULL; + sb->s_fs_info = NULL; + kfree(sbi); +} + +/** + * nilfs_write_super - write super block(s) of NILFS + * @sb: super_block + * + * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and + * clears s_dirt. This function is called in the section protected by + * lock_super(). + * + * The s_dirt flag is managed by each filesystem and we protect it by ns_sem + * of the struct the_nilfs. Lock order must be as follows: + * + * 1. lock_super() + * 2. down_write(&nilfs->ns_sem) + * + * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer + * of the super block (nilfs->ns_sbp[]). + * + * In most cases, VFS functions call lock_super() before calling these + * methods. So we must be careful not to bring on deadlocks when using + * lock_super(); see generic_shutdown_super(), write_super(), and so on. + * + * Note that order of lock_kernel() and lock_super() depends on contexts + * of VFS. We should also note that lock_kernel() can be used in its + * protective section and only the outermost one has an effect. + */ +static void nilfs_write_super(struct super_block *sb) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct the_nilfs *nilfs = sbi->s_nilfs; + + down_write(&nilfs->ns_sem); + if (!(sb->s_flags & MS_RDONLY)) { + struct nilfs_super_block **sbp = nilfs->ns_sbp; + u64 t = get_seconds(); + int dupsb; + + if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] && + t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) { + up_write(&nilfs->ns_sem); + return; + } + dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ; + nilfs_commit_super(sbi, dupsb); + } + sb->s_dirt = 0; + up_write(&nilfs->ns_sem); +} + +static int nilfs_sync_fs(struct super_block *sb, int wait) +{ + int err = 0; + + /* This function is called when super block should be written back */ + if (wait) + err = nilfs_construct_segment(sb); + return err; +} + +int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_checkpoint *raw_cp; + struct buffer_head *bh_cp; + int err; + + down_write(&nilfs->ns_sem); + list_add(&sbi->s_list, &nilfs->ns_supers); + up_write(&nilfs->ns_sem); + + sbi->s_ifile = nilfs_mdt_new( + nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP); + if (!sbi->s_ifile) + return -ENOMEM; + + err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size); + if (unlikely(err)) + goto failed; + + err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, + &bh_cp); + if (unlikely(err)) { + if (err == -ENOENT || err == -EINVAL) { + printk(KERN_ERR + "NILFS: Invalid checkpoint " + "(checkpoint number=%llu)\n", + (unsigned long long)cno); + err = -EINVAL; + } + goto failed; + } + err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode); + if (unlikely(err)) + goto failed_bh; + atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count)); + atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count)); + + nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); + return 0; + + failed_bh: + nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp); + failed: + nilfs_mdt_destroy(sbi->s_ifile); + sbi->s_ifile = NULL; + + down_write(&nilfs->ns_sem); + list_del_init(&sbi->s_list); + up_write(&nilfs->ns_sem); + + return err; +} + +void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + + nilfs_mdt_clear(sbi->s_ifile); + nilfs_mdt_destroy(sbi->s_ifile); + sbi->s_ifile = NULL; + down_write(&nilfs->ns_sem); + list_del_init(&sbi->s_list); + up_write(&nilfs->ns_sem); +} + +static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + int err = 0; + + down_write(&nilfs->ns_sem); + if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) { + nilfs->ns_mount_state |= NILFS_VALID_FS; + err = nilfs_commit_super(sbi, 1); + if (likely(!err)) + printk(KERN_INFO "NILFS: recovery complete.\n"); + } + up_write(&nilfs->ns_sem); + return err; +} + +static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct the_nilfs *nilfs = sbi->s_nilfs; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + unsigned long long blocks; + unsigned long overhead; + unsigned long nrsvblocks; + sector_t nfreeblocks; + int err; + + /* + * Compute all of the segment blocks + * + * The blocks before first segment and after last segment + * are excluded. + */ + blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments + - nilfs->ns_first_data_block; + nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment; + + /* + * Compute the overhead + * + * When distributing meta data blocks outside semgent structure, + * We must count them as the overhead. + */ + overhead = 0; + + err = nilfs_count_free_blocks(nilfs, &nfreeblocks); + if (unlikely(err)) + return err; + + buf->f_type = NILFS_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = blocks - overhead; + buf->f_bfree = nfreeblocks; + buf->f_bavail = (buf->f_bfree >= nrsvblocks) ? + (buf->f_bfree - nrsvblocks) : 0; + buf->f_files = atomic_read(&sbi->s_inodes_count); + buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */ + buf->f_namelen = NILFS_NAME_LEN; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + + return 0; +} + +static struct super_operations nilfs_sops = { + .alloc_inode = nilfs_alloc_inode, + .destroy_inode = nilfs_destroy_inode, + .dirty_inode = nilfs_dirty_inode, + /* .write_inode = nilfs_write_inode, */ + /* .put_inode = nilfs_put_inode, */ + /* .drop_inode = nilfs_drop_inode, */ + .delete_inode = nilfs_delete_inode, + .put_super = nilfs_put_super, + .write_super = nilfs_write_super, + .sync_fs = nilfs_sync_fs, + /* .write_super_lockfs */ + /* .unlockfs */ + .statfs = nilfs_statfs, + .remount_fs = nilfs_remount, + .clear_inode = nilfs_clear_inode, + /* .umount_begin */ + /* .show_options */ +}; + +static struct inode * +nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) +{ + struct inode *inode; + + if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO && + ino != NILFS_SKETCH_INO) + return ERR_PTR(-ESTALE); + + inode = nilfs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +static struct dentry * +nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, + int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + nilfs_nfs_get_inode); +} + +static struct dentry * +nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, + int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + nilfs_nfs_get_inode); +} + +static struct export_operations nilfs_export_ops = { + .fh_to_dentry = nilfs_fh_to_dentry, + .fh_to_parent = nilfs_fh_to_parent, + .get_parent = nilfs_get_parent, +}; + +enum { + Opt_err_cont, Opt_err_panic, Opt_err_ro, + Opt_barrier, Opt_snapshot, Opt_order, + Opt_err, +}; + +static match_table_t tokens = { + {Opt_err_cont, "errors=continue"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_barrier, "barrier=%s"}, + {Opt_snapshot, "cp=%u"}, + {Opt_order, "order=%s"}, + {Opt_err, NULL} +}; + +static int match_bool(substring_t *s, int *result) +{ + int len = s->to - s->from; + + if (strncmp(s->from, "on", len) == 0) + *result = 1; + else if (strncmp(s->from, "off", len) == 0) + *result = 0; + else + return 1; + return 0; +} + +static int parse_options(char *options, struct super_block *sb) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_barrier: + if (match_bool(&args[0], &option)) + return 0; + if (option) + nilfs_set_opt(sbi, BARRIER); + else + nilfs_clear_opt(sbi, BARRIER); + break; + case Opt_order: + if (strcmp(args[0].from, "relaxed") == 0) + /* Ordered data semantics */ + nilfs_clear_opt(sbi, STRICT_ORDER); + else if (strcmp(args[0].from, "strict") == 0) + /* Strict in-order semantics */ + nilfs_set_opt(sbi, STRICT_ORDER); + else + return 0; + break; + case Opt_err_panic: + nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC); + break; + case Opt_err_ro: + nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO); + break; + case Opt_err_cont: + nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT); + break; + case Opt_snapshot: + if (match_int(&args[0], &option) || option <= 0) + return 0; + if (!(sb->s_flags & MS_RDONLY)) + return 0; + sbi->s_snapshot_cno = option; + nilfs_set_opt(sbi, SNAPSHOT); + break; + default: + printk(KERN_ERR + "NILFS: Unrecognized mount option \"%s\"\n", p); + return 0; + } + } + return 1; +} + +static inline void +nilfs_set_default_options(struct nilfs_sb_info *sbi, + struct nilfs_super_block *sbp) +{ + sbi->s_mount_opt = + NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER; +} + +static int nilfs_setup_super(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; + int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count); + int mnt_count = le16_to_cpu(sbp->s_mnt_count); + + /* nilfs->sem must be locked by the caller. */ + if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) { + printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n"); + } else if (nilfs->ns_mount_state & NILFS_ERROR_FS) { + printk(KERN_WARNING + "NILFS warning: mounting fs with errors\n"); +#if 0 + } else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) { + printk(KERN_WARNING + "NILFS warning: maximal mount count reached\n"); +#endif + } + if (!max_mnt_count) + sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); + + sbp->s_mnt_count = cpu_to_le16(mnt_count + 1); + sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS); + sbp->s_mtime = cpu_to_le64(get_seconds()); + return nilfs_commit_super(sbi, 1); +} + +struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb, + u64 pos, int blocksize, + struct buffer_head **pbh) +{ + unsigned long long sb_index = pos; + unsigned long offset; + + offset = do_div(sb_index, blocksize); + *pbh = sb_bread(sb, sb_index); + if (!*pbh) + return NULL; + return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset); +} + +int nilfs_store_magic_and_option(struct super_block *sb, + struct nilfs_super_block *sbp, + char *data) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + + sb->s_magic = le16_to_cpu(sbp->s_magic); + + /* FS independent flags */ +#ifdef NILFS_ATIME_DISABLE + sb->s_flags |= MS_NOATIME; +#endif + + nilfs_set_default_options(sbi, sbp); + + sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid); + sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid); + sbi->s_interval = le32_to_cpu(sbp->s_c_interval); + sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max); + + return !parse_options(data, sb) ? -EINVAL : 0 ; +} + +/** + * nilfs_fill_super() - initialize a super block instance + * @sb: super_block + * @data: mount options + * @silent: silent mode flag + * @nilfs: the_nilfs struct + * + * This function is called exclusively by bd_mount_mutex. + * So, the recovery process is protected from other simultaneous mounts. + */ +static int +nilfs_fill_super(struct super_block *sb, void *data, int silent, + struct the_nilfs *nilfs) +{ + struct nilfs_sb_info *sbi; + struct inode *root; + __u64 cno; + int err; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + + get_nilfs(nilfs); + sbi->s_nilfs = nilfs; + sbi->s_super = sb; + + err = init_nilfs(nilfs, sbi, (char *)data); + if (err) + goto failed_sbi; + + spin_lock_init(&sbi->s_inode_lock); + INIT_LIST_HEAD(&sbi->s_dirty_files); + INIT_LIST_HEAD(&sbi->s_list); + + /* + * Following initialization is overlapped because + * nilfs_sb_info structure has been cleared at the beginning. + * But we reserve them to keep our interest and make ready + * for the future change. + */ + get_random_bytes(&sbi->s_next_generation, + sizeof(sbi->s_next_generation)); + spin_lock_init(&sbi->s_next_gen_lock); + + sb->s_op = &nilfs_sops; + sb->s_export_op = &nilfs_export_ops; + sb->s_root = NULL; + sb->s_time_gran = 1; + + if (!nilfs_loaded(nilfs)) { + err = load_nilfs(nilfs, sbi); + if (err) + goto failed_sbi; + } + cno = nilfs_last_cno(nilfs); + + if (sb->s_flags & MS_RDONLY) { + if (nilfs_test_opt(sbi, SNAPSHOT)) { + err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, + sbi->s_snapshot_cno); + if (err < 0) + goto failed_sbi; + if (!err) { + printk(KERN_ERR + "NILFS: The specified checkpoint is " + "not a snapshot " + "(checkpoint number=%llu).\n", + (unsigned long long)sbi->s_snapshot_cno); + err = -EINVAL; + goto failed_sbi; + } + cno = sbi->s_snapshot_cno; + } else + /* Read-only mount */ + sbi->s_snapshot_cno = cno; + } + + err = nilfs_attach_checkpoint(sbi, cno); + if (err) { + printk(KERN_ERR "NILFS: error loading a checkpoint" + " (checkpoint number=%llu).\n", (unsigned long long)cno); + goto failed_sbi; + } + + if (!(sb->s_flags & MS_RDONLY)) { + err = nilfs_attach_segment_constructor(sbi); + if (err) + goto failed_checkpoint; + } + + root = nilfs_iget(sb, NILFS_ROOT_INO); + if (IS_ERR(root)) { + printk(KERN_ERR "NILFS: get root inode failed\n"); + err = PTR_ERR(root); + goto failed_segctor; + } + if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + iput(root); + printk(KERN_ERR "NILFS: corrupt root inode.\n"); + err = -EINVAL; + goto failed_segctor; + } + sb->s_root = d_alloc_root(root); + if (!sb->s_root) { + iput(root); + printk(KERN_ERR "NILFS: get root dentry failed\n"); + err = -ENOMEM; + goto failed_segctor; + } + + if (!(sb->s_flags & MS_RDONLY)) { + down_write(&nilfs->ns_sem); + nilfs_setup_super(sbi); + up_write(&nilfs->ns_sem); + } + + err = nilfs_mark_recovery_complete(sbi); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: recovery failed.\n"); + goto failed_root; + } + + return 0; + + failed_root: + dput(sb->s_root); + sb->s_root = NULL; + + failed_segctor: + nilfs_detach_segment_constructor(sbi); + + failed_checkpoint: + nilfs_detach_checkpoint(sbi); + + failed_sbi: + put_nilfs(nilfs); + sb->s_fs_info = NULL; + kfree(sbi); + return err; +} + +static int nilfs_remount(struct super_block *sb, int *flags, char *data) +{ + struct nilfs_sb_info *sbi = NILFS_SB(sb); + struct nilfs_super_block *sbp; + struct the_nilfs *nilfs = sbi->s_nilfs; + unsigned long old_sb_flags; + struct nilfs_mount_options old_opts; + int err; + + old_sb_flags = sb->s_flags; + old_opts.mount_opt = sbi->s_mount_opt; + old_opts.snapshot_cno = sbi->s_snapshot_cno; + + if (!parse_options(data, sb)) { + err = -EINVAL; + goto restore_opts; + } + sb->s_flags = (sb->s_flags & ~MS_POSIXACL); + + if ((*flags & MS_RDONLY) && + sbi->s_snapshot_cno != old_opts.snapshot_cno) { + printk(KERN_WARNING "NILFS (device %s): couldn't " + "remount to a different snapshot. \n", + sb->s_id); + err = -EINVAL; + goto restore_opts; + } + + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + goto out; + if (*flags & MS_RDONLY) { + /* Shutting down the segment constructor */ + nilfs_detach_segment_constructor(sbi); + sb->s_flags |= MS_RDONLY; + + sbi->s_snapshot_cno = nilfs_last_cno(nilfs); + /* nilfs_set_opt(sbi, SNAPSHOT); */ + + /* + * Remounting a valid RW partition RDONLY, so set + * the RDONLY flag and then mark the partition as valid again. + */ + down_write(&nilfs->ns_sem); + sbp = nilfs->ns_sbp[0]; + if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) && + (nilfs->ns_mount_state & NILFS_VALID_FS)) + sbp->s_state = cpu_to_le16(nilfs->ns_mount_state); + sbp->s_mtime = cpu_to_le64(get_seconds()); + nilfs_commit_super(sbi, 1); + up_write(&nilfs->ns_sem); + } else { + /* + * Mounting a RDONLY partition read-write, so reread and + * store the current valid flag. (It may have been changed + * by fsck since we originally mounted the partition.) + */ + down(&sb->s_bdev->bd_mount_sem); + /* Check existing RW-mount */ + if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) { + printk(KERN_WARNING "NILFS (device %s): couldn't " + "remount because a RW-mount exists.\n", + sb->s_id); + err = -EBUSY; + goto rw_remount_failed; + } + if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) { + printk(KERN_WARNING "NILFS (device %s): couldn't " + "remount because the current RO-mount is not " + "the latest one.\n", + sb->s_id); + err = -EINVAL; + goto rw_remount_failed; + } + sb->s_flags &= ~MS_RDONLY; + nilfs_clear_opt(sbi, SNAPSHOT); + sbi->s_snapshot_cno = 0; + + err = nilfs_attach_segment_constructor(sbi); + if (err) + goto rw_remount_failed; + + down_write(&nilfs->ns_sem); + nilfs_setup_super(sbi); + up_write(&nilfs->ns_sem); + + up(&sb->s_bdev->bd_mount_sem); + } + out: + return 0; + + rw_remount_failed: + up(&sb->s_bdev->bd_mount_sem); + restore_opts: + sb->s_flags = old_sb_flags; + sbi->s_mount_opt = old_opts.mount_opt; + sbi->s_snapshot_cno = old_opts.snapshot_cno; + return err; +} + +struct nilfs_super_data { + struct block_device *bdev; + __u64 cno; + int flags; +}; + +/** + * nilfs_identify - pre-read mount options needed to identify mount instance + * @data: mount options + * @sd: nilfs_super_data + */ +static int nilfs_identify(char *data, struct nilfs_super_data *sd) +{ + char *p, *options = data; + substring_t args[MAX_OPT_ARGS]; + int option, token; + int ret = 0; + + do { + p = strsep(&options, ","); + if (p != NULL && *p) { + token = match_token(p, tokens, args); + if (token == Opt_snapshot) { + if (!(sd->flags & MS_RDONLY)) + ret++; + else { + ret = match_int(&args[0], &option); + if (!ret) { + if (option > 0) + sd->cno = option; + else + ret++; + } + } + } + if (ret) + printk(KERN_ERR + "NILFS: invalid mount option: %s\n", p); + } + if (!options) + break; + BUG_ON(options == data); + *(options - 1) = ','; + } while (!ret); + return ret; +} + +static int nilfs_set_bdev_super(struct super_block *s, void *data) +{ + struct nilfs_super_data *sd = data; + + s->s_bdev = sd->bdev; + s->s_dev = s->s_bdev->bd_dev; + return 0; +} + +static int nilfs_test_bdev_super(struct super_block *s, void *data) +{ + struct nilfs_super_data *sd = data; + + return s->s_bdev == sd->bdev; +} + +static int nilfs_test_bdev_super2(struct super_block *s, void *data) +{ + struct nilfs_super_data *sd = data; + int ret; + + if (s->s_bdev != sd->bdev) + return 0; + + if (!((s->s_flags | sd->flags) & MS_RDONLY)) + return 1; /* Reuse an old R/W-mode super_block */ + + if (s->s_flags & sd->flags & MS_RDONLY) { + if (down_read_trylock(&s->s_umount)) { + ret = s->s_root && + (sd->cno == NILFS_SB(s)->s_snapshot_cno); + up_read(&s->s_umount); + /* + * This path is locked with sb_lock by sget(). + * So, drop_super() causes deadlock. + */ + return ret; + } + } + return 0; +} + +static int +nilfs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) +{ + struct nilfs_super_data sd; + struct super_block *s, *s2; + struct the_nilfs *nilfs = NULL; + int err, need_to_close = 1; + + sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type); + if (IS_ERR(sd.bdev)) + return PTR_ERR(sd.bdev); + + /* + * To get mount instance using sget() vfs-routine, NILFS needs + * much more information than normal filesystems to identify mount + * instance. For snapshot mounts, not only a mount type (ro-mount + * or rw-mount) but also a checkpoint number is required. + * The results are passed in sget() using nilfs_super_data. + */ + sd.cno = 0; + sd.flags = flags; + if (nilfs_identify((char *)data, &sd)) { + err = -EINVAL; + goto failed; + } + + /* + * once the super is inserted into the list by sget, s_umount + * will protect the lockfs code from trying to start a snapshot + * while we are mounting + */ + down(&sd.bdev->bd_mount_sem); + if (!sd.cno && + (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) { + err = (err < 0) ? : -EBUSY; + goto failed_unlock; + } + + /* + * Phase-1: search any existent instance and get the_nilfs + */ + s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd); + if (IS_ERR(s)) + goto error_s; + + if (!s->s_root) { + err = -ENOMEM; + nilfs = alloc_nilfs(sd.bdev); + if (!nilfs) + goto cancel_new; + } else { + struct nilfs_sb_info *sbi = NILFS_SB(s); + + /* + * s_umount protects super_block from unmount process; + * It covers pointers of nilfs_sb_info and the_nilfs. + */ + nilfs = sbi->s_nilfs; + get_nilfs(nilfs); + up_write(&s->s_umount); + + /* + * Phase-2: search specified snapshot or R/W mode super_block + */ + if (!sd.cno) + /* trying to get the latest checkpoint. */ + sd.cno = nilfs_last_cno(nilfs); + + s2 = sget(fs_type, nilfs_test_bdev_super2, + nilfs_set_bdev_super, &sd); + deactivate_super(s); + /* + * Although deactivate_super() invokes close_bdev_exclusive() at + * kill_block_super(). Here, s is an existent mount; we need + * one more close_bdev_exclusive() call. + */ + s = s2; + if (IS_ERR(s)) + goto error_s; + } + + if (!s->s_root) { + char b[BDEVNAME_SIZE]; + + s->s_flags = flags; + strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); + sb_set_blocksize(s, block_size(sd.bdev)); + + err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs); + if (err) + goto cancel_new; + + s->s_flags |= MS_ACTIVE; + need_to_close = 0; + } else if (!(s->s_flags & MS_RDONLY)) { + err = -EBUSY; + } + + up(&sd.bdev->bd_mount_sem); + put_nilfs(nilfs); + if (need_to_close) + close_bdev_exclusive(sd.bdev, flags); + simple_set_mnt(mnt, s); + return 0; + + error_s: + up(&sd.bdev->bd_mount_sem); + if (nilfs) + put_nilfs(nilfs); + close_bdev_exclusive(sd.bdev, flags); + return PTR_ERR(s); + + failed_unlock: + up(&sd.bdev->bd_mount_sem); + failed: + close_bdev_exclusive(sd.bdev, flags); + + return err; + + cancel_new: + /* Abandoning the newly allocated superblock */ + up(&sd.bdev->bd_mount_sem); + if (nilfs) + put_nilfs(nilfs); + up_write(&s->s_umount); + deactivate_super(s); + /* + * deactivate_super() invokes close_bdev_exclusive(). + * We must finish all post-cleaning before this call; + * put_nilfs() and unlocking bd_mount_sem need the block device. + */ + return err; +} + +static int nilfs_test_bdev_super3(struct super_block *s, void *data) +{ + struct nilfs_super_data *sd = data; + int ret; + + if (s->s_bdev != sd->bdev) + return 0; + if (down_read_trylock(&s->s_umount)) { + ret = (s->s_flags & MS_RDONLY) && s->s_root && + nilfs_test_opt(NILFS_SB(s), SNAPSHOT); + up_read(&s->s_umount); + if (ret) + return 0; /* ignore snapshot mounts */ + } + return !((sd->flags ^ s->s_flags) & MS_RDONLY); +} + +static int __false_bdev_super(struct super_block *s, void *data) +{ +#if 0 /* XXX: workaround for lock debug. This is not good idea */ + up_write(&s->s_umount); +#endif + return -EFAULT; +} + +/** + * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not. + * fs_type: filesystem type + * bdev: block device + * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount) + * res: pointer to an integer to store result + * + * This function must be called within a section protected by bd_mount_mutex. + */ +static int test_exclusive_mount(struct file_system_type *fs_type, + struct block_device *bdev, int flags) +{ + struct super_block *s; + struct nilfs_super_data sd = { .flags = flags, .bdev = bdev }; + + s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd); + if (IS_ERR(s)) { + if (PTR_ERR(s) != -EFAULT) + return PTR_ERR(s); + return 0; /* Not found */ + } + up_write(&s->s_umount); + deactivate_super(s); + return 1; /* Found */ +} + +struct file_system_type nilfs_fs_type = { + .owner = THIS_MODULE, + .name = "nilfs2", + .get_sb = nilfs_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static int __init init_nilfs_fs(void) +{ + int err; + + err = nilfs_init_inode_cache(); + if (err) + goto failed; + + err = nilfs_init_transaction_cache(); + if (err) + goto failed_inode_cache; + + err = nilfs_init_segbuf_cache(); + if (err) + goto failed_transaction_cache; + + err = nilfs_btree_path_cache_init(); + if (err) + goto failed_segbuf_cache; + + err = register_filesystem(&nilfs_fs_type); + if (err) + goto failed_btree_path_cache; + + return 0; + + failed_btree_path_cache: + nilfs_btree_path_cache_destroy(); + + failed_segbuf_cache: + nilfs_destroy_segbuf_cache(); + + failed_transaction_cache: + nilfs_destroy_transaction_cache(); + + failed_inode_cache: + nilfs_destroy_inode_cache(); + + failed: + return err; +} + +static void __exit exit_nilfs_fs(void) +{ + nilfs_destroy_segbuf_cache(); + nilfs_destroy_transaction_cache(); + nilfs_destroy_inode_cache(); + nilfs_btree_path_cache_destroy(); + unregister_filesystem(&nilfs_fs_type); +} + +module_init(init_nilfs_fs) +module_exit(exit_nilfs_fs) diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c new file mode 100644 index 00000000000..7f65b3be4aa --- /dev/null +++ b/fs/nilfs2/the_nilfs.c @@ -0,0 +1,641 @@ +/* + * the_nilfs.c - the_nilfs shared structure. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#include <linux/buffer_head.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include <linux/crc32.h> +#include "nilfs.h" +#include "segment.h" +#include "alloc.h" +#include "cpfile.h" +#include "sufile.h" +#include "dat.h" +#include "seglist.h" +#include "segbuf.h" + +void nilfs_set_last_segment(struct the_nilfs *nilfs, + sector_t start_blocknr, u64 seq, __u64 cno) +{ + spin_lock(&nilfs->ns_last_segment_lock); + nilfs->ns_last_pseg = start_blocknr; + nilfs->ns_last_seq = seq; + nilfs->ns_last_cno = cno; + spin_unlock(&nilfs->ns_last_segment_lock); +} + +/** + * alloc_nilfs - allocate the_nilfs structure + * @bdev: block device to which the_nilfs is related + * + * alloc_nilfs() allocates memory for the_nilfs and + * initializes its reference count and locks. + * + * Return Value: On success, pointer to the_nilfs is returned. + * On error, NULL is returned. + */ +struct the_nilfs *alloc_nilfs(struct block_device *bdev) +{ + struct the_nilfs *nilfs; + + nilfs = kzalloc(sizeof(*nilfs), GFP_KERNEL); + if (!nilfs) + return NULL; + + nilfs->ns_bdev = bdev; + atomic_set(&nilfs->ns_count, 1); + atomic_set(&nilfs->ns_writer_refcount, -1); + atomic_set(&nilfs->ns_ndirtyblks, 0); + init_rwsem(&nilfs->ns_sem); + mutex_init(&nilfs->ns_writer_mutex); + INIT_LIST_HEAD(&nilfs->ns_supers); + spin_lock_init(&nilfs->ns_last_segment_lock); + nilfs->ns_gc_inodes_h = NULL; + init_rwsem(&nilfs->ns_segctor_sem); + + return nilfs; +} + +/** + * put_nilfs - release a reference to the_nilfs + * @nilfs: the_nilfs structure to be released + * + * put_nilfs() decrements a reference counter of the_nilfs. + * If the reference count reaches zero, the_nilfs is freed. + */ +void put_nilfs(struct the_nilfs *nilfs) +{ + if (!atomic_dec_and_test(&nilfs->ns_count)) + return; + /* + * Increment of ns_count never occur below because the caller + * of get_nilfs() holds at least one reference to the_nilfs. + * Thus its exclusion control is not required here. + */ + might_sleep(); + if (nilfs_loaded(nilfs)) { + nilfs_mdt_clear(nilfs->ns_sufile); + nilfs_mdt_destroy(nilfs->ns_sufile); + nilfs_mdt_clear(nilfs->ns_cpfile); + nilfs_mdt_destroy(nilfs->ns_cpfile); + nilfs_mdt_clear(nilfs->ns_dat); + nilfs_mdt_destroy(nilfs->ns_dat); + /* XXX: how and when to clear nilfs->ns_gc_dat? */ + nilfs_mdt_destroy(nilfs->ns_gc_dat); + } + if (nilfs_init(nilfs)) { + nilfs_destroy_gccache(nilfs); + brelse(nilfs->ns_sbh[0]); + brelse(nilfs->ns_sbh[1]); + } + kfree(nilfs); +} + +static int nilfs_load_super_root(struct the_nilfs *nilfs, + struct nilfs_sb_info *sbi, sector_t sr_block) +{ + static struct lock_class_key dat_lock_key; + struct buffer_head *bh_sr; + struct nilfs_super_root *raw_sr; + struct nilfs_super_block **sbp = nilfs->ns_sbp; + unsigned dat_entry_size, segment_usage_size, checkpoint_size; + unsigned inode_size; + int err; + + err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1); + if (unlikely(err)) + return err; + + down_read(&nilfs->ns_sem); + dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size); + checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size); + segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size); + up_read(&nilfs->ns_sem); + + inode_size = nilfs->ns_inode_size; + + err = -ENOMEM; + nilfs->ns_dat = nilfs_mdt_new( + nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP); + if (unlikely(!nilfs->ns_dat)) + goto failed; + + nilfs->ns_gc_dat = nilfs_mdt_new( + nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP); + if (unlikely(!nilfs->ns_gc_dat)) + goto failed_dat; + + nilfs->ns_cpfile = nilfs_mdt_new( + nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP); + if (unlikely(!nilfs->ns_cpfile)) + goto failed_gc_dat; + + nilfs->ns_sufile = nilfs_mdt_new( + nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP); + if (unlikely(!nilfs->ns_sufile)) + goto failed_cpfile; + + err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size); + if (unlikely(err)) + goto failed_sufile; + + err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size); + if (unlikely(err)) + goto failed_sufile; + + lockdep_set_class(&NILFS_MDT(nilfs->ns_dat)->mi_sem, &dat_lock_key); + lockdep_set_class(&NILFS_MDT(nilfs->ns_gc_dat)->mi_sem, &dat_lock_key); + + nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat); + nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size, + sizeof(struct nilfs_cpfile_header)); + nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size, + sizeof(struct nilfs_sufile_header)); + + err = nilfs_mdt_read_inode_direct( + nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size)); + if (unlikely(err)) + goto failed_sufile; + + err = nilfs_mdt_read_inode_direct( + nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size)); + if (unlikely(err)) + goto failed_sufile; + + err = nilfs_mdt_read_inode_direct( + nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size)); + if (unlikely(err)) + goto failed_sufile; + + raw_sr = (struct nilfs_super_root *)bh_sr->b_data; + nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime); + + failed: + brelse(bh_sr); + return err; + + failed_sufile: + nilfs_mdt_destroy(nilfs->ns_sufile); + + failed_cpfile: + nilfs_mdt_destroy(nilfs->ns_cpfile); + + failed_gc_dat: + nilfs_mdt_destroy(nilfs->ns_gc_dat); + + failed_dat: + nilfs_mdt_destroy(nilfs->ns_dat); + goto failed; +} + +static void nilfs_init_recovery_info(struct nilfs_recovery_info *ri) +{ + memset(ri, 0, sizeof(*ri)); + INIT_LIST_HEAD(&ri->ri_used_segments); +} + +static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri) +{ + nilfs_dispose_segment_list(&ri->ri_used_segments); +} + +/** + * load_nilfs - load and recover the nilfs + * @nilfs: the_nilfs structure to be released + * @sbi: nilfs_sb_info used to recover past segment + * + * load_nilfs() searches and load the latest super root, + * attaches the last segment, and does recovery if needed. + * The caller must call this exclusively for simultaneous mounts. + */ +int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) +{ + struct nilfs_recovery_info ri; + unsigned int s_flags = sbi->s_super->s_flags; + int really_read_only = bdev_read_only(nilfs->ns_bdev); + unsigned valid_fs; + int err = 0; + + nilfs_init_recovery_info(&ri); + + down_write(&nilfs->ns_sem); + valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS); + up_write(&nilfs->ns_sem); + + if (!valid_fs && (s_flags & MS_RDONLY)) { + printk(KERN_INFO "NILFS: INFO: recovery " + "required for readonly filesystem.\n"); + if (really_read_only) { + printk(KERN_ERR "NILFS: write access " + "unavailable, cannot proceed.\n"); + err = -EROFS; + goto failed; + } + printk(KERN_INFO "NILFS: write access will " + "be enabled during recovery.\n"); + sbi->s_super->s_flags &= ~MS_RDONLY; + } + + err = nilfs_search_super_root(nilfs, sbi, &ri); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: error searching super root.\n"); + goto failed; + } + + err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: error loading super root.\n"); + goto failed; + } + + if (!valid_fs) { + err = nilfs_recover_logical_segments(nilfs, sbi, &ri); + if (unlikely(err)) { + nilfs_mdt_destroy(nilfs->ns_cpfile); + nilfs_mdt_destroy(nilfs->ns_sufile); + nilfs_mdt_destroy(nilfs->ns_dat); + goto failed; + } + if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED) + sbi->s_super->s_dirt = 1; + } + + set_nilfs_loaded(nilfs); + + failed: + nilfs_clear_recovery_info(&ri); + sbi->s_super->s_flags = s_flags; + return err; +} + +static unsigned long long nilfs_max_size(unsigned int blkbits) +{ + unsigned int max_bits; + unsigned long long res = MAX_LFS_FILESIZE; /* page cache limit */ + + max_bits = blkbits + NILFS_BMAP_KEY_BIT; /* bmap size limit */ + if (max_bits < 64) + res = min_t(unsigned long long, res, (1ULL << max_bits) - 1); + return res; +} + +static int nilfs_store_disk_layout(struct the_nilfs *nilfs, + struct nilfs_super_block *sbp) +{ + if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) { + printk(KERN_ERR "NILFS: revision mismatch " + "(superblock rev.=%d.%d, current rev.=%d.%d). " + "Please check the version of mkfs.nilfs.\n", + le32_to_cpu(sbp->s_rev_level), + le16_to_cpu(sbp->s_minor_rev_level), + NILFS_CURRENT_REV, NILFS_MINOR_REV); + return -EINVAL; + } + nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes); + if (nilfs->ns_sbsize > BLOCK_SIZE) + return -EINVAL; + + nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size); + nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino); + + nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment); + if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) { + printk(KERN_ERR "NILFS: too short segment. \n"); + return -EINVAL; + } + + nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block); + nilfs->ns_nsegments = le64_to_cpu(sbp->s_nsegments); + nilfs->ns_r_segments_percentage = + le32_to_cpu(sbp->s_r_segments_percentage); + nilfs->ns_nrsvsegs = + max_t(unsigned long, NILFS_MIN_NRSVSEGS, + DIV_ROUND_UP(nilfs->ns_nsegments * + nilfs->ns_r_segments_percentage, 100)); + nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed); + return 0; +} + +static int nilfs_valid_sb(struct nilfs_super_block *sbp) +{ + static unsigned char sum[4]; + const int sumoff = offsetof(struct nilfs_super_block, s_sum); + size_t bytes; + u32 crc; + + if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC) + return 0; + bytes = le16_to_cpu(sbp->s_bytes); + if (bytes > BLOCK_SIZE) + return 0; + crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp, + sumoff); + crc = crc32_le(crc, sum, 4); + crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4, + bytes - sumoff - 4); + return crc == le32_to_cpu(sbp->s_sum); +} + +static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset) +{ + return offset < ((le64_to_cpu(sbp->s_nsegments) * + le32_to_cpu(sbp->s_blocks_per_segment)) << + (le32_to_cpu(sbp->s_log_block_size) + 10)); +} + +static void nilfs_release_super_block(struct the_nilfs *nilfs) +{ + int i; + + for (i = 0; i < 2; i++) { + if (nilfs->ns_sbp[i]) { + brelse(nilfs->ns_sbh[i]); + nilfs->ns_sbh[i] = NULL; + nilfs->ns_sbp[i] = NULL; + } + } +} + +void nilfs_fall_back_super_block(struct the_nilfs *nilfs) +{ + brelse(nilfs->ns_sbh[0]); + nilfs->ns_sbh[0] = nilfs->ns_sbh[1]; + nilfs->ns_sbp[0] = nilfs->ns_sbp[1]; + nilfs->ns_sbh[1] = NULL; + nilfs->ns_sbp[1] = NULL; +} + +void nilfs_swap_super_block(struct the_nilfs *nilfs) +{ + struct buffer_head *tsbh = nilfs->ns_sbh[0]; + struct nilfs_super_block *tsbp = nilfs->ns_sbp[0]; + + nilfs->ns_sbh[0] = nilfs->ns_sbh[1]; + nilfs->ns_sbp[0] = nilfs->ns_sbp[1]; + nilfs->ns_sbh[1] = tsbh; + nilfs->ns_sbp[1] = tsbp; +} + +static int nilfs_load_super_block(struct the_nilfs *nilfs, + struct super_block *sb, int blocksize, + struct nilfs_super_block **sbpp) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + struct buffer_head **sbh = nilfs->ns_sbh; + u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size); + int valid[2], swp = 0; + + sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize, + &sbh[0]); + sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]); + + if (!sbp[0]) { + if (!sbp[1]) { + printk(KERN_ERR "NILFS: unable to read superblock\n"); + return -EIO; + } + printk(KERN_WARNING + "NILFS warning: unable to read primary superblock\n"); + } else if (!sbp[1]) + printk(KERN_WARNING + "NILFS warning: unable to read secondary superblock\n"); + + valid[0] = nilfs_valid_sb(sbp[0]); + valid[1] = nilfs_valid_sb(sbp[1]); + swp = valid[1] && + (!valid[0] || + le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime)); + + if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) { + brelse(sbh[1]); + sbh[1] = NULL; + sbp[1] = NULL; + swp = 0; + } + if (!valid[swp]) { + nilfs_release_super_block(nilfs); + printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n", + sb->s_id); + return -EINVAL; + } + + if (swp) { + printk(KERN_WARNING "NILFS warning: broken superblock. " + "using spare superblock.\n"); + nilfs_swap_super_block(nilfs); + } + + nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime); + nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0; + nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq); + *sbpp = sbp[0]; + return 0; +} + +/** + * init_nilfs - initialize a NILFS instance. + * @nilfs: the_nilfs structure + * @sbi: nilfs_sb_info + * @sb: super block + * @data: mount options + * + * init_nilfs() performs common initialization per block device (e.g. + * reading the super block, getting disk layout information, initializing + * shared fields in the_nilfs). It takes on some portion of the jobs + * typically done by a fill_super() routine. This division arises from + * the nature that multiple NILFS instances may be simultaneously + * mounted on a device. + * For multiple mounts on the same device, only the first mount + * invokes these tasks. + * + * Return Value: On success, 0 is returned. On error, a negative error + * code is returned. + */ +int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) +{ + struct super_block *sb = sbi->s_super; + struct nilfs_super_block *sbp; + struct backing_dev_info *bdi; + int blocksize; + int err; + + down_write(&nilfs->ns_sem); + if (nilfs_init(nilfs)) { + /* Load values from existing the_nilfs */ + sbp = nilfs->ns_sbp[0]; + err = nilfs_store_magic_and_option(sb, sbp, data); + if (err) + goto out; + + blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); + if (sb->s_blocksize != blocksize && + !sb_set_blocksize(sb, blocksize)) { + printk(KERN_ERR "NILFS: blocksize %d unfit to device\n", + blocksize); + err = -EINVAL; + } + sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits); + goto out; + } + + blocksize = sb_min_blocksize(sb, BLOCK_SIZE); + if (!blocksize) { + printk(KERN_ERR "NILFS: unable to set blocksize\n"); + err = -EINVAL; + goto out; + } + err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); + if (err) + goto out; + + err = nilfs_store_magic_and_option(sb, sbp, data); + if (err) + goto failed_sbh; + + blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); + if (sb->s_blocksize != blocksize) { + int hw_blocksize = bdev_hardsect_size(sb->s_bdev); + + if (blocksize < hw_blocksize) { + printk(KERN_ERR + "NILFS: blocksize %d too small for device " + "(sector-size = %d).\n", + blocksize, hw_blocksize); + err = -EINVAL; + goto failed_sbh; + } + nilfs_release_super_block(nilfs); + sb_set_blocksize(sb, blocksize); + + err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); + if (err) + goto out; + /* not failed_sbh; sbh is released automatically + when reloading fails. */ + } + nilfs->ns_blocksize_bits = sb->s_blocksize_bits; + + err = nilfs_store_disk_layout(nilfs, sbp); + if (err) + goto failed_sbh; + + sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits); + + nilfs->ns_mount_state = le16_to_cpu(sbp->s_state); + + bdi = nilfs->ns_bdev->bd_inode_backing_dev_info; + if (!bdi) + bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info; + nilfs->ns_bdi = bdi ? : &default_backing_dev_info; + + /* Finding last segment */ + nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg); + nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno); + nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq); + + nilfs->ns_seg_seq = nilfs->ns_last_seq; + nilfs->ns_segnum = + nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg); + nilfs->ns_cno = nilfs->ns_last_cno + 1; + if (nilfs->ns_segnum >= nilfs->ns_nsegments) { + printk(KERN_ERR "NILFS invalid last segment number.\n"); + err = -EINVAL; + goto failed_sbh; + } + /* Dummy values */ + nilfs->ns_free_segments_count = + nilfs->ns_nsegments - (nilfs->ns_segnum + 1); + + /* Initialize gcinode cache */ + err = nilfs_init_gccache(nilfs); + if (err) + goto failed_sbh; + + set_nilfs_init(nilfs); + err = 0; + out: + up_write(&nilfs->ns_sem); + return err; + + failed_sbh: + nilfs_release_super_block(nilfs); + goto out; +} + +int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks) +{ + struct inode *dat = nilfs_dat_inode(nilfs); + unsigned long ncleansegs; + int err; + + down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs); + up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */ + if (likely(!err)) + *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment; + return err; +} + +int nilfs_near_disk_full(struct the_nilfs *nilfs) +{ + struct inode *sufile = nilfs->ns_sufile; + unsigned long ncleansegs, nincsegs; + int ret; + + ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs); + if (likely(!ret)) { + nincsegs = atomic_read(&nilfs->ns_ndirtyblks) / + nilfs->ns_blocks_per_segment + 1; + if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs) + ret++; + } + return ret; +} + +int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno, + int snapshot_mount) +{ + struct nilfs_sb_info *sbi; + int ret = 0; + + down_read(&nilfs->ns_sem); + if (cno == 0 || cno > nilfs->ns_cno) + goto out_unlock; + + list_for_each_entry(sbi, &nilfs->ns_supers, s_list) { + if (sbi->s_snapshot_cno == cno && + (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) { + /* exclude read-only mounts */ + ret++; + break; + } + } + /* for protecting recent checkpoints */ + if (cno >= nilfs_last_cno(nilfs)) + ret++; + + out_unlock: + up_read(&nilfs->ns_sem); + return ret; +} diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h new file mode 100644 index 00000000000..30fe58778d0 --- /dev/null +++ b/fs/nilfs2/the_nilfs.h @@ -0,0 +1,298 @@ +/* + * the_nilfs.h - the_nilfs shared structure. + * + * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Written by Ryusuke Konishi <ryusuke@osrg.net> + * + */ + +#ifndef _THE_NILFS_H +#define _THE_NILFS_H + +#include <linux/types.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include "sb.h" + +/* the_nilfs struct */ +enum { + THE_NILFS_INIT = 0, /* Information from super_block is set */ + THE_NILFS_LOADED, /* Roll-back/roll-forward has done and + the latest checkpoint was loaded */ + THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ +}; + +/** + * struct the_nilfs - struct to supervise multiple nilfs mount points + * @ns_flags: flags + * @ns_count: reference count + * @ns_bdev: block device + * @ns_bdi: backing dev info + * @ns_writer: back pointer to writable nilfs_sb_info + * @ns_sem: semaphore for shared states + * @ns_writer_mutex: mutex protecting ns_writer attach/detach + * @ns_writer_refcount: number of referrers on ns_writer + * @ns_sbh: buffer heads of on-disk super blocks + * @ns_sbp: pointers to super block data + * @ns_sbwtime: previous write time of super blocks + * @ns_sbsize: size of valid data in super block + * @ns_supers: list of nilfs super block structs + * @ns_seg_seq: segment sequence counter + * @ns_segnum: index number of the latest full segment. + * @ns_nextnum: index number of the full segment index to be used next + * @ns_pseg_offset: offset of next partial segment in the current full segment + * @ns_cno: next checkpoint number + * @ns_ctime: write time of the last segment + * @ns_nongc_ctime: write time of the last segment not for cleaner operation + * @ns_ndirtyblks: Number of dirty data blocks + * @ns_last_segment_lock: lock protecting fields for the latest segment + * @ns_last_pseg: start block number of the latest segment + * @ns_last_seq: sequence value of the latest segment + * @ns_last_cno: checkpoint number of the latest segment + * @ns_prot_seq: least sequence number of segments which must not be reclaimed + * @ns_free_segments_count: counter of free segments + * @ns_segctor_sem: segment constructor semaphore + * @ns_dat: DAT file inode + * @ns_cpfile: checkpoint file inode + * @ns_sufile: segusage file inode + * @ns_gc_dat: shadow inode of the DAT file inode for GC + * @ns_gc_inodes: dummy inodes to keep live blocks + * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks + * @ns_blocksize_bits: bit length of block size + * @ns_nsegments: number of segments in filesystem + * @ns_blocks_per_segment: number of blocks per segment + * @ns_r_segments_percentage: reserved segments percentage + * @ns_nrsvsegs: number of reserved segments + * @ns_first_data_block: block number of first data block + * @ns_inode_size: size of on-disk inode + * @ns_first_ino: first not-special inode number + * @ns_crc_seed: seed value of CRC32 calculation + */ +struct the_nilfs { + unsigned long ns_flags; + atomic_t ns_count; + + struct block_device *ns_bdev; + struct backing_dev_info *ns_bdi; + struct nilfs_sb_info *ns_writer; + struct rw_semaphore ns_sem; + struct mutex ns_writer_mutex; + atomic_t ns_writer_refcount; + + /* + * used for + * - loading the latest checkpoint exclusively. + * - allocating a new full segment. + * - protecting s_dirt in the super_block struct + * (see nilfs_write_super) and the following fields. + */ + struct buffer_head *ns_sbh[2]; + struct nilfs_super_block *ns_sbp[2]; + time_t ns_sbwtime[2]; + unsigned ns_sbsize; + unsigned ns_mount_state; + struct list_head ns_supers; + + /* + * Following fields are dedicated to a writable FS-instance. + * Except for the period seeking checkpoint, code outside the segment + * constructor must lock a segment semaphore while accessing these + * fields. + * The writable FS-instance is sole during a lifetime of the_nilfs. + */ + u64 ns_seg_seq; + __u64 ns_segnum; + __u64 ns_nextnum; + unsigned long ns_pseg_offset; + __u64 ns_cno; + time_t ns_ctime; + time_t ns_nongc_ctime; + atomic_t ns_ndirtyblks; + + /* + * The following fields hold information on the latest partial segment + * written to disk with a super root. These fields are protected by + * ns_last_segment_lock. + */ + spinlock_t ns_last_segment_lock; + sector_t ns_last_pseg; + u64 ns_last_seq; + __u64 ns_last_cno; + u64 ns_prot_seq; + unsigned long ns_free_segments_count; + + struct rw_semaphore ns_segctor_sem; + + /* + * Following fields are lock free except for the period before + * the_nilfs is initialized. + */ + struct inode *ns_dat; + struct inode *ns_cpfile; + struct inode *ns_sufile; + struct inode *ns_gc_dat; + + /* GC inode list and hash table head */ + struct list_head ns_gc_inodes; + struct hlist_head *ns_gc_inodes_h; + + /* Disk layout information (static) */ + unsigned int ns_blocksize_bits; + unsigned long ns_nsegments; + unsigned long ns_blocks_per_segment; + unsigned long ns_r_segments_percentage; + unsigned long ns_nrsvsegs; + unsigned long ns_first_data_block; + int ns_inode_size; + int ns_first_ino; + u32 ns_crc_seed; +}; + +#define NILFS_GCINODE_HASH_BITS 8 +#define NILFS_GCINODE_HASH_SIZE (1<<NILFS_GCINODE_HASH_BITS) + +#define THE_NILFS_FNS(bit, name) \ +static inline void set_nilfs_##name(struct the_nilfs *nilfs) \ +{ \ + set_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \ +} \ +static inline void clear_nilfs_##name(struct the_nilfs *nilfs) \ +{ \ + clear_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \ +} \ +static inline int nilfs_##name(struct the_nilfs *nilfs) \ +{ \ + return test_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \ +} + +THE_NILFS_FNS(INIT, init) +THE_NILFS_FNS(LOADED, loaded) +THE_NILFS_FNS(DISCONTINUED, discontinued) + +/* Minimum interval of periodical update of superblocks (in seconds) */ +#define NILFS_SB_FREQ 10 +#define NILFS_ALTSB_FREQ 60 /* spare superblock */ + +void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); +struct the_nilfs *alloc_nilfs(struct block_device *); +void put_nilfs(struct the_nilfs *); +int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); +int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); +int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); +int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int); +int nilfs_near_disk_full(struct the_nilfs *); +void nilfs_fall_back_super_block(struct the_nilfs *); +void nilfs_swap_super_block(struct the_nilfs *); + + +static inline void get_nilfs(struct the_nilfs *nilfs) +{ + /* Caller must have at least one reference of the_nilfs. */ + atomic_inc(&nilfs->ns_count); +} + +static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs) +{ + if (atomic_inc_and_test(&nilfs->ns_writer_refcount)) + mutex_lock(&nilfs->ns_writer_mutex); + return nilfs->ns_writer; +} + +static inline void nilfs_put_writer(struct the_nilfs *nilfs) +{ + if (atomic_add_negative(-1, &nilfs->ns_writer_refcount)) + mutex_unlock(&nilfs->ns_writer_mutex); +} + +static inline void +nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) +{ + mutex_lock(&nilfs->ns_writer_mutex); + nilfs->ns_writer = sbi; + mutex_unlock(&nilfs->ns_writer_mutex); +} + +static inline void +nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) +{ + mutex_lock(&nilfs->ns_writer_mutex); + if (sbi == nilfs->ns_writer) + nilfs->ns_writer = NULL; + mutex_unlock(&nilfs->ns_writer_mutex); +} + +static inline void +nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum, + sector_t *seg_start, sector_t *seg_end) +{ + *seg_start = (sector_t)nilfs->ns_blocks_per_segment * segnum; + *seg_end = *seg_start + nilfs->ns_blocks_per_segment - 1; + if (segnum == 0) + *seg_start = nilfs->ns_first_data_block; +} + +static inline sector_t +nilfs_get_segment_start_blocknr(struct the_nilfs *nilfs, __u64 segnum) +{ + return (segnum == 0) ? nilfs->ns_first_data_block : + (sector_t)nilfs->ns_blocks_per_segment * segnum; +} + +static inline __u64 +nilfs_get_segnum_of_block(struct the_nilfs *nilfs, sector_t blocknr) +{ + sector_t segnum = blocknr; + + sector_div(segnum, nilfs->ns_blocks_per_segment); + return segnum; +} + +static inline void +nilfs_terminate_segment(struct the_nilfs *nilfs, sector_t seg_start, + sector_t seg_end) +{ + /* terminate the current full segment (used in case of I/O-error) */ + nilfs->ns_pseg_offset = seg_end - seg_start + 1; +} + +static inline void nilfs_shift_to_next_segment(struct the_nilfs *nilfs) +{ + /* move forward with a full segment */ + nilfs->ns_segnum = nilfs->ns_nextnum; + nilfs->ns_pseg_offset = 0; + nilfs->ns_seg_seq++; +} + +static inline __u64 nilfs_last_cno(struct the_nilfs *nilfs) +{ + __u64 cno; + + spin_lock(&nilfs->ns_last_segment_lock); + cno = nilfs->ns_last_cno; + spin_unlock(&nilfs->ns_last_segment_lock); + return cno; +} + +static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n) +{ + return n == nilfs->ns_segnum || n == nilfs->ns_nextnum; +} + +#endif /* _THE_NILFS_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index a5887df2cd8..8672b953603 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1926,7 +1926,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, out->f_path.dentry->d_name.len, out->f_path.dentry->d_name.name); - inode_double_lock(inode, pipe->inode); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); ret = ocfs2_rw_lock(inode, 1); if (ret < 0) { @@ -1941,12 +1941,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, goto out_unlock; } + if (pipe->inode) + mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); out_unlock: ocfs2_rw_unlock(inode, 1); out: - inode_double_unlock(inode, pipe->inode); + mutex_unlock(&inode->i_mutex); mlog_exit(ret); return ret; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b0ae0be4801..39e4ad4f59f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -204,6 +204,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) struct file *file = vma->vm_file; int flags = vma->vm_flags; unsigned long ino = 0; + unsigned long long pgoff = 0; dev_t dev = 0; int len; @@ -211,6 +212,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) struct inode *inode = vma->vm_file->f_path.dentry->d_inode; dev = inode->i_sb->s_dev; ino = inode->i_ino; + pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; } seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", @@ -220,7 +222,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? 's' : 'p', - ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, + pgoff, MAJOR(dev), MINOR(dev), ino, &len); /* diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 863464d5519..64a72e2e765 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -126,6 +126,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) struct file *file; dev_t dev = 0; int flags, len; + unsigned long long pgoff = 0; flags = vma->vm_flags; file = vma->vm_file; @@ -134,6 +135,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) struct inode *inode = vma->vm_file->f_path.dentry->d_inode; dev = inode->i_sb->s_dev; ino = inode->i_ino; + pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; } seq_printf(m, @@ -144,7 +146,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', - (unsigned long long) vma->vm_pgoff << PAGE_SHIFT, + pgoff, MAJOR(dev), MINOR(dev), ino, &len); if (file) { diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index a404fb88e45..3a6b193d844 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -221,22 +221,23 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent) save_mount_options(sb, data); fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL); + sb->s_fs_info = fsi; if (!fsi) { err = -ENOMEM; goto fail; } - sb->s_fs_info = fsi; err = ramfs_parse_options(data, &fsi->mount_opts); if (err) goto fail; - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = RAMFS_MAGIC; - sb->s_op = &ramfs_ops; - sb->s_time_gran = 1; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = RAMFS_MAGIC; + sb->s_op = &ramfs_ops; + sb->s_time_gran = 1; + inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0); if (!inode) { err = -ENOMEM; @@ -244,14 +245,16 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent) } root = d_alloc_root(inode); + sb->s_root = root; if (!root) { err = -ENOMEM; goto fail; } - sb->s_root = root; + return 0; fail: kfree(fsi); + sb->s_fs_info = NULL; iput(inode); return err; } diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig index 1a17020f9fa..ce2d6bcc626 100644 --- a/fs/romfs/Kconfig +++ b/fs/romfs/Kconfig @@ -1,6 +1,6 @@ config ROMFS_FS tristate "ROM file system support" - depends on BLOCK + depends on BLOCK || MTD ---help--- This is a very small read-only file system mainly intended for initial ram disks of installation disks, but it could be used for @@ -14,3 +14,49 @@ config ROMFS_FS If you don't know whether you need it, then you don't need it: answer N. + +# +# Select the backing stores to be supported +# +choice + prompt "RomFS backing stores" + depends on ROMFS_FS + default ROMFS_BACKED_BY_BLOCK + help + Select the backing stores to be supported. + +config ROMFS_BACKED_BY_BLOCK + bool "Block device-backed ROM file system support" + depends on BLOCK + help + This permits ROMFS to use block devices buffered through the page + cache as the medium from which to retrieve data. It does not allow + direct mapping of the medium. + + If unsure, answer Y. + +config ROMFS_BACKED_BY_MTD + bool "MTD-backed ROM file system support" + depends on MTD=y || (ROMFS_FS=m && MTD) + help + This permits ROMFS to use MTD based devices directly, without the + intercession of the block layer (which may have been disabled). It + also allows direct mapping of MTD devices through romfs files under + NOMMU conditions if the underlying device is directly addressable by + the CPU. + + If unsure, answer Y. + +config ROMFS_BACKED_BY_BOTH + bool "Both the above" + depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD)) +endchoice + + +config ROMFS_ON_BLOCK + bool + default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH + +config ROMFS_ON_MTD + bool + default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile index c95b21cf49a..420beb7d495 100644 --- a/fs/romfs/Makefile +++ b/fs/romfs/Makefile @@ -1,7 +1,12 @@ # -# Makefile for the linux romfs filesystem routines. +# Makefile for the linux RomFS filesystem routines. # obj-$(CONFIG_ROMFS_FS) += romfs.o -romfs-objs := inode.o +romfs-y := storage.o super.o + +ifneq ($(CONFIG_MMU),y) +romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o +endif + diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c deleted file mode 100644 index 98a232f7196..00000000000 --- a/fs/romfs/inode.c +++ /dev/null @@ -1,665 +0,0 @@ -/* - * ROMFS file system, Linux implementation - * - * Copyright (C) 1997-1999 Janos Farkas <chexum@shadow.banki.hu> - * - * Using parts of the minix filesystem - * Copyright (C) 1991, 1992 Linus Torvalds - * - * and parts of the affs filesystem additionally - * Copyright (C) 1993 Ray Burr - * Copyright (C) 1996 Hans-Joachim Widmaier - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes - * Changed for 2.1.19 modules - * Jan 1997 Initial release - * Jun 1997 2.1.43+ changes - * Proper page locking in readpage - * Changed to work with 2.1.45+ fs - * Jul 1997 Fixed follow_link - * 2.1.47 - * lookup shouldn't return -ENOENT - * from Horst von Brand: - * fail on wrong checksum - * double unlock_super was possible - * correct namelen for statfs - * spotted by Bill Hawes: - * readlink shouldn't iput() - * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir() - * exposed a problem in readdir - * 2.1.107 code-freeze spellchecker run - * Aug 1998 2.1.118+ VFS changes - * Sep 1998 2.1.122 another VFS change (follow_link) - * Apr 1999 2.2.7 no more EBADF checking in - * lookup/readdir, use ERR_PTR - * Jun 1999 2.3.6 d_alloc_root use changed - * 2.3.9 clean up usage of ENOENT/negative - * dentries in lookup - * clean up page flags setting - * (error, uptodate, locking) in - * in readpage - * use init_special_inode for - * fifos/sockets (and streamline) in - * read_inode, fix _ops table order - * Aug 1999 2.3.16 __initfunc() => __init change - * Oct 1999 2.3.24 page->owner hack obsoleted - * Nov 1999 2.3.27 2.3.25+ page->offset => index change - */ - -/* todo: - * - see Documentation/filesystems/romfs.txt - * - use allocated, not stack memory for file names? - * - considering write access... - * - network (tftp) files? - * - merge back some _op tables - */ - -/* - * Sorry about some optimizations and for some goto's. I just wanted - * to squeeze some more bytes out of this code.. :) - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/romfs_fs.h> -#include <linux/fs.h> -#include <linux/init.h> -#include <linux/pagemap.h> -#include <linux/smp_lock.h> -#include <linux/buffer_head.h> -#include <linux/vfs.h> - -#include <asm/uaccess.h> - -struct romfs_inode_info { - unsigned long i_metasize; /* size of non-data area */ - unsigned long i_dataoffset; /* from the start of fs */ - struct inode vfs_inode; -}; - -static struct inode *romfs_iget(struct super_block *, unsigned long); - -/* instead of private superblock data */ -static inline unsigned long romfs_maxsize(struct super_block *sb) -{ - return (unsigned long)sb->s_fs_info; -} - -static inline struct romfs_inode_info *ROMFS_I(struct inode *inode) -{ - return container_of(inode, struct romfs_inode_info, vfs_inode); -} - -static __u32 -romfs_checksum(void *data, int size) -{ - __u32 sum; - __be32 *ptr; - - sum = 0; ptr = data; - size>>=2; - while (size>0) { - sum += be32_to_cpu(*ptr++); - size--; - } - return sum; -} - -static const struct super_operations romfs_ops; - -static int romfs_fill_super(struct super_block *s, void *data, int silent) -{ - struct buffer_head *bh; - struct romfs_super_block *rsb; - struct inode *root; - int sz, ret = -EINVAL; - - /* I would parse the options here, but there are none.. :) */ - - sb_set_blocksize(s, ROMBSIZE); - s->s_maxbytes = 0xFFFFFFFF; - - bh = sb_bread(s, 0); - if (!bh) { - /* XXX merge with other printk? */ - printk ("romfs: unable to read superblock\n"); - goto outnobh; - } - - rsb = (struct romfs_super_block *)bh->b_data; - sz = be32_to_cpu(rsb->size); - if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 - || sz < ROMFH_SIZE) { - if (!silent) - printk ("VFS: Can't find a romfs filesystem on dev " - "%s.\n", s->s_id); - goto out; - } - if (romfs_checksum(rsb, min_t(int, sz, 512))) { - printk ("romfs: bad initial checksum on dev " - "%s.\n", s->s_id); - goto out; - } - - s->s_magic = ROMFS_MAGIC; - s->s_fs_info = (void *)(long)sz; - - s->s_flags |= MS_RDONLY; - - /* Find the start of the fs */ - sz = (ROMFH_SIZE + - strnlen(rsb->name, ROMFS_MAXFN) + 1 + ROMFH_PAD) - & ROMFH_MASK; - - s->s_op = &romfs_ops; - root = romfs_iget(s, sz); - if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; - } - - ret = -ENOMEM; - s->s_root = d_alloc_root(root); - if (!s->s_root) - goto outiput; - - brelse(bh); - return 0; - -outiput: - iput(root); -out: - brelse(bh); -outnobh: - return ret; -} - -/* That's simple too. */ - -static int -romfs_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - buf->f_type = ROMFS_MAGIC; - buf->f_bsize = ROMBSIZE; - buf->f_bfree = buf->f_bavail = buf->f_ffree; - buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS; - buf->f_namelen = ROMFS_MAXFN; - return 0; -} - -/* some helper routines */ - -static int -romfs_strnlen(struct inode *i, unsigned long offset, unsigned long count) -{ - struct buffer_head *bh; - unsigned long avail, maxsize, res; - - maxsize = romfs_maxsize(i->i_sb); - if (offset >= maxsize) - return -1; - - /* strnlen is almost always valid */ - if (count > maxsize || offset+count > maxsize) - count = maxsize-offset; - - bh = sb_bread(i->i_sb, offset>>ROMBSBITS); - if (!bh) - return -1; /* error */ - - avail = ROMBSIZE - (offset & ROMBMASK); - maxsize = min_t(unsigned long, count, avail); - res = strnlen(((char *)bh->b_data)+(offset&ROMBMASK), maxsize); - brelse(bh); - - if (res < maxsize) - return res; /* found all of it */ - - while (res < count) { - offset += maxsize; - - bh = sb_bread(i->i_sb, offset>>ROMBSBITS); - if (!bh) - return -1; - maxsize = min_t(unsigned long, count - res, ROMBSIZE); - avail = strnlen(bh->b_data, maxsize); - res += avail; - brelse(bh); - if (avail < maxsize) - return res; - } - return res; -} - -static int -romfs_copyfrom(struct inode *i, void *dest, unsigned long offset, unsigned long count) -{ - struct buffer_head *bh; - unsigned long avail, maxsize, res; - - maxsize = romfs_maxsize(i->i_sb); - if (offset >= maxsize || count > maxsize || offset+count>maxsize) - return -1; - - bh = sb_bread(i->i_sb, offset>>ROMBSBITS); - if (!bh) - return -1; /* error */ - - avail = ROMBSIZE - (offset & ROMBMASK); - maxsize = min_t(unsigned long, count, avail); - memcpy(dest, ((char *)bh->b_data) + (offset & ROMBMASK), maxsize); - brelse(bh); - - res = maxsize; /* all of it */ - - while (res < count) { - offset += maxsize; - dest += maxsize; - - bh = sb_bread(i->i_sb, offset>>ROMBSBITS); - if (!bh) - return -1; - maxsize = min_t(unsigned long, count - res, ROMBSIZE); - memcpy(dest, bh->b_data, maxsize); - brelse(bh); - res += maxsize; - } - return res; -} - -static unsigned char romfs_dtype_table[] = { - DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO -}; - -static int -romfs_readdir(struct file *filp, void *dirent, filldir_t filldir) -{ - struct inode *i = filp->f_path.dentry->d_inode; - struct romfs_inode ri; - unsigned long offset, maxoff; - int j, ino, nextfh; - int stored = 0; - char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ - - lock_kernel(); - - maxoff = romfs_maxsize(i->i_sb); - - offset = filp->f_pos; - if (!offset) { - offset = i->i_ino & ROMFH_MASK; - if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0) - goto out; - offset = be32_to_cpu(ri.spec) & ROMFH_MASK; - } - - /* Not really failsafe, but we are read-only... */ - for(;;) { - if (!offset || offset >= maxoff) { - offset = maxoff; - filp->f_pos = offset; - goto out; - } - filp->f_pos = offset; - - /* Fetch inode info */ - if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0) - goto out; - - j = romfs_strnlen(i, offset+ROMFH_SIZE, sizeof(fsname)-1); - if (j < 0) - goto out; - - fsname[j]=0; - romfs_copyfrom(i, fsname, offset+ROMFH_SIZE, j); - - ino = offset; - nextfh = be32_to_cpu(ri.next); - if ((nextfh & ROMFH_TYPE) == ROMFH_HRD) - ino = be32_to_cpu(ri.spec); - if (filldir(dirent, fsname, j, offset, ino, - romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) { - goto out; - } - stored++; - offset = nextfh & ROMFH_MASK; - } -out: - unlock_kernel(); - return stored; -} - -static struct dentry * -romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) -{ - unsigned long offset, maxoff; - long res; - int fslen; - struct inode *inode = NULL; - char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ - struct romfs_inode ri; - const char *name; /* got from dentry */ - int len; - - res = -EACCES; /* placeholder for "no data here" */ - offset = dir->i_ino & ROMFH_MASK; - lock_kernel(); - if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0) - goto error; - - maxoff = romfs_maxsize(dir->i_sb); - offset = be32_to_cpu(ri.spec) & ROMFH_MASK; - - /* OK, now find the file whose name is in "dentry" in the - * directory specified by "dir". */ - - name = dentry->d_name.name; - len = dentry->d_name.len; - - for(;;) { - if (!offset || offset >= maxoff) - goto success; /* negative success */ - if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0) - goto error; - - /* try to match the first 16 bytes of name */ - fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, ROMFH_SIZE); - if (len < ROMFH_SIZE) { - if (len == fslen) { - /* both are shorter, and same size */ - romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1); - if (strncmp (name, fsname, len) == 0) - break; - } - } else if (fslen >= ROMFH_SIZE) { - /* both are longer; XXX optimize max size */ - fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, sizeof(fsname)-1); - if (len == fslen) { - romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1); - if (strncmp(name, fsname, len) == 0) - break; - } - } - /* next entry */ - offset = be32_to_cpu(ri.next) & ROMFH_MASK; - } - - /* Hard link handling */ - if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD) - offset = be32_to_cpu(ri.spec) & ROMFH_MASK; - - inode = romfs_iget(dir->i_sb, offset); - if (IS_ERR(inode)) { - res = PTR_ERR(inode); - goto error; - } - -success: - d_add(dentry, inode); - res = 0; -error: - unlock_kernel(); - return ERR_PTR(res); -} - -/* - * Ok, we do readpage, to be able to execute programs. Unfortunately, - * we can't use bmap, since we may have looser alignments. - */ - -static int -romfs_readpage(struct file *file, struct page * page) -{ - struct inode *inode = page->mapping->host; - loff_t offset, size; - unsigned long filled; - void *buf; - int result = -EIO; - - page_cache_get(page); - lock_kernel(); - buf = kmap(page); - if (!buf) - goto err_out; - - /* 32 bit warning -- but not for us :) */ - offset = page_offset(page); - size = i_size_read(inode); - filled = 0; - result = 0; - if (offset < size) { - unsigned long readlen; - - size -= offset; - readlen = size > PAGE_SIZE ? PAGE_SIZE : size; - - filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen); - - if (filled != readlen) { - SetPageError(page); - filled = 0; - result = -EIO; - } - } - - if (filled < PAGE_SIZE) - memset(buf + filled, 0, PAGE_SIZE-filled); - - if (!result) - SetPageUptodate(page); - flush_dcache_page(page); - - unlock_page(page); - - kunmap(page); -err_out: - page_cache_release(page); - unlock_kernel(); - - return result; -} - -/* Mapping from our types to the kernel */ - -static const struct address_space_operations romfs_aops = { - .readpage = romfs_readpage -}; - -static const struct file_operations romfs_dir_operations = { - .read = generic_read_dir, - .readdir = romfs_readdir, -}; - -static const struct inode_operations romfs_dir_inode_operations = { - .lookup = romfs_lookup, -}; - -static mode_t romfs_modemap[] = -{ - 0, S_IFDIR+0644, S_IFREG+0644, S_IFLNK+0777, - S_IFBLK+0600, S_IFCHR+0600, S_IFSOCK+0644, S_IFIFO+0644 -}; - -static struct inode * -romfs_iget(struct super_block *sb, unsigned long ino) -{ - int nextfh, ret; - struct romfs_inode ri; - struct inode *i; - - ino &= ROMFH_MASK; - i = iget_locked(sb, ino); - if (!i) - return ERR_PTR(-ENOMEM); - if (!(i->i_state & I_NEW)) - return i; - - i->i_mode = 0; - - /* Loop for finding the real hard link */ - for(;;) { - if (romfs_copyfrom(i, &ri, ino, ROMFH_SIZE) <= 0) { - printk(KERN_ERR "romfs: read error for inode 0x%lx\n", - ino); - iget_failed(i); - return ERR_PTR(-EIO); - } - /* XXX: do romfs_checksum here too (with name) */ - - nextfh = be32_to_cpu(ri.next); - if ((nextfh & ROMFH_TYPE) != ROMFH_HRD) - break; - - ino = be32_to_cpu(ri.spec) & ROMFH_MASK; - } - - i->i_nlink = 1; /* Hard to decide.. */ - i->i_size = be32_to_cpu(ri.size); - i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; - i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; - - /* Precalculate the data offset */ - ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN); - if (ret >= 0) - ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK; - else - ino = 0; - - ROMFS_I(i)->i_metasize = ino; - ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK); - - /* Compute permissions */ - ino = romfs_modemap[nextfh & ROMFH_TYPE]; - /* only "normal" files have ops */ - switch (nextfh & ROMFH_TYPE) { - case 1: - i->i_size = ROMFS_I(i)->i_metasize; - i->i_op = &romfs_dir_inode_operations; - i->i_fop = &romfs_dir_operations; - if (nextfh & ROMFH_EXEC) - ino |= S_IXUGO; - i->i_mode = ino; - break; - case 2: - i->i_fop = &generic_ro_fops; - i->i_data.a_ops = &romfs_aops; - if (nextfh & ROMFH_EXEC) - ino |= S_IXUGO; - i->i_mode = ino; - break; - case 3: - i->i_op = &page_symlink_inode_operations; - i->i_data.a_ops = &romfs_aops; - i->i_mode = ino | S_IRWXUGO; - break; - default: - /* depending on MBZ for sock/fifos */ - nextfh = be32_to_cpu(ri.spec); - init_special_inode(i, ino, - MKDEV(nextfh>>16,nextfh&0xffff)); - } - unlock_new_inode(i); - return i; -} - -static struct kmem_cache * romfs_inode_cachep; - -static struct inode *romfs_alloc_inode(struct super_block *sb) -{ - struct romfs_inode_info *ei; - ei = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL); - if (!ei) - return NULL; - return &ei->vfs_inode; -} - -static void romfs_destroy_inode(struct inode *inode) -{ - kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); -} - -static void init_once(void *foo) -{ - struct romfs_inode_info *ei = foo; - - inode_init_once(&ei->vfs_inode); -} - -static int init_inodecache(void) -{ - romfs_inode_cachep = kmem_cache_create("romfs_inode_cache", - sizeof(struct romfs_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - init_once); - if (romfs_inode_cachep == NULL) - return -ENOMEM; - return 0; -} - -static void destroy_inodecache(void) -{ - kmem_cache_destroy(romfs_inode_cachep); -} - -static int romfs_remount(struct super_block *sb, int *flags, char *data) -{ - *flags |= MS_RDONLY; - return 0; -} - -static const struct super_operations romfs_ops = { - .alloc_inode = romfs_alloc_inode, - .destroy_inode = romfs_destroy_inode, - .statfs = romfs_statfs, - .remount_fs = romfs_remount, -}; - -static int romfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) -{ - return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super, - mnt); -} - -static struct file_system_type romfs_fs_type = { - .owner = THIS_MODULE, - .name = "romfs", - .get_sb = romfs_get_sb, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; - -static int __init init_romfs_fs(void) -{ - int err = init_inodecache(); - if (err) - goto out1; - err = register_filesystem(&romfs_fs_type); - if (err) - goto out; - return 0; -out: - destroy_inodecache(); -out1: - return err; -} - -static void __exit exit_romfs_fs(void) -{ - unregister_filesystem(&romfs_fs_type); - destroy_inodecache(); -} - -/* Yes, works even as a module... :) */ - -module_init(init_romfs_fs) -module_exit(exit_romfs_fs) -MODULE_LICENSE("GPL"); diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h new file mode 100644 index 00000000000..06044a9dc62 --- /dev/null +++ b/fs/romfs/internal.h @@ -0,0 +1,47 @@ +/* RomFS internal definitions + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/romfs_fs.h> + +struct romfs_inode_info { + struct inode vfs_inode; + unsigned long i_metasize; /* size of non-data area */ + unsigned long i_dataoffset; /* from the start of fs */ +}; + +static inline size_t romfs_maxsize(struct super_block *sb) +{ + return (size_t) (unsigned long) sb->s_fs_info; +} + +static inline struct romfs_inode_info *ROMFS_I(struct inode *inode) +{ + return container_of(inode, struct romfs_inode_info, vfs_inode); +} + +/* + * mmap-nommu.c + */ +#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD) +extern const struct file_operations romfs_ro_fops; +#else +#define romfs_ro_fops generic_ro_fops +#endif + +/* + * storage.c + */ +extern int romfs_dev_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen); +extern ssize_t romfs_dev_strnlen(struct super_block *sb, + unsigned long pos, size_t maxlen); +extern int romfs_dev_strncmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size); diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c new file mode 100644 index 00000000000..f0511e81696 --- /dev/null +++ b/fs/romfs/mmap-nommu.c @@ -0,0 +1,75 @@ +/* NOMMU mmap support for RomFS on MTD devices + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/mm.h> +#include <linux/mtd/super.h> +#include "internal.h" + +/* + * try to determine where a shared mapping can be made + * - only supported for NOMMU at the moment (MMU can't doesn't copy private + * mappings) + * - attempts to map through to the underlying MTD device + */ +static unsigned long romfs_get_unmapped_area(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + struct inode *inode = file->f_mapping->host; + struct mtd_info *mtd = inode->i_sb->s_mtd; + unsigned long isize, offset; + + if (!mtd) + goto cant_map_directly; + + isize = i_size_read(inode); + offset = pgoff << PAGE_SHIFT; + if (offset > isize || len > isize || offset > isize - len) + return (unsigned long) -EINVAL; + + /* we need to call down to the MTD layer to do the actual mapping */ + if (mtd->get_unmapped_area) { + if (addr != 0) + return (unsigned long) -EINVAL; + + if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT)) + return (unsigned long) -EINVAL; + + offset += ROMFS_I(inode)->i_dataoffset; + if (offset > mtd->size - len) + return (unsigned long) -EINVAL; + + return mtd->get_unmapped_area(mtd, len, offset, flags); + } + +cant_map_directly: + return (unsigned long) -ENOSYS; +} + +/* + * permit a R/O mapping to be made directly through onto an MTD device if + * possible + */ +static int romfs_mmap(struct file *file, struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; +} + +const struct file_operations romfs_ro_fops = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .splice_read = generic_file_splice_read, + .mmap = romfs_mmap, + .get_unmapped_area = romfs_get_unmapped_area, +}; diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c new file mode 100644 index 00000000000..7e3e1e12a08 --- /dev/null +++ b/fs/romfs/storage.c @@ -0,0 +1,261 @@ +/* RomFS storage access routines + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/fs.h> +#include <linux/mtd/super.h> +#include <linux/buffer_head.h> +#include "internal.h" + +#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK) +#error no ROMFS backing store interface configured +#endif + +#ifdef CONFIG_ROMFS_ON_MTD +#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__)) + +/* + * read data from an romfs image on an MTD device + */ +static int romfs_mtd_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen) +{ + size_t rlen; + int ret; + + ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf); + return (ret < 0 || rlen != buflen) ? -EIO : 0; +} + +/* + * determine the length of a string in a romfs image on an MTD device + */ +static ssize_t romfs_mtd_strnlen(struct super_block *sb, + unsigned long pos, size_t maxlen) +{ + ssize_t n = 0; + size_t segment; + u_char buf[16], *p; + size_t len; + int ret; + + /* scan the string up to 16 bytes at a time */ + while (maxlen > 0) { + segment = min_t(size_t, maxlen, 16); + ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf); + if (ret < 0) + return ret; + p = memchr(buf, 0, len); + if (p) + return n + (p - buf); + maxlen -= len; + pos += len; + n += len; + } + + return n; +} + +/* + * compare a string to one in a romfs image on MTD + * - return 1 if matched, 0 if differ, -ve if error + */ +static int romfs_mtd_strncmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) +{ + u_char buf[16]; + size_t len, segment; + int ret; + + /* scan the string up to 16 bytes at a time */ + while (size > 0) { + segment = min_t(size_t, size, 16); + ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf); + if (ret < 0) + return ret; + if (memcmp(buf, str, len) != 0) + return 0; + size -= len; + pos += len; + str += len; + } + + return 1; +} +#endif /* CONFIG_ROMFS_ON_MTD */ + +#ifdef CONFIG_ROMFS_ON_BLOCK +/* + * read data from an romfs image on a block device + */ +static int romfs_blk_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen) +{ + struct buffer_head *bh; + unsigned long offset; + size_t segment; + + /* copy the string up to blocksize bytes at a time */ + while (buflen > 0) { + offset = pos & (ROMBSIZE - 1); + segment = min_t(size_t, buflen, ROMBSIZE - offset); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + memcpy(buf, bh->b_data + offset, segment); + brelse(bh); + buflen -= segment; + pos += segment; + } + + return 0; +} + +/* + * determine the length of a string in romfs on a block device + */ +static ssize_t romfs_blk_strnlen(struct super_block *sb, + unsigned long pos, size_t limit) +{ + struct buffer_head *bh; + unsigned long offset; + ssize_t n = 0; + size_t segment; + u_char *buf, *p; + + /* scan the string up to blocksize bytes at a time */ + while (limit > 0) { + offset = pos & (ROMBSIZE - 1); + segment = min_t(size_t, limit, ROMBSIZE - offset); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + buf = bh->b_data + offset; + p = memchr(buf, 0, segment); + brelse(bh); + if (p) + return n + (p - buf); + limit -= segment; + pos += segment; + n += segment; + } + + return n; +} + +/* + * compare a string to one in a romfs image on a block device + * - return 1 if matched, 0 if differ, -ve if error + */ +static int romfs_blk_strncmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) +{ + struct buffer_head *bh; + unsigned long offset; + size_t segment; + bool x; + + /* scan the string up to 16 bytes at a time */ + while (size > 0) { + offset = pos & (ROMBSIZE - 1); + segment = min_t(size_t, size, ROMBSIZE - offset); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + x = (memcmp(bh->b_data + offset, str, segment) != 0); + brelse(bh); + if (x) + return 0; + size -= segment; + pos += segment; + str += segment; + } + + return 1; +} +#endif /* CONFIG_ROMFS_ON_BLOCK */ + +/* + * read data from the romfs image + */ +int romfs_dev_read(struct super_block *sb, unsigned long pos, + void *buf, size_t buflen) +{ + size_t limit; + + limit = romfs_maxsize(sb); + if (pos >= limit) + return -EIO; + if (buflen > limit - pos) + buflen = limit - pos; + +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) + return romfs_mtd_read(sb, pos, buf, buflen); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) + return romfs_blk_read(sb, pos, buf, buflen); +#endif + return -EIO; +} + +/* + * determine the length of a string in romfs + */ +ssize_t romfs_dev_strnlen(struct super_block *sb, + unsigned long pos, size_t maxlen) +{ + size_t limit; + + limit = romfs_maxsize(sb); + if (pos >= limit) + return -EIO; + if (maxlen > limit - pos) + maxlen = limit - pos; + +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) + return romfs_mtd_strnlen(sb, pos, limit); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) + return romfs_blk_strnlen(sb, pos, limit); +#endif + return -EIO; +} + +/* + * compare a string to one in romfs + * - return 1 if matched, 0 if differ, -ve if error + */ +int romfs_dev_strncmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) +{ + size_t limit; + + limit = romfs_maxsize(sb); + if (pos >= limit) + return -EIO; + if (size > ROMFS_MAXFN) + return -ENAMETOOLONG; + if (size > limit - pos) + return -EIO; + +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) + return romfs_mtd_strncmp(sb, pos, str, size); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) + return romfs_blk_strncmp(sb, pos, str, size); +#endif + return -EIO; +} diff --git a/fs/romfs/super.c b/fs/romfs/super.c new file mode 100644 index 00000000000..10ca7d984a8 --- /dev/null +++ b/fs/romfs/super.c @@ -0,0 +1,653 @@ +/* Block- or MTD-based romfs + * + * Copyright © 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * Derived from: ROMFS file system, Linux implementation + * + * Copyright © 1997-1999 Janos Farkas <chexum@shadow.banki.hu> + * + * Using parts of the minix filesystem + * Copyright © 1991, 1992 Linus Torvalds + * + * and parts of the affs filesystem additionally + * Copyright © 1993 Ray Burr + * Copyright © 1996 Hans-Joachim Widmaier + * + * Changes + * Changed for 2.1.19 modules + * Jan 1997 Initial release + * Jun 1997 2.1.43+ changes + * Proper page locking in readpage + * Changed to work with 2.1.45+ fs + * Jul 1997 Fixed follow_link + * 2.1.47 + * lookup shouldn't return -ENOENT + * from Horst von Brand: + * fail on wrong checksum + * double unlock_super was possible + * correct namelen for statfs + * spotted by Bill Hawes: + * readlink shouldn't iput() + * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir() + * exposed a problem in readdir + * 2.1.107 code-freeze spellchecker run + * Aug 1998 2.1.118+ VFS changes + * Sep 1998 2.1.122 another VFS change (follow_link) + * Apr 1999 2.2.7 no more EBADF checking in + * lookup/readdir, use ERR_PTR + * Jun 1999 2.3.6 d_alloc_root use changed + * 2.3.9 clean up usage of ENOENT/negative + * dentries in lookup + * clean up page flags setting + * (error, uptodate, locking) in + * in readpage + * use init_special_inode for + * fifos/sockets (and streamline) in + * read_inode, fix _ops table order + * Aug 1999 2.3.16 __initfunc() => __init change + * Oct 1999 2.3.24 page->owner hack obsoleted + * Nov 1999 2.3.27 2.3.25+ page->offset => index change + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/parser.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/statfs.h> +#include <linux/mtd/super.h> +#include <linux/ctype.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/uaccess.h> +#include "internal.h" + +static struct kmem_cache *romfs_inode_cachep; + +static const umode_t romfs_modemap[8] = { + 0, /* hard link */ + S_IFDIR | 0644, /* directory */ + S_IFREG | 0644, /* regular file */ + S_IFLNK | 0777, /* symlink */ + S_IFBLK | 0600, /* blockdev */ + S_IFCHR | 0600, /* chardev */ + S_IFSOCK | 0644, /* socket */ + S_IFIFO | 0644 /* FIFO */ +}; + +static const unsigned char romfs_dtype_table[] = { + DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO +}; + +static struct inode *romfs_iget(struct super_block *sb, unsigned long pos); + +/* + * read a page worth of data from the image + */ +static int romfs_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + loff_t offset, size; + unsigned long fillsize, pos; + void *buf; + int ret; + + buf = kmap(page); + if (!buf) + return -ENOMEM; + + /* 32 bit warning -- but not for us :) */ + offset = page_offset(page); + size = i_size_read(inode); + fillsize = 0; + ret = 0; + if (offset < size) { + size -= offset; + fillsize = size > PAGE_SIZE ? PAGE_SIZE : size; + + pos = ROMFS_I(inode)->i_dataoffset + offset; + + ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize); + if (ret < 0) { + SetPageError(page); + fillsize = 0; + ret = -EIO; + } + } + + if (fillsize < PAGE_SIZE) + memset(buf + fillsize, 0, PAGE_SIZE - fillsize); + if (ret == 0) + SetPageUptodate(page); + + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + return ret; +} + +static const struct address_space_operations romfs_aops = { + .readpage = romfs_readpage +}; + +/* + * read the entries from a directory + */ +static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *i = filp->f_dentry->d_inode; + struct romfs_inode ri; + unsigned long offset, maxoff; + int j, ino, nextfh; + int stored = 0; + char fsname[ROMFS_MAXFN]; /* XXX dynamic? */ + int ret; + + maxoff = romfs_maxsize(i->i_sb); + + offset = filp->f_pos; + if (!offset) { + offset = i->i_ino & ROMFH_MASK; + ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); + if (ret < 0) + goto out; + offset = be32_to_cpu(ri.spec) & ROMFH_MASK; + } + + /* Not really failsafe, but we are read-only... */ + for (;;) { + if (!offset || offset >= maxoff) { + offset = maxoff; + filp->f_pos = offset; + goto out; + } + filp->f_pos = offset; + + /* Fetch inode info */ + ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE); + if (ret < 0) + goto out; + + j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE, + sizeof(fsname) - 1); + if (j < 0) + goto out; + + ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j); + if (ret < 0) + goto out; + fsname[j] = '\0'; + + ino = offset; + nextfh = be32_to_cpu(ri.next); + if ((nextfh & ROMFH_TYPE) == ROMFH_HRD) + ino = be32_to_cpu(ri.spec); + if (filldir(dirent, fsname, j, offset, ino, + romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) + goto out; + + stored++; + offset = nextfh & ROMFH_MASK; + } + +out: + return stored; +} + +/* + * look up an entry in a directory + */ +static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + unsigned long offset, maxoff; + struct inode *inode; + struct romfs_inode ri; + const char *name; /* got from dentry */ + int len, ret; + + offset = dir->i_ino & ROMFH_MASK; + ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE); + if (ret < 0) + goto error; + + /* search all the file entries in the list starting from the one + * pointed to by the directory's special data */ + maxoff = romfs_maxsize(dir->i_sb); + offset = be32_to_cpu(ri.spec) & ROMFH_MASK; + + name = dentry->d_name.name; + len = dentry->d_name.len; + + for (;;) { + if (!offset || offset >= maxoff) + goto out0; + + ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri)); + if (ret < 0) + goto error; + + /* try to match the first 16 bytes of name */ + ret = romfs_dev_strncmp(dir->i_sb, offset + ROMFH_SIZE, name, + len); + if (ret < 0) + goto error; + if (ret == 1) + break; + + /* next entry */ + offset = be32_to_cpu(ri.next) & ROMFH_MASK; + } + + /* Hard link handling */ + if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD) + offset = be32_to_cpu(ri.spec) & ROMFH_MASK; + + inode = romfs_iget(dir->i_sb, offset); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto error; + } + goto outi; + + /* + * it's a bit funky, _lookup needs to return an error code + * (negative) or a NULL, both as a dentry. ENOENT should not + * be returned, instead we need to create a negative dentry by + * d_add(dentry, NULL); and return 0 as no error. + * (Although as I see, it only matters on writable file + * systems). + */ +out0: + inode = NULL; +outi: + d_add(dentry, inode); + ret = 0; +error: + return ERR_PTR(ret); +} + +static const struct file_operations romfs_dir_operations = { + .read = generic_read_dir, + .readdir = romfs_readdir, +}; + +static struct inode_operations romfs_dir_inode_operations = { + .lookup = romfs_lookup, +}; + +/* + * get a romfs inode based on its position in the image (which doubles as the + * inode number) + */ +static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) +{ + struct romfs_inode_info *inode; + struct romfs_inode ri; + struct inode *i; + unsigned long nlen; + unsigned nextfh, ret; + umode_t mode; + + /* we might have to traverse a chain of "hard link" file entries to get + * to the actual file */ + for (;;) { + ret = romfs_dev_read(sb, pos, &ri, sizeof(ri)); + if (ret < 0) + goto error; + + /* XXX: do romfs_checksum here too (with name) */ + + nextfh = be32_to_cpu(ri.next); + if ((nextfh & ROMFH_TYPE) != ROMFH_HRD) + break; + + pos = be32_to_cpu(ri.spec) & ROMFH_MASK; + } + + /* determine the length of the filename */ + nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN); + if (IS_ERR_VALUE(nlen)) + goto eio; + + /* get an inode for this image position */ + i = iget_locked(sb, pos); + if (!i) + return ERR_PTR(-ENOMEM); + + if (!(i->i_state & I_NEW)) + return i; + + /* precalculate the data offset */ + inode = ROMFS_I(i); + inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK; + inode->i_dataoffset = pos + inode->i_metasize; + + i->i_nlink = 1; /* Hard to decide.. */ + i->i_size = be32_to_cpu(ri.size); + i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; + i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; + + /* set up mode and ops */ + mode = romfs_modemap[nextfh & ROMFH_TYPE]; + + switch (nextfh & ROMFH_TYPE) { + case ROMFH_DIR: + i->i_size = ROMFS_I(i)->i_metasize; + i->i_op = &romfs_dir_inode_operations; + i->i_fop = &romfs_dir_operations; + if (nextfh & ROMFH_EXEC) + mode |= S_IXUGO; + break; + case ROMFH_REG: + i->i_fop = &romfs_ro_fops; + i->i_data.a_ops = &romfs_aops; + if (i->i_sb->s_mtd) + i->i_data.backing_dev_info = + i->i_sb->s_mtd->backing_dev_info; + if (nextfh & ROMFH_EXEC) + mode |= S_IXUGO; + break; + case ROMFH_SYM: + i->i_op = &page_symlink_inode_operations; + i->i_data.a_ops = &romfs_aops; + mode |= S_IRWXUGO; + break; + default: + /* depending on MBZ for sock/fifos */ + nextfh = be32_to_cpu(ri.spec); + init_special_inode(i, mode, MKDEV(nextfh >> 16, + nextfh & 0xffff)); + break; + } + + i->i_mode = mode; + + unlock_new_inode(i); + return i; + +eio: + ret = -EIO; +error: + printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos); + return ERR_PTR(ret); +} + +/* + * allocate a new inode + */ +static struct inode *romfs_alloc_inode(struct super_block *sb) +{ + struct romfs_inode_info *inode; + inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL); + return inode ? &inode->vfs_inode : NULL; +} + +/* + * return a spent inode to the slab cache + */ +static void romfs_destroy_inode(struct inode *inode) +{ + kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); +} + +/* + * get filesystem statistics + */ +static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + + buf->f_type = ROMFS_MAGIC; + buf->f_namelen = ROMFS_MAXFN; + buf->f_bsize = ROMBSIZE; + buf->f_bfree = buf->f_bavail = buf->f_ffree; + buf->f_blocks = + (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS; + buf->f_fsid.val[0] = (u32)id; + buf->f_fsid.val[1] = (u32)(id >> 32); + return 0; +} + +/* + * remounting must involve read-only + */ +static int romfs_remount(struct super_block *sb, int *flags, char *data) +{ + *flags |= MS_RDONLY; + return 0; +} + +static const struct super_operations romfs_super_ops = { + .alloc_inode = romfs_alloc_inode, + .destroy_inode = romfs_destroy_inode, + .statfs = romfs_statfs, + .remount_fs = romfs_remount, +}; + +/* + * checksum check on part of a romfs filesystem + */ +static __u32 romfs_checksum(const void *data, int size) +{ + const __be32 *ptr = data; + __u32 sum; + + sum = 0; + size >>= 2; + while (size > 0) { + sum += be32_to_cpu(*ptr++); + size--; + } + return sum; +} + +/* + * fill in the superblock + */ +static int romfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct romfs_super_block *rsb; + struct inode *root; + unsigned long pos, img_size; + const char *storage; + size_t len; + int ret; + +#ifdef CONFIG_BLOCK + if (!sb->s_mtd) { + sb_set_blocksize(sb, ROMBSIZE); + } else { + sb->s_blocksize = ROMBSIZE; + sb->s_blocksize_bits = blksize_bits(ROMBSIZE); + } +#endif + + sb->s_maxbytes = 0xFFFFFFFF; + sb->s_magic = ROMFS_MAGIC; + sb->s_flags |= MS_RDONLY | MS_NOATIME; + sb->s_op = &romfs_super_ops; + + /* read the image superblock and check it */ + rsb = kmalloc(512, GFP_KERNEL); + if (!rsb) + return -ENOMEM; + + sb->s_fs_info = (void *) 512; + ret = romfs_dev_read(sb, 0, rsb, 512); + if (ret < 0) + goto error_rsb; + + img_size = be32_to_cpu(rsb->size); + + if (sb->s_mtd && img_size > sb->s_mtd->size) + goto error_rsb_inval; + + sb->s_fs_info = (void *) img_size; + + if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 || + img_size < ROMFH_SIZE) { + if (!silent) + printk(KERN_WARNING "VFS:" + " Can't find a romfs filesystem on dev %s.\n", + sb->s_id); + goto error_rsb_inval; + } + + if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) { + printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n", + sb->s_id); + goto error_rsb_inval; + } + + storage = sb->s_mtd ? "MTD" : "the block layer"; + + len = strnlen(rsb->name, ROMFS_MAXFN); + if (!silent) + printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n", + (unsigned) len, (unsigned) len, rsb->name, storage); + + kfree(rsb); + rsb = NULL; + + /* find the root directory */ + pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK; + + root = romfs_iget(sb, pos); + if (!root) + goto error; + + sb->s_root = d_alloc_root(root); + if (!sb->s_root) + goto error_i; + + return 0; + +error_i: + iput(root); +error: + return -EINVAL; +error_rsb_inval: + ret = -EINVAL; +error_rsb: + return ret; +} + +/* + * get a superblock for mounting + */ +static int romfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) +{ + int ret = -EINVAL; + +#ifdef CONFIG_ROMFS_ON_MTD + ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super, + mnt); +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (ret == -EINVAL) + ret = get_sb_bdev(fs_type, flags, dev_name, data, + romfs_fill_super, mnt); +#endif + return ret; +} + +/* + * destroy a romfs superblock in the appropriate manner + */ +static void romfs_kill_sb(struct super_block *sb) +{ +#ifdef CONFIG_ROMFS_ON_MTD + if (sb->s_mtd) { + kill_mtd_super(sb); + return; + } +#endif +#ifdef CONFIG_ROMFS_ON_BLOCK + if (sb->s_bdev) { + kill_block_super(sb); + return; + } +#endif +} + +static struct file_system_type romfs_fs_type = { + .owner = THIS_MODULE, + .name = "romfs", + .get_sb = romfs_get_sb, + .kill_sb = romfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; + +/* + * inode storage initialiser + */ +static void romfs_i_init_once(void *_inode) +{ + struct romfs_inode_info *inode = _inode; + + inode_init_once(&inode->vfs_inode); +} + +/* + * romfs module initialisation + */ +static int __init init_romfs_fs(void) +{ + int ret; + + printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n"); + + romfs_inode_cachep = + kmem_cache_create("romfs_i", + sizeof(struct romfs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + romfs_i_init_once); + + if (!romfs_inode_cachep) { + printk(KERN_ERR + "ROMFS error: Failed to initialise inode cache\n"); + return -ENOMEM; + } + ret = register_filesystem(&romfs_fs_type); + if (ret) { + printk(KERN_ERR "ROMFS error: Failed to register filesystem\n"); + goto error_register; + } + return 0; + +error_register: + kmem_cache_destroy(romfs_inode_cachep); + return ret; +} + +/* + * romfs module removal + */ +static void __exit exit_romfs_fs(void) +{ + unregister_filesystem(&romfs_fs_type); + kmem_cache_destroy(romfs_inode_cachep); +} + +module_init(init_romfs_fs); +module_exit(exit_romfs_fs); + +MODULE_DESCRIPTION("Direct-MTD Capable RomFS"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */ diff --git a/fs/splice.c b/fs/splice.c index dd727d43e5b..c18aa7e03e2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -737,10 +737,19 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, * ->write_end. Most of the time, these expect i_mutex to * be held. Since this may result in an ABBA deadlock with * pipe->inode, we have to order lock acquiry here. + * + * Outer lock must be inode->i_mutex, as pipe_wait() will + * release and reacquire pipe->inode->i_mutex, AND inode must + * never be a pipe. */ - inode_double_lock(inode, pipe->inode); + WARN_ON(S_ISFIFO(inode->i_mode)); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + if (pipe->inode) + mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); ret = __splice_from_pipe(pipe, &sd, actor); - inode_double_unlock(inode, pipe->inode); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + mutex_unlock(&inode->i_mutex); return ret; } @@ -831,11 +840,17 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, }; ssize_t ret; - inode_double_lock(inode, pipe->inode); + WARN_ON(S_ISFIFO(inode->i_mode)); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); ret = file_remove_suid(out); - if (likely(!ret)) + if (likely(!ret)) { + if (pipe->inode) + mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); ret = __splice_from_pipe(pipe, &sd, pipe_to_file); - inode_double_unlock(inode, pipe->inode); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + } + mutex_unlock(&inode->i_mutex); if (ret > 0) { unsigned long nr_pages; diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c index 69e971d5ddc..2b1b8fe5e03 100644 --- a/fs/squashfs/export.c +++ b/fs/squashfs/export.c @@ -40,6 +40,7 @@ #include <linux/dcache.h> #include <linux/exportfs.h> #include <linux/zlib.h> +#include <linux/slab.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/super.c b/fs/super.c index 77cb4ec919b..786fe7d7279 100644 --- a/fs/super.c +++ b/fs/super.c @@ -771,6 +771,46 @@ void kill_litter_super(struct super_block *sb) EXPORT_SYMBOL(kill_litter_super); +static int ns_test_super(struct super_block *sb, void *data) +{ + return sb->s_fs_info == data; +} + +static int ns_set_super(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); +} + +int get_sb_ns(struct file_system_type *fs_type, int flags, void *data, + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt) +{ + struct super_block *sb; + + sb = sget(fs_type, ns_test_super, ns_set_super, data); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + if (!sb->s_root) { + int err; + sb->s_flags = flags; + err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0); + if (err) { + up_write(&sb->s_umount); + deactivate_super(sb); + return err; + } + + sb->s_flags |= MS_ACTIVE; + } + + simple_set_mnt(mnt, sb); + return 0; +} + +EXPORT_SYMBOL(get_sb_ns); + #ifdef CONFIG_BLOCK static int set_bdev_super(struct super_block *s, void *data) { diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index f393620890e..af1914462f0 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -194,29 +194,26 @@ static int make_free_space(struct ubifs_info *c) } /** - * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index. + * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index. * @c: UBIFS file-system description object * - * This function calculates and returns the number of eraseblocks which should - * be kept for index usage. + * This function calculates and returns the number of LEBs which should be kept + * for index usage. */ int ubifs_calc_min_idx_lebs(struct ubifs_info *c) { - int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz; + int idx_lebs; long long idx_size; idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; - /* And make sure we have thrice the index size of space reserved */ - idx_size = idx_size + (idx_size << 1); - + idx_size += idx_size << 1; /* * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' * pair, nor similarly the two variables for the new index size, so we * have to do this costly 64-bit division on fast-path. */ - idx_size += eff_leb_size - 1; - idx_lebs = div_u64(idx_size, eff_leb_size); + idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size); /* * The index head is not available for the in-the-gaps method, so add an * extra LEB to compensate. @@ -310,23 +307,23 @@ static int can_use_rp(struct ubifs_info *c) * do_budget_space - reserve flash space for index and data growth. * @c: UBIFS file-system description object * - * This function makes sure UBIFS has enough free eraseblocks for index growth - * and data. + * This function makes sure UBIFS has enough free LEBs for index growth and + * data. * * When budgeting index space, UBIFS reserves thrice as many LEBs as the index * would take if it was consolidated and written to the flash. This guarantees * that the "in-the-gaps" commit method always succeeds and UBIFS will always * be able to commit dirty index. So this function basically adds amount of * budgeted index space to the size of the current index, multiplies this by 3, - * and makes sure this does not exceed the amount of free eraseblocks. + * and makes sure this does not exceed the amount of free LEBs. * * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might * be large, because UBIFS does not do any index consolidation as long as * there is free space. IOW, the index may take a lot of LEBs, but the LEBs * will contain a lot of dirt. - * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be - * consolidated to take up to @c->min_idx_lebs LEBs. + * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW, + * the index may be consolidated to take up to @c->min_idx_lebs LEBs. * * This function returns zero in case of success, and %-ENOSPC in case of * failure. @@ -695,12 +692,12 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free) * This function calculates amount of free space to report to user-space. * * Because UBIFS may introduce substantial overhead (the index, node headers, - * alignment, wastage at the end of eraseblocks, etc), it cannot report real - * amount of free flash space it has (well, because not all dirty space is - * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so, - * it would bread user expectations about what free space is. Users seem to - * accustomed to assume that if the file-system reports N bytes of free space, - * they would be able to fit a file of N bytes to the FS. This almost works for + * alignment, wastage at the end of LEBs, etc), it cannot report real amount of + * free flash space it has (well, because not all dirty space is reclaimable, + * UBIFS does not actually know the real amount). If UBIFS did so, it would + * bread user expectations about what free space is. Users seem to accustomed + * to assume that if the file-system reports N bytes of free space, they would + * be able to fit a file of N bytes to the FS. This almost works for * traditional file-systems, because they have way less overhead than UBIFS. * So, to keep users happy, UBIFS tries to take the overhead into account. */ diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index e975bd82f38..ce2cd834361 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -479,9 +479,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) "bad or corrupted node)"); else { for (i = 0; i < nlen && dent->name[i]; i++) - printk("%c", dent->name[i]); + printk(KERN_CONT "%c", dent->name[i]); } - printk("\n"); + printk(KERN_CONT "\n"); break; } @@ -1214,7 +1214,7 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr) /* * Make sure the last key in our znode is less or - * equivalent than the the key in zbranch which goes + * equivalent than the key in the zbranch which goes * after our pointing zbranch. */ cmp = keys_cmp(c, max, diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 0ff89fe71e5..6d34dc7e33e 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -430,6 +430,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, struct ubifs_inode *ui = ubifs_inode(inode); pgoff_t index = pos >> PAGE_CACHE_SHIFT; int uninitialized_var(err), appending = !!(pos + len > inode->i_size); + int skipped_read = 0; struct page *page; ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); @@ -444,7 +445,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, if (!PageUptodate(page)) { /* The page is not loaded from the flash */ - if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) + if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) { /* * We change whole page so no need to load it. But we * have to set the @PG_checked flag to make the further @@ -453,7 +454,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, * the media. */ SetPageChecked(page); - else { + skipped_read = 1; + } else { err = do_readpage(page); if (err) { unlock_page(page); @@ -470,6 +472,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, if (unlikely(err)) { ubifs_assert(err == -ENOSPC); /* + * If we skipped reading the page because we were going to + * write all of it, then it is not up to date. + */ + if (skipped_read) { + ClearPageChecked(page); + ClearPageUptodate(page); + } + /* * Budgeting failed which means it would have to force * write-back but didn't, because we set the @fast flag in the * request. Write-back cannot be done now, while we have the @@ -949,7 +959,7 @@ static int do_writepage(struct page *page, int len) * whole index and correct all inode sizes, which is long an unacceptable. * * To prevent situations like this, UBIFS writes pages back only if they are - * within last synchronized inode size, i.e. the the size which has been + * within the last synchronized inode size, i.e. the size which has been * written to the flash media last time. Otherwise, UBIFS forces inode * write-back, thus making sure the on-flash inode contains current inode size, * and then keeps writing pages back. diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 717d79c97c5..1d54383d126 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -478,7 +478,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c, * ubifs_find_free_space - find a data LEB with free space. * @c: the UBIFS file-system description object * @min_space: minimum amount of required free space - * @free: contains amount of free space in the LEB on exit + * @offs: contains offset of where free space starts on exit * @squeeze: whether to try to find space in a non-empty LEB first * * This function looks for an LEB with at least @min_space bytes of free space. @@ -490,7 +490,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c, * failed to find a LEB with @min_space bytes of free space and other a negative * error codes in case of failure. */ -int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, +int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, int squeeze) { const struct ubifs_lprops *lprops; @@ -558,10 +558,10 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, spin_unlock(&c->space_lock); } - *free = lprops->free; + *offs = c->leb_size - lprops->free; ubifs_release_lprops(c); - if (*free == c->leb_size) { + if (*offs == 0) { /* * Ensure that empty LEBs have been unmapped. They may not have * been, for example, because of an unclean unmount. Also @@ -573,8 +573,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, return err; } - dbg_find("found LEB %d, free %d", lnum, *free); - ubifs_assert(*free >= min_space); + dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs); + ubifs_assert(*offs <= c->leb_size - min_space); return lnum; out: diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index a711d33b3d3..f0f5f15d384 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -47,7 +47,7 @@ * have to waste large pieces of free space at the end of LEB B, because nodes * from LEB A would not fit. And the worst situation is when all nodes are of * maximum size. So dark watermark is the amount of free + dirty space in LEB - * which are guaranteed to be reclaimable. If LEB has less space, the GC migh + * which are guaranteed to be reclaimable. If LEB has less space, the GC might * be unable to reclaim it. So, LEBs with free + dirty greater than dark * watermark are "good" LEBs from GC's point of few. The other LEBs are not so * good, and GC takes extra care when moving them. @@ -57,14 +57,6 @@ #include "ubifs.h" /* - * GC tries to optimize the way it fit nodes to available space, and it sorts - * nodes a little. The below constants are watermarks which define "large", - * "medium", and "small" nodes. - */ -#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4) -#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ - -/* * GC may need to move more than one LEB to make progress. The below constants * define "soft" and "hard" limits on the number of LEBs the garbage collector * may move. @@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c) } /** - * joinup - bring data nodes for an inode together. - * @c: UBIFS file-system description object - * @sleb: describes scanned LEB - * @inum: inode number - * @blk: block number - * @data: list to which to add data nodes + * list_sort - sort a list. + * @priv: private data, passed to @cmp + * @head: the list to sort + * @cmp: the elements comparison function * - * This function looks at the first few nodes in the scanned LEB @sleb and adds - * them to @data if they are data nodes from @inum and have a larger block - * number than @blk. This function returns %0 on success and a negative error - * code on failure. + * This function has been implemented by Mark J Roberts <mjr@znex.org>. It + * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted + * in ascending order. + * + * The comparison function @cmp is supposed to return a negative value if @a is + * than @b, and a positive value if @a is greater than @b. If @a and @b are + * equivalent, then it does not matter what this function returns. */ -static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum, - unsigned int blk, struct list_head *data) +static void list_sort(void *priv, struct list_head *head, + int (*cmp)(void *priv, struct list_head *a, + struct list_head *b)) { - int err, cnt = 6, lnum = sleb->lnum, offs; - struct ubifs_scan_node *snod, *tmp; - union ubifs_key *key; + struct list_head *p, *q, *e, *list, *tail, *oldhead; + int insize, nmerges, psize, qsize, i; + + if (list_empty(head)) + return; + + list = head->next; + list_del(head); + insize = 1; + for (;;) { + p = oldhead = list; + list = tail = NULL; + nmerges = 0; + + while (p) { + nmerges++; + q = p; + psize = 0; + for (i = 0; i < insize; i++) { + psize++; + q = q->next == oldhead ? NULL : q->next; + if (!q) + break; + } - list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { - key = &snod->key; - if (key_inum(c, key) == inum && - key_type(c, key) == UBIFS_DATA_KEY && - key_block(c, key) > blk) { - offs = snod->offs; - err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0); - if (err < 0) - return err; - list_del(&snod->list); - if (err) { - list_add_tail(&snod->list, data); - blk = key_block(c, key); - } else - kfree(snod); - cnt = 6; - } else if (--cnt == 0) + qsize = insize; + while (psize > 0 || (qsize > 0 && q)) { + if (!psize) { + e = q; + q = q->next; + qsize--; + if (q == oldhead) + q = NULL; + } else if (!qsize || !q) { + e = p; + p = p->next; + psize--; + if (p == oldhead) + p = NULL; + } else if (cmp(priv, p, q) <= 0) { + e = p; + p = p->next; + psize--; + if (p == oldhead) + p = NULL; + } else { + e = q; + q = q->next; + qsize--; + if (q == oldhead) + q = NULL; + } + if (tail) + tail->next = e; + else + list = e; + e->prev = tail; + tail = e; + } + p = q; + } + + tail->next = list; + list->prev = tail; + + if (nmerges <= 1) break; + + insize *= 2; } - return 0; + + head->next = list; + head->prev = list->prev; + list->prev->next = head; + list->prev = head; } /** - * move_nodes - move nodes. + * data_nodes_cmp - compare 2 data nodes. + * @priv: UBIFS file-system description object + * @a: first data node + * @a: second data node + * + * This function compares data nodes @a and @b. Returns %1 if @a has greater + * inode or block number, and %-1 otherwise. + */ +int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + ino_t inuma, inumb; + struct ubifs_info *c = priv; + struct ubifs_scan_node *sa, *sb; + + cond_resched(); + sa = list_entry(a, struct ubifs_scan_node, list); + sb = list_entry(b, struct ubifs_scan_node, list); + ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY); + ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY); + + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma == inumb) { + unsigned int blka = key_block(c, &sa->key); + unsigned int blkb = key_block(c, &sb->key); + + if (blka <= blkb) + return -1; + } else if (inuma <= inumb) + return -1; + + return 1; +} + +/* + * nondata_nodes_cmp - compare 2 non-data nodes. + * @priv: UBIFS file-system description object + * @a: first node + * @a: second node + * + * This function compares nodes @a and @b. It makes sure that inode nodes go + * first and sorted by length in descending order. Directory entry nodes go + * after inode nodes and are sorted in ascending hash valuer order. + */ +int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + int typea, typeb; + ino_t inuma, inumb; + struct ubifs_info *c = priv; + struct ubifs_scan_node *sa, *sb; + + cond_resched(); + sa = list_entry(a, struct ubifs_scan_node, list); + sb = list_entry(b, struct ubifs_scan_node, list); + typea = key_type(c, &sa->key); + typeb = key_type(c, &sb->key); + ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY); + + /* Inodes go before directory entries */ + if (typea == UBIFS_INO_KEY) { + if (typeb == UBIFS_INO_KEY) + return sb->len - sa->len; + return -1; + } + if (typeb == UBIFS_INO_KEY) + return 1; + + ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY); + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma == inumb) { + uint32_t hasha = key_hash(c, &sa->key); + uint32_t hashb = key_hash(c, &sb->key); + + if (hasha <= hashb) + return -1; + } else if (inuma <= inumb) + return -1; + + return 1; +} + +/** + * sort_nodes - sort nodes for GC. * @c: UBIFS file-system description object - * @sleb: describes nodes to move + * @sleb: describes nodes to sort and contains the result on exit + * @nondata: contains non-data nodes on exit + * @min: minimum node size is returned here * - * This function moves valid nodes from data LEB described by @sleb to the GC - * journal head. The obsolete nodes are dropped. + * This function sorts the list of inodes to garbage collect. First of all, it + * kills obsolete nodes and separates data and non-data nodes to the + * @sleb->nodes and @nondata lists correspondingly. + * + * Data nodes are then sorted in block number order - this is important for + * bulk-read; data nodes with lower inode number go before data nodes with + * higher inode number, and data nodes with lower block number go before data + * nodes with higher block number; * - * When moving nodes we have to deal with classical bin-packing problem: the - * space in the current GC journal head LEB and in @c->gc_lnum are the "bins", - * where the nodes in the @sleb->nodes list are the elements which should be - * fit optimally to the bins. This function uses the "first fit decreasing" - * strategy, although it does not really sort the nodes but just split them on - * 3 classes - large, medium, and small, so they are roughly sorted. + * Non-data nodes are sorted as follows. + * o First go inode nodes - they are sorted in descending length order. + * o Then go directory entry nodes - they are sorted in hash order, which + * should supposedly optimize 'readdir()'. Direntry nodes with lower parent + * inode number go before direntry nodes with higher parent inode number, + * and direntry nodes with lower name hash values go before direntry nodes + * with higher name hash values. * - * This function returns zero in case of success, %-EAGAIN if commit is - * required, and other negative error codes in case of other failures. + * This function returns zero in case of success and a negative error code in + * case of failure. */ -static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) +static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + struct list_head *nondata, int *min) { struct ubifs_scan_node *snod, *tmp; - struct list_head data, large, medium, small; - struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; - int avail, err, min = INT_MAX; - unsigned int blk = 0; - ino_t inum = 0; - INIT_LIST_HEAD(&data); - INIT_LIST_HEAD(&large); - INIT_LIST_HEAD(&medium); - INIT_LIST_HEAD(&small); + *min = INT_MAX; - while (!list_empty(&sleb->nodes)) { - struct list_head *lst = sleb->nodes.next; - - snod = list_entry(lst, struct ubifs_scan_node, list); + /* Separate data nodes and non-data nodes */ + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { + int err; ubifs_assert(snod->type != UBIFS_IDX_NODE); ubifs_assert(snod->type != UBIFS_REF_NODE); @@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, snod->offs, 0); if (err < 0) - goto out; + return err; - list_del(lst); if (!err) { /* The node is obsolete, remove it from the list */ + list_del(&snod->list); kfree(snod); continue; } - /* - * Sort the list of nodes so that data nodes go first, large - * nodes go second, and small nodes go last. - */ - if (key_type(c, &snod->key) == UBIFS_DATA_KEY) { - if (inum != key_inum(c, &snod->key)) { - if (inum) { - /* - * Try to move data nodes from the same - * inode together. - */ - err = joinup(c, sleb, inum, blk, &data); - if (err) - goto out; - } - inum = key_inum(c, &snod->key); - blk = key_block(c, &snod->key); - } - list_add_tail(lst, &data); - } else if (snod->len > MEDIUM_NODE_WM) - list_add_tail(lst, &large); - else if (snod->len > SMALL_NODE_WM) - list_add_tail(lst, &medium); - else - list_add_tail(lst, &small); - - /* And find the smallest node */ - if (snod->len < min) - min = snod->len; + if (snod->len < *min) + *min = snod->len; + + if (key_type(c, &snod->key) != UBIFS_DATA_KEY) + list_move_tail(&snod->list, nondata); } - /* - * Join the tree lists so that we'd have one roughly sorted list - * ('large' will be the head of the joined list). - */ - list_splice(&data, &large); - list_splice(&medium, large.prev); - list_splice(&small, large.prev); + /* Sort data and non-data nodes */ + list_sort(c, &sleb->nodes, &data_nodes_cmp); + list_sort(c, nondata, &nondata_nodes_cmp); + return 0; +} + +/** + * move_node - move a node. + * @c: UBIFS file-system description object + * @sleb: describes the LEB to move nodes from + * @snod: the mode to move + * @wbuf: write-buffer to move node to + * + * This function moves node @snod to @wbuf, changes TNC correspondingly, and + * destroys @snod. Returns zero in case of success and a negative error code in + * case of failure. + */ +static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf) +{ + int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used; + + cond_resched(); + err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len); + if (err) + return err; + + err = ubifs_tnc_replace(c, &snod->key, sleb->lnum, + snod->offs, new_lnum, new_offs, + snod->len); + list_del(&snod->list); + kfree(snod); + return err; +} + +/** + * move_nodes - move nodes. + * @c: UBIFS file-system description object + * @sleb: describes the LEB to move nodes from + * + * This function moves valid nodes from data LEB described by @sleb to the GC + * journal head. This function returns zero in case of success, %-EAGAIN if + * commit is required, and other negative error codes in case of other + * failures. + */ +static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) +{ + int err, min; + LIST_HEAD(nondata); + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; if (wbuf->lnum == -1) { /* @@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) */ err = switch_gc_head(c); if (err) - goto out; + return err; } + err = sort_nodes(c, sleb, &nondata, &min); + if (err) + goto out; + /* Write nodes to their new location. Use the first-fit strategy */ while (1) { - avail = c->leb_size - wbuf->offs - wbuf->used; - list_for_each_entry_safe(snod, tmp, &large, list) { - int new_lnum, new_offs; + int avail; + struct ubifs_scan_node *snod, *tmp; + + /* Move data nodes */ + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { + avail = c->leb_size - wbuf->offs - wbuf->used; + if (snod->len > avail) + /* + * Do not skip data nodes in order to optimize + * bulk-read. + */ + break; + + err = move_node(c, sleb, snod, wbuf); + if (err) + goto out; + } + /* Move non-data nodes */ + list_for_each_entry_safe(snod, tmp, &nondata, list) { + avail = c->leb_size - wbuf->offs - wbuf->used; if (avail < min) break; - if (snod->len > avail) - /* This node does not fit */ + if (snod->len > avail) { + /* + * Keep going only if this is an inode with + * some data. Otherwise stop and switch the GC + * head. IOW, we assume that data-less inode + * nodes and direntry nodes are roughly of the + * same size. + */ + if (key_type(c, &snod->key) == UBIFS_DENT_KEY || + snod->len == UBIFS_INO_NODE_SZ) + break; continue; + } - cond_resched(); - - new_lnum = wbuf->lnum; - new_offs = wbuf->offs + wbuf->used; - err = ubifs_wbuf_write_nolock(wbuf, snod->node, - snod->len); + err = move_node(c, sleb, snod, wbuf); if (err) goto out; - err = ubifs_tnc_replace(c, &snod->key, sleb->lnum, - snod->offs, new_lnum, new_offs, - snod->len); - if (err) - goto out; - - avail = c->leb_size - wbuf->offs - wbuf->used; - list_del(&snod->list); - kfree(snod); } - if (list_empty(&large)) + if (list_empty(&sleb->nodes) && list_empty(&nondata)) break; /* @@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) return 0; out: - list_for_each_entry_safe(snod, tmp, &large, list) { - list_del(&snod->list); - kfree(snod); - } + list_splice_tail(&nondata, &sleb->nodes); return err; } diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index a11ca0958a2..64b5f3a309f 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -114,7 +114,7 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun) */ static int reserve_space(struct ubifs_info *c, int jhead, int len) { - int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze; + int err = 0, err1, retries = 0, avail, lnum, offs, squeeze; struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; /* @@ -139,10 +139,9 @@ again: * Write buffer wasn't seek'ed or there is no enough space - look for an * LEB with some empty space. */ - lnum = ubifs_find_free_space(c, len, &free, squeeze); + lnum = ubifs_find_free_space(c, len, &offs, squeeze); if (lnum >= 0) { /* Found an LEB, add it to the journal head */ - offs = c->leb_size - free; err = ubifs_add_bud_to_log(c, jhead, lnum, offs); if (err) goto out_return; @@ -1366,7 +1365,7 @@ out_ro: * @host: host inode * * This function writes the updated version of an extended attribute inode and - * the host inode tho the journal (to the base head). The host inode is written + * the host inode to the journal (to the base head). The host inode is written * after the extended attribute inode in order to guarantee that the extended * attribute will be flushed when the inode is synchronized by 'fsync()' and * consequently, the write-buffer is synchronized. This function returns zero diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h index efb3430a258..5fa27ea031b 100644 --- a/fs/ubifs/key.h +++ b/fs/ubifs/key.h @@ -381,8 +381,8 @@ static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k) * @c: UBIFS file-system description object * @key: the key to get hash from */ -static inline int key_hash(const struct ubifs_info *c, - const union ubifs_key *key) +static inline uint32_t key_hash(const struct ubifs_info *c, + const union ubifs_key *key) { return key->u32[1] & UBIFS_S_KEY_HASH_MASK; } @@ -392,7 +392,7 @@ static inline int key_hash(const struct ubifs_info *c, * @c: UBIFS file-system description object * @k: the key to get hash from */ -static inline int key_hash_flash(const struct ubifs_info *c, const void *k) +static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k) { const union ubifs_key *key = k; diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 3e0aa736755..56e33772a1e 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -239,7 +239,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) } /* - * Make sure the the amount of space in buds will not exceed + * Make sure the amount of space in buds will not exceed the * 'c->max_bud_bytes' limit, because we want to guarantee mount time * limits. * @@ -367,7 +367,6 @@ static void remove_buds(struct ubifs_info *c) bud->jhead, c->leb_size - bud->start, c->cmt_bud_bytes); rb_erase(p1, &c->buds); - list_del(&bud->list); /* * If the commit does not finish, the recovery will need * to replay the journal, in which case the old buds @@ -375,7 +374,7 @@ static void remove_buds(struct ubifs_info *c) * commit i.e. do not allow them to be garbage * collected. */ - list_add(&bud->list, &c->old_buds); + list_move(&bud->list, &c->old_buds); } } spin_unlock(&c->buds_lock); diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 3216a1f277f..8cbfb824802 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -229,7 +229,7 @@ static int layout_cnodes(struct ubifs_info *c) while (offs + len > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); - dbg_chk_lpt_sz(c, 2, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = alloc_lpt_leb(c, &lnum); if (err) goto no_space; @@ -272,7 +272,7 @@ static int layout_cnodes(struct ubifs_info *c) if (offs + c->lsave_sz > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); - dbg_chk_lpt_sz(c, 2, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = alloc_lpt_leb(c, &lnum); if (err) goto no_space; @@ -292,7 +292,7 @@ static int layout_cnodes(struct ubifs_info *c) if (offs + c->ltab_sz > c->leb_size) { alen = ALIGN(offs, c->min_io_size); upd_ltab(c, lnum, c->leb_size - alen, alen - offs); - dbg_chk_lpt_sz(c, 2, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = alloc_lpt_leb(c, &lnum); if (err) goto no_space; @@ -416,14 +416,12 @@ static int write_cnodes(struct ubifs_info *c) alen, UBI_SHORTTERM); if (err) return err; - dbg_chk_lpt_sz(c, 4, alen - wlen); } - dbg_chk_lpt_sz(c, 2, 0); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = realloc_lpt_leb(c, &lnum); if (err) goto no_space; - offs = 0; - from = 0; + offs = from = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); err = ubifs_leb_unmap(c, lnum); @@ -477,11 +475,11 @@ static int write_cnodes(struct ubifs_info *c) UBI_SHORTTERM); if (err) return err; - dbg_chk_lpt_sz(c, 2, alen - wlen); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = realloc_lpt_leb(c, &lnum); if (err) goto no_space; - offs = 0; + offs = from = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); err = ubifs_leb_unmap(c, lnum); @@ -504,11 +502,11 @@ static int write_cnodes(struct ubifs_info *c) UBI_SHORTTERM); if (err) return err; - dbg_chk_lpt_sz(c, 2, alen - wlen); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); err = realloc_lpt_leb(c, &lnum); if (err) goto no_space; - offs = 0; + offs = from = 0; ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); err = ubifs_leb_unmap(c, lnum); @@ -1756,10 +1754,16 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c) /** * dbg_chk_lpt_sz - check LPT does not write more than LPT size. * @c: the UBIFS file-system description object - * @action: action + * @action: what to do * @len: length written * * This function returns %0 on success and a negative error code on failure. + * The @action argument may be one of: + * o %0 - LPT debugging checking starts, initialize debugging variables; + * o %1 - wrote an LPT node, increase LPT size by @len bytes; + * o %2 - switched to a different LEB and wasted @len bytes; + * o %3 - check that we've written the right number of bytes. + * o %4 - wasted @len bytes; */ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) { @@ -1917,12 +1921,12 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) lnum, offs); err = ubifs_unpack_nnode(c, buf, &nnode); for (i = 0; i < UBIFS_LPT_FANOUT; i++) { - printk("%d:%d", nnode.nbranch[i].lnum, + printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum, nnode.nbranch[i].offs); if (i != UBIFS_LPT_FANOUT - 1) - printk(", "); + printk(KERN_CONT ", "); } - printk("\n"); + printk(KERN_CONT "\n"); break; } case UBIFS_LPT_LTAB: diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 90acac603e6..10662975d2e 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -425,59 +425,35 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, * @lnum: LEB number of the LEB from which @buf was read * @offs: offset from which @buf was read * - * This function scans @buf for more nodes and returns %0 is a node is found and - * %1 if no more nodes are found. + * This function ensures that the corrupted node at @offs is the last thing + * written to a LEB. This function returns %1 if more data is not found and + * %0 if more data is found. */ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, int lnum, int offs) { - int skip, next_offs = 0; + struct ubifs_ch *ch = buf; + int skip, dlen = le32_to_cpu(ch->len); - if (len > UBIFS_DATA_NODE_SZ) { - struct ubifs_ch *ch = buf; - int dlen = le32_to_cpu(ch->len); - - if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ && - dlen <= UBIFS_MAX_DATA_NODE_SZ) - /* The corrupt node looks like a data node */ - next_offs = ALIGN(offs + dlen, 8); - } - - if (c->min_io_size == 1) - skip = 8; - else - skip = ALIGN(offs + 1, c->min_io_size) - offs; - - offs += skip; - buf += skip; - len -= skip; - while (len > 8) { - struct ubifs_ch *ch = buf; - uint32_t magic = le32_to_cpu(ch->magic); - int ret; - - if (magic == UBIFS_NODE_MAGIC) { - ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1); - if (ret == SCANNED_A_NODE || ret > 0) { - /* - * There is a small chance this is just data in - * a data node, so check that possibility. e.g. - * this is part of a file that itself contains - * a UBIFS image. - */ - if (next_offs && offs + le32_to_cpu(ch->len) <= - next_offs) - continue; - dbg_rcvry("unexpected node at %d:%d", lnum, - offs); - return 0; - } - } - offs += 8; - buf += 8; - len -= 8; + /* Check for empty space after the corrupt node's common header */ + skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs; + if (is_empty(buf + skip, len - skip)) + return 1; + /* + * The area after the common header size is not empty, so the common + * header must be intact. Check it. + */ + if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) { + dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs); + return 0; } - return 1; + /* Now we know the corrupt node's length we can skip over it */ + skip = ALIGN(offs + dlen, c->min_io_size) - offs; + /* After which there should be empty space */ + if (is_empty(buf + skip, len - skip)) + return 1; + dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip); + return 0; } /** diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index ce42a7b0ca5..11cc80125a4 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -143,7 +143,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r) dirty -= c->leb_size - lp->free; /* * If the replay order was perfect the dirty space would now be - * zero. The order is not perfect because the the journal heads + * zero. The order is not perfect because the journal heads * race with each other. This is not a problem but is does mean * that the dirty space may temporarily exceed c->leb_size * during the replay. diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index e070c643d1b..57085e43320 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -193,6 +193,7 @@ static int create_default_filesystem(struct ubifs_info *c) if (tmp64 > DEFAULT_MAX_RP_SIZE) tmp64 = DEFAULT_MAX_RP_SIZE; sup->rp_size = cpu_to_le64(tmp64); + sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION); err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM); kfree(sup); @@ -532,17 +533,39 @@ int ubifs_read_superblock(struct ubifs_info *c) if (IS_ERR(sup)) return PTR_ERR(sup); + c->fmt_version = le32_to_cpu(sup->fmt_version); + c->ro_compat_version = le32_to_cpu(sup->ro_compat_version); + /* * The software supports all previous versions but not future versions, * due to the unavailability of time-travelling equipment. */ - c->fmt_version = le32_to_cpu(sup->fmt_version); if (c->fmt_version > UBIFS_FORMAT_VERSION) { - ubifs_err("on-flash format version is %d, but software only " - "supports up to version %d", c->fmt_version, - UBIFS_FORMAT_VERSION); - err = -EINVAL; - goto out; + struct super_block *sb = c->vfs_sb; + int mounting_ro = sb->s_flags & MS_RDONLY; + + ubifs_assert(!c->ro_media || mounting_ro); + if (!mounting_ro || + c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) { + ubifs_err("on-flash format version is w%d/r%d, but " + "software only supports up to version " + "w%d/r%d", c->fmt_version, + c->ro_compat_version, UBIFS_FORMAT_VERSION, + UBIFS_RO_COMPAT_VERSION); + if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) { + ubifs_msg("only R/O mounting is possible"); + err = -EROFS; + } else + err = -EINVAL; + goto out; + } + + /* + * The FS is mounted R/O, and the media format is + * R/O-compatible with the UBIFS implementation, so we can + * mount. + */ + c->rw_incompat = 1; } if (c->fmt_version < 3) { @@ -623,7 +646,6 @@ int ubifs_read_superblock(struct ubifs_info *c) c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS; c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs; c->main_first = c->leb_cnt - c->main_lebs; - c->report_rp_size = ubifs_reported_space(c, c->rp_size); err = validate_sb(c, sup); out: diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index e7bab52a141..02feb59cefc 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -206,8 +206,7 @@ static int shrink_tnc_trees(int nr, int age, int *contention) * Move this one to the end of the list to provide some * fairness. */ - list_del(&c->infos_list); - list_add_tail(&c->infos_list, &ubifs_infos); + list_move_tail(&c->infos_list, &ubifs_infos); mutex_unlock(&c->umount_mutex); if (freed >= nr) break; @@ -263,8 +262,7 @@ static int kick_a_thread(void) } if (i == 1) { - list_del(&c->infos_list); - list_add_tail(&c->infos_list, &ubifs_infos); + list_move_tail(&c->infos_list, &ubifs_infos); spin_unlock(&ubifs_infos_lock); ubifs_request_bg_commit(c); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index c5c98355459..faa44f90608 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -421,8 +421,8 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt) seq_printf(s, ",no_chk_data_crc"); if (c->mount_opts.override_compr) { - seq_printf(s, ",compr="); - seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type)); + seq_printf(s, ",compr=%s", + ubifs_compr_name(c->mount_opts.compr_type)); } return 0; @@ -700,6 +700,8 @@ static int init_constants_sb(struct ubifs_info *c) if (err) return err; + /* Initialize effective LEB size used in budgeting calculations */ + c->idx_leb_size = c->leb_size - c->max_idx_node_sz; return 0; } @@ -716,6 +718,7 @@ static void init_constants_master(struct ubifs_info *c) long long tmp64; c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + c->report_rp_size = ubifs_reported_space(c, c->rp_size); /* * Calculate total amount of FS blocks. This number is not used @@ -1201,7 +1204,7 @@ static int mount_ubifs(struct ubifs_info *c) goto out_cbuf; /* Create background thread */ - c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); + c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; @@ -1318,11 +1321,15 @@ static int mount_ubifs(struct ubifs_info *c) else { c->need_recovery = 0; ubifs_msg("recovery completed"); - /* GC LEB has to be empty and taken at this point */ - ubifs_assert(c->lst.taken_empty_lebs == 1); + /* + * GC LEB has to be empty and taken at this point. But + * the journal head LEBs may also be accounted as + * "empty taken" if they are empty. + */ + ubifs_assert(c->lst.taken_empty_lebs > 0); } } else - ubifs_assert(c->lst.taken_empty_lebs == 1); + ubifs_assert(c->lst.taken_empty_lebs > 0); err = dbg_check_filesystem(c); if (err) @@ -1344,8 +1351,9 @@ static int mount_ubifs(struct ubifs_info *c) x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); - ubifs_msg("media format: %d (latest is %d)", - c->fmt_version, UBIFS_FORMAT_VERSION); + ubifs_msg("media format: w%d/r%d (latest is w%d/r%d)", + c->fmt_version, c->ro_compat_version, + UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); ubifs_msg("reserved for root: %llu bytes (%llu KiB)", c->report_rp_size, c->report_rp_size >> 10); @@ -1485,6 +1493,15 @@ static int ubifs_remount_rw(struct ubifs_info *c) { int err, lnum; + if (c->rw_incompat) { + ubifs_err("the file-system is not R/W-compatible"); + ubifs_msg("on-flash format version is w%d/r%d, but software " + "only supports up to version w%d/r%d", c->fmt_version, + c->ro_compat_version, UBIFS_FORMAT_VERSION, + UBIFS_RO_COMPAT_VERSION); + return -EROFS; + } + mutex_lock(&c->umount_mutex); dbg_save_space_info(c); c->remounting_rw = 1; @@ -1554,7 +1571,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) ubifs_create_buds_lists(c); /* Create background thread */ - c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); + c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; @@ -1775,7 +1792,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) c->bu.buf = NULL; } - ubifs_assert(c->lst.taken_empty_lebs == 1); + ubifs_assert(c->lst.taken_empty_lebs > 0); return 0; } diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index fa28a84c6a1..f249f7b0d65 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -1252,7 +1252,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, * splitting in the middle of the colliding sequence. Also, when * removing the leftmost key, we would have to correct the key of the * parent node, which would introduce additional complications. Namely, - * if we changed the the leftmost key of the parent znode, the garbage + * if we changed the leftmost key of the parent znode, the garbage * collector would be unable to find it (GC is doing this when GC'ing * indexing LEBs). Although we already have an additional RB-tree where * we save such changed znodes (see 'ins_clr_old_idx_znode()') until diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index b25fc36cf72..3eee07e0c49 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -36,9 +36,31 @@ /* UBIFS node magic number (must not have the padding byte first or last) */ #define UBIFS_NODE_MAGIC 0x06101831 -/* UBIFS on-flash format version */ +/* + * UBIFS on-flash format version. This version is increased when the on-flash + * format is changing. If this happens, UBIFS is will support older versions as + * well. But older UBIFS code will not support newer formats. Format changes + * will be rare and only when absolutely necessary, e.g. to fix a bug or to add + * a new feature. + * + * UBIFS went into mainline kernel with format version 4. The older formats + * were development formats. + */ #define UBIFS_FORMAT_VERSION 4 +/* + * Read-only compatibility version. If the UBIFS format is changed, older UBIFS + * implementations will not be able to mount newer formats in read-write mode. + * However, depending on the change, it may be possible to mount newer formats + * in R/O mode. This is indicated by the R/O compatibility version which is + * stored in the super-block. + * + * This is needed to support boot-loaders which only need R/O mounting. With + * this flag it is possible to do UBIFS format changes without a need to update + * boot-loaders. + */ +#define UBIFS_RO_COMPAT_VERSION 0 + /* Minimum logical eraseblock size in bytes */ #define UBIFS_MIN_LEB_SZ (15*1024) @@ -53,7 +75,7 @@ /* * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes - * shorter than uncompressed data length, UBIFS preferes to leave this data + * shorter than uncompressed data length, UBIFS prefers to leave this data * node uncompress, because it'll be read faster. */ #define UBIFS_MIN_COMPRESS_DIFF 64 @@ -586,6 +608,7 @@ struct ubifs_pad_node { * @padding2: reserved for future, zeroes * @time_gran: time granularity in nanoseconds * @uuid: UUID generated when the file system image was created + * @ro_compat_version: UBIFS R/O compatibility version */ struct ubifs_sb_node { struct ubifs_ch ch; @@ -612,7 +635,8 @@ struct ubifs_sb_node { __le64 rp_size; __le32 time_gran; __u8 uuid[16]; - __u8 padding2[3972]; + __le32 ro_compat_version; + __u8 padding2[3968]; } __attribute__ ((packed)); /** diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 039a68bee29..0a8341e1408 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -934,6 +934,7 @@ struct ubifs_debug_info; * by @commit_sem * @cnt_lock: protects @highest_inum and @max_sqnum counters * @fmt_version: UBIFS on-flash format version + * @ro_compat_version: R/O compatibility version * @uuid: UUID from super block * * @lhead_lnum: log head logical eraseblock number @@ -966,6 +967,7 @@ struct ubifs_debug_info; * recovery) * @bulk_read: enable bulk-reads * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) + * @rw_incompat: the media is not R/W compatible * * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and * @calc_idx_sz @@ -1015,6 +1017,8 @@ struct ubifs_debug_info; * @min_io_shift: number of bits in @min_io_size minus one * @leb_size: logical eraseblock size in bytes * @half_leb_size: half LEB size + * @idx_leb_size: how many bytes of an LEB are effectively available when it is + * used to store indexing nodes (@leb_size - @max_idx_node_sz) * @leb_cnt: count of logical eraseblocks * @max_leb_cnt: maximum count of logical eraseblocks * @old_leb_cnt: count of logical eraseblocks before re-size @@ -1132,8 +1136,8 @@ struct ubifs_debug_info; * previous commit start * @uncat_list: list of un-categorized LEBs * @empty_list: list of empty LEBs - * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size) - * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size) + * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size) + * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size) * @freeable_cnt: number of freeable LEBs in @freeable_list * * @ltab_lnum: LEB number of LPT's own lprops table @@ -1177,6 +1181,7 @@ struct ubifs_info { unsigned long long cmt_no; spinlock_t cnt_lock; int fmt_version; + int ro_compat_version; unsigned char uuid[16]; int lhead_lnum; @@ -1205,6 +1210,7 @@ struct ubifs_info { unsigned int no_chk_data_crc:1; unsigned int bulk_read:1; unsigned int default_compr:2; + unsigned int rw_incompat:1; struct mutex tnc_mutex; struct ubifs_zbranch zroot; @@ -1253,6 +1259,7 @@ struct ubifs_info { int min_io_shift; int leb_size; int half_leb_size; + int idx_leb_size; int leb_cnt; int max_leb_cnt; int old_leb_cnt; @@ -1500,7 +1507,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free); long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); /* find.c */ -int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, +int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, int squeeze); int ubifs_find_free_leb_for_idx(struct ubifs_info *c); int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index c13f67300fe..7ec89fc05b2 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -153,23 +153,6 @@ xfs_find_bdev_for_inode( } /* - * Schedule IO completion handling on a xfsdatad if this was - * the final hold on this ioend. If we are asked to wait, - * flush the workqueue. - */ -STATIC void -xfs_finish_ioend( - xfs_ioend_t *ioend, - int wait) -{ - if (atomic_dec_and_test(&ioend->io_remaining)) { - queue_work(xfsdatad_workqueue, &ioend->io_work); - if (wait) - flush_workqueue(xfsdatad_workqueue); - } -} - -/* * We're now finished for good with this ioend structure. * Update the page state via the associated buffer_heads, * release holds on the inode and bio, and finally free @@ -310,6 +293,27 @@ xfs_end_bio_read( } /* + * Schedule IO completion handling on a xfsdatad if this was + * the final hold on this ioend. If we are asked to wait, + * flush the workqueue. + */ +STATIC void +xfs_finish_ioend( + xfs_ioend_t *ioend, + int wait) +{ + if (atomic_dec_and_test(&ioend->io_remaining)) { + struct workqueue_struct *wq = xfsdatad_workqueue; + if (ioend->io_work.func == xfs_end_bio_unwritten) + wq = xfsconvertd_workqueue; + + queue_work(wq, &ioend->io_work); + if (wait) + flush_workqueue(wq); + } +} + +/* * Allocate and initialise an IO completion structure. * We need to track unwritten extent write completion here initially. * We'll need to extend this for updating the ondisk inode size later diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h index 1dd52884975..221b3e66cee 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/linux-2.6/xfs_aops.h @@ -19,6 +19,7 @@ #define __XFS_AOPS_H__ extern struct workqueue_struct *xfsdatad_workqueue; +extern struct workqueue_struct *xfsconvertd_workqueue; extern mempool_t *xfs_ioend_pool; /* diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index aa1016bb913..e28800a9f2b 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -51,6 +51,7 @@ static struct shrinker xfs_buf_shake = { static struct workqueue_struct *xfslogd_workqueue; struct workqueue_struct *xfsdatad_workqueue; +struct workqueue_struct *xfsconvertd_workqueue; #ifdef XFS_BUF_TRACE void @@ -1775,6 +1776,7 @@ xfs_flush_buftarg( xfs_buf_t *bp, *n; int pincount = 0; + xfs_buf_runall_queues(xfsconvertd_workqueue); xfs_buf_runall_queues(xfsdatad_workqueue); xfs_buf_runall_queues(xfslogd_workqueue); @@ -1831,9 +1833,15 @@ xfs_buf_init(void) if (!xfsdatad_workqueue) goto out_destroy_xfslogd_workqueue; + xfsconvertd_workqueue = create_workqueue("xfsconvertd"); + if (!xfsconvertd_workqueue) + goto out_destroy_xfsdatad_workqueue; + register_shrinker(&xfs_buf_shake); return 0; + out_destroy_xfsdatad_workqueue: + destroy_workqueue(xfsdatad_workqueue); out_destroy_xfslogd_workqueue: destroy_workqueue(xfslogd_workqueue); out_free_buf_zone: @@ -1849,6 +1857,7 @@ void xfs_buf_terminate(void) { unregister_shrinker(&xfs_buf_shake); + destroy_workqueue(xfsconvertd_workqueue); destroy_workqueue(xfsdatad_workqueue); destroy_workqueue(xfslogd_workqueue); kmem_zone_destroy(xfs_buf_zone); diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c index 5aeb7777696..08be36d7326 100644 --- a/fs/xfs/linux-2.6/xfs_fs_subr.c +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c @@ -74,14 +74,14 @@ xfs_flush_pages( if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = filemap_fdatawrite(mapping); - if (flags & XFS_B_ASYNC) - return -ret; - ret2 = filemap_fdatawait(mapping); - if (!ret) - ret = ret2; + ret = -filemap_fdatawrite(mapping); } - return -ret; + if (flags & XFS_B_ASYNC) + return ret; + ret2 = xfs_wait_on_pages(ip, first, last); + if (!ret) + ret = ret2; + return ret; } int diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 7e90daa0d1d..9142192ccbe 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -751,10 +751,26 @@ start: goto relock; } } else { + int enospc = 0; + ssize_t ret2 = 0; + +write_retry: xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs, *offset, ioflags); - ret = generic_file_buffered_write(iocb, iovp, segs, + ret2 = generic_file_buffered_write(iocb, iovp, segs, pos, offset, count, ret); + /* + * if we just got an ENOSPC, flush the inode now we + * aren't holding any page locks and retry *once* + */ + if (ret2 == -ENOSPC && !enospc) { + error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE); + if (error) + goto out_unlock_internal; + enospc = 1; + goto write_retry; + } + ret = ret2; } current->backing_dev_info = NULL; diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index a608e72fa40..f7ba76633c2 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -62,12 +62,6 @@ xfs_sync_inodes_ag( uint32_t first_index = 0; int error = 0; int last_error = 0; - int fflag = XFS_B_ASYNC; - - if (flags & SYNC_DELWRI) - fflag = XFS_B_DELWRI; - if (flags & SYNC_WAIT) - fflag = 0; /* synchronous overrides all */ do { struct inode *inode; @@ -128,11 +122,23 @@ xfs_sync_inodes_ag( * If we have to flush data or wait for I/O completion * we need to hold the iolock. */ - if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) { - xfs_ilock(ip, XFS_IOLOCK_SHARED); - lock_flags |= XFS_IOLOCK_SHARED; - error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE); - if (flags & SYNC_IOWAIT) + if (flags & SYNC_DELWRI) { + if (VN_DIRTY(inode)) { + if (flags & SYNC_TRYLOCK) { + if (xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) + lock_flags |= XFS_IOLOCK_SHARED; + } else { + xfs_ilock(ip, XFS_IOLOCK_SHARED); + lock_flags |= XFS_IOLOCK_SHARED; + } + if (lock_flags & XFS_IOLOCK_SHARED) { + error = xfs_flush_pages(ip, 0, -1, + (flags & SYNC_WAIT) ? 0 + : XFS_B_ASYNC, + FI_NONE); + } + } + if (VN_CACHED(inode) && (flags & SYNC_IOWAIT)) xfs_ioend_wait(ip); } xfs_ilock(ip, XFS_ILOCK_SHARED); @@ -398,15 +404,17 @@ STATIC void xfs_syncd_queue_work( struct xfs_mount *mp, void *data, - void (*syncer)(struct xfs_mount *, void *)) + void (*syncer)(struct xfs_mount *, void *), + struct completion *completion) { - struct bhv_vfs_sync_work *work; + struct xfs_sync_work *work; - work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP); + work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP); INIT_LIST_HEAD(&work->w_list); work->w_syncer = syncer; work->w_data = data; work->w_mount = mp; + work->w_completion = completion; spin_lock(&mp->m_sync_lock); list_add_tail(&work->w_list, &mp->m_sync_list); spin_unlock(&mp->m_sync_lock); @@ -420,49 +428,26 @@ xfs_syncd_queue_work( * heads, looking about for more room... */ STATIC void -xfs_flush_inode_work( - struct xfs_mount *mp, - void *arg) -{ - struct inode *inode = arg; - filemap_flush(inode->i_mapping); - iput(inode); -} - -void -xfs_flush_inode( - xfs_inode_t *ip) -{ - struct inode *inode = VFS_I(ip); - - igrab(inode); - xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work); - delay(msecs_to_jiffies(500)); -} - -/* - * This is the "bigger hammer" version of xfs_flush_inode_work... - * (IOW, "If at first you don't succeed, use a Bigger Hammer"). - */ -STATIC void -xfs_flush_device_work( +xfs_flush_inodes_work( struct xfs_mount *mp, void *arg) { struct inode *inode = arg; - sync_blockdev(mp->m_super->s_bdev); + xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK); + xfs_sync_inodes(mp, SYNC_DELWRI | SYNC_TRYLOCK | SYNC_IOWAIT); iput(inode); } void -xfs_flush_device( +xfs_flush_inodes( xfs_inode_t *ip) { struct inode *inode = VFS_I(ip); + DECLARE_COMPLETION_ONSTACK(completion); igrab(inode); - xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work); - delay(msecs_to_jiffies(500)); + xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); + wait_for_completion(&completion); xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); } @@ -497,7 +482,7 @@ xfssyncd( { struct xfs_mount *mp = arg; long timeleft; - bhv_vfs_sync_work_t *work, *n; + xfs_sync_work_t *work, *n; LIST_HEAD (tmp); set_freezable(); @@ -532,6 +517,8 @@ xfssyncd( list_del(&work->w_list); if (work == &mp->m_sync_work) continue; + if (work->w_completion) + complete(work->w_completion); kmem_free(work); } } @@ -545,6 +532,7 @@ xfs_syncd_init( { mp->m_sync_work.w_syncer = xfs_sync_worker; mp->m_sync_work.w_mount = mp; + mp->m_sync_work.w_completion = NULL; mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd"); if (IS_ERR(mp->m_sync_task)) return -PTR_ERR(mp->m_sync_task); diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index 04f058c848a..308d5bf6dfb 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -21,18 +21,20 @@ struct xfs_mount; struct xfs_perag; -typedef struct bhv_vfs_sync_work { +typedef struct xfs_sync_work { struct list_head w_list; struct xfs_mount *w_mount; void *w_data; /* syncer routine argument */ void (*w_syncer)(struct xfs_mount *, void *); -} bhv_vfs_sync_work_t; + struct completion *w_completion; +} xfs_sync_work_t; #define SYNC_ATTR 0x0001 /* sync attributes */ #define SYNC_DELWRI 0x0002 /* look at delayed writes */ #define SYNC_WAIT 0x0004 /* wait for i/o to complete */ #define SYNC_BDFLUSH 0x0008 /* BDFLUSH is calling -- don't block */ #define SYNC_IOWAIT 0x0010 /* wait for all I/O to complete */ +#define SYNC_TRYLOCK 0x0020 /* only try to lock inodes */ int xfs_syncd_init(struct xfs_mount *mp); void xfs_syncd_stop(struct xfs_mount *mp); @@ -43,8 +45,7 @@ int xfs_sync_fsdata(struct xfs_mount *mp, int flags); int xfs_quiesce_data(struct xfs_mount *mp); void xfs_quiesce_attr(struct xfs_mount *mp); -void xfs_flush_inode(struct xfs_inode *ip); -void xfs_flush_device(struct xfs_inode *ip); +void xfs_flush_inodes(struct xfs_inode *ip); int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode); int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode); diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 478e587087f..89b81eedce6 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -69,15 +69,6 @@ xfs_inode_alloc( ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); - /* - * initialise the VFS inode here to get failures - * out of the way early. - */ - if (!inode_init_always(mp->m_super, VFS_I(ip))) { - kmem_zone_free(xfs_inode_zone, ip); - return NULL; - } - /* initialise the xfs inode */ ip->i_ino = ino; ip->i_mount = mp; @@ -113,6 +104,20 @@ xfs_inode_alloc( #ifdef XFS_DIR2_TRACE ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS); #endif + /* + * Now initialise the VFS inode. We do this after the xfs_inode + * initialisation as internal failures will result in ->destroy_inode + * being called and that will pass down through the reclaim path and + * free the XFS inode. This path requires the XFS inode to already be + * initialised. Hence if this call fails, the xfs_inode has already + * been freed and we should not reference it at all in the error + * handling. + */ + if (!inode_init_always(mp->m_super, VFS_I(ip))) + return NULL; + + /* prevent anyone from using this yet */ + VFS_I(ip)->i_state = I_NEW|I_LOCK; return ip; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 08ce72316bf..5aaa2d7ec15 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -338,38 +338,6 @@ xfs_iomap_eof_align_last_fsb( } STATIC int -xfs_flush_space( - xfs_inode_t *ip, - int *fsynced, - int *ioflags) -{ - switch (*fsynced) { - case 0: - if (ip->i_delayed_blks) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_flush_inode(ip); - xfs_ilock(ip, XFS_ILOCK_EXCL); - *fsynced = 1; - } else { - *ioflags |= BMAPI_SYNC; - *fsynced = 2; - } - return 0; - case 1: - *fsynced = 2; - *ioflags |= BMAPI_SYNC; - return 0; - case 2: - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_flush_device(ip); - xfs_ilock(ip, XFS_ILOCK_EXCL); - *fsynced = 3; - return 0; - } - return 1; -} - -STATIC int xfs_cmn_err_fsblock_zero( xfs_inode_t *ip, xfs_bmbt_irec_t *imap) @@ -538,15 +506,9 @@ error_out: } /* - * If the caller is doing a write at the end of the file, - * then extend the allocation out to the file system's write - * iosize. We clean up any extra space left over when the - * file is closed in xfs_inactive(). - * - * For sync writes, we are flushing delayed allocate space to - * try to make additional space available for allocation near - * the filesystem full boundary - preallocation hurts in that - * situation, of course. + * If the caller is doing a write at the end of the file, then extend the + * allocation out to the file system's write iosize. We clean up any extra + * space left over when the file is closed in xfs_inactive(). */ STATIC int xfs_iomap_eof_want_preallocate( @@ -565,7 +527,7 @@ xfs_iomap_eof_want_preallocate( int n, error, imaps; *prealloc = 0; - if ((ioflag & BMAPI_SYNC) || (offset + count) <= ip->i_size) + if ((offset + count) <= ip->i_size) return 0; /* @@ -611,7 +573,7 @@ xfs_iomap_write_delay( xfs_extlen_t extsz; int nimaps; xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; - int prealloc, fsynced = 0; + int prealloc, flushed = 0; int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -627,12 +589,12 @@ xfs_iomap_write_delay( extsz = xfs_get_extsz_hint(ip); offset_fsb = XFS_B_TO_FSBT(mp, offset); -retry: error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, ioflag, imap, XFS_WRITE_IMAPS, &prealloc); if (error) return error; +retry: if (prealloc) { aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); ioalign = XFS_B_TO_FSBT(mp, aligned_offset); @@ -659,15 +621,22 @@ retry: /* * If bmapi returned us nothing, and if we didn't get back EDQUOT, - * then we must have run out of space - flush delalloc, and retry.. + * then we must have run out of space - flush all other inodes with + * delalloc blocks and retry without EOF preallocation. */ if (nimaps == 0) { xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE, ip, offset, count); - if (xfs_flush_space(ip, &fsynced, &ioflag)) + if (flushed) return XFS_ERROR(ENOSPC); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_flush_inodes(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); + + flushed = 1; error = 0; + prealloc = 0; goto retry; } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index a1cc1322fc0..fdcf7b82747 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -40,8 +40,7 @@ typedef enum { BMAPI_IGNSTATE = (1 << 4), /* ignore unwritten state on read */ BMAPI_DIRECT = (1 << 5), /* direct instead of buffered write */ BMAPI_MMAP = (1 << 6), /* allocate for mmap write */ - BMAPI_SYNC = (1 << 7), /* sync write to flush delalloc space */ - BMAPI_TRYLOCK = (1 << 8), /* non-blocking request */ + BMAPI_TRYLOCK = (1 << 7), /* non-blocking request */ } bmapi_flags_t; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f76c6d7cea2..3750f04ede0 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -562,9 +562,8 @@ xfs_log_mount( } mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); - if (!mp->m_log) { - cmn_err(CE_WARN, "XFS: Log allocation failed: No memory!"); - error = ENOMEM; + if (IS_ERR(mp->m_log)) { + error = -PTR_ERR(mp->m_log); goto out; } @@ -1180,10 +1179,13 @@ xlog_alloc_log(xfs_mount_t *mp, xfs_buf_t *bp; int i; int iclogsize; + int error = ENOMEM; log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); - if (!log) - return NULL; + if (!log) { + xlog_warn("XFS: Log allocation failed: No memory!"); + goto out; + } log->l_mp = mp; log->l_targ = log_target; @@ -1201,19 +1203,35 @@ xlog_alloc_log(xfs_mount_t *mp, log->l_grant_reserve_cycle = 1; log->l_grant_write_cycle = 1; + error = EFSCORRUPTED; if (xfs_sb_version_hassector(&mp->m_sb)) { log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT; - ASSERT(log->l_sectbb_log <= mp->m_sectbb_log); + if (log->l_sectbb_log < 0 || + log->l_sectbb_log > mp->m_sectbb_log) { + xlog_warn("XFS: Log sector size (0x%x) out of range.", + log->l_sectbb_log); + goto out_free_log; + } + /* for larger sector sizes, must have v2 or external log */ - ASSERT(log->l_sectbb_log == 0 || - log->l_logBBstart == 0 || - xfs_sb_version_haslogv2(&mp->m_sb)); - ASSERT(mp->m_sb.sb_logsectlog >= BBSHIFT); + if (log->l_sectbb_log != 0 && + (log->l_logBBstart != 0 && + !xfs_sb_version_haslogv2(&mp->m_sb))) { + xlog_warn("XFS: log sector size (0x%x) invalid " + "for configuration.", log->l_sectbb_log); + goto out_free_log; + } + if (mp->m_sb.sb_logsectlog < BBSHIFT) { + xlog_warn("XFS: Log sector log (0x%x) too small.", + mp->m_sb.sb_logsectlog); + goto out_free_log; + } } log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1; xlog_get_iclog_buffer_size(mp, log); + error = ENOMEM; bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); if (!bp) goto out_free_log; @@ -1313,7 +1331,8 @@ out_free_iclog: xfs_buf_free(log->l_xbuf); out_free_log: kmem_free(log); - return NULL; +out: + return ERR_PTR(-error); } /* xlog_alloc_log */ @@ -2541,18 +2560,19 @@ redo: xlog_ins_ticketq(&log->l_reserve_headq, tic); xlog_trace_loggrant(log, tic, "xlog_grant_log_space: sleep 2"); + spin_unlock(&log->l_grant_lock); + xlog_grant_push_ail(log->l_mp, need_bytes); + spin_lock(&log->l_grant_lock); + XFS_STATS_INC(xs_sleep_logspace); sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); - if (XLOG_FORCED_SHUTDOWN(log)) { - spin_lock(&log->l_grant_lock); + spin_lock(&log->l_grant_lock); + if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; - } xlog_trace_loggrant(log, tic, "xlog_grant_log_space: wake 2"); - xlog_grant_push_ail(log->l_mp, need_bytes); - spin_lock(&log->l_grant_lock); goto redo; } else if (tic->t_flags & XLOG_TIC_IN_Q) xlog_del_ticketq(&log->l_reserve_headq, tic); @@ -2631,7 +2651,7 @@ xlog_regrant_write_log_space(xlog_t *log, * for more free space, otherwise try to get some space for * this transaction. */ - + need_bytes = tic->t_unit_res; if ((ntic = log->l_write_headq)) { free_bytes = xlog_space_left(log, log->l_grant_write_cycle, log->l_grant_write_bytes); @@ -2651,26 +2671,25 @@ xlog_regrant_write_log_space(xlog_t *log, xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: sleep 1"); + spin_unlock(&log->l_grant_lock); + xlog_grant_push_ail(log->l_mp, need_bytes); + spin_lock(&log->l_grant_lock); + XFS_STATS_INC(xs_sleep_logspace); sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); /* If we're shutting down, this tic is already * off the queue */ - if (XLOG_FORCED_SHUTDOWN(log)) { - spin_lock(&log->l_grant_lock); + spin_lock(&log->l_grant_lock); + if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; - } xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: wake 1"); - xlog_grant_push_ail(log->l_mp, tic->t_unit_res); - spin_lock(&log->l_grant_lock); } } - need_bytes = tic->t_unit_res; - redo: if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; @@ -2680,19 +2699,20 @@ redo: if (free_bytes < need_bytes) { if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) xlog_ins_ticketq(&log->l_write_headq, tic); + spin_unlock(&log->l_grant_lock); + xlog_grant_push_ail(log->l_mp, need_bytes); + spin_lock(&log->l_grant_lock); + XFS_STATS_INC(xs_sleep_logspace); sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s); /* If we're shutting down, this tic is already off the queue */ - if (XLOG_FORCED_SHUTDOWN(log)) { - spin_lock(&log->l_grant_lock); + spin_lock(&log->l_grant_lock); + if (XLOG_FORCED_SHUTDOWN(log)) goto error_return; - } xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: wake 2"); - xlog_grant_push_ail(log->l_mp, need_bytes); - spin_lock(&log->l_grant_lock); goto redo; } else if (tic->t_flags & XLOG_TIC_IN_Q) xlog_del_ticketq(&log->l_write_headq, tic); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7af44adffc8..d6a64392f98 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -313,7 +313,7 @@ typedef struct xfs_mount { #endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct task_struct *m_sync_task; /* generalised sync thread */ - bhv_vfs_sync_work_t m_sync_work; /* work item for VFS_SYNC */ + xfs_sync_work_t m_sync_work; /* work item for VFS_SYNC */ struct list_head m_sync_list; /* sync thread work item list */ spinlock_t m_sync_lock; /* work item list lock */ int m_sync_seq; /* sync thread generation no. */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 7394c7af5de..19cf90a9c76 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1457,6 +1457,13 @@ xfs_create( error = xfs_trans_reserve(tp, resblks, log_res, 0, XFS_TRANS_PERM_LOG_RES, log_count); if (error == ENOSPC) { + /* flush outstanding delalloc blocks and retry */ + xfs_flush_inodes(dp); + error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0, + XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); + } + if (error == ENOSPC) { + /* No space at all so try a "no-allocation" reservation */ resblks = 0; error = xfs_trans_reserve(tp, 0, log_res, 0, XFS_TRANS_PERM_LOG_RES, log_count); |