diff options
Diffstat (limited to 'fs')
360 files changed, 36656 insertions, 28171 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 6a4ad4bb7a5..e54be705835 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -29,6 +29,7 @@ config EXT2_FS_XATTR config EXT2_FS_POSIX_ACL bool "Ext2 POSIX Access Control Lists" depends on EXT2_FS_XATTR + select FS_POSIX_ACL help Posix Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. @@ -50,6 +51,23 @@ config EXT2_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. +config EXT2_FS_XIP + bool "Ext2 execute in place support" + depends on EXT2_FS + help + Execute in place can be used on memory-backed block devices. If you + enable this option, you can select to mount block devices which are + capable of this feature without using the page cache. + + If you do not use a block device that is capable of using this, + or if unsure, say N. + +config FS_XIP +# execute in place + bool + depends on EXT2_FS_XIP + default y + config EXT3_FS tristate "Ext3 journalling file system support" help @@ -97,6 +115,7 @@ config EXT3_FS_XATTR config EXT3_FS_POSIX_ACL bool "Ext3 POSIX Access Control Lists" depends on EXT3_FS_XATTR + select FS_POSIX_ACL help Posix Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. @@ -224,6 +243,7 @@ config REISERFS_FS_XATTR config REISERFS_FS_POSIX_ACL bool "ReiserFS POSIX Access Control Lists" depends on REISERFS_FS_XATTR + select FS_POSIX_ACL help Posix Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. @@ -257,6 +277,7 @@ config JFS_FS config JFS_POSIX_ACL bool "JFS POSIX Access Control Lists" depends on JFS_FS + select FS_POSIX_ACL help Posix Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. @@ -301,8 +322,7 @@ config FS_POSIX_ACL # Never use this symbol for ifdefs. # bool - depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL || REISERFS_FS_POSIX_ACL || NFSD_V4 - default y + default n source "fs/xfs/Kconfig" @@ -339,6 +359,22 @@ config ROMFS_FS If you don't know whether you need it, then you don't need it: answer N. +config INOTIFY + bool "Inotify file change notification support" + default y + ---help--- + Say Y here to enable inotify support and the associated system + calls. Inotify is a file change notification system and a + replacement for dnotify. Inotify fixes numerous shortcomings in + dnotify and introduces several new features. It allows monitoring + of both files and directories via a single open fd. Other features + include multiple file events, one-shot support, and unmount + notification. + + For more information, see Documentation/filesystems/inotify.txt + + If unsure, say Y. + config QUOTA bool "Quota support" help @@ -717,6 +753,12 @@ config PROC_KCORE bool "/proc/kcore support" if !ARM depends on PROC_FS && MMU +config PROC_VMCORE + bool "/proc/vmcore support (EXPERIMENTAL)" + depends on PROC_FS && EMBEDDED && EXPERIMENTAL && CRASH_DUMP + help + Exports the dump image of crashed kernel in ELF format. + config SYSFS bool "sysfs file system support" if EMBEDDED default y @@ -741,56 +783,6 @@ config SYSFS Designers of embedded systems may wish to say N here to conserve space. -config DEVFS_FS - bool "/dev file system support (OBSOLETE)" - depends on EXPERIMENTAL - help - This is support for devfs, a virtual file system (like /proc) which - provides the file system interface to device drivers, normally found - in /dev. Devfs does not depend on major and minor number - allocations. Device drivers register entries in /dev which then - appear automatically, which means that the system administrator does - not have to create character and block special device files in the - /dev directory using the mknod command (or MAKEDEV script) anymore. - - This is work in progress. If you want to use this, you *must* read - the material in <file:Documentation/filesystems/devfs/>, especially - the file README there. - - Note that devfs no longer manages /dev/pts! If you are using UNIX98 - ptys, you will also need to mount the /dev/pts filesystem (devpts). - - Note that devfs has been obsoleted by udev, - <http://www.kernel.org/pub/linux/utils/kernel/hotplug/>. - It has been stripped down to a bare minimum and is only provided for - legacy installations that use its naming scheme which is - unfortunately different from the names normal Linux installations - use. - - If unsure, say N. - -config DEVFS_MOUNT - bool "Automatically mount at boot" - depends on DEVFS_FS - help - This option appears if you have CONFIG_DEVFS_FS enabled. Setting - this to 'Y' will make the kernel automatically mount devfs onto /dev - when the system is booted, before the init thread is started. - You can override this with the "devfs=nomount" boot option. - - If unsure, say N. - -config DEVFS_DEBUG - bool "Debug devfs" - depends on DEVFS_FS - help - If you say Y here, then the /dev file system code will generate - debugging messages. See the file - <file:Documentation/filesystems/devfs/boot-options> for more - details. - - If unsure, say N. - config DEVPTS_FS_XATTR bool "/dev/pts Extended Attributes" depends on UNIX98_PTYS @@ -1063,26 +1055,18 @@ config JFFS2_FS_DEBUG If reporting bugs, please try to have available a full dump of the messages at debug level 1 while the misbehaviour was occurring. -config JFFS2_FS_NAND - bool "JFFS2 support for NAND flash" +config JFFS2_FS_WRITEBUFFER + bool "JFFS2 write-buffering support" depends on JFFS2_FS - default n + default y help - This enables the support for NAND flash in JFFS2. NAND is a newer - type of flash chip design than the traditional NOR flash, with - higher density but a handful of characteristics which make it more - interesting for the file system to use. + This enables the write-buffering support in JFFS2. - Say 'N' unless you have NAND flash. - -config JFFS2_FS_NOR_ECC - bool "JFFS2 support for ECC'd NOR flash (EXPERIMENTAL)" - depends on JFFS2_FS && EXPERIMENTAL - default n - help - This enables the experimental support for NOR flash with transparent - ECC for JFFS2. This type of flash chip is not common, however it is - available from ST Microelectronics. + This functionality is required to support JFFS2 on the following + types of flash devices: + - NAND flash + - NOR flash with transparent ECC + - DataFlash config JFFS2_COMPRESSION_OPTIONS bool "Advanced compression options for JFFS2" @@ -1318,6 +1302,7 @@ config NFS_FS depends on INET select LOCKD select SUNRPC + select NFS_ACL_SUPPORT if NFS_V3_ACL help If you are connected to some other (usually local) Unix computer (using SLIP, PLIP, PPP or Ethernet) and want to mount files residing @@ -1360,6 +1345,16 @@ config NFS_V3 If unsure, say Y. +config NFS_V3_ACL + bool "Provide client support for the NFSv3 ACL protocol extension" + depends on NFS_V3 + help + Implement the NFSv3 ACL protocol extension for manipulating POSIX + Access Control Lists. The server should also be compiled with + the NFSv3 ACL protocol extension; see the CONFIG_NFSD_V3_ACL option. + + If unsure, say N. + config NFS_V4 bool "Provide NFSv4 client support (EXPERIMENTAL)" depends on NFS_FS && EXPERIMENTAL @@ -1403,6 +1398,7 @@ config NFSD select LOCKD select SUNRPC select EXPORTFS + select NFS_ACL_SUPPORT if NFSD_V3_ACL || NFSD_V2_ACL help If you want your Linux box to act as an NFS *server*, so that other computers on your local network which support NFS can access certain @@ -1426,6 +1422,10 @@ config NFSD To compile the NFS server support as a module, choose M here: the module will be called nfsd. If unsure, say N. +config NFSD_V2_ACL + bool + depends on NFSD + config NFSD_V3 bool "Provide NFSv3 server support" depends on NFSD @@ -1433,10 +1433,23 @@ config NFSD_V3 If you would like to include the NFSv3 server as well as the NFSv2 server, say Y here. If unsure, say Y. +config NFSD_V3_ACL + bool "Provide server support for the NFSv3 ACL protocol extension" + depends on NFSD_V3 + select NFSD_V2_ACL + help + Implement the NFSv3 ACL protocol extension for manipulating POSIX + Access Control Lists on exported file systems. NFS clients should + be compiled with the NFSv3 ACL protocol extension; see the + CONFIG_NFS_V3_ACL option. If unsure, say N. + config NFSD_V4 bool "Provide NFSv4 server support (EXPERIMENTAL)" depends on NFSD_V3 && EXPERIMENTAL select NFSD_TCP + select CRYPTO_MD5 + select CRYPTO + select FS_POSIX_ACL help If you would like to include the NFSv4 server as well as the NFSv2 and NFSv3 servers, say Y here. This feature is experimental, and @@ -1477,6 +1490,15 @@ config LOCKD_V4 config EXPORTFS tristate +config NFS_ACL_SUPPORT + tristate + select FS_POSIX_ACL + +config NFS_COMMON + bool + depends on NFSD || NFS_FS + default y + config SUNRPC tristate diff --git a/fs/Makefile b/fs/Makefile index 443f2bc56cc..cf95eb894fd 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -10,7 +10,9 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \ ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \ + ioprio.o +obj-$(CONFIG_INOTIFY) += inotify.o obj-$(CONFIG_EPOLL) += eventpoll.o obj-$(CONFIG_COMPAT) += compat.o @@ -31,6 +33,7 @@ obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o obj-$(CONFIG_FS_MBCACHE) += mbcache.o obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o +obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_QUOTA) += dquot.o obj-$(CONFIG_QFMT_V1) += quota_v1.o diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c index 6fc88ae8ad9..7ac07d0d47b 100644 --- a/fs/afs/kafsasyncd.c +++ b/fs/afs/kafsasyncd.c @@ -116,7 +116,7 @@ static int kafsasyncd(void *arg) remove_wait_queue(&kafsasyncd_sleepq, &myself); set_current_state(TASK_RUNNING); - try_to_freeze(PF_FREEZE); + try_to_freeze(); /* discard pending signals */ afs_discard_my_signals(); diff --git a/fs/afs/kafstimod.c b/fs/afs/kafstimod.c index 86e710dd057..65bc05ab818 100644 --- a/fs/afs/kafstimod.c +++ b/fs/afs/kafstimod.c @@ -91,7 +91,7 @@ static int kafstimod(void *arg) complete_and_exit(&kafstimod_dead, 0); } - try_to_freeze(PF_FREEZE); + try_to_freeze(); /* discard pending signals */ afs_discard_my_signals(); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index bfc28abe1cb..31ee06590de 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -30,7 +30,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd); static int afs_mntpt_open(struct inode *inode, struct file *file); -static int afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd); +static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd); struct file_operations afs_mntpt_file_operations = { .open = afs_mntpt_open, @@ -233,7 +233,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) /* * follow a link from a mountpoint directory, thus causing it to be mounted */ -static int afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd) +static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd) { struct vfsmount *newmnt; struct dentry *old_dentry; @@ -249,7 +249,7 @@ static int afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd) newmnt = afs_mntpt_do_automount(dentry); if (IS_ERR(newmnt)) { path_release(nd); - return PTR_ERR(newmnt); + return (void *)newmnt; } old_dentry = nd->dentry; @@ -267,7 +267,7 @@ static int afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd) } kleave(" = %d", err); - return err; + return ERR_PTR(err); } /* end afs_mntpt_follow_link() */ /*****************************************************************************/ @@ -58,6 +58,7 @@ static DEFINE_SPINLOCK(fput_lock); static LIST_HEAD(fput_head); static void aio_kick_handler(void *); +static void aio_queue_work(struct kioctx *); /* aio_setup * Creates the slab caches used by the aio routines, panic on @@ -747,6 +748,14 @@ out: * has already been kicked */ if (kiocbIsKicked(iocb)) { __queue_kicked_iocb(iocb); + + /* + * __queue_kicked_iocb will always return 1 here, because + * iocb->ki_run_list is empty at this point so it should + * be safe to unconditionally queue the context into the + * work queue. + */ + aio_queue_work(ctx); } } return ret; diff --git a/fs/attr.c b/fs/attr.c index c3c76fe7834..b1796fb9e52 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -10,7 +10,7 @@ #include <linux/mm.h> #include <linux/string.h> #include <linux/smp_lock.h> -#include <linux/dnotify.h> +#include <linux/fsnotify.h> #include <linux/fcntl.h> #include <linux/quotaops.h> #include <linux/security.h> @@ -107,31 +107,8 @@ int inode_setattr(struct inode * inode, struct iattr * attr) out: return error; } - EXPORT_SYMBOL(inode_setattr); -int setattr_mask(unsigned int ia_valid) -{ - unsigned long dn_mask = 0; - - if (ia_valid & ATTR_UID) - dn_mask |= DN_ATTRIB; - if (ia_valid & ATTR_GID) - dn_mask |= DN_ATTRIB; - if (ia_valid & ATTR_SIZE) - dn_mask |= DN_MODIFY; - /* both times implies a utime(s) call */ - if ((ia_valid & (ATTR_ATIME|ATTR_MTIME)) == (ATTR_ATIME|ATTR_MTIME)) - dn_mask |= DN_ATTRIB; - else if (ia_valid & ATTR_ATIME) - dn_mask |= DN_ACCESS; - else if (ia_valid & ATTR_MTIME) - dn_mask |= DN_MODIFY; - if (ia_valid & ATTR_MODE) - dn_mask |= DN_ATTRIB; - return dn_mask; -} - int notify_change(struct dentry * dentry, struct iattr * attr) { struct inode *inode = dentry->d_inode; @@ -197,11 +174,9 @@ int notify_change(struct dentry * dentry, struct iattr * attr) if (ia_valid & ATTR_SIZE) up_write(&dentry->d_inode->i_alloc_sem); - if (!error) { - unsigned long dn_mask = setattr_mask(ia_valid); - if (dn_mask) - dnotify_parent(dentry, dn_mask); - } + if (!error) + fsnotify_change(dentry, ia_valid); + return error; } diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c index f028396f138..52e8772b066 100644 --- a/fs/autofs/symlink.c +++ b/fs/autofs/symlink.c @@ -12,11 +12,12 @@ #include "autofs_i.h" -static int autofs_follow_link(struct dentry *dentry, struct nameidata *nd) +/* Nothing to release.. */ +static void *autofs_follow_link(struct dentry *dentry, struct nameidata *nd) { char *s=((struct autofs_symlink *)dentry->d_inode->u.generic_ip)->data; nd_set_link(nd, s); - return 0; + return NULL; } struct inode_operations autofs_symlink_inode_operations = { diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index c7b2b889018..fca83e28edc 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -92,6 +92,7 @@ struct autofs_wait_queue { struct autofs_sb_info { u32 magic; + struct dentry *root; struct file *pipe; pid_t oz_pgrp; int catatonic; @@ -185,6 +186,19 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify); int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); void autofs4_catatonic_mode(struct autofs_sb_info *); +static inline int autofs4_follow_mount(struct vfsmount **mnt, struct dentry **dentry) +{ + int res = 0; + + while (d_mountpoint(*dentry)) { + int followed = follow_down(mnt, dentry); + if (!followed) + break; + res = 1; + } + return res; +} + static inline int simple_positive(struct dentry *dentry) { return dentry->d_inode && !d_unhashed(dentry); diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 500425e24fb..feb6ac427d0 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -56,12 +56,9 @@ static int autofs4_check_mount(struct vfsmount *mnt, struct dentry *dentry) mntget(mnt); dget(dentry); - if (!follow_down(&mnt, &dentry)) + if (!autofs4_follow_mount(&mnt, &dentry)) goto done; - while (d_mountpoint(dentry) && follow_down(&mnt, &dentry)) - ; - /* This is an autofs submount, we can't expire it */ if (is_autofs4_dentry(dentry)) goto done; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 4bb14cc6804..0a3c05d1016 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -16,6 +16,7 @@ #include <linux/pagemap.h> #include <linux/parser.h> #include <linux/bitops.h> +#include <linux/smp_lock.h> #include "autofs_i.h" #include <linux/module.h> @@ -76,6 +77,66 @@ void autofs4_free_ino(struct autofs_info *ino) kfree(ino); } +/* + * Deal with the infamous "Busy inodes after umount ..." message. + * + * Clean up the dentry tree. This happens with autofs if the user + * space program goes away due to a SIGKILL, SIGSEGV etc. + */ +static void autofs4_force_release(struct autofs_sb_info *sbi) +{ + struct dentry *this_parent = sbi->root; + struct list_head *next; + + spin_lock(&dcache_lock); +repeat: + next = this_parent->d_subdirs.next; +resume: + while (next != &this_parent->d_subdirs) { + struct dentry *dentry = list_entry(next, struct dentry, d_child); + + /* Negative dentry - don`t care */ + if (!simple_positive(dentry)) { + next = next->next; + continue; + } + + if (!list_empty(&dentry->d_subdirs)) { + this_parent = dentry; + goto repeat; + } + + next = next->next; + spin_unlock(&dcache_lock); + + DPRINTK("dentry %p %.*s", + dentry, (int)dentry->d_name.len, dentry->d_name.name); + + dput(dentry); + spin_lock(&dcache_lock); + } + + if (this_parent != sbi->root) { + struct dentry *dentry = this_parent; + + next = this_parent->d_child.next; + this_parent = this_parent->d_parent; + spin_unlock(&dcache_lock); + DPRINTK("parent dentry %p %.*s", + dentry, (int)dentry->d_name.len, dentry->d_name.name); + dput(dentry); + spin_lock(&dcache_lock); + goto resume; + } + spin_unlock(&dcache_lock); + + dput(sbi->root); + sbi->root = NULL; + shrink_dcache_sb(sbi->sb); + + return; +} + static void autofs4_put_super(struct super_block *sb) { struct autofs_sb_info *sbi = autofs4_sbi(sb); @@ -85,6 +146,10 @@ static void autofs4_put_super(struct super_block *sb) if ( !sbi->catatonic ) autofs4_catatonic_mode(sbi); /* Free wait queues, close pipe */ + /* Clean up and release dangling references */ + if (sbi) + autofs4_force_release(sbi); + kfree(sbi); DPRINTK("shutting down"); @@ -199,6 +264,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) s->s_fs_info = sbi; sbi->magic = AUTOFS_SBI_MAGIC; + sbi->root = NULL; sbi->catatonic = 0; sbi->exp_timeout = 0; sbi->oz_pgrp = process_group(current); @@ -267,6 +333,13 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->pipe = pipe; /* + * Take a reference to the root dentry so we get a chance to + * clean up the dentry tree on umount. + * See autofs4_force_release. + */ + sbi->root = dget(root); + + /* * Success! Install the root dentry now to indicate completion. */ s->s_root = root; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 3765c047f15..2a771ec6695 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -205,7 +205,11 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) struct vfsmount *fp_mnt = mntget(mnt); struct dentry *fp_dentry = dget(dentry); - while (follow_down(&fp_mnt, &fp_dentry) && d_mountpoint(fp_dentry)); + if (!autofs4_follow_mount(&fp_mnt, &fp_dentry)) { + dput(fp_dentry); + mntput(fp_mnt); + return -ENOENT; + } fp = dentry_open(fp_dentry, fp_mnt, file->f_flags); status = PTR_ERR(fp); @@ -302,7 +306,14 @@ static int try_to_fill_dentry(struct dentry *dentry, DPRINTK("expire done status=%d", status); - return 0; + /* + * If the directory still exists the mount request must + * continue otherwise it can't be followed at the right + * time during the walk. + */ + status = d_invalidate(dentry); + if (status != -EBUSY) + return 0; } DPRINTK("dentry=%p %.*s ino=%p", diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index c265a66edf0..2ea2c98fd84 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -12,11 +12,11 @@ #include "autofs_i.h" -static int autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) +static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) { struct autofs_info *ino = autofs4_dentry_ino(dentry); nd_set_link(nd, (char *)ino->u.symlink); - return 0; + return NULL; } struct inode_operations autofs4_symlink_inode_operations = { diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 5a40d36e5a5..3df86285a1c 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -191,6 +191,13 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, } if ( !wq ) { + /* Can't wait for an expire if there's no mount */ + if (notify == NFY_NONE && !d_mountpoint(dentry)) { + kfree(name); + up(&sbi->wq_sem); + return -ENOENT; + } + /* Create a new wait queue */ wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL); if ( !wq ) { @@ -224,8 +231,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, int type = (notify == NFY_MOUNT ? autofs_ptype_missing : autofs_ptype_expire_multi); - DPRINTK(("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", - (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify)); + DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", + (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify); /* autofs4_notify_daemon() may block */ autofs4_notify_daemon(sbi, wq, type); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index de5bb280a82..e0a6025f1d0 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -41,8 +41,8 @@ static struct inode *befs_alloc_inode(struct super_block *sb); static void befs_destroy_inode(struct inode *inode); static int befs_init_inodecache(void); static void befs_destroy_inodecache(void); -static int befs_follow_link(struct dentry *, struct nameidata *); -static void befs_put_link(struct dentry *, struct nameidata *); +static void *befs_follow_link(struct dentry *, struct nameidata *); +static void befs_put_link(struct dentry *, struct nameidata *, void *); static int befs_utf2nls(struct super_block *sb, const char *in, int in_len, char **out, int *out_len); static int befs_nls2utf(struct super_block *sb, const char *in, int in_len, @@ -461,7 +461,7 @@ befs_destroy_inodecache(void) * The data stream become link name. Unless the LONG_SYMLINK * flag is set. */ -static int +static void * befs_follow_link(struct dentry *dentry, struct nameidata *nd) { befs_inode_info *befs_ino = BEFS_I(dentry->d_inode); @@ -487,10 +487,10 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd) } nd_set_link(nd, link); - return 0; + return NULL; } -static void befs_put_link(struct dentry *dentry, struct nameidata *nd) +static void befs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) { befs_inode_info *befs_ino = BEFS_I(dentry->d_inode); if (befs_ino->i_flags & BEFS_LONG_SYMLINK) { diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 009b8920c1f..dd9baabaf01 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -316,6 +316,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) current->mm->brk = ex.a_bss + (current->mm->start_brk = N_BSSADDR(ex)); current->mm->free_area_cache = current->mm->mmap_base; + current->mm->cached_hole_size = 0; set_mm_counter(current->mm, rss, 0); current->mm->mmap = NULL; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f8f6b6b7617..7976a238f0a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -775,6 +775,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs) change some of these later */ set_mm_counter(current->mm, rss, 0); current->mm->free_area_cache = current->mm->mmap_base; + current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), executable_stack); if (retval < 0) { @@ -53,7 +53,7 @@ struct biovec_slab { */ #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } -static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] = { +static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), }; #undef BV @@ -249,19 +249,16 @@ inline void __bio_clone(struct bio *bio, struct bio *bio_src) { request_queue_t *q = bdev_get_queue(bio_src->bi_bdev); - memcpy(bio->bi_io_vec, bio_src->bi_io_vec, bio_src->bi_max_vecs * sizeof(struct bio_vec)); + memcpy(bio->bi_io_vec, bio_src->bi_io_vec, + bio_src->bi_max_vecs * sizeof(struct bio_vec)); bio->bi_sector = bio_src->bi_sector; bio->bi_bdev = bio_src->bi_bdev; bio->bi_flags |= 1 << BIO_CLONED; bio->bi_rw = bio_src->bi_rw; - - /* - * notes -- maybe just leave bi_idx alone. assume identical mapping - * for the clone - */ bio->bi_vcnt = bio_src->bi_vcnt; bio->bi_size = bio_src->bi_size; + bio->bi_idx = bio_src->bi_idx; bio_phys_segments(q, bio); bio_hw_segments(q, bio); } diff --git a/fs/block_dev.c b/fs/block_dev.c index c0cbd1bc1a0..e0df94c37b7 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -777,8 +777,7 @@ static ssize_t blkdev_file_aio_write(struct kiocb *iocb, const char __user *buf, return generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); } -static int block_ioctl(struct inode *inode, struct file *file, unsigned cmd, - unsigned long arg) +static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) { return blkdev_ioctl(file->f_mapping->host, file, cmd, arg); } @@ -803,7 +802,7 @@ struct file_operations def_blk_fops = { .aio_write = blkdev_file_aio_write, .mmap = generic_file_mmap, .fsync = block_fsync, - .ioctl = block_ioctl, + .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif diff --git a/fs/buffer.c b/fs/buffer.c index 7e9e409feaa..6a25d7df89b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -278,7 +278,7 @@ EXPORT_SYMBOL(thaw_bdev); */ static void do_sync(unsigned long wait) { - wakeup_bdflush(0); + wakeup_pdflush(0); sync_inodes(0); /* All mappings, inodes and their blockdevs */ DQUOT_SYNC(NULL); sync_supers(); /* Write the superblocks */ @@ -331,7 +331,7 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync) return ret; } -asmlinkage long sys_fsync(unsigned int fd) +static long do_fsync(unsigned int fd, int datasync) { struct file * file; struct address_space *mapping; @@ -342,14 +342,14 @@ asmlinkage long sys_fsync(unsigned int fd) if (!file) goto out; - mapping = file->f_mapping; - ret = -EINVAL; if (!file->f_op || !file->f_op->fsync) { /* Why? We can still call filemap_fdatawrite */ goto out_putf; } + mapping = file->f_mapping; + current->flags |= PF_SYNCWRITE; ret = filemap_fdatawrite(mapping); @@ -358,7 +358,7 @@ asmlinkage long sys_fsync(unsigned int fd) * which could cause livelocks in fsync_buffers_list */ down(&mapping->host->i_sem); - err = file->f_op->fsync(file, file->f_dentry, 0); + err = file->f_op->fsync(file, file->f_dentry, datasync); if (!ret) ret = err; up(&mapping->host->i_sem); @@ -373,39 +373,14 @@ out: return ret; } -asmlinkage long sys_fdatasync(unsigned int fd) +asmlinkage long sys_fsync(unsigned int fd) { - struct file * file; - struct address_space *mapping; - int ret, err; - - ret = -EBADF; - file = fget(fd); - if (!file) - goto out; - - ret = -EINVAL; - if (!file->f_op || !file->f_op->fsync) - goto out_putf; - - mapping = file->f_mapping; - - current->flags |= PF_SYNCWRITE; - ret = filemap_fdatawrite(mapping); - down(&mapping->host->i_sem); - err = file->f_op->fsync(file, file->f_dentry, 1); - if (!ret) - ret = err; - up(&mapping->host->i_sem); - err = filemap_fdatawait(mapping); - if (!ret) - ret = err; - current->flags &= ~PF_SYNCWRITE; + return do_fsync(fd, 0); +} -out_putf: - fput(file); -out: - return ret; +asmlinkage long sys_fdatasync(unsigned int fd) +{ + return do_fsync(fd, 1); } /* @@ -522,13 +497,13 @@ static void free_more_memory(void) struct zone **zones; pg_data_t *pgdat; - wakeup_bdflush(1024); + wakeup_pdflush(1024); yield(); for_each_pgdat(pgdat) { zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones; if (*zones) - try_to_free_pages(zones, GFP_NOFS, 0); + try_to_free_pages(zones, GFP_NOFS); } } @@ -538,8 +513,8 @@ static void free_more_memory(void) */ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) { - static DEFINE_SPINLOCK(page_uptodate_lock); unsigned long flags; + struct buffer_head *first; struct buffer_head *tmp; struct page *page; int page_uptodate = 1; @@ -561,7 +536,9 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) * two buffer heads end IO at almost the same time and both * decide that the page is now completely done. */ - spin_lock_irqsave(&page_uptodate_lock, flags); + first = page_buffers(page); + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &first->b_state); clear_buffer_async_read(bh); unlock_buffer(bh); tmp = bh; @@ -574,7 +551,8 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) } tmp = tmp->b_this_page; } while (tmp != bh); - spin_unlock_irqrestore(&page_uptodate_lock, flags); + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); /* * If none of the buffers had errors and they are all @@ -586,7 +564,8 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) return; still_busy: - spin_unlock_irqrestore(&page_uptodate_lock, flags); + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); return; } @@ -597,8 +576,8 @@ still_busy: void end_buffer_async_write(struct buffer_head *bh, int uptodate) { char b[BDEVNAME_SIZE]; - static DEFINE_SPINLOCK(page_uptodate_lock); unsigned long flags; + struct buffer_head *first; struct buffer_head *tmp; struct page *page; @@ -619,7 +598,10 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) SetPageError(page); } - spin_lock_irqsave(&page_uptodate_lock, flags); + first = page_buffers(page); + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + clear_buffer_async_write(bh); unlock_buffer(bh); tmp = bh->b_this_page; @@ -630,12 +612,14 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) } tmp = tmp->b_this_page; } - spin_unlock_irqrestore(&page_uptodate_lock, flags); + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); end_page_writeback(page); return; still_busy: - spin_unlock_irqrestore(&page_uptodate_lock, flags); + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); return; } @@ -1951,7 +1935,6 @@ static int __block_prepare_write(struct inode *inode, struct page *page, if (err) break; if (buffer_new(bh)) { - clear_buffer_new(bh); unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); if (PageUptodate(page)) { @@ -1993,9 +1976,14 @@ static int __block_prepare_write(struct inode *inode, struct page *page, if (!buffer_uptodate(*wait_bh)) err = -EIO; } - if (!err) - return err; - + if (!err) { + bh = head; + do { + if (buffer_new(bh)) + clear_buffer_new(bh); + } while ((bh = bh->b_this_page) != head); + return 0; + } /* Error case: */ /* * Zero out any newly allocated blocks to avoid exposing stale diff --git a/fs/char_dev.c b/fs/char_dev.c index c1e3537909f..3b1b1eefdbb 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -56,10 +56,21 @@ int get_chrdev_list(char *page) down(&chrdevs_lock); for (i = 0; i < ARRAY_SIZE(chrdevs) ; i++) { - for (cd = chrdevs[i]; cd; cd = cd->next) + for (cd = chrdevs[i]; cd; cd = cd->next) { + /* + * if the current name, plus the 5 extra characters + * in the device line for this entry + * would run us off the page, we're done + */ + if ((len+strlen(cd->name) + 5) >= PAGE_SIZE) + goto page_full; + + len += sprintf(page+len, "%3d %s\n", cd->major, cd->name); + } } +page_full: up(&chrdevs_lock); return len; @@ -139,7 +150,7 @@ __unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct) struct char_device_struct *cd = NULL, **cp; int i = major_to_index(major); - up(&chrdevs_lock); + down(&chrdevs_lock); for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) if ((*cp)->major == major && (*cp)->baseminor == baseminor && @@ -266,8 +277,9 @@ static struct kobject *cdev_get(struct cdev *p) void cdev_put(struct cdev *p) { if (p) { + struct module *owner = p->owner; kobject_put(&p->kobj); - module_put(p->owner); + module_put(owner); } } diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index dab4774ee7b..3196d4c4eed 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,3 +1,9 @@ +Version 1.35 +------------ +Add writepage performance improvements. Fix path name conversions +for long filenames on mounts which were done with "mapchars" mount option +specified. + Version 1.34 ------------ Fix error mapping of the TOO_MANY_LINKS (hardlinks) case. diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 78af5850c55..1fd21f66f24 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -83,8 +83,8 @@ extern int cifs_dir_notify(struct file *, unsigned long arg); extern struct dentry_operations cifs_dentry_ops; /* Functions related to symlinks */ -extern int cifs_follow_link(struct dentry *direntry, struct nameidata *nd); -extern void cifs_put_link(struct dentry *direntry, struct nameidata *nd); +extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd); +extern void cifs_put_link(struct dentry *direntry, struct nameidata *nd, void *); extern int cifs_readlink(struct dentry *direntry, char __user *buffer, int buflen); extern int cifs_symlink(struct inode *inode, struct dentry *direntry, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 3c628bf667a..0db0b313d71 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2602,6 +2602,9 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon, if(name_len < PATH_MAX) { memcpy(pSMB->ResumeFileName, psrch_inf->presume_name, name_len); byte_count += name_len; + /* 14 byte parm len above enough for 2 byte null terminator */ + pSMB->ResumeFileName[name_len] = 0; + pSMB->ResumeFileName[name_len+1] = 0; } else { rc = -EINVAL; goto FNext2_err_exit; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 30ab70ce554..3497125189d 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -643,7 +643,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) netfid, length, pfLock->fl_start, numUnlock, numLock, lockType, wait_flag); - if (rc == 0 && (pfLock->fl_flags & FL_POSIX)) + if (pfLock->fl_flags & FL_POSIX) posix_lock_file_wait(file, pfLock); FreeXid(xid); return rc; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index bde0fabfece..ab925ef4f86 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -92,7 +92,7 @@ cifs_hl_exit: return rc; } -int +void * cifs_follow_link(struct dentry *direntry, struct nameidata *nd) { struct inode *inode = direntry->d_inode; @@ -148,7 +148,7 @@ out: out_no_free: FreeXid(xid); nd_set_link(nd, target_path); - return 0; + return NULL; /* No cookie */ } int @@ -330,7 +330,7 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen) return rc; } -void cifs_put_link(struct dentry *direntry, struct nameidata *nd) +void cifs_put_link(struct dentry *direntry, struct nameidata *nd, void *cookie) { char *p = nd_get_link(nd); if (!IS_ERR(p)) diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 072b4ee8c53..20ae4153f79 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -611,6 +611,7 @@ cifsConvertToUCS(__le16 * target, const char *source, int maxlen, src_char = source[i]; switch (src_char) { case 0: + target[j] = 0; goto ctoUCS_out; case ':': target[j] = cpu_to_le16(UNI_COLON); diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index ef001a9313e..3d1cce3653b 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -61,7 +61,7 @@ unsigned long coda_timeout = 30; /* .. secs, then signals will dequeue */ struct venus_comm coda_comms[MAX_CODADEVS]; -static struct class_simple *coda_psdev_class; +static struct class *coda_psdev_class; /* * Device operations @@ -363,14 +363,14 @@ static int init_coda_psdev(void) CODA_PSDEV_MAJOR); return -EIO; } - coda_psdev_class = class_simple_create(THIS_MODULE, "coda"); + coda_psdev_class = class_create(THIS_MODULE, "coda"); if (IS_ERR(coda_psdev_class)) { err = PTR_ERR(coda_psdev_class); goto out_chrdev; } devfs_mk_dir ("coda"); for (i = 0; i < MAX_CODADEVS; i++) { - class_simple_device_add(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR,i), + class_device_create(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR,i), NULL, "cfs%d", i); err = devfs_mk_cdev(MKDEV(CODA_PSDEV_MAJOR, i), S_IFCHR|S_IRUSR|S_IWUSR, "coda/%d", i); @@ -382,8 +382,8 @@ static int init_coda_psdev(void) out_class: for (i = 0; i < MAX_CODADEVS; i++) - class_simple_device_remove(MKDEV(CODA_PSDEV_MAJOR, i)); - class_simple_destroy(coda_psdev_class); + class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i)); + class_destroy(coda_psdev_class); out_chrdev: unregister_chrdev(CODA_PSDEV_MAJOR, "coda"); out: @@ -425,10 +425,10 @@ static int __init init_coda(void) return 0; out: for (i = 0; i < MAX_CODADEVS; i++) { - class_simple_device_remove(MKDEV(CODA_PSDEV_MAJOR, i)); + class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i)); devfs_remove("coda/%d", i); } - class_simple_destroy(coda_psdev_class); + class_destroy(coda_psdev_class); devfs_remove("coda"); unregister_chrdev(CODA_PSDEV_MAJOR, "coda"); coda_sysctl_clean(); @@ -447,10 +447,10 @@ static void __exit exit_coda(void) printk("coda: failed to unregister filesystem\n"); } for (i = 0; i < MAX_CODADEVS; i++) { - class_simple_device_remove(MKDEV(CODA_PSDEV_MAJOR, i)); + class_device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i)); devfs_remove("coda/%d", i); } - class_simple_destroy(coda_psdev_class); + class_destroy(coda_psdev_class); devfs_remove("coda"); unregister_chrdev(CODA_PSDEV_MAJOR, "coda"); coda_sysctl_clean(); diff --git a/fs/compat.c b/fs/compat.c index 728cd836538..6b06b6bae35 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -37,7 +37,7 @@ #include <linux/ctype.h> #include <linux/module.h> #include <linux/dirent.h> -#include <linux/dnotify.h> +#include <linux/fsnotify.h> #include <linux/highuid.h> #include <linux/sunrpc/svc.h> #include <linux/nfsd/nfsd.h> @@ -1307,9 +1307,13 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, out: if (iov != iovstack) kfree(iov); - if ((ret + (type == READ)) > 0) - dnotify_parent(file->f_dentry, - (type == READ) ? DN_ACCESS : DN_MODIFY); + if ((ret + (type == READ)) > 0) { + struct dentry *dentry = file->f_dentry; + if (type == READ) + fsnotify_access(dentry); + else + fsnotify_modify(dentry); + } return ret; } diff --git a/fs/dcache.c b/fs/dcache.c index 3aa8a7e980d..a15a2e1f552 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -19,6 +19,7 @@ #include <linux/string.h> #include <linux/mm.h> #include <linux/fs.h> +#include <linux/fsnotify.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/smp_lock.h> @@ -101,6 +102,7 @@ static inline void dentry_iput(struct dentry * dentry) list_del_init(&dentry->d_alias); spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); + fsnotify_inoderemove(inode); if (dentry->d_op && dentry->d_op->d_iput) dentry->d_op->d_iput(dentry, inode); else @@ -1165,13 +1167,16 @@ out: void d_delete(struct dentry * dentry) { + int isdir = 0; /* * Are we the only user? */ spin_lock(&dcache_lock); spin_lock(&dentry->d_lock); + isdir = S_ISDIR(dentry->d_inode->i_mode); if (atomic_read(&dentry->d_count) == 1) { dentry_iput(dentry); + fsnotify_nameremove(dentry, isdir); return; } @@ -1180,6 +1185,8 @@ void d_delete(struct dentry * dentry) spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); + + fsnotify_nameremove(dentry, isdir); } static void __d_rehash(struct dentry * entry, struct hlist_head *list) diff --git a/fs/dcookies.c b/fs/dcookies.c index 581aac959cd..02aa0ddc582 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -94,12 +94,10 @@ static struct dcookie_struct * alloc_dcookie(struct dentry * dentry, if (!dcs) return NULL; - atomic_inc(&dentry->d_count); - atomic_inc(&vfsmnt->mnt_count); dentry->d_cookie = dcs; - dcs->dentry = dentry; - dcs->vfsmnt = vfsmnt; + dcs->dentry = dget(dentry); + dcs->vfsmnt = mntget(vfsmnt); hash_dcookie(dcs); return dcs; diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 548556ff250..efc97d9b786 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -45,44 +45,15 @@ struct file_operations debugfs_file_operations = { .open = default_open, }; -#define simple_type(type, format, temptype, strtolfn) \ -static ssize_t read_file_##type(struct file *file, char __user *user_buf, \ - size_t count, loff_t *ppos) \ -{ \ - char buf[32]; \ - type *val = file->private_data; \ - \ - snprintf(buf, sizeof(buf), format "\n", *val); \ - return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));\ -} \ -static ssize_t write_file_##type(struct file *file, const char __user *user_buf,\ - size_t count, loff_t *ppos) \ -{ \ - char *endp; \ - char buf[32]; \ - int buf_size; \ - type *val = file->private_data; \ - temptype tmp; \ - \ - memset(buf, 0x00, sizeof(buf)); \ - buf_size = min(count, (sizeof(buf)-1)); \ - if (copy_from_user(buf, user_buf, buf_size)) \ - return -EFAULT; \ - \ - tmp = strtolfn(buf, &endp, 0); \ - if ((endp == buf) || ((type)tmp != tmp)) \ - return -EINVAL; \ - *val = tmp; \ - return count; \ -} \ -static struct file_operations fops_##type = { \ - .read = read_file_##type, \ - .write = write_file_##type, \ - .open = default_open, \ -}; -simple_type(u8, "%c", unsigned long, simple_strtoul); -simple_type(u16, "%hi", unsigned long, simple_strtoul); -simple_type(u32, "%i", unsigned long, simple_strtoul); +static void debugfs_u8_set(void *data, u64 val) +{ + *(u8 *)data = val; +} +static u64 debugfs_u8_get(void *data) +{ + return *(u8 *)data; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); /** * debugfs_create_u8 - create a file in the debugfs filesystem that is used to read and write a unsigned 8 bit value. @@ -116,6 +87,16 @@ struct dentry *debugfs_create_u8(const char *name, mode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_u8); +static void debugfs_u16_set(void *data, u64 val) +{ + *(u16 *)data = val; +} +static u64 debugfs_u16_get(void *data) +{ + return *(u16 *)data; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); + /** * debugfs_create_u16 - create a file in the debugfs filesystem that is used to read and write a unsigned 8 bit value. * @@ -148,6 +129,16 @@ struct dentry *debugfs_create_u16(const char *name, mode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_u16); +static void debugfs_u32_set(void *data, u64 val) +{ + *(u32 *)data = val; +} +static u64 debugfs_u32_get(void *data) +{ + return *(u32 *)data; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); + /** * debugfs_create_u32 - create a file in the debugfs filesystem that is used to read and write a unsigned 8 bit value. * diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index b529786699e..a86ac4aeaed 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -110,16 +110,6 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent) return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); } -static struct dentry * get_dentry(struct dentry *parent, const char *name) -{ - struct qstr qstr; - - qstr.name = name; - qstr.len = strlen(name); - qstr.hash = full_name_hash(name,qstr.len); - return lookup_hash(&qstr,parent); -} - static struct super_block *debug_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) @@ -157,7 +147,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode, *dentry = NULL; down(&parent->d_inode->i_sem); - *dentry = get_dentry (parent, name); + *dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry)) { if ((mode & S_IFMT) == S_IFDIR) error = debugfs_mkdir(parent->d_inode, *dentry, mode); diff --git a/fs/devfs/base.c b/fs/devfs/base.c index 1ecfe1f184d..8b679b67e5e 100644 --- a/fs/devfs/base.c +++ b/fs/devfs/base.c @@ -2491,11 +2491,11 @@ static int devfs_mknod(struct inode *dir, struct dentry *dentry, int mode, return 0; } /* End Function devfs_mknod */ -static int devfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static void *devfs_follow_link(struct dentry *dentry, struct nameidata *nd) { struct devfs_entry *p = get_devfs_entry_from_vfs_inode(dentry->d_inode); nd_set_link(nd, p ? p->u.symlink.linkname : ERR_PTR(-ENODEV)); - return 0; + return NULL; } /* End Function devfs_follow_link */ static struct inode_operations devfs_iops = { diff --git a/fs/direct-io.c b/fs/direct-io.c index 1d55e7e6734..0d06097bc99 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -215,7 +215,7 @@ static struct page *dio_get_page(struct dio *dio) static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes) { if (dio->end_io && dio->result) - dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private); + dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private); if (dio->lock_type == DIO_LOCKING) up_read(&dio->inode->i_alloc_sem); } diff --git a/fs/dquot.c b/fs/dquot.c index 3995ce7907c..b9732335bcd 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -409,13 +409,10 @@ out_dqlock: * for this sb+type at all. */ static void invalidate_dquots(struct super_block *sb, int type) { - struct dquot *dquot; - struct list_head *head; + struct dquot *dquot, *tmp; spin_lock(&dq_list_lock); - for (head = inuse_list.next; head != &inuse_list;) { - dquot = list_entry(head, struct dquot, dq_inuse); - head = head->next; + list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) { if (dquot->dq_sb != sb) continue; if (dquot->dq_type != type) @@ -1519,14 +1516,22 @@ out_path: * This function is used when filesystem needs to initialize quotas * during mount time. */ -int vfs_quota_on_mount(int type, int format_id, struct dentry *dentry) +int vfs_quota_on_mount(struct super_block *sb, char *qf_name, + int format_id, int type) { + struct dentry *dentry; int error; + dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name)); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + error = security_quota_on(dentry); - if (error) - return error; - return vfs_quota_on_inode(dentry->d_inode, type, format_id); + if (!error) + error = vfs_quota_on_inode(dentry->d_inode, type, format_id); + + dput(dentry); + return error; } /* Generic routine for getting common part of quota structure */ diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 9900e333655..6ab1dd0ca90 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -101,57 +101,6 @@ /* Maximum number of poll wake up nests we are allowing */ #define EP_MAX_POLLWAKE_NESTS 4 -/* Macro to allocate a "struct epitem" from the slab cache */ -#define EPI_MEM_ALLOC() (struct epitem *) kmem_cache_alloc(epi_cache, SLAB_KERNEL) - -/* Macro to free a "struct epitem" to the slab cache */ -#define EPI_MEM_FREE(p) kmem_cache_free(epi_cache, p) - -/* Macro to allocate a "struct eppoll_entry" from the slab cache */ -#define PWQ_MEM_ALLOC() (struct eppoll_entry *) kmem_cache_alloc(pwq_cache, SLAB_KERNEL) - -/* Macro to free a "struct eppoll_entry" to the slab cache */ -#define PWQ_MEM_FREE(p) kmem_cache_free(pwq_cache, p) - -/* Fast test to see if the file is an evenpoll file */ -#define IS_FILE_EPOLL(f) ((f)->f_op == &eventpoll_fops) - -/* Setup the structure that is used as key for the rb-tree */ -#define EP_SET_FFD(p, f, d) do { (p)->file = (f); (p)->fd = (d); } while (0) - -/* Compare rb-tree keys */ -#define EP_CMP_FFD(p1, p2) ((p1)->file > (p2)->file ? +1: \ - ((p1)->file < (p2)->file ? -1: (p1)->fd - (p2)->fd)) - -/* Special initialization for the rb-tree node to detect linkage */ -#define EP_RB_INITNODE(n) (n)->rb_parent = (n) - -/* Removes a node from the rb-tree and marks it for a fast is-linked check */ -#define EP_RB_ERASE(n, r) do { rb_erase(n, r); (n)->rb_parent = (n); } while (0) - -/* Fast check to verify that the item is linked to the main rb-tree */ -#define EP_RB_LINKED(n) ((n)->rb_parent != (n)) - -/* - * Remove the item from the list and perform its initialization. - * This is useful for us because we can test if the item is linked - * using "EP_IS_LINKED(p)". - */ -#define EP_LIST_DEL(p) do { list_del(p); INIT_LIST_HEAD(p); } while (0) - -/* Tells us if the item is currently linked */ -#define EP_IS_LINKED(p) (!list_empty(p)) - -/* Get the "struct epitem" from a wait queue pointer */ -#define EP_ITEM_FROM_WAIT(p) ((struct epitem *) container_of(p, struct eppoll_entry, wait)->base) - -/* Get the "struct epitem" from an epoll queue wrapper */ -#define EP_ITEM_FROM_EPQUEUE(p) (container_of(p, struct ep_pqueue, pt)->epi) - -/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ -#define EP_OP_HASH_EVENT(op) ((op) != EPOLL_CTL_DEL) - - struct epoll_filefd { struct file *file; int fd; @@ -357,6 +306,82 @@ static struct dentry_operations eventpollfs_dentry_operations = { +/* Fast test to see if the file is an evenpoll file */ +static inline int is_file_epoll(struct file *f) +{ + return f->f_op == &eventpoll_fops; +} + +/* Setup the structure that is used as key for the rb-tree */ +static inline void ep_set_ffd(struct epoll_filefd *ffd, + struct file *file, int fd) +{ + ffd->file = file; + ffd->fd = fd; +} + +/* Compare rb-tree keys */ +static inline int ep_cmp_ffd(struct epoll_filefd *p1, + struct epoll_filefd *p2) +{ + return (p1->file > p2->file ? +1: + (p1->file < p2->file ? -1 : p1->fd - p2->fd)); +} + +/* Special initialization for the rb-tree node to detect linkage */ +static inline void ep_rb_initnode(struct rb_node *n) +{ + n->rb_parent = n; +} + +/* Removes a node from the rb-tree and marks it for a fast is-linked check */ +static inline void ep_rb_erase(struct rb_node *n, struct rb_root *r) +{ + rb_erase(n, r); + n->rb_parent = n; +} + +/* Fast check to verify that the item is linked to the main rb-tree */ +static inline int ep_rb_linked(struct rb_node *n) +{ + return n->rb_parent != n; +} + +/* + * Remove the item from the list and perform its initialization. + * This is useful for us because we can test if the item is linked + * using "ep_is_linked(p)". + */ +static inline void ep_list_del(struct list_head *p) +{ + list_del(p); + INIT_LIST_HEAD(p); +} + +/* Tells us if the item is currently linked */ +static inline int ep_is_linked(struct list_head *p) +{ + return !list_empty(p); +} + +/* Get the "struct epitem" from a wait queue pointer */ +static inline struct epitem * ep_item_from_wait(wait_queue_t *p) +{ + return container_of(p, struct eppoll_entry, wait)->base; +} + +/* Get the "struct epitem" from an epoll queue wrapper */ +static inline struct epitem * ep_item_from_epqueue(poll_table *p) +{ + return container_of(p, struct ep_pqueue, pt)->epi; +} + +/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ +static inline int ep_op_hash_event(int op) +{ + return op != EPOLL_CTL_DEL; +} + /* Initialize the poll safe wake up structure */ static void ep_poll_safewake_init(struct poll_safewake *psw) { @@ -456,7 +481,7 @@ void eventpoll_release_file(struct file *file) epi = list_entry(lsthead->next, struct epitem, fllink); ep = epi->ep; - EP_LIST_DEL(&epi->fllink); + ep_list_del(&epi->fllink); down_write(&ep->sem); ep_remove(ep, epi); up_write(&ep->sem); @@ -534,7 +559,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) current, epfd, op, fd, event)); error = -EFAULT; - if (EP_OP_HASH_EVENT(op) && + if (ep_op_hash_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) goto eexit_1; @@ -560,7 +585,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) * adding an epoll file descriptor inside itself. */ error = -EINVAL; - if (file == tfile || !IS_FILE_EPOLL(file)) + if (file == tfile || !is_file_epoll(file)) goto eexit_3; /* @@ -656,7 +681,7 @@ asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, * the user passed to us _is_ an eventpoll file. */ error = -EINVAL; - if (!IS_FILE_EPOLL(file)) + if (!is_file_epoll(file)) goto eexit_2; /* @@ -831,11 +856,11 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) struct epitem *epi, *epir = NULL; struct epoll_filefd ffd; - EP_SET_FFD(&ffd, file, fd); + ep_set_ffd(&ffd, file, fd); read_lock_irqsave(&ep->lock, flags); for (rbp = ep->rbr.rb_node; rbp; ) { epi = rb_entry(rbp, struct epitem, rbn); - kcmp = EP_CMP_FFD(&ffd, &epi->ffd); + kcmp = ep_cmp_ffd(&ffd, &epi->ffd); if (kcmp > 0) rbp = rbp->rb_right; else if (kcmp < 0) @@ -875,7 +900,7 @@ static void ep_release_epitem(struct epitem *epi) { if (atomic_dec_and_test(&epi->usecnt)) - EPI_MEM_FREE(epi); + kmem_cache_free(epi_cache, epi); } @@ -886,10 +911,10 @@ static void ep_release_epitem(struct epitem *epi) static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { - struct epitem *epi = EP_ITEM_FROM_EPQUEUE(pt); + struct epitem *epi = ep_item_from_epqueue(pt); struct eppoll_entry *pwq; - if (epi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC())) { + if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, SLAB_KERNEL))) { init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; @@ -912,7 +937,7 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) while (*p) { parent = *p; epic = rb_entry(parent, struct epitem, rbn); - kcmp = EP_CMP_FFD(&epi->ffd, &epic->ffd); + kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); if (kcmp > 0) p = &parent->rb_right; else @@ -932,17 +957,17 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct ep_pqueue epq; error = -ENOMEM; - if (!(epi = EPI_MEM_ALLOC())) + if (!(epi = kmem_cache_alloc(epi_cache, SLAB_KERNEL))) goto eexit_1; /* Item initialization follow here ... */ - EP_RB_INITNODE(&epi->rbn); + ep_rb_initnode(&epi->rbn); INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->txlink); INIT_LIST_HEAD(&epi->pwqlist); epi->ep = ep; - EP_SET_FFD(&epi->ffd, tfile, fd); + ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; atomic_set(&epi->usecnt, 1); epi->nwait = 0; @@ -978,7 +1003,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, ep_rbtree_insert(ep, epi); /* If the file is already "ready" we drop it inside the ready list */ - if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) { + if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); /* Notify waiting tasks that events are available */ @@ -1007,11 +1032,11 @@ eexit_2: * allocated wait queue. */ write_lock_irqsave(&ep->lock, flags); - if (EP_IS_LINKED(&epi->rdllink)) - EP_LIST_DEL(&epi->rdllink); + if (ep_is_linked(&epi->rdllink)) + ep_list_del(&epi->rdllink); write_unlock_irqrestore(&ep->lock, flags); - EPI_MEM_FREE(epi); + kmem_cache_free(epi_cache, epi); eexit_1: return error; } @@ -1050,14 +1075,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * If the item is not linked to the hash it means that it's on its * way toward the removal. Do nothing in this case. */ - if (EP_RB_LINKED(&epi->rbn)) { + if (ep_rb_linked(&epi->rbn)) { /* * If the item is "hot" and it is not registered inside the ready * list, push it inside. If the item is not "hot" and it is currently * registered inside the ready list, unlink it. */ if (revents & event->events) { - if (!EP_IS_LINKED(&epi->rdllink)) { + if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); /* Notify waiting tasks that events are available */ @@ -1097,9 +1122,9 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) while (!list_empty(lsthead)) { pwq = list_entry(lsthead->next, struct eppoll_entry, llink); - EP_LIST_DEL(&pwq->llink); + ep_list_del(&pwq->llink); remove_wait_queue(pwq->whead, &pwq->wait); - PWQ_MEM_FREE(pwq); + kmem_cache_free(pwq_cache, pwq); } } } @@ -1118,7 +1143,7 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi) * The check protect us from doing a double unlink ( crash ). */ error = -ENOENT; - if (!EP_RB_LINKED(&epi->rbn)) + if (!ep_rb_linked(&epi->rbn)) goto eexit_1; /* @@ -1133,14 +1158,14 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi) * This operation togheter with the above check closes the door to * double unlinks. */ - EP_RB_ERASE(&epi->rbn, &ep->rbr); + ep_rb_erase(&epi->rbn, &ep->rbr); /* * If the item we are going to remove is inside the ready file descriptors * we want to remove it from this list to avoid stale events. */ - if (EP_IS_LINKED(&epi->rdllink)) - EP_LIST_DEL(&epi->rdllink); + if (ep_is_linked(&epi->rdllink)) + ep_list_del(&epi->rdllink); error = 0; eexit_1: @@ -1174,8 +1199,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_ep_lock); - if (EP_IS_LINKED(&epi->fllink)) - EP_LIST_DEL(&epi->fllink); + if (ep_is_linked(&epi->fllink)) + ep_list_del(&epi->fllink); spin_unlock(&file->f_ep_lock); /* We need to acquire the write IRQ lock before calling ep_unlink() */ @@ -1210,7 +1235,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k { int pwake = 0; unsigned long flags; - struct epitem *epi = EP_ITEM_FROM_WAIT(wait); + struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n", @@ -1228,7 +1253,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k goto is_disabled; /* If this file is already in the ready list we exit soon */ - if (EP_IS_LINKED(&epi->rdllink)) + if (ep_is_linked(&epi->rdllink)) goto is_linked; list_add_tail(&epi->rdllink, &ep->rdllist); @@ -1307,7 +1332,7 @@ static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist lnk = lnk->next; /* If this file is already in the ready list we exit soon */ - if (!EP_IS_LINKED(&epi->txlink)) { + if (!ep_is_linked(&epi->txlink)) { /* * This is initialized in this way so that the default * behaviour of the reinjecting code will be to push back @@ -1322,7 +1347,7 @@ static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist /* * Unlink the item from the ready list. */ - EP_LIST_DEL(&epi->rdllink); + ep_list_del(&epi->rdllink); } } @@ -1401,7 +1426,7 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist) epi = list_entry(txlist->next, struct epitem, txlink); /* Unlink the current item from the transfer list */ - EP_LIST_DEL(&epi->txlink); + ep_list_del(&epi->txlink); /* * If the item is no more linked to the interest set, we don't @@ -1410,8 +1435,8 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist) * item is set to have an Edge Triggered behaviour, we don't have * to push it back either. */ - if (EP_RB_LINKED(&epi->rbn) && !(epi->event.events & EPOLLET) && - (epi->revents & epi->event.events) && !EP_IS_LINKED(&epi->rdllink)) { + if (ep_rb_linked(&epi->rbn) && !(epi->event.events & EPOLLET) && + (epi->revents & epi->event.events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); ricnt++; } diff --git a/fs/exec.c b/fs/exec.c index 3a4b35a14c0..222ab1c572d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -58,6 +58,9 @@ int core_uses_pid; char core_pattern[65] = "core"; +int suid_dumpable = 0; + +EXPORT_SYMBOL(suid_dumpable); /* The maximal length of core_pattern is also specified in sysctl.c */ static struct linux_binfmt *formats; @@ -639,6 +642,18 @@ static inline int de_thread(struct task_struct *tsk) count = 2; if (thread_group_leader(current)) count = 1; + else { + /* + * The SIGALRM timer survives the exec, but needs to point + * at us as the new group leader now. We have a race with + * a timer firing now getting the old leader, so we need to + * synchronize with any firing (by calling del_timer_sync) + * before we can safely let the old group leader die. + */ + sig->real_timer.data = (unsigned long)current; + if (del_timer_sync(&sig->real_timer)) + add_timer(&sig->real_timer); + } while (atomic_read(&sig->count) > count) { sig->group_exit_task = current; sig->notify_count = count; @@ -864,6 +879,9 @@ int flush_old_exec(struct linux_binprm * bprm) if (current->euid == current->uid && current->egid == current->gid) current->mm->dumpable = 1; + else + current->mm->dumpable = suid_dumpable; + name = bprm->filename; /* Copies the binary name from after last slash */ @@ -884,7 +902,7 @@ int flush_old_exec(struct linux_binprm * bprm) permission(bprm->file->f_dentry->d_inode,MAY_READ, NULL) || (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) { suid_keys(current); - current->mm->dumpable = 0; + current->mm->dumpable = suid_dumpable; } /* An exec changes our domain. We are no longer part of the thread @@ -1432,6 +1450,8 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) struct inode * inode; struct file * file; int retval = 0; + int fsuid = current->fsuid; + int flag = 0; binfmt = current->binfmt; if (!binfmt || !binfmt->core_dump) @@ -1441,6 +1461,16 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) up_write(&mm->mmap_sem); goto fail; } + + /* + * We cannot trust fsuid as being the "true" uid of the + * process nor do we know its entire history. We only know it + * was tainted so we dump it as root in mode 2. + */ + if (mm->dumpable == 2) { /* Setuid core dump mode */ + flag = O_EXCL; /* Stop rewrite attacks */ + current->fsuid = 0; /* Dump root private */ + } mm->dumpable = 0; init_completion(&mm->core_done); spin_lock_irq(¤t->sighand->siglock); @@ -1466,7 +1496,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) lock_kernel(); format_corename(corename, core_pattern, signr); unlock_kernel(); - file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE, 0600); + file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600); if (IS_ERR(file)) goto fail_unlock; inode = file->f_dentry->d_inode; @@ -1491,6 +1521,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) close_fail: filp_close(file, NULL); fail_unlock: + current->fsuid = fsuid; complete_all(&mm->core_done); fail: return retval; diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile index ee240a14e70..c5d02da73bc 100644 --- a/fs/ext2/Makefile +++ b/fs/ext2/Makefile @@ -10,3 +10,4 @@ ext2-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o ext2-$(CONFIG_EXT2_FS_SECURITY) += xattr_security.o +ext2-$(CONFIG_EXT2_FS_XIP) += xip.o diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 25f4a64fd6b..213148c36eb 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -396,12 +396,12 @@ static size_t ext2_xattr_list_acl_access(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t size = sizeof(XATTR_NAME_ACL_ACCESS); + const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); if (!test_opt(inode->i_sb, POSIX_ACL)) return 0; if (list && size <= list_size) - memcpy(list, XATTR_NAME_ACL_ACCESS, size); + memcpy(list, POSIX_ACL_XATTR_ACCESS, size); return size; } @@ -409,12 +409,12 @@ static size_t ext2_xattr_list_acl_default(struct inode *inode, char *list, size_t list_size, const char *name, size_t name_len) { - const size_t size = sizeof(XATTR_NAME_ACL_DEFAULT); + const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); if (!test_opt(inode->i_sb, POSIX_ACL)) return 0; if (list && size <= list_size) - memcpy(list, XATTR_NAME_ACL_DEFAULT, size); + memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); return size; } @@ -506,14 +506,14 @@ ext2_xattr_set_acl_default(struct inode *inode, const char *name, } struct xattr_handler ext2_xattr_acl_access_handler = { - .prefix = XATTR_NAME_ACL_ACCESS, + .prefix = POSIX_ACL_XATTR_ACCESS, .list = ext2_xattr_list_acl_access, .get = ext2_xattr_get_acl_access, .set = ext2_xattr_set_acl_access, }; struct xattr_handler ext2_xattr_acl_default_handler = { - .prefix = XATTR_NAME_ACL_DEFAULT, + .prefix = POSIX_ACL_XATTR_DEFAULT, .list = ext2_xattr_list_acl_default, .get = ext2_xattr_get_acl_default, .set = ext2_xattr_set_acl_default, diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index fed96ae81a7..0bde85bafe3 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -4,7 +4,7 @@ (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/xattr_acl.h> +#include <linux/posix_acl_xattr.h> #define EXT2_ACL_VERSION 0x0001 diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 8f0fd726c3f..e977f8566d1 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -2,6 +2,15 @@ #include <linux/ext2_fs.h> /* + * ext2 mount options + */ +struct ext2_mount_options { + unsigned long s_mount_opt; + uid_t s_resuid; + gid_t s_resgid; +}; + +/* * second extended file system inode data in memory */ struct ext2_inode_info { @@ -147,9 +156,11 @@ extern struct file_operations ext2_dir_operations; /* file.c */ extern struct inode_operations ext2_file_inode_operations; extern struct file_operations ext2_file_operations; +extern struct file_operations ext2_xip_file_operations; /* inode.c */ extern struct address_space_operations ext2_aops; +extern struct address_space_operations ext2_aops_xip; extern struct address_space_operations ext2_nobh_aops; /* namei.c */ diff --git a/fs/ext2/file.c b/fs/ext2/file.c index f5e86141ec5..a484412fc78 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -55,6 +55,20 @@ struct file_operations ext2_file_operations = { .sendfile = generic_file_sendfile, }; +#ifdef CONFIG_EXT2_FS_XIP +struct file_operations ext2_xip_file_operations = { + .llseek = generic_file_llseek, + .read = xip_file_read, + .write = xip_file_write, + .ioctl = ext2_ioctl, + .mmap = xip_file_mmap, + .open = generic_file_open, + .release = ext2_release_file, + .fsync = ext2_sync_file, + .sendfile = xip_file_sendfile, +}; +#endif + struct inode_operations ext2_file_inode_operations = { .truncate = ext2_truncate, #ifdef CONFIG_EXT2_FS_XATTR diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 77e05914921..161f156d98c 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -612,6 +612,7 @@ got: err = ext2_init_acl(inode, dir); if (err) { DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); goto fail2; } mark_inode_dirty(inode); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index a50d9db4b6e..53dceb0c659 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -33,6 +33,7 @@ #include <linux/mpage.h> #include "ext2.h" #include "acl.h" +#include "xip.h" MODULE_AUTHOR("Remy Card and others"); MODULE_DESCRIPTION("Second Extended Filesystem"); @@ -594,6 +595,16 @@ out: if (err) goto cleanup; + if (ext2_use_xip(inode->i_sb)) { + /* + * we need to clear the block + */ + err = ext2_clear_xip_target (inode, + le32_to_cpu(chain[depth-1].key)); + if (err) + goto cleanup; + } + if (ext2_splice_branch(inode, iblock, chain, partial, left) < 0) goto changed; @@ -691,6 +702,11 @@ struct address_space_operations ext2_aops = { .writepages = ext2_writepages, }; +struct address_space_operations ext2_aops_xip = { + .bmap = ext2_bmap, + .get_xip_page = ext2_get_xip_page, +}; + struct address_space_operations ext2_nobh_aops = { .readpage = ext2_readpage, .readpages = ext2_readpages, @@ -910,7 +926,9 @@ void ext2_truncate (struct inode * inode) iblock = (inode->i_size + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb); - if (test_opt(inode->i_sb, NOBH)) + if (mapping_is_xip(inode->i_mapping)) + xip_truncate_page(inode->i_mapping, inode->i_size); + else if (test_opt(inode->i_sb, NOBH)) nobh_truncate_page(inode->i_mapping, inode->i_size); else block_truncate_page(inode->i_mapping, @@ -1110,11 +1128,16 @@ void ext2_read_inode (struct inode * inode) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext2_file_inode_operations; - inode->i_fop = &ext2_file_operations; - if (test_opt(inode->i_sb, NOBH)) + if (ext2_use_xip(inode->i_sb)) { + inode->i_mapping->a_ops = &ext2_aops_xip; + inode->i_fop = &ext2_xip_file_operations; + } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; - else + inode->i_fop = &ext2_file_operations; + } else { inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_file_operations; + } } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext2_dir_inode_operations; inode->i_fop = &ext2_dir_operations; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 3176b3d3ffa..c5513953c82 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -34,6 +34,7 @@ #include "ext2.h" #include "xattr.h" #include "acl.h" +#include "xip.h" /* * Couple of helper functions - make the code slightly cleaner. @@ -127,11 +128,16 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, st int err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext2_file_inode_operations; - inode->i_fop = &ext2_file_operations; - if (test_opt(inode->i_sb, NOBH)) + if (ext2_use_xip(inode->i_sb)) { + inode->i_mapping->a_ops = &ext2_aops_xip; + inode->i_fop = &ext2_xip_file_operations; + } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; - else + inode->i_fop = &ext2_file_operations; + } else { inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_file_operations; + } mark_inode_dirty(inode); err = ext2_add_nondir(dentry, inode); } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 661c3d98d94..dcfe331dc4c 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -31,6 +31,7 @@ #include "ext2.h" #include "xattr.h" #include "acl.h" +#include "xip.h" static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es); @@ -257,7 +258,7 @@ enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_nobh, - Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, + Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, }; @@ -286,6 +287,7 @@ static match_table_t tokens = { {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, + {Opt_xip, "xip"}, {Opt_ignore, "grpquota"}, {Opt_ignore, "noquota"}, {Opt_ignore, "quota"}, @@ -397,6 +399,13 @@ static int parse_options (char * options, printk("EXT2 (no)acl options not supported\n"); break; #endif + case Opt_xip: +#ifdef CONFIG_EXT2_FS_XIP + set_opt (sbi->s_mount_opt, XIP); +#else + printk("EXT2 xip option not supported\n"); +#endif + break; case Opt_ignore: break; default: @@ -640,6 +649,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset + EXT2_MOUNT_XIP if not */ + if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV && (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) || EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) || @@ -668,6 +680,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); + if ((ext2_use_xip(sb)) && ((blocksize != PAGE_SIZE) || + (sb->s_blocksize != blocksize))) { + if (!silent) + printk("XIP: Unsupported blocksize\n"); + goto failed_mount; + } + /* If the blocksize doesn't match, re-read the thing.. */ if (sb->s_blocksize != blocksize) { brelse(bh); @@ -916,17 +935,34 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) { struct ext2_sb_info * sbi = EXT2_SB(sb); struct ext2_super_block * es; + unsigned long old_mount_opt = sbi->s_mount_opt; + struct ext2_mount_options old_opts; + unsigned long old_sb_flags; + int err; + + /* Store the old options */ + old_sb_flags = sb->s_flags; + old_opts.s_mount_opt = sbi->s_mount_opt; + old_opts.s_resuid = sbi->s_resuid; + old_opts.s_resgid = sbi->s_resgid; /* * Allow the "check" option to be passed as a remount option. */ - if (!parse_options (data, sbi)) - return -EINVAL; + if (!parse_options (data, sbi)) { + err = -EINVAL; + goto restore_opts; + } sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); es = sbi->s_es; + if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != + (old_mount_opt & EXT2_MOUNT_XIP)) && + invalidate_inodes(sb)) + ext2_warning(sb, __FUNCTION__, "busy inodes while remounting "\ + "xip remain in cache (no functional problem)"); if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; if (*flags & MS_RDONLY) { @@ -946,7 +982,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) printk("EXT2-fs: %s: couldn't remount RDWR because of " "unsupported optional features (%x).\n", sb->s_id, le32_to_cpu(ret)); - return -EROFS; + err = -EROFS; + goto restore_opts; } /* * Mounting a RDONLY partition read-write, so reread and @@ -959,6 +996,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) } ext2_sync_super(sb, es); return 0; +restore_opts: + sbi->s_mount_opt = old_opts.s_mount_opt; + sbi->s_resuid = old_opts.s_resuid; + sbi->s_resgid = old_opts.s_resgid; + sb->s_flags = old_sb_flags; + return err; } static int ext2_statfs (struct super_block * sb, struct kstatfs * buf) diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c index 9f7bac01d55..1e67d87cfa9 100644 --- a/fs/ext2/symlink.c +++ b/fs/ext2/symlink.c @@ -21,11 +21,11 @@ #include "xattr.h" #include <linux/namei.h> -static int ext2_follow_link(struct dentry *dentry, struct nameidata *nd) +static void *ext2_follow_link(struct dentry *dentry, struct nameidata *nd) { struct ext2_inode_info *ei = EXT2_I(dentry->d_inode); nd_set_link(nd, (char *)ei->i_data); - return 0; + return NULL; } struct inode_operations ext2_symlink_inode_operations = { diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 27982b500e8..0099462d427 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -823,7 +823,7 @@ cleanup: void ext2_xattr_put_super(struct super_block *sb) { - mb_cache_shrink(ext2_xattr_cache, sb->s_bdev); + mb_cache_shrink(sb->s_bdev); } diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c new file mode 100644 index 00000000000..ca7f0031238 --- /dev/null +++ b/fs/ext2/xip.c @@ -0,0 +1,93 @@ +/* + * linux/fs/ext2/xip.c + * + * Copyright (C) 2005 IBM Corporation + * Author: Carsten Otte (cotte@de.ibm.com) + */ + +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/buffer_head.h> +#include <linux/ext2_fs_sb.h> +#include <linux/ext2_fs.h> +#include "ext2.h" +#include "xip.h" + +static inline int +__inode_direct_access(struct inode *inode, sector_t sector, + unsigned long *data) +{ + BUG_ON(!inode->i_sb->s_bdev->bd_disk->fops->direct_access); + return inode->i_sb->s_bdev->bd_disk->fops + ->direct_access(inode->i_sb->s_bdev,sector,data); +} + +static inline int +__ext2_get_sector(struct inode *inode, sector_t offset, int create, + sector_t *result) +{ + struct buffer_head tmp; + int rc; + + memset(&tmp, 0, sizeof(struct buffer_head)); + rc = ext2_get_block(inode, offset/ (PAGE_SIZE/512), &tmp, + create); + *result = tmp.b_blocknr; + + /* did we get a sparse block (hole in the file)? */ + if (!tmp.b_blocknr && !rc) { + BUG_ON(create); + rc = -ENODATA; + } + + return rc; +} + +int +ext2_clear_xip_target(struct inode *inode, int block) +{ + sector_t sector = block * (PAGE_SIZE/512); + unsigned long data; + int rc; + + rc = __inode_direct_access(inode, sector, &data); + if (!rc) + clear_page((void*)data); + return rc; +} + +void ext2_xip_verify_sb(struct super_block *sb) +{ + struct ext2_sb_info *sbi = EXT2_SB(sb); + + if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) && + !sb->s_bdev->bd_disk->fops->direct_access) { + sbi->s_mount_opt &= (~EXT2_MOUNT_XIP); + ext2_warning(sb, __FUNCTION__, + "ignoring xip option - not supported by bdev"); + } +} + +struct page * +ext2_get_xip_page(struct address_space *mapping, sector_t offset, + int create) +{ + int rc; + unsigned long data; + sector_t sector; + + /* first, retrieve the sector number */ + rc = __ext2_get_sector(mapping->host, offset, create, §or); + if (rc) + goto error; + + /* retrieve address of the target data */ + rc = __inode_direct_access + (mapping->host, sector * (PAGE_SIZE/512), &data); + if (!rc) + return virt_to_page(data); + + error: + return ERR_PTR(rc); +} diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h new file mode 100644 index 00000000000..aa85331d6c5 --- /dev/null +++ b/fs/ext2/xip.h @@ -0,0 +1,25 @@ +/* + * linux/fs/ext2/xip.h + * + * Copyright (C) 2005 IBM Corporation + * Author: Carsten Otte (cotte@de.ibm.com) + */ + +#ifdef CONFIG_EXT2_FS_XIP +extern void ext2_xip_verify_sb (struct super_block *); +extern int ext2_clear_xip_target (struct inode *, int); + +static inline int ext2_use_xip (struct super_block *sb) +{ + struct ext2_sb_info *sbi = EXT2_SB(sb); + return (sbi->s_mount_opt & EXT2_MOUNT_XIP); +} +struct page* ext2_get_xip_page (struct address_space *, sector_t, int); +#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_page) +#else +#define mapping_is_xip(map) 0 +#define ext2_xip_verify_sb(sb) do { } while (0) +#define ext2_use_xip(sb) 0 +#define ext2_clear_xip_target(inode, chain) 0 +#define ext2_get_xip_page NULL +#endif diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 638c13a26c0..3ac38266fc9 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -393,7 +393,8 @@ ext3_acl_chmod(struct inode *inode) int retries = 0; retry: - handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS); + handle = ext3_journal_start(inode, + EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { error = PTR_ERR(handle); ext3_std_error(inode->i_sb, error); @@ -417,12 +418,12 @@ static size_t ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len, const char *name, size_t name_len) { - const size_t size = sizeof(XATTR_NAME_ACL_ACCESS); + const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); if (!test_opt(inode->i_sb, POSIX_ACL)) return 0; if (list && size <= list_len) - memcpy(list, XATTR_NAME_ACL_ACCESS, size); + memcpy(list, POSIX_ACL_XATTR_ACCESS, size); return size; } @@ -430,12 +431,12 @@ static size_t ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len, const char *name, size_t name_len) { - const size_t size = sizeof(XATTR_NAME_ACL_DEFAULT); + const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); if (!test_opt(inode->i_sb, POSIX_ACL)) return 0; if (list && size <= list_len) - memcpy(list, XATTR_NAME_ACL_DEFAULT, size); + memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); return size; } @@ -503,7 +504,7 @@ ext3_xattr_set_acl(struct inode *inode, int type, const void *value, acl = NULL; retry: - handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS); + handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); error = ext3_set_acl(handle, inode, type, acl); @@ -535,14 +536,14 @@ ext3_xattr_set_acl_default(struct inode *inode, const char *name, } struct xattr_handler ext3_xattr_acl_access_handler = { - .prefix = XATTR_NAME_ACL_ACCESS, + .prefix = POSIX_ACL_XATTR_ACCESS, .list = ext3_xattr_list_acl_access, .get = ext3_xattr_get_acl_access, .set = ext3_xattr_set_acl_access, }; struct xattr_handler ext3_xattr_acl_default_handler = { - .prefix = XATTR_NAME_ACL_DEFAULT, + .prefix = POSIX_ACL_XATTR_DEFAULT, .list = ext3_xattr_list_acl_default, .get = ext3_xattr_get_acl_default, .set = ext3_xattr_set_acl_default, diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 98af0c0d0ba..92d50b53a93 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -4,7 +4,7 @@ (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> */ -#include <linux/xattr_acl.h> +#include <linux/posix_acl_xattr.h> #define EXT3_ACL_VERSION 0x0001 diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index ccd632fcc6d..e463dca008e 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -749,24 +749,24 @@ fail_access: * to find a free region that is of my size and has not * been reserved. * - * on succeed, it returns the reservation window to be appended to. - * failed, return NULL. */ -static struct ext3_reserve_window_node *find_next_reservable_window( +static int find_next_reservable_window( struct ext3_reserve_window_node *search_head, - unsigned long size, int *start_block, + struct ext3_reserve_window_node *my_rsv, + struct super_block * sb, int start_block, int last_block) { struct rb_node *next; struct ext3_reserve_window_node *rsv, *prev; int cur; + int size = my_rsv->rsv_goal_size; /* TODO: make the start of the reservation window byte-aligned */ /* cur = *start_block & ~7;*/ - cur = *start_block; + cur = start_block; rsv = search_head; if (!rsv) - return NULL; + return -1; while (1) { if (cur <= rsv->rsv_end) @@ -782,11 +782,11 @@ static struct ext3_reserve_window_node *find_next_reservable_window( * space with expected-size (or more)... */ if (cur > last_block) - return NULL; /* fail */ + return -1; /* fail */ prev = rsv; next = rb_next(&rsv->rsv_node); - rsv = list_entry(next, struct ext3_reserve_window_node, rsv_node); + rsv = list_entry(next,struct ext3_reserve_window_node,rsv_node); /* * Reached the last reservation, we can just append to the @@ -813,8 +813,25 @@ static struct ext3_reserve_window_node *find_next_reservable_window( * return the reservation window that we could append to. * succeed. */ - *start_block = cur; - return prev; + + if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) + rsv_window_remove(sb, my_rsv); + + /* + * Let's book the whole avaliable window for now. We will check the + * disk bitmap later and then, if there are free blocks then we adjust + * the window size if it's larger than requested. + * Otherwise, we will remove this node from the tree next time + * call find_next_reservable_window. + */ + my_rsv->rsv_start = cur; + my_rsv->rsv_end = cur + size - 1; + my_rsv->rsv_alloc_hit = 0; + + if (prev != my_rsv) + ext3_rsv_window_add(sb, my_rsv); + + return 0; } /** @@ -852,6 +869,7 @@ static struct ext3_reserve_window_node *find_next_reservable_window( * @sb: the super block * @group: the group we are trying to allocate in * @bitmap_bh: the block group block bitmap + * */ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, int goal, struct super_block *sb, @@ -860,10 +878,10 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, struct ext3_reserve_window_node *search_head; int group_first_block, group_end_block, start_block; int first_free_block; - int reservable_space_start; - struct ext3_reserve_window_node *prev_rsv; struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root; unsigned long size; + int ret; + spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; group_first_block = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + group * EXT3_BLOCKS_PER_GROUP(sb); @@ -875,6 +893,7 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, start_block = goal + group_first_block; size = my_rsv->rsv_goal_size; + if (!rsv_is_empty(&my_rsv->rsv_window)) { /* * if the old reservation is cross group boundary @@ -908,6 +927,8 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, my_rsv->rsv_goal_size= size; } } + + spin_lock(rsv_lock); /* * shift the search start to the window near the goal block */ @@ -921,11 +942,16 @@ static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv, * need to check the bitmap after we found a reservable window. */ retry: - prev_rsv = find_next_reservable_window(search_head, size, - &start_block, group_end_block); - if (prev_rsv == NULL) - goto failed; - reservable_space_start = start_block; + ret = find_next_reservable_window(search_head, my_rsv, sb, + start_block, group_end_block); + + if (ret == -1) { + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + spin_unlock(rsv_lock); + return -1; + } + /* * On success, find_next_reservable_window() returns the * reservation window where there is a reservable space after it. @@ -937,8 +963,9 @@ retry: * block. Search start from the start block of the reservable space * we just found. */ + spin_unlock(rsv_lock); first_free_block = bitmap_search_next_usable_block( - reservable_space_start - group_first_block, + my_rsv->rsv_start - group_first_block, bitmap_bh, group_end_block - group_first_block + 1); if (first_free_block < 0) { @@ -946,54 +973,29 @@ retry: * no free block left on the bitmap, no point * to reserve the space. return failed. */ - goto failed; + spin_lock(rsv_lock); + if (!rsv_is_empty(&my_rsv->rsv_window)) + rsv_window_remove(sb, my_rsv); + spin_unlock(rsv_lock); + return -1; /* failed */ } + start_block = first_free_block + group_first_block; /* * check if the first free block is within the - * free space we just found + * free space we just reserved */ - if ((start_block >= reservable_space_start) && - (start_block < reservable_space_start + size)) - goto found_rsv_window; + if (start_block >= my_rsv->rsv_start && start_block < my_rsv->rsv_end) + return 0; /* success */ /* * if the first free bit we found is out of the reservable space - * this means there is no free block on the reservable space - * we should continue search for next reservable space, + * continue search for next reservable space, * start from where the free block is, * we also shift the list head to where we stopped last time */ - search_head = prev_rsv; + search_head = my_rsv; + spin_lock(rsv_lock); goto retry; - -found_rsv_window: - /* - * great! the reservable space contains some free blocks. - * if the search returns that we should add the new - * window just next to where the old window, we don't - * need to remove the old window first then add it to the - * same place, just update the new start and new end. - */ - if (my_rsv != prev_rsv) { - if (!rsv_is_empty(&my_rsv->rsv_window)) - rsv_window_remove(sb, my_rsv); - } - my_rsv->rsv_start = reservable_space_start; - my_rsv->rsv_end = my_rsv->rsv_start + size - 1; - my_rsv->rsv_alloc_hit = 0; - if (my_rsv != prev_rsv) { - ext3_rsv_window_add(sb, my_rsv); - } - return 0; /* succeed */ -failed: - /* - * failed to find a new reservation window in the current - * group, remove the current(stale) reservation window - * if there is any - */ - if (!rsv_is_empty(&my_rsv->rsv_window)) - rsv_window_remove(sb, my_rsv); - return -1; /* failed */ } /* @@ -1023,7 +1025,6 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, int goal, struct ext3_reserve_window_node * my_rsv, int *errp) { - spinlock_t *rsv_lock; unsigned long group_first_block; int ret = 0; int fatal; @@ -1052,7 +1053,6 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, NULL); goto out; } - rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock; /* * goal is a group relative block number (if there is a goal) * 0 < goal < EXT3_BLOCKS_PER_GROUP(sb) @@ -1078,30 +1078,21 @@ ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, * then we could go to allocate from the reservation window directly. */ while (1) { - struct ext3_reserve_window rsv_copy; - - rsv_copy._rsv_start = my_rsv->rsv_start; - rsv_copy._rsv_end = my_rsv->rsv_end; - - if (rsv_is_empty(&rsv_copy) || (ret < 0) || - !goal_in_my_reservation(&rsv_copy, goal, group, sb)) { - spin_lock(rsv_lock); + if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || + !goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) { ret = alloc_new_reservation(my_rsv, goal, sb, group, bitmap_bh); - rsv_copy._rsv_start = my_rsv->rsv_start; - rsv_copy._rsv_end = my_rsv->rsv_end; - spin_unlock(rsv_lock); if (ret < 0) break; /* failed */ - if (!goal_in_my_reservation(&rsv_copy, goal, group, sb)) + if (!goal_in_my_reservation(&my_rsv->rsv_window, goal, group, sb)) goal = -1; } - if ((rsv_copy._rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb)) - || (rsv_copy._rsv_end < group_first_block)) + if ((my_rsv->rsv_start >= group_first_block + EXT3_BLOCKS_PER_GROUP(sb)) + || (my_rsv->rsv_end < group_first_block)) BUG(); ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh, goal, - &rsv_copy); + &my_rsv->rsv_window); if (ret >= 0) { my_rsv->rsv_alloc_hit++; break; /* succeed */ diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 5ad8cf0292d..98e78345ead 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -36,7 +36,11 @@ static int ext3_release_file (struct inode * inode, struct file * filp) /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && (atomic_read(&inode->i_writecount) == 1)) + { + down(&EXT3_I(inode)->truncate_sem); ext3_discard_reservation(inode); + up(&EXT3_I(inode)->truncate_sem); + } if (is_dx(inode) && filp->private_data) ext3_htree_free_dir_info(filp->private_data); diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 1e6f3ea2871..6981bd014ed 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -604,12 +604,14 @@ got: err = ext3_init_acl(handle, inode, dir); if (err) { DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); goto fail2; } err = ext3_mark_inode_dirty(handle, inode); if (err) { ext3_std_error(sb, err); DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); goto fail2; } diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 0d5fa73b18d..9989fdcf4d5 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -128,7 +128,7 @@ static unsigned long blocks_for_truncate(struct inode *inode) if (needed > EXT3_MAX_TRANS_DATA) needed = EXT3_MAX_TRANS_DATA; - return EXT3_DATA_TRANS_BLOCKS + needed; + return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed; } /* @@ -2663,7 +2663,7 @@ static int ext3_do_update_inode(handle_t *handle, } else for (block = 0; block < EXT3_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; - if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) + if (ei->i_extra_isize) raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); @@ -2763,7 +2763,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext3_journal_start(inode, 4*EXT3_QUOTA_INIT_BLOCKS+3); + handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+ + EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; @@ -2861,7 +2862,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode) #ifdef CONFIG_QUOTA /* We know that structure was already allocated during DQUOT_INIT so * we will be updating only the data blocks + inodes */ - ret += 2*EXT3_QUOTA_TRANS_BLOCKS; + ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb); #endif return ret; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 79742d824a0..50378d8ff84 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -932,8 +932,16 @@ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, struct inode *dir = dentry->d_parent->d_inode; sb = dir->i_sb; - if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) - return NULL; + /* NFS may look up ".." - look at dx_root directory block */ + if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ + if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) + return NULL; + } else { + frame = frames; + frame->bh = NULL; /* for dx_release() */ + frame->at = (struct dx_entry *)frames; /* hack for zero entry*/ + dx_set_block(frame->at, 0); /* dx_root block is 0 */ + } hash = hinfo.hash; do { block = dx_get_block(frame->at); @@ -1637,9 +1645,9 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, int err, retries = 0; retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS); + 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1671,9 +1679,9 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry, return -EINVAL; retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS); + 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1707,9 +1715,9 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) return -EMLINK; retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS); + 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1998,7 +2006,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ DQUOT_INIT(dentry->d_inode); - handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2057,7 +2065,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go * in separate transaction */ DQUOT_INIT(dentry->d_inode); - handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2112,9 +2120,9 @@ static int ext3_symlink (struct inode * dir, return -ENAMETOOLONG; retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + - 2*EXT3_QUOTA_INIT_BLOCKS); + 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2166,7 +2174,7 @@ static int ext3_link (struct dentry * old_dentry, return -EMLINK; retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2208,7 +2216,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, * in separate transaction */ if (new_dentry->d_inode) DQUOT_INIT(new_dentry->d_inode); - handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + + handle = ext3_journal_start(old_dir, 2 * + EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 981ccb233ef..3c3c6e399fb 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -589,7 +589,7 @@ enum { Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, - Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, }; @@ -634,10 +634,10 @@ static match_table_t tokens = { {Opt_grpjquota, "grpjquota=%s"}, {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, - {Opt_ignore, "grpquota"}, - {Opt_ignore, "noquota"}, - {Opt_ignore, "quota"}, - {Opt_ignore, "usrquota"}, + {Opt_quota, "grpquota"}, + {Opt_noquota, "noquota"}, + {Opt_quota, "quota"}, + {Opt_quota, "usrquota"}, {Opt_barrier, "barrier=%u"}, {Opt_err, NULL}, {Opt_resize, "resize"}, @@ -876,6 +876,7 @@ set_qf_name: sbi->s_qf_names[qtype] = NULL; return 0; } + set_opt(sbi->s_mount_opt, QUOTA); break; case Opt_offusrjquota: qtype = USRQUOTA; @@ -889,7 +890,10 @@ clear_qf_name: "quota turned on.\n"); return 0; } - kfree(sbi->s_qf_names[qtype]); + /* + * The space will be released later when all options + * are confirmed to be correct + */ sbi->s_qf_names[qtype] = NULL; break; case Opt_jqfmt_vfsold: @@ -898,6 +902,17 @@ clear_qf_name: case Opt_jqfmt_vfsv0: sbi->s_jquota_fmt = QFMT_VFS_V0; break; + case Opt_quota: + set_opt(sbi->s_mount_opt, QUOTA); + break; + case Opt_noquota: + if (sb_any_quota_enabled(sb)) { + printk(KERN_ERR "EXT3-fs: Cannot change quota " + "options when quota turned on.\n"); + return 0; + } + clear_opt(sbi->s_mount_opt, QUOTA); + break; #else case Opt_usrjquota: case Opt_grpjquota: @@ -909,6 +924,9 @@ clear_qf_name: "EXT3-fs: journalled quota options not " "supported.\n"); break; + case Opt_quota: + case Opt_noquota: + break; #endif case Opt_abort: set_opt(sbi->s_mount_opt, ABORT); @@ -924,12 +942,13 @@ clear_qf_name: case Opt_ignore: break; case Opt_resize: - if (!n_blocks_count) { + if (!is_remount) { printk("EXT3-fs: resize option only available " "for remount\n"); return 0; } - match_int(&args[0], &option); + if (match_int(&args[0], &option) != 0) + return 0; *n_blocks_count = option; break; case Opt_nobh: @@ -2093,14 +2112,33 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) { struct ext3_super_block * es; struct ext3_sb_info *sbi = EXT3_SB(sb); - unsigned long tmp; unsigned long n_blocks_count = 0; + unsigned long old_sb_flags; + struct ext3_mount_options old_opts; + int err; +#ifdef CONFIG_QUOTA + int i; +#endif + + /* Store the original options */ + old_sb_flags = sb->s_flags; + old_opts.s_mount_opt = sbi->s_mount_opt; + old_opts.s_resuid = sbi->s_resuid; + old_opts.s_resgid = sbi->s_resgid; + old_opts.s_commit_interval = sbi->s_commit_interval; +#ifdef CONFIG_QUOTA + old_opts.s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) + old_opts.s_qf_names[i] = sbi->s_qf_names[i]; +#endif /* * Allow the "check" option to be passed as a remount option. */ - if (!parse_options(data, sb, &tmp, &n_blocks_count, 1)) - return -EINVAL; + if (!parse_options(data, sb, NULL, &n_blocks_count, 1)) { + err = -EINVAL; + goto restore_opts; + } if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) ext3_abort(sb, __FUNCTION__, "Abort forced by user"); @@ -2114,8 +2152,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || n_blocks_count > le32_to_cpu(es->s_blocks_count)) { - if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) - return -EROFS; + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) { + err = -EROFS; + goto restore_opts; + } if (*flags & MS_RDONLY) { /* @@ -2142,7 +2182,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) "remount RDWR because of unsupported " "optional features (%x).\n", sb->s_id, le32_to_cpu(ret)); - return -EROFS; + err = -EROFS; + goto restore_opts; } /* * Mounting a RDONLY partition read-write, so reread @@ -2152,13 +2193,38 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) */ ext3_clear_journal_err(sb, es); sbi->s_mount_state = le16_to_cpu(es->s_state); - if ((ret = ext3_group_extend(sb, es, n_blocks_count))) - return ret; + if ((ret = ext3_group_extend(sb, es, n_blocks_count))) { + err = ret; + goto restore_opts; + } if (!ext3_setup_super (sb, es, 0)) sb->s_flags &= ~MS_RDONLY; } } +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + if (old_opts.s_qf_names[i] && + old_opts.s_qf_names[i] != sbi->s_qf_names[i]) + kfree(old_opts.s_qf_names[i]); +#endif return 0; +restore_opts: + sb->s_flags = old_sb_flags; + sbi->s_mount_opt = old_opts.s_mount_opt; + sbi->s_resuid = old_opts.s_resuid; + sbi->s_resgid = old_opts.s_resgid; + sbi->s_commit_interval = old_opts.s_commit_interval; +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = old_opts.s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i] && + old_opts.s_qf_names[i] != sbi->s_qf_names[i]) + kfree(sbi->s_qf_names[i]); + sbi->s_qf_names[i] = old_opts.s_qf_names[i]; + } +#endif + return err; } static int ext3_statfs (struct super_block * sb, struct kstatfs * buf) @@ -2238,7 +2304,7 @@ static int ext3_dquot_initialize(struct inode *inode, int type) int ret, err; /* We may create quota structure so we need to reserve enough blocks */ - handle = ext3_journal_start(inode, 2*EXT3_QUOTA_INIT_BLOCKS); + handle = ext3_journal_start(inode, 2*EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_initialize(inode, type); @@ -2254,7 +2320,7 @@ static int ext3_dquot_drop(struct inode *inode) int ret, err; /* We may delete quota structure so we need to reserve enough blocks */ - handle = ext3_journal_start(inode, 2*EXT3_QUOTA_INIT_BLOCKS); + handle = ext3_journal_start(inode, 2*EXT3_QUOTA_DEL_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_drop(inode); @@ -2272,7 +2338,7 @@ static int ext3_write_dquot(struct dquot *dquot) inode = dquot_to_inode(dquot); handle = ext3_journal_start(inode, - EXT3_QUOTA_TRANS_BLOCKS); + EXT3_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit(dquot); @@ -2288,7 +2354,7 @@ static int ext3_acquire_dquot(struct dquot *dquot) handle_t *handle; handle = ext3_journal_start(dquot_to_inode(dquot), - EXT3_QUOTA_INIT_BLOCKS); + EXT3_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_acquire(dquot); @@ -2304,7 +2370,7 @@ static int ext3_release_dquot(struct dquot *dquot) handle_t *handle; handle = ext3_journal_start(dquot_to_inode(dquot), - EXT3_QUOTA_INIT_BLOCKS); + EXT3_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_release(dquot); @@ -2348,22 +2414,8 @@ static int ext3_write_info(struct super_block *sb, int type) */ static int ext3_quota_on_mount(struct super_block *sb, int type) { - int err; - struct dentry *dentry; - struct qstr name = { .name = EXT3_SB(sb)->s_qf_names[type], - .hash = 0, - .len = strlen(EXT3_SB(sb)->s_qf_names[type])}; - - dentry = lookup_hash(&name, sb->s_root); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - err = vfs_quota_on_mount(type, EXT3_SB(sb)->s_jquota_fmt, dentry); - /* Now invalidate and put the dentry - quota got its own reference - * to inode and dentry has at least wrong hash so we had better - * throw it away */ - d_invalidate(dentry); - dput(dentry); - return err; + return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type], + EXT3_SB(sb)->s_jquota_fmt, type); } /* @@ -2375,6 +2427,8 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, int err; struct nameidata nd; + if (!test_opt(sb, QUOTA)) + return -EINVAL; /* Not journalling quota? */ if (!EXT3_SB(sb)->s_qf_names[USRQUOTA] && !EXT3_SB(sb)->s_qf_names[GRPQUOTA]) diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c index 8c3e72818fb..4f79122cde6 100644 --- a/fs/ext3/symlink.c +++ b/fs/ext3/symlink.c @@ -23,11 +23,11 @@ #include <linux/namei.h> #include "xattr.h" -static int ext3_follow_link(struct dentry *dentry, struct nameidata *nd) +static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd) { struct ext3_inode_info *ei = EXT3_I(dentry->d_inode); nd_set_link(nd, (char*)ei->i_data); - return 0; + return NULL; } struct inode_operations ext3_symlink_inode_operations = { diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 4cbc6d0212d..269c7b92db9 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -1044,7 +1044,7 @@ ext3_xattr_set(struct inode *inode, int name_index, const char *name, int error, retries = 0; retry: - handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS); + handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { error = PTR_ERR(handle); } else { @@ -1106,7 +1106,7 @@ cleanup: void ext3_xattr_put_super(struct super_block *sb) { - mb_cache_shrink(ext3_xattr_cache, sb->s_bdev); + mb_cache_shrink(sb->s_bdev); } /* diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 7c52e465a61..77c24fcf712 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -56,7 +56,7 @@ int __init fat_cache_init(void) return 0; } -void __exit fat_cache_destroy(void) +void fat_cache_destroy(void) { if (kmem_cache_destroy(fat_cache_cachep)) printk(KERN_INFO "fat_cache: not all structures were freed\n"); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 8ccee841548..96ae85b67eb 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1327,16 +1327,25 @@ out_fail: EXPORT_SYMBOL(fat_fill_super); int __init fat_cache_init(void); -void __exit fat_cache_destroy(void); +void fat_cache_destroy(void); static int __init init_fat_fs(void) { - int ret; + int err; - ret = fat_cache_init(); - if (ret < 0) - return ret; - return fat_init_inodecache(); + err = fat_cache_init(); + if (err) + return err; + + err = fat_init_inodecache(); + if (err) + goto failed; + + return 0; + +failed: + fat_cache_destroy(); + return err; } static void __exit exit_fat_fs(void) diff --git a/fs/fcntl.c b/fs/fcntl.c index 286a9f8f3d4..6fbc9d8fcc3 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -288,7 +288,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; case F_SETLK: case F_SETLKW: - err = fcntl_setlk(filp, cmd, (struct flock __user *) arg); + err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg); break; case F_GETOWN: /* @@ -376,7 +376,8 @@ asmlinkage long sys_fcntl64(unsigned int fd, unsigned int cmd, unsigned long arg break; case F_SETLK64: case F_SETLKW64: - err = fcntl_setlk64(filp, cmd, (struct flock64 __user *) arg); + err = fcntl_setlk64(fd, filp, cmd, + (struct flock64 __user *) arg); break; default: err = do_fcntl(fd, cmd, arg, filp); diff --git a/fs/file_table.c b/fs/file_table.c index 03d83cb686b..1d3de78e6bc 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -16,6 +16,7 @@ #include <linux/eventpoll.h> #include <linux/mount.h> #include <linux/cdev.h> +#include <linux/fsnotify.h> /* sysctl tunables... */ struct files_stat_struct files_stat = { @@ -63,42 +64,45 @@ static inline void file_free(struct file *f) */ struct file *get_empty_filp(void) { -static int old_max; + static int old_max; struct file * f; /* * Privileged users can go above max_files */ - if (files_stat.nr_files < files_stat.max_files || - capable(CAP_SYS_ADMIN)) { - f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); - if (f) { - memset(f, 0, sizeof(*f)); - if (security_file_alloc(f)) { - file_free(f); - goto fail; - } - eventpoll_init_file(f); - atomic_set(&f->f_count, 1); - f->f_uid = current->fsuid; - f->f_gid = current->fsgid; - rwlock_init(&f->f_owner.lock); - /* f->f_version: 0 */ - INIT_LIST_HEAD(&f->f_list); - f->f_maxcount = INT_MAX; - return f; - } - } - + if (files_stat.nr_files >= files_stat.max_files && + !capable(CAP_SYS_ADMIN)) + goto over; + + f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); + if (f == NULL) + goto fail; + + memset(f, 0, sizeof(*f)); + if (security_file_alloc(f)) + goto fail_sec; + + eventpoll_init_file(f); + atomic_set(&f->f_count, 1); + f->f_uid = current->fsuid; + f->f_gid = current->fsgid; + rwlock_init(&f->f_owner.lock); + /* f->f_version: 0 */ + INIT_LIST_HEAD(&f->f_list); + f->f_maxcount = INT_MAX; + return f; + +over: /* Ran out of filps - report that */ - if (files_stat.max_files >= old_max) { + if (files_stat.nr_files > old_max) { printk(KERN_INFO "VFS: file-max limit %d reached\n", files_stat.max_files); - old_max = files_stat.max_files; - } else { - /* Big problems... */ - printk(KERN_WARNING "VFS: filp allocation failed\n"); + old_max = files_stat.nr_files; } + goto fail; + +fail_sec: + file_free(f); fail: return NULL; } @@ -123,6 +127,8 @@ void fastcall __fput(struct file *file) struct inode *inode = dentry->d_inode; might_sleep(); + + fsnotify_close(file); /* * The function eventpoll_release() should be the first called * in the file cleanup chain. diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h index 8da0252642a..583bd78086d 100644 --- a/fs/freevxfs/vxfs.h +++ b/fs/freevxfs/vxfs.h @@ -37,7 +37,6 @@ * superblocks of the Veritas Filesystem. */ #include <linux/types.h> -#include "vxfs_kcompat.h" /* diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c index bc4b57da306..d3f6b2835bc 100644 --- a/fs/freevxfs/vxfs_bmap.c +++ b/fs/freevxfs/vxfs_bmap.c @@ -101,7 +101,7 @@ vxfs_bmap_ext4(struct inode *ip, long bn) return 0; fail_size: - printk("vxfs: indirect extent to big!\n"); + printk("vxfs: indirect extent too big!\n"); fail_buf: return 0; } diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c index 05b19f70bf9..6dee109aeea 100644 --- a/fs/freevxfs/vxfs_fshead.c +++ b/fs/freevxfs/vxfs_fshead.c @@ -78,17 +78,18 @@ vxfs_getfsh(struct inode *ip, int which) struct buffer_head *bp; bp = vxfs_bread(ip, which); - if (buffer_mapped(bp)) { + if (bp) { struct vxfs_fsh *fhp; - if (!(fhp = kmalloc(sizeof(*fhp), SLAB_KERNEL))) - return NULL; + if (!(fhp = kmalloc(sizeof(*fhp), GFP_KERNEL))) + goto out; memcpy(fhp, bp->b_data, sizeof(*fhp)); - brelse(bp); + put_bh(bp); return (fhp); } - +out: + brelse(bp); return NULL; } diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c index ac677ab262b..d0401dc68d4 100644 --- a/fs/freevxfs/vxfs_immed.c +++ b/fs/freevxfs/vxfs_immed.c @@ -38,7 +38,7 @@ #include "vxfs_inode.h" -static int vxfs_immed_follow_link(struct dentry *, struct nameidata *); +static void * vxfs_immed_follow_link(struct dentry *, struct nameidata *); static int vxfs_immed_readpage(struct file *, struct page *); @@ -72,12 +72,12 @@ struct address_space_operations vxfs_immed_aops = { * Returns: * Zero on success, else a negative error code. */ -static int +static void * vxfs_immed_follow_link(struct dentry *dp, struct nameidata *np) { struct vxfs_inode_info *vip = VXFS_INO(dp->d_inode); nd_set_link(np, vip->vii_immed.vi_immed); - return 0; + return NULL; } /** diff --git a/fs/freevxfs/vxfs_kcompat.h b/fs/freevxfs/vxfs_kcompat.h deleted file mode 100644 index 342a4cc860f..00000000000 --- a/fs/freevxfs/vxfs_kcompat.h +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef _VXFS_KCOMPAT_H -#define _VXFS_KCOMPAT_H - -#include <linux/version.h> - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - -#include <linux/blkdev.h> - -typedef long sector_t; - -/* From include/linux/fs.h (Linux 2.5.2-pre3) */ -static inline struct buffer_head * sb_bread(struct super_block *sb, int block) -{ - return bread(sb->s_dev, block, sb->s_blocksize); -} - -/* Dito. */ -static inline void map_bh(struct buffer_head *bh, struct super_block *sb, int block) -{ - bh->b_state |= 1 << BH_Mapped; - bh->b_dev = sb->s_dev; - bh->b_blocknr = block; -} - -/* From fs/block_dev.c (Linux 2.5.2-pre2) */ -static inline int sb_set_blocksize(struct super_block *sb, int size) -{ - int bits; - if (set_blocksize(sb->s_dev, size) < 0) - return 0; - sb->s_blocksize = size; - for (bits = 9, size >>= 9; size >>= 1; bits++) - ; - sb->s_blocksize_bits = bits; - return sb->s_blocksize; -} - -/* Dito. */ -static inline int sb_min_blocksize(struct super_block *sb, int size) -{ - int minsize = get_hardsect_size(sb->s_dev); - if (size < minsize) - size = minsize; - return sb_set_blocksize(sb, size); -} - -#endif /* Kernel 2.4 */ -#endif /* _VXFS_KCOMPAT_H */ diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 506ae251d2c..554eb455722 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -61,13 +61,13 @@ struct file_operations vxfs_dir_operations = { }; -static __inline__ u_long +static inline u_long dir_pages(struct inode *inode) { return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; } -static __inline__ u_long +static inline u_long dir_blocks(struct inode *ip) { u_long bsize = ip->i_sb->s_blocksize; @@ -79,7 +79,7 @@ dir_blocks(struct inode *ip) * * len <= VXFS_NAMELEN and de != NULL are guaranteed by caller. */ -static __inline__ int +static inline int vxfs_match(int len, const char * const name, struct vxfs_direct *de) { if (len != de->d_namelen) @@ -89,7 +89,7 @@ vxfs_match(int len, const char * const name, struct vxfs_direct *de) return !memcmp(name, de->d_name, len); } -static __inline__ struct vxfs_direct * +static inline struct vxfs_direct * vxfs_next_entry(struct vxfs_direct *de) { return ((struct vxfs_direct *)((char*)de + de->d_reclen)); diff --git a/fs/freevxfs/vxfs_olt.c b/fs/freevxfs/vxfs_olt.c index 7a204e31aad..133476201d8 100644 --- a/fs/freevxfs/vxfs_olt.c +++ b/fs/freevxfs/vxfs_olt.c @@ -38,7 +38,7 @@ #include "vxfs_olt.h" -static __inline__ void +static inline void vxfs_get_fshead(struct vxfs_oltfshead *fshp, struct vxfs_sb_info *infp) { if (infp->vsi_fshino) @@ -46,7 +46,7 @@ vxfs_get_fshead(struct vxfs_oltfshead *fshp, struct vxfs_sb_info *infp) infp->vsi_fshino = fshp->olt_fsino[0]; } -static __inline__ void +static inline void vxfs_get_ilist(struct vxfs_oltilist *ilistp, struct vxfs_sb_info *infp) { if (infp->vsi_iext) @@ -54,7 +54,7 @@ vxfs_get_ilist(struct vxfs_oltilist *ilistp, struct vxfs_sb_info *infp) infp->vsi_iext = ilistp->olt_iext[0]; } -static __inline__ u_long +static inline u_long vxfs_oblock(struct super_block *sbp, daddr_t block, u_long bsize) { if (sbp->s_blocksize % bsize) @@ -104,8 +104,8 @@ vxfs_read_olt(struct super_block *sbp, u_long bsize) goto fail; } - oaddr = (char *)bp->b_data + op->olt_size; - eaddr = (char *)bp->b_data + (infp->vsi_oltsize * sbp->s_blocksize); + oaddr = bp->b_data + op->olt_size; + eaddr = bp->b_data + (infp->vsi_oltsize * sbp->s_blocksize); while (oaddr < eaddr) { struct vxfs_oltcommon *ocp = diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c index 5e305612054..50aae77651b 100644 --- a/fs/freevxfs/vxfs_subr.c +++ b/fs/freevxfs/vxfs_subr.c @@ -36,7 +36,6 @@ #include <linux/slab.h> #include <linux/pagemap.h> -#include "vxfs_kcompat.h" #include "vxfs_extern.h" diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 0ae2c7b8182..27f66d3e8a0 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -155,12 +155,11 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) sbp->s_flags |= MS_RDONLY; - infp = kmalloc(sizeof(*infp), GFP_KERNEL); + infp = kcalloc(1, sizeof(*infp), GFP_KERNEL); if (!infp) { printk(KERN_WARNING "vxfs: unable to allocate incore superblock\n"); return -ENOMEM; } - memset(infp, 0, sizeof(*infp)); bsize = sb_min_blocksize(sbp, BLOCK_SIZE); if (!bsize) { @@ -196,7 +195,7 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) #endif sbp->s_magic = rsbp->vs_magic; - sbp->s_fs_info = (void *)infp; + sbp->s_fs_info = infp; infp->vsi_raw = rsbp; infp->vsi_bp = bp; @@ -263,7 +262,7 @@ vxfs_init(void) sizeof(struct vxfs_inode_info), 0, SLAB_RECLAIM_ACCOUNT, NULL, NULL); if (vxfs_inode_cachep) - return (register_filesystem(&vxfs_fs_type)); + return register_filesystem(&vxfs_fs_type); return -ENOMEM; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8e050fa5821..e94ab398b71 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -485,32 +485,6 @@ static void set_sb_syncing(int val) spin_unlock(&sb_lock); } -/* - * Find a superblock with inodes that need to be synced - */ -static struct super_block *get_super_to_sync(void) -{ - struct super_block *sb; -restart: - spin_lock(&sb_lock); - sb = sb_entry(super_blocks.prev); - for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) { - if (sb->s_syncing) - continue; - sb->s_syncing = 1; - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (!sb->s_root) { - drop_super(sb); - goto restart; - } - return sb; - } - spin_unlock(&sb_lock); - return NULL; -} - /** * sync_inodes - writes all inodes to disk * @wait: wait for completion @@ -530,23 +504,39 @@ restart: * outstanding dirty inodes, the writeback goes block-at-a-time within the * filesystem's write_inode(). This is extremely slow. */ -void sync_inodes(int wait) +static void __sync_inodes(int wait) { struct super_block *sb; - set_sb_syncing(0); - while ((sb = get_super_to_sync()) != NULL) { - sync_inodes_sb(sb, 0); - sync_blockdev(sb->s_bdev); - drop_super(sb); + spin_lock(&sb_lock); +restart: + list_for_each_entry(sb, &super_blocks, s_list) { + if (sb->s_syncing) + continue; + sb->s_syncing = 1; + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + if (sb->s_root) { + sync_inodes_sb(sb, wait); + sync_blockdev(sb->s_bdev); + } + up_read(&sb->s_umount); + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; } + spin_unlock(&sb_lock); +} + +void sync_inodes(int wait) +{ + set_sb_syncing(0); + __sync_inodes(0); + if (wait) { set_sb_syncing(0); - while ((sb = get_super_to_sync()) != NULL) { - sync_inodes_sb(sb, 1); - sync_blockdev(sb->s_bdev); - drop_super(sb); - } + __sync_inodes(1); } } diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index 6ad1211f84e..a096c5a5666 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -480,6 +480,8 @@ void hfs_bnode_put(struct hfs_bnode *node) return; } for (i = 0; i < tree->pages_per_bnode; i++) { + if (!node->page[i]) + continue; mark_page_accessed(node->page[i]); #if REF_PAGES put_page(node->page[i]); diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index cbc8510ad22..5ea6b3d45ea 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -482,7 +482,8 @@ void hfs_file_truncate(struct inode *inode) page_cache_release(page); mark_inode_dirty(inode); return; - } + } else if (inode->i_size == HFS_I(inode)->phys_size) + return; size = inode->i_size + HFS_SB(sb)->alloc_blksz - 1; blk_cnt = size / HFS_SB(sb)->alloc_blksz; alloc_cnt = HFS_I(inode)->alloc_blocks; diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 267872e84d7..8868d3b766f 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -643,6 +643,8 @@ void hfs_bnode_put(struct hfs_bnode *node) return; } for (i = 0; i < tree->pages_per_bnode; i++) { + if (!node->page[i]) + continue; mark_page_accessed(node->page[i]); #if REF_PAGES put_page(node->page[i]); diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 376498cc64f..e7235ca79a9 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -461,7 +461,9 @@ void hfsplus_file_truncate(struct inode *inode) page_cache_release(page); mark_inode_dirty(inode); return; - } + } else if (inode->i_size == HFSPLUS_I(inode).phys_size) + return; + blk_cnt = (inode->i_size + HFSPLUS_SB(sb).alloc_blksz - 1) >> HFSPLUS_SB(sb).alloc_blksz_shift; alloc_cnt = HFSPLUS_I(inode).alloc_blocks; if (blk_cnt == alloc_cnt) diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index c1516d013bf..67bca0d4a33 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -69,6 +69,7 @@ extern int read_file(int fd, unsigned long long *offset, char *buf, int len); extern int write_file(int fd, unsigned long long *offset, const char *buf, int len); extern int lseek_file(int fd, long long offset, int whence); +extern int fsync_file(int fd, int datasync); extern int file_create(char *name, int ur, int uw, int ux, int gr, int gw, int gx, int or, int ow, int ox); extern int set_attr(const char *file, struct hostfs_iattr *attrs); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 4bf43ea87c4..b2d18200a00 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -15,7 +15,6 @@ #include <linux/pagemap.h> #include <linux/blkdev.h> #include <linux/list.h> -#include <linux/root_dev.h> #include <linux/statfs.h> #include <linux/kdev_t.h> #include <asm/uaccess.h> @@ -160,8 +159,6 @@ static int read_name(struct inode *ino, char *name) ino->i_size = i_size; ino->i_blksize = i_blksize; ino->i_blocks = i_blocks; - if((ino->i_sb->s_dev == ROOT_DEV) && (ino->i_uid == getuid())) - ino->i_uid = 0; return(0); } @@ -385,7 +382,7 @@ int hostfs_file_open(struct inode *ino, struct file *file) int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync) { - return(0); + return fsync_file(HOSTFS_I(dentry->d_inode)->fd, datasync); } static struct file_operations hostfs_file_fops = { @@ -841,16 +838,10 @@ int hostfs_setattr(struct dentry *dentry, struct iattr *attr) attrs.ia_mode = attr->ia_mode; } if(attr->ia_valid & ATTR_UID){ - if((dentry->d_inode->i_sb->s_dev == ROOT_DEV) && - (attr->ia_uid == 0)) - attr->ia_uid = getuid(); attrs.ia_valid |= HOSTFS_ATTR_UID; attrs.ia_uid = attr->ia_uid; } if(attr->ia_valid & ATTR_GID){ - if((dentry->d_inode->i_sb->s_dev == ROOT_DEV) && - (attr->ia_gid == 0)) - attr->ia_gid = getgid(); attrs.ia_valid |= HOSTFS_ATTR_GID; attrs.ia_gid = attr->ia_gid; } diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 4796e8490f7..b97809deba6 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -153,10 +153,24 @@ int lseek_file(int fd, long long offset, int whence) int ret; ret = lseek64(fd, offset, whence); - if(ret < 0) return(-errno); + if(ret < 0) + return(-errno); return(0); } +int fsync_file(int fd, int datasync) +{ + int ret; + if (datasync) + ret = fdatasync(fd); + else + ret = fsync(fd); + + if (ret < 0) + return -errno; + return 0; +} + void close_file(void *stream) { close(*((int *) stream)); diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c index f8e0cbd0cb6..52930915bad 100644 --- a/fs/hppfs/hppfs_kern.c +++ b/fs/hppfs/hppfs_kern.c @@ -4,6 +4,7 @@ */ #include <linux/fs.h> +#include <linux/file.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> @@ -37,7 +38,7 @@ struct hppfs_inode_info { static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode) { - return(list_entry(inode, struct hppfs_inode_info, vfs_inode)); + return container_of(inode, struct hppfs_inode_info, vfs_inode); } #define HPPFS_SUPER_MAGIC 0xb00000ee @@ -232,7 +233,7 @@ static ssize_t read_proc(struct file *file, char *buf, ssize_t count, set_fs(USER_DS); if(ppos) *ppos = file->f_pos; - return(n); + return n; } static ssize_t hppfs_read_file(int fd, char *buf, ssize_t count) @@ -253,7 +254,7 @@ static ssize_t hppfs_read_file(int fd, char *buf, ssize_t count) err = os_read_file(fd, new_buf, cur); if(err < 0){ printk("hppfs_read : read failed, errno = %d\n", - count); + err); n = err; goto out_free; } @@ -270,7 +271,7 @@ static ssize_t hppfs_read_file(int fd, char *buf, ssize_t count) out_free: kfree(new_buf); out: - return(n); + return n; } static ssize_t hppfs_read(struct file *file, char *buf, size_t count, @@ -491,7 +492,7 @@ static int hppfs_open(struct inode *inode, struct file *file) fd = open_host_sock(host_file, &filter); if(fd > 0){ data->contents = hppfs_get_data(fd, filter, - &data->proc_file, + data->proc_file, file, &data->len); if(!IS_ERR(data->contents)) data->host_fd = fd; @@ -543,7 +544,7 @@ static int hppfs_dir_open(struct inode *inode, struct file *file) static loff_t hppfs_llseek(struct file *file, loff_t off, int where) { struct hppfs_private *data = file->private_data; - struct file *proc_file = &data->proc_file; + struct file *proc_file = data->proc_file; loff_t (*llseek)(struct file *, loff_t, int); loff_t ret; @@ -586,7 +587,7 @@ static int hppfs_filldir(void *d, const char *name, int size, static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir) { struct hppfs_private *data = file->private_data; - struct file *proc_file = &data->proc_file; + struct file *proc_file = data->proc_file; int (*readdir)(struct file *, void *, filldir_t); struct hppfs_dirent dirent = ((struct hppfs_dirent) { .vfs_dirent = ent, @@ -661,42 +662,36 @@ static int hppfs_readlink(struct dentry *dentry, char *buffer, int buflen) { struct file *proc_file; struct dentry *proc_dentry; - int (*readlink)(struct dentry *, char *, int); - int err, n; + int ret; proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; proc_file = dentry_open(dget(proc_dentry), NULL, O_RDONLY); - err = PTR_ERR(proc_dentry); - if(IS_ERR(proc_dentry)) - return(err); + if (IS_ERR(proc_file)) + return PTR_ERR(proc_file); - readlink = proc_dentry->d_inode->i_op->readlink; - n = (*readlink)(proc_dentry, buffer, buflen); + ret = proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer, buflen); fput(proc_file); - return(n); + return ret; } -static int hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static void* hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) { struct file *proc_file; struct dentry *proc_dentry; - int (*follow_link)(struct dentry *, struct nameidata *); - int err, n; + void *ret; proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; proc_file = dentry_open(dget(proc_dentry), NULL, O_RDONLY); - err = PTR_ERR(proc_dentry); - if(IS_ERR(proc_dentry)) - return(err); + if (IS_ERR(proc_file)) + return proc_file; - follow_link = proc_dentry->d_inode->i_op->follow_link; - n = (*follow_link)(proc_dentry, nd); + ret = proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd); fput(proc_file); - return(n); + return ret; } static struct inode_operations hppfs_dir_iops = { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 2af3338f891..3a9b6d179cb 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -122,6 +122,9 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, start_addr = mm->free_area_cache; + if (len <= mm->cached_hole_size) + start_addr = TASK_UNMAPPED_BASE; + full_search: addr = ALIGN(start_addr, HPAGE_SIZE); diff --git a/fs/inode.c b/fs/inode.c index 801fe7f3628..e57f1724db3 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -21,6 +21,7 @@ #include <linux/pagemap.h> #include <linux/cdev.h> #include <linux/bootmem.h> +#include <linux/inotify.h> /* * This is needed for the following functions: @@ -202,6 +203,10 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear); spin_lock_init(&inode->i_lock); i_size_ordered_init(inode); +#ifdef CONFIG_INOTIFY + INIT_LIST_HEAD(&inode->inotify_watches); + sema_init(&inode->inotify_sem, 1); +#endif } EXPORT_SYMBOL(inode_init_once); @@ -282,6 +287,13 @@ static void dispose_list(struct list_head *head) if (inode->i_data.nrpages) truncate_inode_pages(&inode->i_data, 0); clear_inode(inode); + + spin_lock(&inode_lock); + hlist_del_init(&inode->i_hash); + list_del_init(&inode->i_sb_list); + spin_unlock(&inode_lock); + + wake_up_inode(inode); destroy_inode(inode); nr_disposed++; } @@ -317,8 +329,6 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose) inode = list_entry(tmp, struct inode, i_sb_list); invalidate_inode_buffers(inode); if (!atomic_read(&inode->i_count)) { - hlist_del_init(&inode->i_hash); - list_del(&inode->i_sb_list); list_move(&inode->i_list, dispose); inode->i_state |= I_FREEING; count++; @@ -346,6 +356,7 @@ int invalidate_inodes(struct super_block * sb) down(&iprune_sem); spin_lock(&inode_lock); + inotify_unmount_inodes(&sb->s_inodes); busy = invalidate_list(&sb->s_inodes, &throw_away); spin_unlock(&inode_lock); @@ -439,8 +450,6 @@ static void prune_icache(int nr_to_scan) if (!can_unuse(inode)) continue; } - hlist_del_init(&inode->i_hash); - list_del_init(&inode->i_sb_list); list_move(&inode->i_list, &freeable); inode->i_state |= I_FREEING; nr_pruned++; @@ -500,7 +509,7 @@ repeat: continue; if (!test(inode, data)) continue; - if (inode->i_state & (I_FREEING|I_CLEAR)) { + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; } @@ -525,7 +534,7 @@ repeat: continue; if (inode->i_sb != sb) continue; - if (inode->i_state & (I_FREEING|I_CLEAR)) { + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { __wait_on_freeing_inode(inode); goto repeat; } @@ -727,7 +736,7 @@ EXPORT_SYMBOL(iunique); struct inode *igrab(struct inode *inode) { spin_lock(&inode_lock); - if (!(inode->i_state & I_FREEING)) + if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) __iget(inode); else /* @@ -748,6 +757,7 @@ EXPORT_SYMBOL(igrab); * @head: the head of the list to search * @test: callback used for comparisons between inodes * @data: opaque data pointer to pass to @test + * @wait: if true wait for the inode to be unlocked, if false do not * * ifind() searches for the inode specified by @data in the inode * cache. This is a generalized version of ifind_fast() for file systems where @@ -762,7 +772,7 @@ EXPORT_SYMBOL(igrab); */ static inline struct inode *ifind(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), - void *data) + void *data, const int wait) { struct inode *inode; @@ -771,7 +781,8 @@ static inline struct inode *ifind(struct super_block *sb, if (inode) { __iget(inode); spin_unlock(&inode_lock); - wait_on_inode(inode); + if (likely(wait)) + wait_on_inode(inode); return inode; } spin_unlock(&inode_lock); @@ -811,7 +822,7 @@ static inline struct inode *ifind_fast(struct super_block *sb, } /** - * ilookup5 - search for an inode in the inode cache + * ilookup5_nowait - search for an inode in the inode cache * @sb: super block of file system to search * @hashval: hash value (usually inode number) to search for * @test: callback used for comparisons between inodes @@ -823,7 +834,38 @@ static inline struct inode *ifind_fast(struct super_block *sb, * identification of an inode. * * If the inode is in the cache, the inode is returned with an incremented - * reference count. + * reference count. Note, the inode lock is not waited upon so you have to be + * very careful what you do with the returned inode. You probably should be + * using ilookup5() instead. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + + return ifind(sb, head, test, data, 0); +} + +EXPORT_SYMBOL(ilookup5_nowait); + +/** + * ilookup5 - search for an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ilookup5() uses ifind() to search for the inode specified by @hashval and + * @data in the inode cache. This is a generalized version of ilookup() for + * file systems where the inode number is not sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode lock is waited upon and the inode is + * returned with an incremented reference count. * * Otherwise NULL is returned. * @@ -834,7 +876,7 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval, { struct hlist_head *head = inode_hashtable + hash(sb, hashval); - return ifind(sb, head, test, data); + return ifind(sb, head, test, data, 1); } EXPORT_SYMBOL(ilookup5); @@ -891,7 +933,7 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, struct hlist_head *head = inode_hashtable + hash(sb, hashval); struct inode *inode; - inode = ifind(sb, head, test, data); + inode = ifind(sb, head, test, data, 1); if (inode) return inode; /* @@ -1024,17 +1066,21 @@ static void generic_forget_inode(struct inode *inode) if (!(inode->i_state & (I_DIRTY|I_LOCK))) list_move(&inode->i_list, &inode_unused); inodes_stat.nr_unused++; - spin_unlock(&inode_lock); - if (!sb || (sb->s_flags & MS_ACTIVE)) + if (!sb || (sb->s_flags & MS_ACTIVE)) { + spin_unlock(&inode_lock); return; + } + inode->i_state |= I_WILL_FREE; + spin_unlock(&inode_lock); write_inode_now(inode, 1); spin_lock(&inode_lock); + inode->i_state &= ~I_WILL_FREE; inodes_stat.nr_unused--; hlist_del_init(&inode->i_hash); } list_del_init(&inode->i_list); list_del_init(&inode->i_sb_list); - inode->i_state|=I_FREEING; + inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); if (inode->i_data.nrpages) @@ -1048,7 +1094,7 @@ static void generic_forget_inode(struct inode *inode) * inode when the usage count drops to zero, and * i_nlink is zero. */ -static void generic_drop_inode(struct inode *inode) +void generic_drop_inode(struct inode *inode) { if (!inode->i_nlink) generic_delete_inode(inode); @@ -1056,6 +1102,8 @@ static void generic_drop_inode(struct inode *inode) generic_forget_inode(inode); } +EXPORT_SYMBOL_GPL(generic_drop_inode); + /* * Called when we're dropping the last reference * to an inode. @@ -1238,29 +1286,21 @@ int inode_wait(void *word) } /* - * If we try to find an inode in the inode hash while it is being deleted, we - * have to wait until the filesystem completes its deletion before reporting - * that it isn't found. This is because iget will immediately call - * ->read_inode, and we want to be sure that evidence of the deletion is found - * by ->read_inode. + * If we try to find an inode in the inode hash while it is being + * deleted, we have to wait until the filesystem completes its + * deletion before reporting that it isn't found. This function waits + * until the deletion _might_ have completed. Callers are responsible + * to recheck inode state. + * + * It doesn't matter if I_LOCK is not set initially, a call to + * wake_up_inode() after removing from the hash list will DTRT. + * * This is called with inode_lock held. */ static void __wait_on_freeing_inode(struct inode *inode) { wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK); - - /* - * I_FREEING and I_CLEAR are cleared in process context under - * inode_lock, so we have to give the tasks who would clear them - * a chance to run and acquire inode_lock. - */ - if (!(inode->i_state & I_LOCK)) { - spin_unlock(&inode_lock); - yield(); - spin_lock(&inode_lock); - return; - } wq = bit_waitqueue(&inode->i_state, __I_LOCK); prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); spin_unlock(&inode_lock); diff --git a/fs/inotify.c b/fs/inotify.c new file mode 100644 index 00000000000..2e4e2a57708 --- /dev/null +++ b/fs/inotify.c @@ -0,0 +1,1057 @@ +/* + * fs/inotify.c - inode-based file event notifications + * + * Authors: + * John McCutchan <ttb@tentacle.dhs.org> + * Robert Love <rml@novell.com> + * + * Copyright (C) 2005 John McCutchan + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/idr.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/poll.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/writeback.h> +#include <linux/inotify.h> + +#include <asm/ioctls.h> + +static atomic_t inotify_cookie; + +static kmem_cache_t *watch_cachep; +static kmem_cache_t *event_cachep; + +static struct vfsmount *inotify_mnt; + +/* these are configurable via /proc/sys/fs/inotify/ */ +int inotify_max_user_instances; +int inotify_max_user_watches; +int inotify_max_queued_events; + +/* + * Lock ordering: + * + * dentry->d_lock (used to keep d_move() away from dentry->d_parent) + * iprune_sem (synchronize shrink_icache_memory()) + * inode_lock (protects the super_block->s_inodes list) + * inode->inotify_sem (protects inode->inotify_watches and watches->i_list) + * inotify_dev->sem (protects inotify_device and watches->d_list) + */ + +/* + * Lifetimes of the three main data structures--inotify_device, inode, and + * inotify_watch--are managed by reference count. + * + * inotify_device: Lifetime is from inotify_init() until release. Additional + * references can bump the count via get_inotify_dev() and drop the count via + * put_inotify_dev(). + * + * inotify_watch: Lifetime is from create_watch() to destory_watch(). + * Additional references can bump the count via get_inotify_watch() and drop + * the count via put_inotify_watch(). + * + * inode: Pinned so long as the inode is associated with a watch, from + * create_watch() to put_inotify_watch(). + */ + +/* + * struct inotify_device - represents an inotify instance + * + * This structure is protected by the semaphore 'sem'. + */ +struct inotify_device { + wait_queue_head_t wq; /* wait queue for i/o */ + struct idr idr; /* idr mapping wd -> watch */ + struct semaphore sem; /* protects this bad boy */ + struct list_head events; /* list of queued events */ + struct list_head watches; /* list of watches */ + atomic_t count; /* reference count */ + struct user_struct *user; /* user who opened this dev */ + unsigned int queue_size; /* size of the queue (bytes) */ + unsigned int event_count; /* number of pending events */ + unsigned int max_events; /* maximum number of events */ + u32 last_wd; /* the last wd allocated */ +}; + +/* + * struct inotify_kernel_event - An inotify event, originating from a watch and + * queued for user-space. A list of these is attached to each instance of the + * device. In read(), this list is walked and all events that can fit in the + * buffer are returned. + * + * Protected by dev->sem of the device in which we are queued. + */ +struct inotify_kernel_event { + struct inotify_event event; /* the user-space event */ + struct list_head list; /* entry in inotify_device's list */ + char *name; /* filename, if any */ +}; + +/* + * struct inotify_watch - represents a watch request on a specific inode + * + * d_list is protected by dev->sem of the associated watch->dev. + * i_list and mask are protected by inode->inotify_sem of the associated inode. + * dev, inode, and wd are never written to once the watch is created. + */ +struct inotify_watch { + struct list_head d_list; /* entry in inotify_device's list */ + struct list_head i_list; /* entry in inode's list */ + atomic_t count; /* reference count */ + struct inotify_device *dev; /* associated device */ + struct inode *inode; /* associated inode */ + s32 wd; /* watch descriptor */ + u32 mask; /* event mask for this watch */ +}; + +#ifdef CONFIG_SYSCTL + +#include <linux/sysctl.h> + +static int zero; + +ctl_table inotify_table[] = { + { + .ctl_name = INOTIFY_MAX_USER_INSTANCES, + .procname = "max_user_instances", + .data = &inotify_max_user_instances, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, + { + .ctl_name = INOTIFY_MAX_USER_WATCHES, + .procname = "max_user_watches", + .data = &inotify_max_user_watches, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, + { + .ctl_name = INOTIFY_MAX_QUEUED_EVENTS, + .procname = "max_queued_events", + .data = &inotify_max_queued_events, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero + }, + { .ctl_name = 0 } +}; +#endif /* CONFIG_SYSCTL */ + +static inline void get_inotify_dev(struct inotify_device *dev) +{ + atomic_inc(&dev->count); +} + +static inline void put_inotify_dev(struct inotify_device *dev) +{ + if (atomic_dec_and_test(&dev->count)) { + atomic_dec(&dev->user->inotify_devs); + free_uid(dev->user); + kfree(dev); + } +} + +static inline void get_inotify_watch(struct inotify_watch *watch) +{ + atomic_inc(&watch->count); +} + +/* + * put_inotify_watch - decrements the ref count on a given watch. cleans up + * the watch and its references if the count reaches zero. + */ +static inline void put_inotify_watch(struct inotify_watch *watch) +{ + if (atomic_dec_and_test(&watch->count)) { + put_inotify_dev(watch->dev); + iput(watch->inode); + kmem_cache_free(watch_cachep, watch); + } +} + +/* + * kernel_event - create a new kernel event with the given parameters + * + * This function can sleep. + */ +static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie, + const char *name) +{ + struct inotify_kernel_event *kevent; + + kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL); + if (unlikely(!kevent)) + return NULL; + + /* we hand this out to user-space, so zero it just in case */ + memset(&kevent->event, 0, sizeof(struct inotify_event)); + + kevent->event.wd = wd; + kevent->event.mask = mask; + kevent->event.cookie = cookie; + + INIT_LIST_HEAD(&kevent->list); + + if (name) { + size_t len, rem, event_size = sizeof(struct inotify_event); + + /* + * We need to pad the filename so as to properly align an + * array of inotify_event structures. Because the structure is + * small and the common case is a small filename, we just round + * up to the next multiple of the structure's sizeof. This is + * simple and safe for all architectures. + */ + len = strlen(name) + 1; + rem = event_size - len; + if (len > event_size) { + rem = event_size - (len % event_size); + if (len % event_size == 0) + rem = 0; + } + + kevent->name = kmalloc(len + rem, GFP_KERNEL); + if (unlikely(!kevent->name)) { + kmem_cache_free(event_cachep, kevent); + return NULL; + } + memcpy(kevent->name, name, len); + if (rem) + memset(kevent->name + len, 0, rem); + kevent->event.len = len + rem; + } else { + kevent->event.len = 0; + kevent->name = NULL; + } + + return kevent; +} + +/* + * inotify_dev_get_event - return the next event in the given dev's queue + * + * Caller must hold dev->sem. + */ +static inline struct inotify_kernel_event * +inotify_dev_get_event(struct inotify_device *dev) +{ + return list_entry(dev->events.next, struct inotify_kernel_event, list); +} + +/* + * inotify_dev_queue_event - add a new event to the given device + * + * Caller must hold dev->sem. Can sleep (calls kernel_event()). + */ +static void inotify_dev_queue_event(struct inotify_device *dev, + struct inotify_watch *watch, u32 mask, + u32 cookie, const char *name) +{ + struct inotify_kernel_event *kevent, *last; + + /* coalescing: drop this event if it is a dupe of the previous */ + last = inotify_dev_get_event(dev); + if (last && last->event.mask == mask && last->event.wd == watch->wd && + last->event.cookie == cookie) { + const char *lastname = last->name; + + if (!name && !lastname) + return; + if (name && lastname && !strcmp(lastname, name)) + return; + } + + /* the queue overflowed and we already sent the Q_OVERFLOW event */ + if (unlikely(dev->event_count > dev->max_events)) + return; + + /* if the queue overflows, we need to notify user space */ + if (unlikely(dev->event_count == dev->max_events)) + kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL); + else + kevent = kernel_event(watch->wd, mask, cookie, name); + + if (unlikely(!kevent)) + return; + + /* queue the event and wake up anyone waiting */ + dev->event_count++; + dev->queue_size += sizeof(struct inotify_event) + kevent->event.len; + list_add_tail(&kevent->list, &dev->events); + wake_up_interruptible(&dev->wq); +} + +/* + * remove_kevent - cleans up and ultimately frees the given kevent + * + * Caller must hold dev->sem. + */ +static void remove_kevent(struct inotify_device *dev, + struct inotify_kernel_event *kevent) +{ + list_del(&kevent->list); + + dev->event_count--; + dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len; + + kfree(kevent->name); + kmem_cache_free(event_cachep, kevent); +} + +/* + * inotify_dev_event_dequeue - destroy an event on the given device + * + * Caller must hold dev->sem. + */ +static void inotify_dev_event_dequeue(struct inotify_device *dev) +{ + if (!list_empty(&dev->events)) { + struct inotify_kernel_event *kevent; + kevent = inotify_dev_get_event(dev); + remove_kevent(dev, kevent); + } +} + +/* + * inotify_dev_get_wd - returns the next WD for use by the given dev + * + * Callers must hold dev->sem. This function can sleep. + */ +static int inotify_dev_get_wd(struct inotify_device *dev, + struct inotify_watch *watch) +{ + int ret; + + do { + if (unlikely(!idr_pre_get(&dev->idr, GFP_KERNEL))) + return -ENOSPC; + ret = idr_get_new_above(&dev->idr, watch, dev->last_wd+1, &watch->wd); + } while (ret == -EAGAIN); + + return ret; +} + +/* + * find_inode - resolve a user-given path to a specific inode and return a nd + */ +static int find_inode(const char __user *dirname, struct nameidata *nd) +{ + int error; + + error = __user_walk(dirname, LOOKUP_FOLLOW, nd); + if (error) + return error; + /* you can only watch an inode if you have read permissions on it */ + error = permission(nd->dentry->d_inode, MAY_READ, NULL); + if (error) + path_release(nd); + return error; +} + +/* + * create_watch - creates a watch on the given device. + * + * Callers must hold dev->sem. Calls inotify_dev_get_wd() so may sleep. + * Both 'dev' and 'inode' (by way of nameidata) need to be pinned. + */ +static struct inotify_watch *create_watch(struct inotify_device *dev, + u32 mask, struct inode *inode) +{ + struct inotify_watch *watch; + int ret; + + if (atomic_read(&dev->user->inotify_watches) >= + inotify_max_user_watches) + return ERR_PTR(-ENOSPC); + + watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL); + if (unlikely(!watch)) + return ERR_PTR(-ENOMEM); + + ret = inotify_dev_get_wd(dev, watch); + if (unlikely(ret)) { + kmem_cache_free(watch_cachep, watch); + return ERR_PTR(ret); + } + + dev->last_wd = watch->wd; + watch->mask = mask; + atomic_set(&watch->count, 0); + INIT_LIST_HEAD(&watch->d_list); + INIT_LIST_HEAD(&watch->i_list); + + /* save a reference to device and bump the count to make it official */ + get_inotify_dev(dev); + watch->dev = dev; + + /* + * Save a reference to the inode and bump the ref count to make it + * official. We hold a reference to nameidata, which makes this safe. + */ + watch->inode = igrab(inode); + + /* bump our own count, corresponding to our entry in dev->watches */ + get_inotify_watch(watch); + + atomic_inc(&dev->user->inotify_watches); + + return watch; +} + +/* + * inotify_find_dev - find the watch associated with the given inode and dev + * + * Callers must hold inode->inotify_sem. + */ +static struct inotify_watch *inode_find_dev(struct inode *inode, + struct inotify_device *dev) +{ + struct inotify_watch *watch; + + list_for_each_entry(watch, &inode->inotify_watches, i_list) { + if (watch->dev == dev) + return watch; + } + + return NULL; +} + +/* + * remove_watch_no_event - remove_watch() without the IN_IGNORED event. + */ +static void remove_watch_no_event(struct inotify_watch *watch, + struct inotify_device *dev) +{ + list_del(&watch->i_list); + list_del(&watch->d_list); + + atomic_dec(&dev->user->inotify_watches); + idr_remove(&dev->idr, watch->wd); + put_inotify_watch(watch); +} + +/* + * remove_watch - Remove a watch from both the device and the inode. Sends + * the IN_IGNORED event to the given device signifying that the inode is no + * longer watched. + * + * Callers must hold both inode->inotify_sem and dev->sem. We drop a + * reference to the inode before returning. + * + * The inode is not iput() so as to remain atomic. If the inode needs to be + * iput(), the call returns one. Otherwise, it returns zero. + */ +static void remove_watch(struct inotify_watch *watch,struct inotify_device *dev) +{ + inotify_dev_queue_event(dev, watch, IN_IGNORED, 0, NULL); + remove_watch_no_event(watch, dev); +} + +/* + * inotify_inode_watched - returns nonzero if there are watches on this inode + * and zero otherwise. We call this lockless, we do not care if we race. + */ +static inline int inotify_inode_watched(struct inode *inode) +{ + return !list_empty(&inode->inotify_watches); +} + +/* Kernel API */ + +/** + * inotify_inode_queue_event - queue an event to all watches on this inode + * @inode: inode event is originating from + * @mask: event mask describing this event + * @cookie: cookie for synchronization, or zero + * @name: filename, if any + */ +void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie, + const char *name) +{ + struct inotify_watch *watch, *next; + + if (!inotify_inode_watched(inode)) + return; + + down(&inode->inotify_sem); + list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) { + u32 watch_mask = watch->mask; + if (watch_mask & mask) { + struct inotify_device *dev = watch->dev; + get_inotify_watch(watch); + down(&dev->sem); + inotify_dev_queue_event(dev, watch, mask, cookie, name); + if (watch_mask & IN_ONESHOT) + remove_watch_no_event(watch, dev); + up(&dev->sem); + put_inotify_watch(watch); + } + } + up(&inode->inotify_sem); +} +EXPORT_SYMBOL_GPL(inotify_inode_queue_event); + +/** + * inotify_dentry_parent_queue_event - queue an event to a dentry's parent + * @dentry: the dentry in question, we queue against this dentry's parent + * @mask: event mask describing this event + * @cookie: cookie for synchronization, or zero + * @name: filename, if any + */ +void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask, + u32 cookie, const char *name) +{ + struct dentry *parent; + struct inode *inode; + + spin_lock(&dentry->d_lock); + parent = dentry->d_parent; + inode = parent->d_inode; + + if (inotify_inode_watched(inode)) { + dget(parent); + spin_unlock(&dentry->d_lock); + inotify_inode_queue_event(inode, mask, cookie, name); + dput(parent); + } else + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL_GPL(inotify_dentry_parent_queue_event); + +/** + * inotify_get_cookie - return a unique cookie for use in synchronizing events. + */ +u32 inotify_get_cookie(void) +{ + return atomic_inc_return(&inotify_cookie); +} +EXPORT_SYMBOL_GPL(inotify_get_cookie); + +/** + * inotify_unmount_inodes - an sb is unmounting. handle any watched inodes. + * @list: list of inodes being unmounted (sb->s_inodes) + * + * Called with inode_lock held, protecting the unmounting super block's list + * of inodes, and with iprune_sem held, keeping shrink_icache_memory() at bay. + * We temporarily drop inode_lock, however, and CAN block. + */ +void inotify_unmount_inodes(struct list_head *list) +{ + struct inode *inode, *next_i, *need_iput = NULL; + + list_for_each_entry_safe(inode, next_i, list, i_sb_list) { + struct inotify_watch *watch, *next_w; + struct inode *need_iput_tmp; + struct list_head *watches; + + /* + * If i_count is zero, the inode cannot have any watches and + * doing an __iget/iput with MS_ACTIVE clear would actually + * evict all inodes with zero i_count from icache which is + * unnecessarily violent and may in fact be illegal to do. + */ + if (!atomic_read(&inode->i_count)) + continue; + + /* + * We cannot __iget() an inode in state I_CLEAR, I_FREEING, or + * I_WILL_FREE which is fine because by that point the inode + * cannot have any associated watches. + */ + if (inode->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE)) + continue; + + need_iput_tmp = need_iput; + need_iput = NULL; + /* In case the remove_watch() drops a reference. */ + if (inode != need_iput_tmp) + __iget(inode); + else + need_iput_tmp = NULL; + /* In case the dropping of a reference would nuke next_i. */ + if ((&next_i->i_sb_list != list) && + atomic_read(&next_i->i_count) && + !(next_i->i_state & (I_CLEAR | I_FREEING | + I_WILL_FREE))) { + __iget(next_i); + need_iput = next_i; + } + + /* + * We can safely drop inode_lock here because we hold + * references on both inode and next_i. Also no new inodes + * will be added since the umount has begun. Finally, + * iprune_sem keeps shrink_icache_memory() away. + */ + spin_unlock(&inode_lock); + + if (need_iput_tmp) + iput(need_iput_tmp); + + /* for each watch, send IN_UNMOUNT and then remove it */ + down(&inode->inotify_sem); + watches = &inode->inotify_watches; + list_for_each_entry_safe(watch, next_w, watches, i_list) { + struct inotify_device *dev = watch->dev; + down(&dev->sem); + inotify_dev_queue_event(dev, watch, IN_UNMOUNT,0,NULL); + remove_watch(watch, dev); + up(&dev->sem); + } + up(&inode->inotify_sem); + iput(inode); + + spin_lock(&inode_lock); + } +} +EXPORT_SYMBOL_GPL(inotify_unmount_inodes); + +/** + * inotify_inode_is_dead - an inode has been deleted, cleanup any watches + * @inode: inode that is about to be removed + */ +void inotify_inode_is_dead(struct inode *inode) +{ + struct inotify_watch *watch, *next; + + down(&inode->inotify_sem); + list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) { + struct inotify_device *dev = watch->dev; + down(&dev->sem); + remove_watch(watch, dev); + up(&dev->sem); + } + up(&inode->inotify_sem); +} +EXPORT_SYMBOL_GPL(inotify_inode_is_dead); + +/* Device Interface */ + +static unsigned int inotify_poll(struct file *file, poll_table *wait) +{ + struct inotify_device *dev = file->private_data; + int ret = 0; + + poll_wait(file, &dev->wq, wait); + down(&dev->sem); + if (!list_empty(&dev->events)) + ret = POLLIN | POLLRDNORM; + up(&dev->sem); + + return ret; +} + +static ssize_t inotify_read(struct file *file, char __user *buf, + size_t count, loff_t *pos) +{ + size_t event_size = sizeof (struct inotify_event); + struct inotify_device *dev; + char __user *start; + int ret; + DEFINE_WAIT(wait); + + start = buf; + dev = file->private_data; + + while (1) { + int events; + + prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE); + + down(&dev->sem); + events = !list_empty(&dev->events); + up(&dev->sem); + if (events) { + ret = 0; + break; + } + + if (file->f_flags & O_NONBLOCK) { + ret = -EAGAIN; + break; + } + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + schedule(); + } + + finish_wait(&dev->wq, &wait); + if (ret) + return ret; + + down(&dev->sem); + while (1) { + struct inotify_kernel_event *kevent; + + ret = buf - start; + if (list_empty(&dev->events)) + break; + + kevent = inotify_dev_get_event(dev); + if (event_size + kevent->event.len > count) + break; + + if (copy_to_user(buf, &kevent->event, event_size)) { + ret = -EFAULT; + break; + } + buf += event_size; + count -= event_size; + + if (kevent->name) { + if (copy_to_user(buf, kevent->name, kevent->event.len)){ + ret = -EFAULT; + break; + } + buf += kevent->event.len; + count -= kevent->event.len; + } + + remove_kevent(dev, kevent); + } + up(&dev->sem); + + return ret; +} + +static int inotify_release(struct inode *ignored, struct file *file) +{ + struct inotify_device *dev = file->private_data; + + /* + * Destroy all of the watches on this device. Unfortunately, not very + * pretty. We cannot do a simple iteration over the list, because we + * do not know the inode until we iterate to the watch. But we need to + * hold inode->inotify_sem before dev->sem. The following works. + */ + while (1) { + struct inotify_watch *watch; + struct list_head *watches; + struct inode *inode; + + down(&dev->sem); + watches = &dev->watches; + if (list_empty(watches)) { + up(&dev->sem); + break; + } + watch = list_entry(watches->next, struct inotify_watch, d_list); + get_inotify_watch(watch); + up(&dev->sem); + + inode = watch->inode; + down(&inode->inotify_sem); + down(&dev->sem); + remove_watch_no_event(watch, dev); + up(&dev->sem); + up(&inode->inotify_sem); + put_inotify_watch(watch); + } + + /* destroy all of the events on this device */ + down(&dev->sem); + while (!list_empty(&dev->events)) + inotify_dev_event_dequeue(dev); + up(&dev->sem); + + /* free this device: the put matching the get in inotify_init() */ + put_inotify_dev(dev); + + return 0; +} + +/* + * inotify_ignore - remove a given wd from this inotify instance. + * + * Can sleep. + */ +static int inotify_ignore(struct inotify_device *dev, s32 wd) +{ + struct inotify_watch *watch; + struct inode *inode; + + down(&dev->sem); + watch = idr_find(&dev->idr, wd); + if (unlikely(!watch)) { + up(&dev->sem); + return -EINVAL; + } + get_inotify_watch(watch); + inode = watch->inode; + up(&dev->sem); + + down(&inode->inotify_sem); + down(&dev->sem); + + /* make sure that we did not race */ + watch = idr_find(&dev->idr, wd); + if (likely(watch)) + remove_watch(watch, dev); + + up(&dev->sem); + up(&inode->inotify_sem); + put_inotify_watch(watch); + + return 0; +} + +static long inotify_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct inotify_device *dev; + void __user *p; + int ret = -ENOTTY; + + dev = file->private_data; + p = (void __user *) arg; + + switch (cmd) { + case FIONREAD: + ret = put_user(dev->queue_size, (int __user *) p); + break; + } + + return ret; +} + +static struct file_operations inotify_fops = { + .poll = inotify_poll, + .read = inotify_read, + .release = inotify_release, + .unlocked_ioctl = inotify_ioctl, + .compat_ioctl = inotify_ioctl, +}; + +asmlinkage long sys_inotify_init(void) +{ + struct inotify_device *dev; + struct user_struct *user; + struct file *filp; + int fd, ret; + + fd = get_unused_fd(); + if (fd < 0) + return fd; + + filp = get_empty_filp(); + if (!filp) { + ret = -ENFILE; + goto out_put_fd; + } + + user = get_uid(current->user); + if (unlikely(atomic_read(&user->inotify_devs) >= + inotify_max_user_instances)) { + ret = -EMFILE; + goto out_free_uid; + } + + dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL); + if (unlikely(!dev)) { + ret = -ENOMEM; + goto out_free_uid; + } + + filp->f_op = &inotify_fops; + filp->f_vfsmnt = mntget(inotify_mnt); + filp->f_dentry = dget(inotify_mnt->mnt_root); + filp->f_mapping = filp->f_dentry->d_inode->i_mapping; + filp->f_mode = FMODE_READ; + filp->f_flags = O_RDONLY; + filp->private_data = dev; + + idr_init(&dev->idr); + INIT_LIST_HEAD(&dev->events); + INIT_LIST_HEAD(&dev->watches); + init_waitqueue_head(&dev->wq); + sema_init(&dev->sem, 1); + dev->event_count = 0; + dev->queue_size = 0; + dev->max_events = inotify_max_queued_events; + dev->user = user; + dev->last_wd = 0; + atomic_set(&dev->count, 0); + + get_inotify_dev(dev); + atomic_inc(&user->inotify_devs); + fd_install(fd, filp); + + return fd; +out_free_uid: + free_uid(user); + put_filp(filp); +out_put_fd: + put_unused_fd(fd); + return ret; +} + +asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask) +{ + struct inotify_watch *watch, *old; + struct inode *inode; + struct inotify_device *dev; + struct nameidata nd; + struct file *filp; + int ret, fput_needed; + + filp = fget_light(fd, &fput_needed); + if (unlikely(!filp)) + return -EBADF; + + /* verify that this is indeed an inotify instance */ + if (unlikely(filp->f_op != &inotify_fops)) { + ret = -EINVAL; + goto fput_and_out; + } + + ret = find_inode(path, &nd); + if (unlikely(ret)) + goto fput_and_out; + + /* inode held in place by reference to nd; dev by fget on fd */ + inode = nd.dentry->d_inode; + dev = filp->private_data; + + down(&inode->inotify_sem); + down(&dev->sem); + + /* don't let user-space set invalid bits: we don't want flags set */ + mask &= IN_ALL_EVENTS; + if (unlikely(!mask)) { + ret = -EINVAL; + goto out; + } + + /* + * Handle the case of re-adding a watch on an (inode,dev) pair that we + * are already watching. We just update the mask and return its wd. + */ + old = inode_find_dev(inode, dev); + if (unlikely(old)) { + old->mask = mask; + ret = old->wd; + goto out; + } + + watch = create_watch(dev, mask, inode); + if (unlikely(IS_ERR(watch))) { + ret = PTR_ERR(watch); + goto out; + } + + /* Add the watch to the device's and the inode's list */ + list_add(&watch->d_list, &dev->watches); + list_add(&watch->i_list, &inode->inotify_watches); + ret = watch->wd; +out: + up(&dev->sem); + up(&inode->inotify_sem); + path_release(&nd); +fput_and_out: + fput_light(filp, fput_needed); + return ret; +} + +asmlinkage long sys_inotify_rm_watch(int fd, u32 wd) +{ + struct file *filp; + struct inotify_device *dev; + int ret, fput_needed; + + filp = fget_light(fd, &fput_needed); + if (unlikely(!filp)) + return -EBADF; + + /* verify that this is indeed an inotify instance */ + if (unlikely(filp->f_op != &inotify_fops)) { + ret = -EINVAL; + goto out; + } + + dev = filp->private_data; + ret = inotify_ignore(dev, wd); + +out: + fput_light(filp, fput_needed); + return ret; +} + +static struct super_block * +inotify_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA); +} + +static struct file_system_type inotify_fs_type = { + .name = "inotifyfs", + .get_sb = inotify_get_sb, + .kill_sb = kill_anon_super, +}; + +/* + * inotify_setup - Our initialization function. Note that we cannnot return + * error because we have compiled-in VFS hooks. So an (unlikely) failure here + * must result in panic(). + */ +static int __init inotify_setup(void) +{ + int ret; + + ret = register_filesystem(&inotify_fs_type); + if (unlikely(ret)) + panic("inotify: register_filesystem returned %d!\n", ret); + + inotify_mnt = kern_mount(&inotify_fs_type); + if (IS_ERR(inotify_mnt)) + panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt)); + + inotify_max_queued_events = 16384; + inotify_max_user_instances = 128; + inotify_max_user_watches = 8192; + + atomic_set(&inotify_cookie, 0); + + watch_cachep = kmem_cache_create("inotify_watch_cache", + sizeof(struct inotify_watch), + 0, SLAB_PANIC, NULL, NULL); + event_cachep = kmem_cache_create("inotify_event_cache", + sizeof(struct inotify_kernel_event), + 0, SLAB_PANIC, NULL, NULL); + + return 0; +} + +module_init(inotify_setup); diff --git a/fs/ioprio.c b/fs/ioprio.c new file mode 100644 index 00000000000..d1c1f2b2c9d --- /dev/null +++ b/fs/ioprio.c @@ -0,0 +1,174 @@ +/* + * fs/ioprio.c + * + * Copyright (C) 2004 Jens Axboe <axboe@suse.de> + * + * Helper functions for setting/querying io priorities of processes. The + * system calls closely mimmick getpriority/setpriority, see the man page for + * those. The prio argument is a composite of prio class and prio data, where + * the data argument has meaning within that class. The standard scheduling + * classes have 8 distinct prio levels, with 0 being the highest prio and 7 + * being the lowest. + * + * IOW, setting BE scheduling class with prio 2 is done ala: + * + * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2; + * + * ioprio_set(PRIO_PROCESS, pid, prio); + * + * See also Documentation/block/ioprio.txt + * + */ +#include <linux/kernel.h> +#include <linux/ioprio.h> +#include <linux/blkdev.h> + +static int set_task_ioprio(struct task_struct *task, int ioprio) +{ + struct io_context *ioc; + + if (task->uid != current->euid && + task->uid != current->uid && !capable(CAP_SYS_NICE)) + return -EPERM; + + task_lock(task); + + task->ioprio = ioprio; + + ioc = task->io_context; + if (ioc && ioc->set_ioprio) + ioc->set_ioprio(ioc, ioprio); + + task_unlock(task); + return 0; +} + +asmlinkage long sys_ioprio_set(int which, int who, int ioprio) +{ + int class = IOPRIO_PRIO_CLASS(ioprio); + int data = IOPRIO_PRIO_DATA(ioprio); + struct task_struct *p, *g; + struct user_struct *user; + int ret; + + switch (class) { + case IOPRIO_CLASS_RT: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* fall through, rt has prio field too */ + case IOPRIO_CLASS_BE: + if (data >= IOPRIO_BE_NR || data < 0) + return -EINVAL; + + break; + case IOPRIO_CLASS_IDLE: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + break; + default: + return -EINVAL; + } + + ret = -ESRCH; + read_lock_irq(&tasklist_lock); + switch (which) { + case IOPRIO_WHO_PROCESS: + if (!who) + p = current; + else + p = find_task_by_pid(who); + if (p) + ret = set_task_ioprio(p, ioprio); + break; + case IOPRIO_WHO_PGRP: + if (!who) + who = process_group(current); + do_each_task_pid(who, PIDTYPE_PGID, p) { + ret = set_task_ioprio(p, ioprio); + if (ret) + break; + } while_each_task_pid(who, PIDTYPE_PGID, p); + break; + case IOPRIO_WHO_USER: + if (!who) + user = current->user; + else + user = find_user(who); + + if (!user) + break; + + do_each_thread(g, p) { + if (p->uid != who) + continue; + ret = set_task_ioprio(p, ioprio); + if (ret) + break; + } while_each_thread(g, p); + + if (who) + free_uid(user); + break; + default: + ret = -EINVAL; + } + + read_unlock_irq(&tasklist_lock); + return ret; +} + +asmlinkage long sys_ioprio_get(int which, int who) +{ + struct task_struct *g, *p; + struct user_struct *user; + int ret = -ESRCH; + + read_lock_irq(&tasklist_lock); + switch (which) { + case IOPRIO_WHO_PROCESS: + if (!who) + p = current; + else + p = find_task_by_pid(who); + if (p) + ret = p->ioprio; + break; + case IOPRIO_WHO_PGRP: + if (!who) + who = process_group(current); + do_each_task_pid(who, PIDTYPE_PGID, p) { + if (ret == -ESRCH) + ret = p->ioprio; + else + ret = ioprio_best(ret, p->ioprio); + } while_each_task_pid(who, PIDTYPE_PGID, p); + break; + case IOPRIO_WHO_USER: + if (!who) + user = current->user; + else + user = find_user(who); + + if (!user) + break; + + do_each_thread(g, p) { + if (p->uid != user->uid) + continue; + if (ret == -ESRCH) + ret = p->ioprio; + else + ret = ioprio_best(ret, p->ioprio); + } while_each_thread(g, p); + + if (who) + free_uid(user); + break; + default: + ret = -EINVAL; + } + + read_unlock_irq(&tasklist_lock); + return ret; +} + diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 34a44e45168..4917315db73 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -129,8 +129,14 @@ static int zisofs_readpage(struct file *file, struct page *page) cend = le32_to_cpu(*(__le32 *)(bh->b_data + (blockendptr & bufmask))); brelse(bh); + if (cstart > cend) + goto eio; + csize = cend-cstart; + if (csize > deflateBound(1UL << zisofs_block_shift)) + goto eio; + /* Now page[] contains an array of pages, any of which can be NULL, and the locks on which we hold. We should now read the data and release the pages. If the pages are NULL the decompressed data diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index 6030956b894..7901ac9f97a 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -193,12 +193,17 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, /* Handle everything else. Do name translation if there is no Rock Ridge NM field. */ - if (sbi->s_unhide == 'n') { - /* Do not report hidden or associated files */ - if (de->flags[-sbi->s_high_sierra] & 5) { - filp->f_pos += de_len; - continue; - } + + /* + * Do not report hidden files if so instructed, or associated + * files unless instructed to do so + */ + if ((sbi->s_hide == 'y' && + (de->flags[-sbi->s_high_sierra] & 1)) || + (sbi->s_showassoc =='n' && + (de->flags[-sbi->s_high_sierra] & 4))) { + filp->f_pos += de_len; + continue; } map = 1; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index abd7b12eeca..1652de1b6cb 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -28,11 +28,6 @@ #define BEQUIET -#ifdef LEAK_CHECK -static int check_malloc; -static int check_bread; -#endif - static int isofs_hashi(struct dentry *parent, struct qstr *qstr); static int isofs_hash(struct dentry *parent, struct qstr *qstr); static int isofs_dentry_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b); @@ -55,11 +50,6 @@ static void isofs_put_super(struct super_block *sb) } #endif -#ifdef LEAK_CHECK - printk("Outstanding mallocs:%d, outstanding buffers: %d\n", - check_malloc, check_bread); -#endif - kfree(sbi); sb->s_fs_info = NULL; return; @@ -73,7 +63,7 @@ static kmem_cache_t *isofs_inode_cachep; static struct inode *isofs_alloc_inode(struct super_block *sb) { struct iso_inode_info *ei; - ei = (struct iso_inode_info *)kmem_cache_alloc(isofs_inode_cachep, SLAB_KERNEL); + ei = kmem_cache_alloc(isofs_inode_cachep, SLAB_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; @@ -84,9 +74,9 @@ static void isofs_destroy_inode(struct inode *inode) kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode)); } -static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags) { - struct iso_inode_info *ei = (struct iso_inode_info *) foo; + struct iso_inode_info *ei = foo; if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR) @@ -107,7 +97,8 @@ static int init_inodecache(void) static void destroy_inodecache(void) { if (kmem_cache_destroy(isofs_inode_cachep)) - printk(KERN_INFO "iso_inode_cache: not all structures were freed\n"); + printk(KERN_INFO "iso_inode_cache: not all structures were " + "freed\n"); } static int isofs_remount(struct super_block *sb, int *flags, char *data) @@ -144,7 +135,7 @@ static struct dentry_operations isofs_dentry_ops[] = { { .d_hash = isofs_hashi_ms, .d_compare = isofs_dentry_cmpi_ms, - } + }, #endif }; @@ -153,7 +144,8 @@ struct iso9660_options{ char rock; char joliet; char cruft; - char unhide; + char hide; + char showassoc; char nocompress; unsigned char check; unsigned int blocksize; @@ -219,8 +211,8 @@ isofs_hashi_common(struct dentry *dentry, struct qstr *qstr, int ms) /* * Case insensitive compare of two isofs names. */ -static int -isofs_dentry_cmpi_common(struct dentry *dentry,struct qstr *a,struct qstr *b,int ms) +static int isofs_dentry_cmpi_common(struct dentry *dentry, struct qstr *a, + struct qstr *b, int ms) { int alen, blen; @@ -243,8 +235,8 @@ isofs_dentry_cmpi_common(struct dentry *dentry,struct qstr *a,struct qstr *b,int /* * Case sensitive compare of two isofs names. */ -static int -isofs_dentry_cmp_common(struct dentry *dentry,struct qstr *a,struct qstr *b,int ms) +static int isofs_dentry_cmp_common(struct dentry *dentry, struct qstr *a, + struct qstr *b, int ms) { int alen, blen; @@ -318,13 +310,15 @@ enum { Opt_block, Opt_check_r, Opt_check_s, Opt_cruft, Opt_gid, Opt_ignore, Opt_iocharset, Opt_map_a, Opt_map_n, Opt_map_o, Opt_mode, Opt_nojoliet, Opt_norock, Opt_sb, Opt_session, Opt_uid, Opt_unhide, Opt_utf8, Opt_err, - Opt_nocompress, + Opt_nocompress, Opt_hide, Opt_showassoc, }; static match_table_t tokens = { {Opt_norock, "norock"}, {Opt_nojoliet, "nojoliet"}, {Opt_unhide, "unhide"}, + {Opt_hide, "hide"}, + {Opt_showassoc, "showassoc"}, {Opt_cruft, "cruft"}, {Opt_utf8, "utf8"}, {Opt_iocharset, "iocharset=%s"}, @@ -356,7 +350,7 @@ static match_table_t tokens = { {Opt_err, NULL} }; -static int parse_options(char *options, struct iso9660_options * popt) +static int parse_options(char *options, struct iso9660_options *popt) { char *p; int option; @@ -365,7 +359,8 @@ static int parse_options(char *options, struct iso9660_options * popt) popt->rock = 'y'; popt->joliet = 'y'; popt->cruft = 'n'; - popt->unhide = 'n'; + popt->hide = 'n'; + popt->showassoc = 'n'; popt->check = 'u'; /* unset */ popt->nocompress = 0; popt->blocksize = 1024; @@ -398,8 +393,12 @@ static int parse_options(char *options, struct iso9660_options * popt) case Opt_nojoliet: popt->joliet = 'n'; break; + case Opt_hide: + popt->hide = 'y'; + break; case Opt_unhide: - popt->unhide = 'y'; + case Opt_showassoc: + popt->showassoc = 'y'; break; case Opt_cruft: popt->cruft = 'y'; @@ -493,7 +492,7 @@ static int parse_options(char *options, struct iso9660_options * popt) */ #define WE_OBEY_THE_WRITTEN_STANDARDS 1 -static unsigned int isofs_get_last_session(struct super_block *sb,s32 session ) +static unsigned int isofs_get_last_session(struct super_block *sb, s32 session) { struct cdrom_multisession ms_info; unsigned int vol_desc_start; @@ -518,7 +517,8 @@ static unsigned int isofs_get_last_session(struct super_block *sb,s32 session ) printk(KERN_ERR "Invalid session number or type of track\n"); } i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long) &ms_info); - if(session > 0) printk(KERN_ERR "Invalid session number\n"); + if (session > 0) + printk(KERN_ERR "Invalid session number\n"); #if 0 printk("isofs.inode: CDROMMULTISESSION: rc=%d\n",i); if (i==0) { @@ -557,13 +557,13 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) struct iso9660_options opt; struct isofs_sb_info * sbi; - sbi = kmalloc(sizeof(struct isofs_sb_info), GFP_KERNEL); + sbi = kmalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; s->s_fs_info = sbi; - memset(sbi, 0, sizeof(struct isofs_sb_info)); + memset(sbi, 0, sizeof(*sbi)); - if (!parse_options((char *) data, &opt)) + if (!parse_options((char *)data, &opt)) goto out_freesbi; /* @@ -792,7 +792,8 @@ root_found: sbi->s_rock = (opt.rock == 'y' ? 2 : 0); sbi->s_rock_offset = -1; /* initial offset, will guess until SP is found*/ sbi->s_cruft = opt.cruft; - sbi->s_unhide = opt.unhide; + sbi->s_hide = opt.hide; + sbi->s_showassoc = opt.showassoc; sbi->s_uid = opt.uid; sbi->s_gid = opt.gid; sbi->s_utf8 = opt.utf8; @@ -1002,7 +1003,6 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock_s, rv++; } - abort: unlock_kernel(); return rv; @@ -1014,7 +1014,7 @@ abort: static int isofs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - if ( create ) { + if (create) { printk("isofs_get_block: Kernel tries to allocate a block\n"); return -EROFS; } @@ -1061,19 +1061,17 @@ static struct address_space_operations isofs_aops = { static inline void test_and_set_uid(uid_t *p, uid_t value) { - if(value) { + if (value) *p = value; - } } static inline void test_and_set_gid(gid_t *p, gid_t value) { - if(value) { + if (value) *p = value; - } } -static int isofs_read_level3_size(struct inode * inode) +static int isofs_read_level3_size(struct inode *inode) { unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); int high_sierra = ISOFS_SB(inode->i_sb)->s_high_sierra; @@ -1136,7 +1134,7 @@ static int isofs_read_level3_size(struct inode * inode) bh = sb_bread(inode->i_sb, block); if (!bh) goto out_noread; - memcpy((void *) tmpde + slop, bh->b_data, offset); + memcpy((void *)tmpde+slop, bh->b_data, offset); } de = tmpde; } @@ -1150,12 +1148,11 @@ static int isofs_read_level3_size(struct inode * inode) more_entries = de->flags[-high_sierra] & 0x80; i++; - if(i > 100) + if (i > 100) goto out_toomany; - } while(more_entries); + } while (more_entries); out: - if (tmpde) - kfree(tmpde); + kfree(tmpde); if (bh) brelse(bh); return 0; @@ -1179,7 +1176,7 @@ out_toomany: goto out; } -static void isofs_read_inode(struct inode * inode) +static void isofs_read_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; struct isofs_sb_info *sbi = ISOFS_SB(sb); @@ -1249,7 +1246,7 @@ static void isofs_read_inode(struct inode * inode) ei->i_format_parm[2] = 0; ei->i_section_size = isonum_733 (de->size); - if(de->flags[-high_sierra] & 0x80) { + if (de->flags[-high_sierra] & 0x80) { if(isofs_read_level3_size(inode)) goto fail; } else { ei->i_next_section_block = 0; @@ -1336,16 +1333,16 @@ static void isofs_read_inode(struct inode * inode) /* XXX - parse_rock_ridge_inode() had already set i_rdev. */ init_special_inode(inode, inode->i_mode, inode->i_rdev); - out: +out: if (tmpde) kfree(tmpde); if (bh) brelse(bh); return; - out_badread: +out_badread: printk(KERN_WARNING "ISOFS: unable to read i-node block\n"); - fail: +fail: make_bad_inode(inode); goto out; } @@ -1394,11 +1391,8 @@ struct inode *isofs_iget(struct super_block *sb, hashval = (block << sb->s_blocksize_bits) | offset; - inode = iget5_locked(sb, - hashval, - &isofs_iget5_test, - &isofs_iget5_set, - &data); + inode = iget5_locked(sb, hashval, &isofs_iget5_test, + &isofs_iget5_set, &data); if (inode && (inode->i_state & I_NEW)) { sb->s_op->read_inode(inode); @@ -1408,36 +1402,6 @@ struct inode *isofs_iget(struct super_block *sb, return inode; } -#ifdef LEAK_CHECK -#undef malloc -#undef free_s -#undef sb_bread -#undef brelse - -void * leak_check_malloc(unsigned int size){ - void * tmp; - check_malloc++; - tmp = kmalloc(size, GFP_KERNEL); - return tmp; -} - -void leak_check_free_s(void * obj, int size){ - check_malloc--; - return kfree(obj); -} - -struct buffer_head * leak_check_bread(struct super_block *sb, int block){ - check_bread++; - return sb_bread(sb, block); -} - -void leak_check_brelse(struct buffer_head * bh){ - check_bread--; - return brelse(bh); -} - -#endif - static struct super_block *isofs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index 9ce7b51fb61..38c75151fc6 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -47,6 +47,8 @@ struct isofs_sb_info { unsigned char s_nosuid; unsigned char s_nodev; unsigned char s_nocompress; + unsigned char s_hide; + unsigned char s_showassoc; mode_t s_mode; gid_t s_gid; diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c index 690edf37173..e37e82b7cbf 100644 --- a/fs/isofs/namei.c +++ b/fs/isofs/namei.c @@ -131,14 +131,16 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry, } /* - * Skip hidden or associated files unless unhide is set + * Skip hidden or associated files unless hide or showassoc, + * respectively, is set */ match = 0; if (dlen > 0 && - (!(de->flags[-sbi->s_high_sierra] & 5) - || sbi->s_unhide == 'y')) - { - match = (isofs_cmp(dentry,dpnt,dlen) == 0); + (sbi->s_hide =='n' || + (!(de->flags[-sbi->s_high_sierra] & 1))) && + (sbi->s_showassoc =='y' || + (!(de->flags[-sbi->s_high_sierra] & 4)))) { + match = (isofs_cmp(dentry, dpnt, dlen) == 0); } if (match) { isofs_normalize_block_and_offset(de, @@ -146,11 +148,11 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry, &offset_saved); *block_rv = block_saved; *offset_rv = offset_saved; - if (bh) brelse(bh); + brelse(bh); return 1; } } - if (bh) brelse(bh); + brelse(bh); return 0; } diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 089e79c6558..4326cb47f8f 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -13,352 +13,542 @@ #include "isofs.h" #include "rock.h" -/* These functions are designed to read the system areas of a directory record +/* + * These functions are designed to read the system areas of a directory record * and extract relevant information. There are different functions provided * depending upon what information we need at the time. One function fills * out an inode structure, a second one extracts a filename, a third one * returns a symbolic link name, and a fourth one returns the extent number - * for the file. */ - -#define SIG(A,B) ((A) | ((B) << 8)) /* isonum_721() */ - - -/* This is a way of ensuring that we have something in the system - use fields that is compatible with Rock Ridge */ -#define CHECK_SP(FAIL) \ - if(rr->u.SP.magic[0] != 0xbe) FAIL; \ - if(rr->u.SP.magic[1] != 0xef) FAIL; \ - ISOFS_SB(inode->i_sb)->s_rock_offset=rr->u.SP.skip; -/* We define a series of macros because each function must do exactly the - same thing in certain places. We use the macros to ensure that everything - is done correctly */ - -#define CONTINUE_DECLS \ - int cont_extent = 0, cont_offset = 0, cont_size = 0; \ - void *buffer = NULL - -#define CHECK_CE \ - {cont_extent = isonum_733(rr->u.CE.extent); \ - cont_offset = isonum_733(rr->u.CE.offset); \ - cont_size = isonum_733(rr->u.CE.size);} - -#define SETUP_ROCK_RIDGE(DE,CHR,LEN) \ - {LEN= sizeof(struct iso_directory_record) + DE->name_len[0]; \ - if(LEN & 1) LEN++; \ - CHR = ((unsigned char *) DE) + LEN; \ - LEN = *((unsigned char *) DE) - LEN; \ - if (LEN<0) LEN=0; \ - if (ISOFS_SB(inode->i_sb)->s_rock_offset!=-1) \ - { \ - LEN-=ISOFS_SB(inode->i_sb)->s_rock_offset; \ - CHR+=ISOFS_SB(inode->i_sb)->s_rock_offset; \ - if (LEN<0) LEN=0; \ - } \ -} - -#define MAYBE_CONTINUE(LABEL,DEV) \ - {if (buffer) { kfree(buffer); buffer = NULL; } \ - if (cont_extent){ \ - int block, offset, offset1; \ - struct buffer_head * pbh; \ - buffer = kmalloc(cont_size,GFP_KERNEL); \ - if (!buffer) goto out; \ - block = cont_extent; \ - offset = cont_offset; \ - offset1 = 0; \ - pbh = sb_bread(DEV->i_sb, block); \ - if(pbh){ \ - if (offset > pbh->b_size || offset + cont_size > pbh->b_size){ \ - brelse(pbh); \ - goto out; \ - } \ - memcpy(buffer + offset1, pbh->b_data + offset, cont_size - offset1); \ - brelse(pbh); \ - chr = (unsigned char *) buffer; \ - len = cont_size; \ - cont_extent = 0; \ - cont_size = 0; \ - cont_offset = 0; \ - goto LABEL; \ - } \ - printk("Unable to read rock-ridge attributes\n"); \ - }} - -/* return length of name field; 0: not found, -1: to be ignored */ -int get_rock_ridge_filename(struct iso_directory_record * de, - char * retname, struct inode * inode) + * for the file. + */ + +#define SIG(A,B) ((A) | ((B) << 8)) /* isonum_721() */ + +struct rock_state { + void *buffer; + unsigned char *chr; + int len; + int cont_size; + int cont_extent; + int cont_offset; + struct inode *inode; +}; + +/* + * This is a way of ensuring that we have something in the system + * use fields that is compatible with Rock Ridge. Return zero on success. + */ + +static int check_sp(struct rock_ridge *rr, struct inode *inode) { - int len; - unsigned char * chr; - CONTINUE_DECLS; - int retnamlen = 0, truncate=0; - - if (!ISOFS_SB(inode->i_sb)->s_rock) return 0; - *retname = 0; - - SETUP_ROCK_RIDGE(de, chr, len); - repeat: - { - struct rock_ridge * rr; - int sig; - - while (len > 2){ /* There may be one byte for padding somewhere */ - rr = (struct rock_ridge *) chr; - if (rr->len < 3) goto out; /* Something got screwed up here */ - sig = isonum_721(chr); - chr += rr->len; - len -= rr->len; - if (len < 0) goto out; /* corrupted isofs */ - - switch(sig){ - case SIG('R','R'): - if((rr->u.RR.flags[0] & RR_NM) == 0) goto out; - break; - case SIG('S','P'): - CHECK_SP(goto out); - break; - case SIG('C','E'): - CHECK_CE; - break; - case SIG('N','M'): - if (truncate) break; - if (rr->len < 5) break; - /* - * If the flags are 2 or 4, this indicates '.' or '..'. - * We don't want to do anything with this, because it - * screws up the code that calls us. We don't really - * care anyways, since we can just use the non-RR - * name. - */ - if (rr->u.NM.flags & 6) { - break; + if (rr->u.SP.magic[0] != 0xbe) + return -1; + if (rr->u.SP.magic[1] != 0xef) + return -1; + ISOFS_SB(inode->i_sb)->s_rock_offset = rr->u.SP.skip; + return 0; +} + +static void setup_rock_ridge(struct iso_directory_record *de, + struct inode *inode, struct rock_state *rs) +{ + rs->len = sizeof(struct iso_directory_record) + de->name_len[0]; + if (rs->len & 1) + (rs->len)++; + rs->chr = (unsigned char *)de + rs->len; + rs->len = *((unsigned char *)de) - rs->len; + if (rs->len < 0) + rs->len = 0; + + if (ISOFS_SB(inode->i_sb)->s_rock_offset != -1) { + rs->len -= ISOFS_SB(inode->i_sb)->s_rock_offset; + rs->chr += ISOFS_SB(inode->i_sb)->s_rock_offset; + if (rs->len < 0) + rs->len = 0; + } +} + +static void init_rock_state(struct rock_state *rs, struct inode *inode) +{ + memset(rs, 0, sizeof(*rs)); + rs->inode = inode; +} + +/* + * Returns 0 if the caller should continue scanning, 1 if the scan must end + * and -ve on error. + */ +static int rock_continue(struct rock_state *rs) +{ + int ret = 1; + int blocksize = 1 << rs->inode->i_blkbits; + const int min_de_size = offsetof(struct rock_ridge, u); + + kfree(rs->buffer); + rs->buffer = NULL; + + if ((unsigned)rs->cont_offset > blocksize - min_de_size || + (unsigned)rs->cont_size > blocksize || + (unsigned)(rs->cont_offset + rs->cont_size) > blocksize) { + printk(KERN_NOTICE "rock: corrupted directory entry. " + "extent=%d, offset=%d, size=%d\n", + rs->cont_extent, rs->cont_offset, rs->cont_size); + ret = -EIO; + goto out; } - if (rr->u.NM.flags & ~1) { - printk("Unsupported NM flag settings (%d)\n",rr->u.NM.flags); - break; + if (rs->cont_extent) { + struct buffer_head *bh; + + rs->buffer = kmalloc(rs->cont_size, GFP_KERNEL); + if (!rs->buffer) { + ret = -ENOMEM; + goto out; + } + ret = -EIO; + bh = sb_bread(rs->inode->i_sb, rs->cont_extent); + if (bh) { + memcpy(rs->buffer, bh->b_data + rs->cont_offset, + rs->cont_size); + put_bh(bh); + rs->chr = rs->buffer; + rs->len = rs->cont_size; + rs->cont_extent = 0; + rs->cont_size = 0; + rs->cont_offset = 0; + return 0; + } + printk("Unable to read rock-ridge attributes\n"); + } +out: + kfree(rs->buffer); + rs->buffer = NULL; + return ret; +} + +/* + * We think there's a record of type `sig' at rs->chr. Parse the signature + * and make sure that there's really room for a record of that type. + */ +static int rock_check_overflow(struct rock_state *rs, int sig) +{ + int len; + + switch (sig) { + case SIG('S', 'P'): + len = sizeof(struct SU_SP_s); + break; + case SIG('C', 'E'): + len = sizeof(struct SU_CE_s); + break; + case SIG('E', 'R'): + len = sizeof(struct SU_ER_s); + break; + case SIG('R', 'R'): + len = sizeof(struct RR_RR_s); + break; + case SIG('P', 'X'): + len = sizeof(struct RR_PX_s); + break; + case SIG('P', 'N'): + len = sizeof(struct RR_PN_s); + break; + case SIG('S', 'L'): + len = sizeof(struct RR_SL_s); + break; + case SIG('N', 'M'): + len = sizeof(struct RR_NM_s); + break; + case SIG('C', 'L'): + len = sizeof(struct RR_CL_s); + break; + case SIG('P', 'L'): + len = sizeof(struct RR_PL_s); + break; + case SIG('T', 'F'): + len = sizeof(struct RR_TF_s); + break; + case SIG('Z', 'F'): + len = sizeof(struct RR_ZF_s); + break; + default: + len = 0; + break; } - if((strlen(retname) + rr->len - 5) >= 254) { - truncate = 1; - break; + len += offsetof(struct rock_ridge, u); + if (len > rs->len) { + printk(KERN_NOTICE "rock: directory entry would overflow " + "storage\n"); + printk(KERN_NOTICE "rock: sig=0x%02x, size=%d, remaining=%d\n", + sig, len, rs->len); + return -EIO; + } + return 0; +} + +/* + * return length of name field; 0: not found, -1: to be ignored + */ +int get_rock_ridge_filename(struct iso_directory_record *de, + char *retname, struct inode *inode) +{ + struct rock_state rs; + struct rock_ridge *rr; + int sig; + int retnamlen = 0; + int truncate = 0; + int ret = 0; + + if (!ISOFS_SB(inode->i_sb)->s_rock) + return 0; + *retname = 0; + + init_rock_state(&rs, inode); + setup_rock_ridge(de, inode, &rs); +repeat: + + while (rs.len > 2) { /* There may be one byte for padding somewhere */ + rr = (struct rock_ridge *)rs.chr; + if (rr->len < 3) + goto out; /* Something got screwed up here */ + sig = isonum_721(rs.chr); + if (rock_check_overflow(&rs, sig)) + goto eio; + rs.chr += rr->len; + rs.len -= rr->len; + if (rs.len < 0) + goto eio; /* corrupted isofs */ + + switch (sig) { + case SIG('R', 'R'): + if ((rr->u.RR.flags[0] & RR_NM) == 0) + goto out; + break; + case SIG('S', 'P'): + if (check_sp(rr, inode)) + goto out; + break; + case SIG('C', 'E'): + rs.cont_extent = isonum_733(rr->u.CE.extent); + rs.cont_offset = isonum_733(rr->u.CE.offset); + rs.cont_size = isonum_733(rr->u.CE.size); + break; + case SIG('N', 'M'): + if (truncate) + break; + if (rr->len < 5) + break; + /* + * If the flags are 2 or 4, this indicates '.' or '..'. + * We don't want to do anything with this, because it + * screws up the code that calls us. We don't really + * care anyways, since we can just use the non-RR + * name. + */ + if (rr->u.NM.flags & 6) + break; + + if (rr->u.NM.flags & ~1) { + printk("Unsupported NM flag settings (%d)\n", + rr->u.NM.flags); + break; + } + if ((strlen(retname) + rr->len - 5) >= 254) { + truncate = 1; + break; + } + strncat(retname, rr->u.NM.name, rr->len - 5); + retnamlen += rr->len - 5; + break; + case SIG('R', 'E'): + kfree(rs.buffer); + return -1; + default: + break; + } } - strncat(retname, rr->u.NM.name, rr->len - 5); - retnamlen += rr->len - 5; - break; - case SIG('R','E'): - if (buffer) kfree(buffer); - return -1; - default: - break; - } - } - } - MAYBE_CONTINUE(repeat,inode); - if (buffer) kfree(buffer); - return retnamlen; /* If 0, this file did not have a NM field */ - out: - if(buffer) kfree(buffer); - return 0; + ret = rock_continue(&rs); + if (ret == 0) + goto repeat; + if (ret == 1) + return retnamlen; /* If 0, this file did not have a NM field */ +out: + kfree(rs.buffer); + return ret; +eio: + ret = -EIO; + goto out; } static int parse_rock_ridge_inode_internal(struct iso_directory_record *de, struct inode *inode, int regard_xa) { - int len; - unsigned char * chr; - int symlink_len = 0; - CONTINUE_DECLS; - - if (!ISOFS_SB(inode->i_sb)->s_rock) return 0; - - SETUP_ROCK_RIDGE(de, chr, len); - if (regard_xa) - { - chr+=14; - len-=14; - if (len<0) len=0; - } - - repeat: - { - int cnt, sig; - struct inode * reloc; - struct rock_ridge * rr; - int rootflag; - - while (len > 2){ /* There may be one byte for padding somewhere */ - rr = (struct rock_ridge *) chr; - if (rr->len < 3) goto out; /* Something got screwed up here */ - sig = isonum_721(chr); - chr += rr->len; - len -= rr->len; - if (len < 0) goto out; /* corrupted isofs */ - - switch(sig){ + int symlink_len = 0; + int cnt, sig; + struct inode *reloc; + struct rock_ridge *rr; + int rootflag; + struct rock_state rs; + int ret = 0; + + if (!ISOFS_SB(inode->i_sb)->s_rock) + return 0; + + init_rock_state(&rs, inode); + setup_rock_ridge(de, inode, &rs); + if (regard_xa) { + rs.chr += 14; + rs.len -= 14; + if (rs.len < 0) + rs.len = 0; + } + +repeat: + while (rs.len > 2) { /* There may be one byte for padding somewhere */ + rr = (struct rock_ridge *)rs.chr; + if (rr->len < 3) + goto out; /* Something got screwed up here */ + sig = isonum_721(rs.chr); + if (rock_check_overflow(&rs, sig)) + goto eio; + rs.chr += rr->len; + rs.len -= rr->len; + if (rs.len < 0) + goto eio; /* corrupted isofs */ + + switch (sig) { #ifndef CONFIG_ZISOFS /* No flag for SF or ZF */ - case SIG('R','R'): - if((rr->u.RR.flags[0] & - (RR_PX | RR_TF | RR_SL | RR_CL)) == 0) goto out; - break; + case SIG('R', 'R'): + if ((rr->u.RR.flags[0] & + (RR_PX | RR_TF | RR_SL | RR_CL)) == 0) + goto out; + break; #endif - case SIG('S','P'): - CHECK_SP(goto out); - break; - case SIG('C','E'): - CHECK_CE; - break; - case SIG('E','R'): - ISOFS_SB(inode->i_sb)->s_rock = 1; - printk(KERN_DEBUG "ISO 9660 Extensions: "); - { int p; - for(p=0;p<rr->u.ER.len_id;p++) printk("%c",rr->u.ER.data[p]); - } - printk("\n"); - break; - case SIG('P','X'): - inode->i_mode = isonum_733(rr->u.PX.mode); - inode->i_nlink = isonum_733(rr->u.PX.n_links); - inode->i_uid = isonum_733(rr->u.PX.uid); - inode->i_gid = isonum_733(rr->u.PX.gid); - break; - case SIG('P','N'): - { int high, low; - high = isonum_733(rr->u.PN.dev_high); - low = isonum_733(rr->u.PN.dev_low); - /* - * The Rock Ridge standard specifies that if sizeof(dev_t) <= 4, - * then the high field is unused, and the device number is completely - * stored in the low field. Some writers may ignore this subtlety, - * and as a result we test to see if the entire device number is - * stored in the low field, and use that. - */ - if((low & ~0xff) && high == 0) { - inode->i_rdev = MKDEV(low >> 8, low & 0xff); - } else { - inode->i_rdev = MKDEV(high, low); - } - } - break; - case SIG('T','F'): - /* Some RRIP writers incorrectly place ctime in the TF_CREATE field. - Try to handle this correctly for either case. */ - cnt = 0; /* Rock ridge never appears on a High Sierra disk */ - if(rr->u.TF.flags & TF_CREATE) { - inode->i_ctime.tv_sec = iso_date(rr->u.TF.times[cnt++].time, 0); - inode->i_ctime.tv_nsec = 0; - } - if(rr->u.TF.flags & TF_MODIFY) { - inode->i_mtime.tv_sec = iso_date(rr->u.TF.times[cnt++].time, 0); - inode->i_mtime.tv_nsec = 0; - } - if(rr->u.TF.flags & TF_ACCESS) { - inode->i_atime.tv_sec = iso_date(rr->u.TF.times[cnt++].time, 0); - inode->i_atime.tv_nsec = 0; - } - if(rr->u.TF.flags & TF_ATTRIBUTES) { - inode->i_ctime.tv_sec = iso_date(rr->u.TF.times[cnt++].time, 0); - inode->i_ctime.tv_nsec = 0; - } - break; - case SIG('S','L'): - {int slen; - struct SL_component * slp; - struct SL_component * oldslp; - slen = rr->len - 5; - slp = &rr->u.SL.link; - inode->i_size = symlink_len; - while (slen > 1){ - rootflag = 0; - switch(slp->flags &~1){ - case 0: - inode->i_size += slp->len; - break; - case 2: - inode->i_size += 1; - break; - case 4: - inode->i_size += 2; - break; - case 8: - rootflag = 1; - inode->i_size += 1; - break; - default: - printk("Symlink component flag not implemented\n"); - } - slen -= slp->len + 2; - oldslp = slp; - slp = (struct SL_component *) (((char *) slp) + slp->len + 2); - - if(slen < 2) { - if( ((rr->u.SL.flags & 1) != 0) - && ((oldslp->flags & 1) == 0) ) inode->i_size += 1; - break; - } - - /* - * If this component record isn't continued, then append a '/'. - */ - if (!rootflag && (oldslp->flags & 1) == 0) - inode->i_size += 1; - } - } - symlink_len = inode->i_size; - break; - case SIG('R','E'): - printk(KERN_WARNING "Attempt to read inode for relocated directory\n"); - goto out; - case SIG('C','L'): - ISOFS_I(inode)->i_first_extent = isonum_733(rr->u.CL.location); - reloc = isofs_iget(inode->i_sb, ISOFS_I(inode)->i_first_extent, 0); - if (!reloc) - goto out; - inode->i_mode = reloc->i_mode; - inode->i_nlink = reloc->i_nlink; - inode->i_uid = reloc->i_uid; - inode->i_gid = reloc->i_gid; - inode->i_rdev = reloc->i_rdev; - inode->i_size = reloc->i_size; - inode->i_blocks = reloc->i_blocks; - inode->i_atime = reloc->i_atime; - inode->i_ctime = reloc->i_ctime; - inode->i_mtime = reloc->i_mtime; - iput(reloc); - break; + case SIG('S', 'P'): + if (check_sp(rr, inode)) + goto out; + break; + case SIG('C', 'E'): + rs.cont_extent = isonum_733(rr->u.CE.extent); + rs.cont_offset = isonum_733(rr->u.CE.offset); + rs.cont_size = isonum_733(rr->u.CE.size); + break; + case SIG('E', 'R'): + ISOFS_SB(inode->i_sb)->s_rock = 1; + printk(KERN_DEBUG "ISO 9660 Extensions: "); + { + int p; + for (p = 0; p < rr->u.ER.len_id; p++) + printk("%c", rr->u.ER.data[p]); + } + printk("\n"); + break; + case SIG('P', 'X'): + inode->i_mode = isonum_733(rr->u.PX.mode); + inode->i_nlink = isonum_733(rr->u.PX.n_links); + inode->i_uid = isonum_733(rr->u.PX.uid); + inode->i_gid = isonum_733(rr->u.PX.gid); + break; + case SIG('P', 'N'): + { + int high, low; + high = isonum_733(rr->u.PN.dev_high); + low = isonum_733(rr->u.PN.dev_low); + /* + * The Rock Ridge standard specifies that if + * sizeof(dev_t) <= 4, then the high field is + * unused, and the device number is completely + * stored in the low field. Some writers may + * ignore this subtlety, + * and as a result we test to see if the entire + * device number is + * stored in the low field, and use that. + */ + if ((low & ~0xff) && high == 0) { + inode->i_rdev = + MKDEV(low >> 8, low & 0xff); + } else { + inode->i_rdev = + MKDEV(high, low); + } + } + break; + case SIG('T', 'F'): + /* + * Some RRIP writers incorrectly place ctime in the + * TF_CREATE field. Try to handle this correctly for + * either case. + */ + /* Rock ridge never appears on a High Sierra disk */ + cnt = 0; + if (rr->u.TF.flags & TF_CREATE) { + inode->i_ctime.tv_sec = + iso_date(rr->u.TF.times[cnt++].time, + 0); + inode->i_ctime.tv_nsec = 0; + } + if (rr->u.TF.flags & TF_MODIFY) { + inode->i_mtime.tv_sec = + iso_date(rr->u.TF.times[cnt++].time, + 0); + inode->i_mtime.tv_nsec = 0; + } + if (rr->u.TF.flags & TF_ACCESS) { + inode->i_atime.tv_sec = + iso_date(rr->u.TF.times[cnt++].time, + 0); + inode->i_atime.tv_nsec = 0; + } + if (rr->u.TF.flags & TF_ATTRIBUTES) { + inode->i_ctime.tv_sec = + iso_date(rr->u.TF.times[cnt++].time, + 0); + inode->i_ctime.tv_nsec = 0; + } + break; + case SIG('S', 'L'): + { + int slen; + struct SL_component *slp; + struct SL_component *oldslp; + slen = rr->len - 5; + slp = &rr->u.SL.link; + inode->i_size = symlink_len; + while (slen > 1) { + rootflag = 0; + switch (slp->flags & ~1) { + case 0: + inode->i_size += + slp->len; + break; + case 2: + inode->i_size += 1; + break; + case 4: + inode->i_size += 2; + break; + case 8: + rootflag = 1; + inode->i_size += 1; + break; + default: + printk("Symlink component flag " + "not implemented\n"); + } + slen -= slp->len + 2; + oldslp = slp; + slp = (struct SL_component *) + (((char *)slp) + slp->len + 2); + + if (slen < 2) { + if (((rr->u.SL. + flags & 1) != 0) + && + ((oldslp-> + flags & 1) == 0)) + inode->i_size += + 1; + break; + } + + /* + * If this component record isn't + * continued, then append a '/'. + */ + if (!rootflag + && (oldslp->flags & 1) == 0) + inode->i_size += 1; + } + } + symlink_len = inode->i_size; + break; + case SIG('R', 'E'): + printk(KERN_WARNING "Attempt to read inode for " + "relocated directory\n"); + goto out; + case SIG('C', 'L'): + ISOFS_I(inode)->i_first_extent = + isonum_733(rr->u.CL.location); + reloc = + isofs_iget(inode->i_sb, + ISOFS_I(inode)->i_first_extent, + 0); + if (!reloc) + goto out; + inode->i_mode = reloc->i_mode; + inode->i_nlink = reloc->i_nlink; + inode->i_uid = reloc->i_uid; + inode->i_gid = reloc->i_gid; + inode->i_rdev = reloc->i_rdev; + inode->i_size = reloc->i_size; + inode->i_blocks = reloc->i_blocks; + inode->i_atime = reloc->i_atime; + inode->i_ctime = reloc->i_ctime; + inode->i_mtime = reloc->i_mtime; + iput(reloc); + break; #ifdef CONFIG_ZISOFS - case SIG('Z','F'): - if ( !ISOFS_SB(inode->i_sb)->s_nocompress ) { - int algo; - algo = isonum_721(rr->u.ZF.algorithm); - if ( algo == SIG('p','z') ) { - int block_shift = isonum_711(&rr->u.ZF.parms[1]); - if ( block_shift < PAGE_CACHE_SHIFT || block_shift > 17 ) { - printk(KERN_WARNING "isofs: Can't handle ZF block size of 2^%d\n", block_shift); - } else { - /* Note: we don't change i_blocks here */ - ISOFS_I(inode)->i_file_format = isofs_file_compressed; - /* Parameters to compression algorithm (header size, block size) */ - ISOFS_I(inode)->i_format_parm[0] = isonum_711(&rr->u.ZF.parms[0]); - ISOFS_I(inode)->i_format_parm[1] = isonum_711(&rr->u.ZF.parms[1]); - inode->i_size = isonum_733(rr->u.ZF.real_size); - } - } else { - printk(KERN_WARNING "isofs: Unknown ZF compression algorithm: %c%c\n", - rr->u.ZF.algorithm[0], rr->u.ZF.algorithm[1]); - } - } - break; + case SIG('Z', 'F'): { + int algo; + + if (ISOFS_SB(inode->i_sb)->s_nocompress) + break; + algo = isonum_721(rr->u.ZF.algorithm); + if (algo == SIG('p', 'z')) { + int block_shift = + isonum_711(&rr->u.ZF.parms[1]); + if (block_shift < PAGE_CACHE_SHIFT + || block_shift > 17) { + printk(KERN_WARNING "isofs: " + "Can't handle ZF block " + "size of 2^%d\n", + block_shift); + } else { + /* + * Note: we don't change + * i_blocks here + */ + ISOFS_I(inode)->i_file_format = + isofs_file_compressed; + /* + * Parameters to compression + * algorithm (header size, + * block size) + */ + ISOFS_I(inode)->i_format_parm[0] = + isonum_711(&rr->u.ZF.parms[0]); + ISOFS_I(inode)->i_format_parm[1] = + isonum_711(&rr->u.ZF.parms[1]); + inode->i_size = + isonum_733(rr->u.ZF. + real_size); + } + } else { + printk(KERN_WARNING + "isofs: Unknown ZF compression " + "algorithm: %c%c\n", + rr->u.ZF.algorithm[0], + rr->u.ZF.algorithm[1]); + } + break; + } #endif - default: - break; - } - } - } - MAYBE_CONTINUE(repeat,inode); - out: - if(buffer) kfree(buffer); - return 0; + default: + break; + } + } + ret = rock_continue(&rs); + if (ret == 0) + goto repeat; + if (ret == 1) + ret = 0; +out: + kfree(rs.buffer); + return ret; +eio: + ret = -EIO; + goto out; } static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit) @@ -376,32 +566,32 @@ static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit) if (slp->len > plimit - rpnt) return NULL; memcpy(rpnt, slp->text, slp->len); - rpnt+=slp->len; + rpnt += slp->len; break; case 2: if (rpnt >= plimit) return NULL; - *rpnt++='.'; + *rpnt++ = '.'; break; case 4: if (2 > plimit - rpnt) return NULL; - *rpnt++='.'; - *rpnt++='.'; + *rpnt++ = '.'; + *rpnt++ = '.'; break; case 8: if (rpnt >= plimit) return NULL; rootflag = 1; - *rpnt++='/'; + *rpnt++ = '/'; break; default: printk("Symlink component flag not implemented (%d)\n", - slp->flags); + slp->flags); } slen -= slp->len + 2; oldslp = slp; - slp = (struct SL_component *) ((char *) slp + slp->len + 2); + slp = (struct SL_component *)((char *)slp + slp->len + 2); if (slen < 2) { /* @@ -412,7 +602,7 @@ static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit) !(oldslp->flags & 1)) { if (rpnt >= plimit) return NULL; - *rpnt++='/'; + *rpnt++ = '/'; } break; } @@ -423,59 +613,61 @@ static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit) if (!rootflag && !(oldslp->flags & 1)) { if (rpnt >= plimit) return NULL; - *rpnt++='/'; + *rpnt++ = '/'; } } return rpnt; } -int parse_rock_ridge_inode(struct iso_directory_record * de, - struct inode * inode) +int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode) { - int result=parse_rock_ridge_inode_internal(de,inode,0); - /* if rockridge flag was reset and we didn't look for attributes - * behind eventual XA attributes, have a look there */ - if ((ISOFS_SB(inode->i_sb)->s_rock_offset==-1) - &&(ISOFS_SB(inode->i_sb)->s_rock==2)) - { - result=parse_rock_ridge_inode_internal(de,inode,14); - } - return result; -} + int result = parse_rock_ridge_inode_internal(de, inode, 0); -/* readpage() for symlinks: reads symlink contents into the page and either - makes it uptodate and returns 0 or returns error (-EIO) */ + /* + * if rockridge flag was reset and we didn't look for attributes + * behind eventual XA attributes, have a look there + */ + if ((ISOFS_SB(inode->i_sb)->s_rock_offset == -1) + && (ISOFS_SB(inode->i_sb)->s_rock == 2)) { + result = parse_rock_ridge_inode_internal(de, inode, 14); + } + return result; +} +/* + * readpage() for symlinks: reads symlink contents into the page and either + * makes it uptodate and returns 0 or returns error (-EIO) + */ static int rock_ridge_symlink_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; - struct iso_inode_info *ei = ISOFS_I(inode); + struct iso_inode_info *ei = ISOFS_I(inode); char *link = kmap(page); unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); struct buffer_head *bh; char *rpnt = link; unsigned char *pnt; - struct iso_directory_record *raw_inode; - CONTINUE_DECLS; + struct iso_directory_record *raw_de; unsigned long block, offset; int sig; - int len; - unsigned char *chr; struct rock_ridge *rr; + struct rock_state rs; + int ret; if (!ISOFS_SB(inode->i_sb)->s_rock) goto error; + init_rock_state(&rs, inode); block = ei->i_iget5_block; lock_kernel(); bh = sb_bread(inode->i_sb, block); if (!bh) goto out_noread; - offset = ei->i_iget5_offset; - pnt = (unsigned char *) bh->b_data + offset; + offset = ei->i_iget5_offset; + pnt = (unsigned char *)bh->b_data + offset; - raw_inode = (struct iso_directory_record *) pnt; + raw_de = (struct iso_directory_record *)pnt; /* * If we go past the end of the buffer, there is some sort of error. @@ -483,20 +675,24 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page) if (offset + *pnt > bufsize) goto out_bad_span; - /* Now test for possible Rock Ridge extensions which will override - some of these numbers in the inode structure. */ + /* + * Now test for possible Rock Ridge extensions which will override + * some of these numbers in the inode structure. + */ - SETUP_ROCK_RIDGE(raw_inode, chr, len); + setup_rock_ridge(raw_de, inode, &rs); - repeat: - while (len > 2) { /* There may be one byte for padding somewhere */ - rr = (struct rock_ridge *) chr; +repeat: + while (rs.len > 2) { /* There may be one byte for padding somewhere */ + rr = (struct rock_ridge *)rs.chr; if (rr->len < 3) goto out; /* Something got screwed up here */ - sig = isonum_721(chr); - chr += rr->len; - len -= rr->len; - if (len < 0) + sig = isonum_721(rs.chr); + if (rock_check_overflow(&rs, sig)) + goto out; + rs.chr += rr->len; + rs.len -= rr->len; + if (rs.len < 0) goto out; /* corrupted isofs */ switch (sig) { @@ -505,7 +701,8 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page) goto out; break; case SIG('S', 'P'): - CHECK_SP(goto out); + if (check_sp(rr, inode)) + goto out; break; case SIG('S', 'L'): rpnt = get_symlink_chunk(rpnt, rr, @@ -515,14 +712,18 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page) break; case SIG('C', 'E'): /* This tells is if there is a continuation record */ - CHECK_CE; + rs.cont_extent = isonum_733(rr->u.CE.extent); + rs.cont_offset = isonum_733(rr->u.CE.offset); + rs.cont_size = isonum_733(rr->u.CE.size); default: break; } } - MAYBE_CONTINUE(repeat, inode); - if (buffer) - kfree(buffer); + ret = rock_continue(&rs); + if (ret == 0) + goto repeat; + if (ret < 0) + goto fail; if (rpnt == link) goto fail; @@ -535,19 +736,18 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page) return 0; /* error exit from macro */ - out: - if (buffer) - kfree(buffer); +out: + kfree(rs.buffer); goto fail; - out_noread: +out_noread: printk("unable to read i-node block"); goto fail; - out_bad_span: +out_bad_span: printk("symlink spans iso9660 blocks\n"); - fail: +fail: brelse(bh); unlock_kernel(); - error: +error: SetPageError(page); kunmap(page); unlock_page(page); @@ -555,5 +755,5 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page) } struct address_space_operations isofs_symlink_aops = { - .readpage = rock_ridge_symlink_readpage + .readpage = rock_ridge_symlink_readpage }; diff --git a/fs/isofs/rock.h b/fs/isofs/rock.h index deaf5c8e8b4..ed09e2b0863 100644 --- a/fs/isofs/rock.h +++ b/fs/isofs/rock.h @@ -1,85 +1,88 @@ -/* These structs are used by the system-use-sharing protocol, in which the - Rock Ridge extensions are embedded. It is quite possible that other - extensions are present on the disk, and this is fine as long as they - all use SUSP */ - -struct SU_SP{ - unsigned char magic[2]; - unsigned char skip; -} __attribute__((packed)); - -struct SU_CE{ - char extent[8]; - char offset[8]; - char size[8]; +/* + * These structs are used by the system-use-sharing protocol, in which the + * Rock Ridge extensions are embedded. It is quite possible that other + * extensions are present on the disk, and this is fine as long as they + * all use SUSP + */ + +struct SU_SP_s { + unsigned char magic[2]; + unsigned char skip; +} __attribute__ ((packed)); + +struct SU_CE_s { + char extent[8]; + char offset[8]; + char size[8]; }; -struct SU_ER{ - unsigned char len_id; - unsigned char len_des; - unsigned char len_src; - unsigned char ext_ver; - char data[0]; -} __attribute__((packed)); - -struct RR_RR{ - char flags[1]; -} __attribute__((packed)); - -struct RR_PX{ - char mode[8]; - char n_links[8]; - char uid[8]; - char gid[8]; +struct SU_ER_s { + unsigned char len_id; + unsigned char len_des; + unsigned char len_src; + unsigned char ext_ver; + char data[0]; +} __attribute__ ((packed)); + +struct RR_RR_s { + char flags[1]; +} __attribute__ ((packed)); + +struct RR_PX_s { + char mode[8]; + char n_links[8]; + char uid[8]; + char gid[8]; }; -struct RR_PN{ - char dev_high[8]; - char dev_low[8]; +struct RR_PN_s { + char dev_high[8]; + char dev_low[8]; }; +struct SL_component { + unsigned char flags; + unsigned char len; + char text[0]; +} __attribute__ ((packed)); -struct SL_component{ - unsigned char flags; - unsigned char len; - char text[0]; -} __attribute__((packed)); +struct RR_SL_s { + unsigned char flags; + struct SL_component link; +} __attribute__ ((packed)); -struct RR_SL{ - unsigned char flags; - struct SL_component link; -} __attribute__((packed)); +struct RR_NM_s { + unsigned char flags; + char name[0]; +} __attribute__ ((packed)); -struct RR_NM{ - unsigned char flags; - char name[0]; -} __attribute__((packed)); - -struct RR_CL{ - char location[8]; +struct RR_CL_s { + char location[8]; }; -struct RR_PL{ - char location[8]; +struct RR_PL_s { + char location[8]; }; -struct stamp{ - char time[7]; -} __attribute__((packed)); +struct stamp { + char time[7]; +} __attribute__ ((packed)); -struct RR_TF{ - char flags; - struct stamp times[0]; /* Variable number of these beasts */ -} __attribute__((packed)); +struct RR_TF_s { + char flags; + struct stamp times[0]; /* Variable number of these beasts */ +} __attribute__ ((packed)); /* Linux-specific extension for transparent decompression */ -struct RR_ZF{ - char algorithm[2]; - char parms[2]; - char real_size[8]; +struct RR_ZF_s { + char algorithm[2]; + char parms[2]; + char real_size[8]; }; -/* These are the bits and their meanings for flags in the TF structure. */ +/* + * These are the bits and their meanings for flags in the TF structure. + */ #define TF_CREATE 1 #define TF_MODIFY 2 #define TF_ACCESS 4 @@ -89,31 +92,31 @@ struct RR_ZF{ #define TF_EFFECTIVE 64 #define TF_LONG_FORM 128 -struct rock_ridge{ - char signature[2]; - unsigned char len; - unsigned char version; - union{ - struct SU_SP SP; - struct SU_CE CE; - struct SU_ER ER; - struct RR_RR RR; - struct RR_PX PX; - struct RR_PN PN; - struct RR_SL SL; - struct RR_NM NM; - struct RR_CL CL; - struct RR_PL PL; - struct RR_TF TF; - struct RR_ZF ZF; - } u; +struct rock_ridge { + char signature[2]; + unsigned char len; + unsigned char version; + union { + struct SU_SP_s SP; + struct SU_CE_s CE; + struct SU_ER_s ER; + struct RR_RR_s RR; + struct RR_PX_s PX; + struct RR_PN_s PN; + struct RR_SL_s SL; + struct RR_NM_s NM; + struct RR_CL_s CL; + struct RR_PL_s PL; + struct RR_TF_s TF; + struct RR_ZF_s ZF; + } u; }; -#define RR_PX 1 /* POSIX attributes */ -#define RR_PN 2 /* POSIX devices */ -#define RR_SL 4 /* Symbolic link */ -#define RR_NM 8 /* Alternate Name */ -#define RR_CL 16 /* Child link */ -#define RR_PL 32 /* Parent link */ -#define RR_RE 64 /* Relocation directory */ -#define RR_TF 128 /* Timestamps */ +#define RR_PX 1 /* POSIX attributes */ +#define RR_PN 2 /* POSIX devices */ +#define RR_SL 4 /* Symbolic link */ +#define RR_NM 8 /* Alternate Name */ +#define RR_CL 16 /* Child link */ +#define RR_PL 32 /* Parent link */ +#define RR_RE 64 /* Relocation directory */ +#define RR_TF 128 /* Timestamps */ diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 1e6f2e2ad4a..5e7b4394951 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -167,7 +167,7 @@ loop: } wake_up(&journal->j_wait_done_commit); - if (current->flags & PF_FREEZE) { + if (freezing(current)) { /* * The simpler the better. Flushing journal isn't a * good idea, because that depends on threads that may @@ -175,7 +175,7 @@ loop: */ jbd_debug(1, "Now suspending kjournald\n"); spin_unlock(&journal->j_state_lock); - refrigerator(PF_FREEZE); + refrigerator(); spin_lock(&journal->j_state_lock); } else { /* diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c index 8cc6893fc56..456d7e6e29c 100644 --- a/fs/jffs/intrep.c +++ b/fs/jffs/intrep.c @@ -175,8 +175,64 @@ jffs_hexdump(struct mtd_info *mtd, loff_t pos, int size) } } +/* Print the contents of a node. */ +static void +jffs_print_node(struct jffs_node *n) +{ + D(printk("jffs_node: 0x%p\n", n)); + D(printk("{\n")); + D(printk(" 0x%08x, /* version */\n", n->version)); + D(printk(" 0x%08x, /* data_offset */\n", n->data_offset)); + D(printk(" 0x%08x, /* data_size */\n", n->data_size)); + D(printk(" 0x%08x, /* removed_size */\n", n->removed_size)); + D(printk(" 0x%08x, /* fm_offset */\n", n->fm_offset)); + D(printk(" 0x%02x, /* name_size */\n", n->name_size)); + D(printk(" 0x%p, /* fm, fm->offset: %u */\n", + n->fm, (n->fm ? n->fm->offset : 0))); + D(printk(" 0x%p, /* version_prev */\n", n->version_prev)); + D(printk(" 0x%p, /* version_next */\n", n->version_next)); + D(printk(" 0x%p, /* range_prev */\n", n->range_prev)); + D(printk(" 0x%p, /* range_next */\n", n->range_next)); + D(printk("}\n")); +} + #endif +/* Print the contents of a raw inode. */ +static void +jffs_print_raw_inode(struct jffs_raw_inode *raw_inode) +{ + D(printk("jffs_raw_inode: inode number: %u\n", raw_inode->ino)); + D(printk("{\n")); + D(printk(" 0x%08x, /* magic */\n", raw_inode->magic)); + D(printk(" 0x%08x, /* ino */\n", raw_inode->ino)); + D(printk(" 0x%08x, /* pino */\n", raw_inode->pino)); + D(printk(" 0x%08x, /* version */\n", raw_inode->version)); + D(printk(" 0x%08x, /* mode */\n", raw_inode->mode)); + D(printk(" 0x%04x, /* uid */\n", raw_inode->uid)); + D(printk(" 0x%04x, /* gid */\n", raw_inode->gid)); + D(printk(" 0x%08x, /* atime */\n", raw_inode->atime)); + D(printk(" 0x%08x, /* mtime */\n", raw_inode->mtime)); + D(printk(" 0x%08x, /* ctime */\n", raw_inode->ctime)); + D(printk(" 0x%08x, /* offset */\n", raw_inode->offset)); + D(printk(" 0x%08x, /* dsize */\n", raw_inode->dsize)); + D(printk(" 0x%08x, /* rsize */\n", raw_inode->rsize)); + D(printk(" 0x%02x, /* nsize */\n", raw_inode->nsize)); + D(printk(" 0x%02x, /* nlink */\n", raw_inode->nlink)); + D(printk(" 0x%02x, /* spare */\n", + raw_inode->spare)); + D(printk(" %u, /* rename */\n", + raw_inode->rename)); + D(printk(" %u, /* deleted */\n", + raw_inode->deleted)); + D(printk(" 0x%02x, /* accurate */\n", + raw_inode->accurate)); + D(printk(" 0x%08x, /* dchksum */\n", raw_inode->dchksum)); + D(printk(" 0x%04x, /* nchksum */\n", raw_inode->nchksum)); + D(printk(" 0x%04x, /* chksum */\n", raw_inode->chksum)); + D(printk("}\n")); +} + #define flash_safe_acquire(arg) #define flash_safe_release(arg) @@ -2507,64 +2563,6 @@ jffs_update_file(struct jffs_file *f, struct jffs_node *node) return 0; } -/* Print the contents of a node. */ -void -jffs_print_node(struct jffs_node *n) -{ - D(printk("jffs_node: 0x%p\n", n)); - D(printk("{\n")); - D(printk(" 0x%08x, /* version */\n", n->version)); - D(printk(" 0x%08x, /* data_offset */\n", n->data_offset)); - D(printk(" 0x%08x, /* data_size */\n", n->data_size)); - D(printk(" 0x%08x, /* removed_size */\n", n->removed_size)); - D(printk(" 0x%08x, /* fm_offset */\n", n->fm_offset)); - D(printk(" 0x%02x, /* name_size */\n", n->name_size)); - D(printk(" 0x%p, /* fm, fm->offset: %u */\n", - n->fm, (n->fm ? n->fm->offset : 0))); - D(printk(" 0x%p, /* version_prev */\n", n->version_prev)); - D(printk(" 0x%p, /* version_next */\n", n->version_next)); - D(printk(" 0x%p, /* range_prev */\n", n->range_prev)); - D(printk(" 0x%p, /* range_next */\n", n->range_next)); - D(printk("}\n")); -} - - -/* Print the contents of a raw inode. */ -void -jffs_print_raw_inode(struct jffs_raw_inode *raw_inode) -{ - D(printk("jffs_raw_inode: inode number: %u\n", raw_inode->ino)); - D(printk("{\n")); - D(printk(" 0x%08x, /* magic */\n", raw_inode->magic)); - D(printk(" 0x%08x, /* ino */\n", raw_inode->ino)); - D(printk(" 0x%08x, /* pino */\n", raw_inode->pino)); - D(printk(" 0x%08x, /* version */\n", raw_inode->version)); - D(printk(" 0x%08x, /* mode */\n", raw_inode->mode)); - D(printk(" 0x%04x, /* uid */\n", raw_inode->uid)); - D(printk(" 0x%04x, /* gid */\n", raw_inode->gid)); - D(printk(" 0x%08x, /* atime */\n", raw_inode->atime)); - D(printk(" 0x%08x, /* mtime */\n", raw_inode->mtime)); - D(printk(" 0x%08x, /* ctime */\n", raw_inode->ctime)); - D(printk(" 0x%08x, /* offset */\n", raw_inode->offset)); - D(printk(" 0x%08x, /* dsize */\n", raw_inode->dsize)); - D(printk(" 0x%08x, /* rsize */\n", raw_inode->rsize)); - D(printk(" 0x%02x, /* nsize */\n", raw_inode->nsize)); - D(printk(" 0x%02x, /* nlink */\n", raw_inode->nlink)); - D(printk(" 0x%02x, /* spare */\n", - raw_inode->spare)); - D(printk(" %u, /* rename */\n", - raw_inode->rename)); - D(printk(" %u, /* deleted */\n", - raw_inode->deleted)); - D(printk(" 0x%02x, /* accurate */\n", - raw_inode->accurate)); - D(printk(" 0x%08x, /* dchksum */\n", raw_inode->dchksum)); - D(printk(" 0x%04x, /* nchksum */\n", raw_inode->nchksum)); - D(printk(" 0x%04x, /* chksum */\n", raw_inode->chksum)); - D(printk("}\n")); -} - - /* Print the contents of a file. */ #if 0 int @@ -3399,6 +3397,9 @@ jffs_garbage_collect_thread(void *ptr) siginfo_t info; unsigned long signr = 0; + if (try_to_freeze()) + continue; + spin_lock_irq(¤t->sighand->siglock); signr = dequeue_signal(current, ¤t->blocked, &info); spin_unlock_irq(¤t->sighand->siglock); diff --git a/fs/jffs/intrep.h b/fs/jffs/intrep.h index 4ae97b17911..5c7abe0e269 100644 --- a/fs/jffs/intrep.h +++ b/fs/jffs/intrep.h @@ -49,8 +49,6 @@ int jffs_garbage_collect_thread(void *c); void jffs_garbage_collect_trigger(struct jffs_control *c); /* For debugging purposes. */ -void jffs_print_node(struct jffs_node *n); -void jffs_print_raw_inode(struct jffs_raw_inode *raw_inode); #if 0 int jffs_print_file(struct jffs_file *f); #endif /* 0 */ diff --git a/fs/jffs/jffs_fm.c b/fs/jffs/jffs_fm.c index 0cab8da49d3..053e3a98a27 100644 --- a/fs/jffs/jffs_fm.c +++ b/fs/jffs/jffs_fm.c @@ -31,6 +31,60 @@ static void jffs_free_fm(struct jffs_fm *n); extern kmem_cache_t *fm_cache; extern kmem_cache_t *node_cache; +#if CONFIG_JFFS_FS_VERBOSE > 0 +void +jffs_print_fmcontrol(struct jffs_fmcontrol *fmc) +{ + D(printk("struct jffs_fmcontrol: 0x%p\n", fmc)); + D(printk("{\n")); + D(printk(" %u, /* flash_size */\n", fmc->flash_size)); + D(printk(" %u, /* used_size */\n", fmc->used_size)); + D(printk(" %u, /* dirty_size */\n", fmc->dirty_size)); + D(printk(" %u, /* free_size */\n", fmc->free_size)); + D(printk(" %u, /* sector_size */\n", fmc->sector_size)); + D(printk(" %u, /* min_free_size */\n", fmc->min_free_size)); + D(printk(" %u, /* max_chunk_size */\n", fmc->max_chunk_size)); + D(printk(" 0x%p, /* mtd */\n", fmc->mtd)); + D(printk(" 0x%p, /* head */ " + "(head->offset = 0x%08x)\n", + fmc->head, (fmc->head ? fmc->head->offset : 0))); + D(printk(" 0x%p, /* tail */ " + "(tail->offset + tail->size = 0x%08x)\n", + fmc->tail, + (fmc->tail ? fmc->tail->offset + fmc->tail->size : 0))); + D(printk(" 0x%p, /* head_extra */\n", fmc->head_extra)); + D(printk(" 0x%p, /* tail_extra */\n", fmc->tail_extra)); + D(printk("}\n")); +} +#endif /* CONFIG_JFFS_FS_VERBOSE > 0 */ + +#if CONFIG_JFFS_FS_VERBOSE > 2 +static void +jffs_print_fm(struct jffs_fm *fm) +{ + D(printk("struct jffs_fm: 0x%p\n", fm)); + D(printk("{\n")); + D(printk(" 0x%08x, /* offset */\n", fm->offset)); + D(printk(" %u, /* size */\n", fm->size)); + D(printk(" 0x%p, /* prev */\n", fm->prev)); + D(printk(" 0x%p, /* next */\n", fm->next)); + D(printk(" 0x%p, /* nodes */\n", fm->nodes)); + D(printk("}\n")); +} +#endif /* CONFIG_JFFS_FS_VERBOSE > 2 */ + +#if 0 +void +jffs_print_node_ref(struct jffs_node_ref *ref) +{ + D(printk("struct jffs_node_ref: 0x%p\n", ref)); + D(printk("{\n")); + D(printk(" 0x%p, /* node */\n", ref->node)); + D(printk(" 0x%p, /* next */\n", ref->next)); + D(printk("}\n")); +} +#endif /* 0 */ + /* This function creates a new shiny flash memory control structure. */ struct jffs_fmcontrol * jffs_build_begin(struct jffs_control *c, int unit) @@ -742,54 +796,3 @@ int jffs_get_node_inuse(void) { return no_jffs_node; } - -void -jffs_print_fmcontrol(struct jffs_fmcontrol *fmc) -{ - D(printk("struct jffs_fmcontrol: 0x%p\n", fmc)); - D(printk("{\n")); - D(printk(" %u, /* flash_size */\n", fmc->flash_size)); - D(printk(" %u, /* used_size */\n", fmc->used_size)); - D(printk(" %u, /* dirty_size */\n", fmc->dirty_size)); - D(printk(" %u, /* free_size */\n", fmc->free_size)); - D(printk(" %u, /* sector_size */\n", fmc->sector_size)); - D(printk(" %u, /* min_free_size */\n", fmc->min_free_size)); - D(printk(" %u, /* max_chunk_size */\n", fmc->max_chunk_size)); - D(printk(" 0x%p, /* mtd */\n", fmc->mtd)); - D(printk(" 0x%p, /* head */ " - "(head->offset = 0x%08x)\n", - fmc->head, (fmc->head ? fmc->head->offset : 0))); - D(printk(" 0x%p, /* tail */ " - "(tail->offset + tail->size = 0x%08x)\n", - fmc->tail, - (fmc->tail ? fmc->tail->offset + fmc->tail->size : 0))); - D(printk(" 0x%p, /* head_extra */\n", fmc->head_extra)); - D(printk(" 0x%p, /* tail_extra */\n", fmc->tail_extra)); - D(printk("}\n")); -} - -void -jffs_print_fm(struct jffs_fm *fm) -{ - D(printk("struct jffs_fm: 0x%p\n", fm)); - D(printk("{\n")); - D(printk(" 0x%08x, /* offset */\n", fm->offset)); - D(printk(" %u, /* size */\n", fm->size)); - D(printk(" 0x%p, /* prev */\n", fm->prev)); - D(printk(" 0x%p, /* next */\n", fm->next)); - D(printk(" 0x%p, /* nodes */\n", fm->nodes)); - D(printk("}\n")); -} - -#if 0 -void -jffs_print_node_ref(struct jffs_node_ref *ref) -{ - D(printk("struct jffs_node_ref: 0x%p\n", ref)); - D(printk("{\n")); - D(printk(" 0x%p, /* node */\n", ref->node)); - D(printk(" 0x%p, /* next */\n", ref->next)); - D(printk("}\n")); -} -#endif /* 0 */ - diff --git a/fs/jffs/jffs_fm.h b/fs/jffs/jffs_fm.h index bc291c43182..f64151e7412 100644 --- a/fs/jffs/jffs_fm.h +++ b/fs/jffs/jffs_fm.h @@ -139,8 +139,9 @@ int jffs_add_node(struct jffs_node *node); void jffs_fmfree_partly(struct jffs_fmcontrol *fmc, struct jffs_fm *fm, __u32 size); +#if CONFIG_JFFS_FS_VERBOSE > 0 void jffs_print_fmcontrol(struct jffs_fmcontrol *fmc); -void jffs_print_fm(struct jffs_fm *fm); +#endif #if 0 void jffs_print_node_ref(struct jffs_node_ref *ref); #endif /* 0 */ diff --git a/fs/jffs2/Makefile b/fs/jffs2/Makefile index e3c38ccf9c7..f1afe681ecd 100644 --- a/fs/jffs2/Makefile +++ b/fs/jffs2/Makefile @@ -1,7 +1,7 @@ # # Makefile for the Linux Journalling Flash File System v2 (JFFS2) # -# $Id: Makefile.common,v 1.7 2004/11/03 12:57:38 jwboyer Exp $ +# $Id: Makefile.common,v 1.9 2005/02/09 09:23:53 pavlov Exp $ # obj-$(CONFIG_JFFS2_FS) += jffs2.o @@ -11,8 +11,7 @@ jffs2-y += read.o nodemgmt.o readinode.o write.o scan.o gc.o jffs2-y += symlink.o build.o erase.o background.o fs.o writev.o jffs2-y += super.o -jffs2-$(CONFIG_JFFS2_FS_NAND) += wbuf.o -jffs2-$(CONFIG_JFFS2_FS_NOR_ECC) += wbuf.o +jffs2-$(CONFIG_JFFS2_FS_WRITEBUFFER) += wbuf.o jffs2-$(CONFIG_JFFS2_RUBIN) += compr_rubin.o jffs2-$(CONFIG_JFFS2_RTIME) += compr_rtime.o jffs2-$(CONFIG_JFFS2_ZLIB) += compr_zlib.o diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking index 49771cf8513..b7943439b6e 100644 --- a/fs/jffs2/README.Locking +++ b/fs/jffs2/README.Locking @@ -1,4 +1,4 @@ - $Id: README.Locking,v 1.9 2004/11/20 10:35:40 dwmw2 Exp $ + $Id: README.Locking,v 1.12 2005/04/13 13:22:35 dwmw2 Exp $ JFFS2 LOCKING DOCUMENTATION --------------------------- @@ -108,6 +108,10 @@ in-core jffs2_inode_cache objects (each inode in JFFS2 has the correspondent jffs2_inode_cache object). So, the inocache_lock has to be locked while walking the c->inocache_list hash buckets. +This spinlock also covers allocation of new inode numbers, which is +currently just '++->highest_ino++', but might one day get more complicated +if we need to deal with wrapping after 4 milliard inode numbers are used. + Note, the f->sem guarantees that the correspondent jffs2_inode_cache will not be removed. So, it is allowed to access it without locking the inocache_lock spinlock. diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index 1be6de27dd8..0f224384f17 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -7,7 +7,7 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: background.c,v 1.50 2004/11/16 20:36:10 dwmw2 Exp $ + * $Id: background.c,v 1.54 2005/05/20 21:37:12 gleixner Exp $ * */ @@ -37,7 +37,7 @@ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c) if (c->gc_task) BUG(); - init_MUTEX_LOCKED(&c->gc_thread_start); + init_completion(&c->gc_thread_start); init_completion(&c->gc_thread_exit); pid = kernel_thread(jffs2_garbage_collect_thread, c, CLONE_FS|CLONE_FILES); @@ -48,7 +48,7 @@ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c) } else { /* Wait for it... */ D1(printk(KERN_DEBUG "JFFS2: Garbage collect thread is pid %d\n", pid)); - down(&c->gc_thread_start); + wait_for_completion(&c->gc_thread_start); } return ret; @@ -56,13 +56,16 @@ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c) void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c) { + int wait = 0; spin_lock(&c->erase_completion_lock); if (c->gc_task) { D1(printk(KERN_DEBUG "jffs2: Killing GC task %d\n", c->gc_task->pid)); send_sig(SIGKILL, c->gc_task, 1); + wait = 1; } spin_unlock(&c->erase_completion_lock); - wait_for_completion(&c->gc_thread_exit); + if (wait) + wait_for_completion(&c->gc_thread_exit); } static int jffs2_garbage_collect_thread(void *_c) @@ -75,7 +78,7 @@ static int jffs2_garbage_collect_thread(void *_c) allow_signal(SIGCONT); c->gc_task = current; - up(&c->gc_thread_start); + complete(&c->gc_thread_start); set_user_nice(current, 10); @@ -92,7 +95,7 @@ static int jffs2_garbage_collect_thread(void *_c) schedule(); } - if (try_to_freeze(0)) + if (try_to_freeze()) continue; cond_resched(); diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index a01dd5fdbb9..97dc39796e2 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -7,7 +7,7 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: build.c,v 1.69 2004/12/16 20:22:18 dmarlin Exp $ + * $Id: build.c,v 1.71 2005/07/12 16:37:08 dedekind Exp $ * */ @@ -97,14 +97,16 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c) /* First, scan the medium and build all the inode caches with lists of physical nodes */ - c->flags |= JFFS2_SB_FLAG_MOUNTING; + c->flags |= JFFS2_SB_FLAG_SCANNING; ret = jffs2_scan_medium(c); + c->flags &= ~JFFS2_SB_FLAG_SCANNING; if (ret) goto exit; D1(printk(KERN_DEBUG "Scanned flash completely\n")); D2(jffs2_dump_block_lists(c)); + c->flags |= JFFS2_SB_FLAG_BUILDING; /* Now scan the directory tree, increasing nlink according to every dirent found. */ for_each_inode(i, c, ic) { D1(printk(KERN_DEBUG "Pass 1: ino #%u\n", ic->ino)); @@ -116,7 +118,6 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c) cond_resched(); } } - c->flags &= ~JFFS2_SB_FLAG_MOUNTING; D1(printk(KERN_DEBUG "Pass 1 complete\n")); @@ -164,6 +165,8 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c) ic->scan_dents = NULL; cond_resched(); } + c->flags &= ~JFFS2_SB_FLAG_BUILDING; + D1(printk(KERN_DEBUG "Pass 3 complete\n")); D2(jffs2_dump_block_lists(c)); @@ -333,13 +336,6 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c) c->blocks[i].bad_count = 0; } - init_MUTEX(&c->alloc_sem); - init_MUTEX(&c->erase_free_sem); - init_waitqueue_head(&c->erase_wait); - init_waitqueue_head(&c->inocache_wq); - spin_lock_init(&c->erase_completion_lock); - spin_lock_init(&c->inocache_lock); - INIT_LIST_HEAD(&c->clean_list); INIT_LIST_HEAD(&c->very_dirty_list); INIT_LIST_HEAD(&c->dirty_list); diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c index 078a30e406b..83f7e0788fd 100644 --- a/fs/jffs2/compr_zlib.c +++ b/fs/jffs2/compr_zlib.c @@ -7,7 +7,7 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: compr_zlib.c,v 1.29 2004/11/16 20:36:11 dwmw2 Exp $ + * $Id: compr_zlib.c,v 1.31 2005/05/20 19:30:06 gleixner Exp $ * */ @@ -17,10 +17,10 @@ #include <linux/config.h> #include <linux/kernel.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/zlib.h> #include <linux/zutil.h> -#include <asm/semaphore.h> #include "nodelist.h" #include "compr.h" diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 757306fa3ff..3ca0d25eef1 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -7,7 +7,7 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: dir.c,v 1.84 2004/11/16 20:36:11 dwmw2 Exp $ + * $Id: dir.c,v 1.86 2005/07/06 12:13:09 dwmw2 Exp $ * */ @@ -22,16 +22,6 @@ #include <linux/time.h> #include "nodelist.h" -/* Urgh. Please tell me there's a nicer way of doing these. */ -#include <linux/version.h> -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,48) -typedef int mknod_arg_t; -#define NAMEI_COMPAT(x) ((void *)x) -#else -typedef dev_t mknod_arg_t; -#define NAMEI_COMPAT(x) (x) -#endif - static int jffs2_readdir (struct file *, void *, filldir_t); static int jffs2_create (struct inode *,struct dentry *,int, @@ -43,7 +33,7 @@ static int jffs2_unlink (struct inode *,struct dentry *); static int jffs2_symlink (struct inode *,struct dentry *,const char *); static int jffs2_mkdir (struct inode *,struct dentry *,int); static int jffs2_rmdir (struct inode *,struct dentry *); -static int jffs2_mknod (struct inode *,struct dentry *,int,mknod_arg_t); +static int jffs2_mknod (struct inode *,struct dentry *,int,dev_t); static int jffs2_rename (struct inode *, struct dentry *, struct inode *, struct dentry *); @@ -58,8 +48,8 @@ struct file_operations jffs2_dir_operations = struct inode_operations jffs2_dir_inode_operations = { - .create = NAMEI_COMPAT(jffs2_create), - .lookup = NAMEI_COMPAT(jffs2_lookup), + .create = jffs2_create, + .lookup = jffs2_lookup, .link = jffs2_link, .unlink = jffs2_unlink, .symlink = jffs2_symlink, @@ -296,11 +286,11 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char struct jffs2_full_dirent *fd; int namelen; uint32_t alloclen, phys_ofs; - int ret; + int ret, targetlen = strlen(target); /* FIXME: If you care. We'd need to use frags for the target if it grows much more than this */ - if (strlen(target) > 254) + if (targetlen > 254) return -EINVAL; ri = jffs2_alloc_raw_inode(); @@ -314,7 +304,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char * Just the node will do for now, though */ namelen = dentry->d_name.len; - ret = jffs2_reserve_space(c, sizeof(*ri) + strlen(target), &phys_ofs, &alloclen, ALLOC_NORMAL); + ret = jffs2_reserve_space(c, sizeof(*ri) + targetlen, &phys_ofs, &alloclen, ALLOC_NORMAL); if (ret) { jffs2_free_raw_inode(ri); @@ -333,16 +323,16 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char f = JFFS2_INODE_INFO(inode); - inode->i_size = strlen(target); + inode->i_size = targetlen; ri->isize = ri->dsize = ri->csize = cpu_to_je32(inode->i_size); ri->totlen = cpu_to_je32(sizeof(*ri) + inode->i_size); ri->hdr_crc = cpu_to_je32(crc32(0, ri, sizeof(struct jffs2_unknown_node)-4)); ri->compr = JFFS2_COMPR_NONE; - ri->data_crc = cpu_to_je32(crc32(0, target, strlen(target))); + ri->data_crc = cpu_to_je32(crc32(0, target, targetlen)); ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); - fn = jffs2_write_dnode(c, f, ri, target, strlen(target), phys_ofs, ALLOC_NORMAL); + fn = jffs2_write_dnode(c, f, ri, target, targetlen, phys_ofs, ALLOC_NORMAL); jffs2_free_raw_inode(ri); @@ -353,6 +343,20 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char jffs2_clear_inode(inode); return PTR_ERR(fn); } + + /* We use f->dents field to store the target path. */ + f->dents = kmalloc(targetlen + 1, GFP_KERNEL); + if (!f->dents) { + printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1); + up(&f->sem); + jffs2_complete_reservation(c); + jffs2_clear_inode(inode); + return -ENOMEM; + } + + memcpy(f->dents, target, targetlen + 1); + D1(printk(KERN_DEBUG "jffs2_symlink: symlink's target '%s' cached\n", (char *)f->dents)); + /* No data here. Only a metadata node, which will be obsoleted by the first data write */ @@ -564,7 +568,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) return ret; } -static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, mknod_arg_t rdev) +static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, dev_t rdev) { struct jffs2_inode_info *f, *dir_f; struct jffs2_sb_info *c; diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index 41451e8bf36..787d84ac2bc 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -7,7 +7,7 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: erase.c,v 1.66 2004/11/16 20:36:11 dwmw2 Exp $ + * $Id: erase.c,v 1.80 2005/07/14 19:46:24 joern Exp $ * */ @@ -48,6 +48,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, #else /* Linux */ struct erase_info *instr; + D1(printk(KERN_DEBUG "jffs2_erase_block(): erase block %#x (range %#x-%#x)\n", jeb->offset, jeb->offset, jeb->offset + c->sector_size)); instr = kmalloc(sizeof(struct erase_info) + sizeof(struct erase_priv_struct), GFP_KERNEL); if (!instr) { printk(KERN_WARNING "kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n"); @@ -233,7 +234,7 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c, continue; } - if (((*prev)->flash_offset & ~(c->sector_size -1)) == jeb->offset) { + if (SECTOR_ADDR((*prev)->flash_offset) == jeb->offset) { /* It's in the block we're erasing */ struct jffs2_raw_node_ref *this; @@ -277,11 +278,8 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c, printk("\n"); }); - if (ic->nodes == (void *)ic) { - D1(printk(KERN_DEBUG "inocache for ino #%u is all gone now. Freeing\n", ic->ino)); + if (ic->nodes == (void *)ic && ic->nlink == 0) jffs2_del_ino_cache(c, ic); - jffs2_free_inode_cache(ic); - } } static void jffs2_free_all_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) @@ -302,92 +300,86 @@ static void jffs2_free_all_node_refs(struct jffs2_sb_info *c, struct jffs2_erase jeb->last_node = NULL; } -static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) +static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, uint32_t *bad_offset) { - struct jffs2_raw_node_ref *marker_ref = NULL; - unsigned char *ebuf; + void *ebuf; + uint32_t ofs; size_t retlen; - int ret; - uint32_t bad_offset; - - if (!jffs2_cleanmarker_oob(c)) { - marker_ref = jffs2_alloc_raw_node_ref(); - if (!marker_ref) { - printk(KERN_WARNING "Failed to allocate raw node ref for clean marker\n"); - /* Stick it back on the list from whence it came and come back later */ - jffs2_erase_pending_trigger(c); - spin_lock(&c->erase_completion_lock); - list_add(&jeb->list, &c->erase_complete_list); - spin_unlock(&c->erase_completion_lock); - return; - } - } + int ret = -EIO; + ebuf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!ebuf) { - printk(KERN_WARNING "Failed to allocate page buffer for verifying erase at 0x%08x. Assuming it worked\n", jeb->offset); - } else { - uint32_t ofs = jeb->offset; + printk(KERN_WARNING "Failed to allocate page buffer for verifying erase at 0x%08x. Refiling\n", jeb->offset); + return -EAGAIN; + } - D1(printk(KERN_DEBUG "Verifying erase at 0x%08x\n", jeb->offset)); - while(ofs < jeb->offset + c->sector_size) { - uint32_t readlen = min((uint32_t)PAGE_SIZE, jeb->offset + c->sector_size - ofs); - int i; + D1(printk(KERN_DEBUG "Verifying erase at 0x%08x\n", jeb->offset)); - bad_offset = ofs; + for (ofs = jeb->offset; ofs < jeb->offset + c->sector_size; ) { + uint32_t readlen = min((uint32_t)PAGE_SIZE, jeb->offset + c->sector_size - ofs); + int i; - ret = jffs2_flash_read(c, ofs, readlen, &retlen, ebuf); - if (ret) { - printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret); - goto bad; - } - if (retlen != readlen) { - printk(KERN_WARNING "Short read from newly-erased block at 0x%08x. Wanted %d, got %zd\n", ofs, readlen, retlen); - goto bad; - } - for (i=0; i<readlen; i += sizeof(unsigned long)) { - /* It's OK. We know it's properly aligned */ - unsigned long datum = *(unsigned long *)(&ebuf[i]); - if (datum + 1) { - bad_offset += i; - printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08x\n", datum, bad_offset); - bad: - if (!jffs2_cleanmarker_oob(c)) - jffs2_free_raw_node_ref(marker_ref); - kfree(ebuf); - bad2: - spin_lock(&c->erase_completion_lock); - /* Stick it on a list (any list) so - erase_failed can take it right off - again. Silly, but shouldn't happen - often. */ - list_add(&jeb->list, &c->erasing_list); - spin_unlock(&c->erase_completion_lock); - jffs2_erase_failed(c, jeb, bad_offset); - return; - } + *bad_offset = ofs; + + ret = jffs2_flash_read(c, ofs, readlen, &retlen, ebuf); + if (ret) { + printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret); + goto fail; + } + if (retlen != readlen) { + printk(KERN_WARNING "Short read from newly-erased block at 0x%08x. Wanted %d, got %zd\n", ofs, readlen, retlen); + goto fail; + } + for (i=0; i<readlen; i += sizeof(unsigned long)) { + /* It's OK. We know it's properly aligned */ + unsigned long *datum = ebuf + i; + if (*datum + 1) { + *bad_offset += i; + printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08x\n", *datum, *bad_offset); + goto fail; } - ofs += readlen; - cond_resched(); } - kfree(ebuf); + ofs += readlen; + cond_resched(); } + ret = 0; +fail: + kfree(ebuf); + return ret; +} - bad_offset = jeb->offset; +static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) +{ + struct jffs2_raw_node_ref *marker_ref = NULL; + size_t retlen; + int ret; + uint32_t bad_offset; + + switch (jffs2_block_check_erase(c, jeb, &bad_offset)) { + case -EAGAIN: goto refile; + case -EIO: goto filebad; + } /* Write the erase complete marker */ D1(printk(KERN_DEBUG "Writing erased marker to block at 0x%08x\n", jeb->offset)); - if (jffs2_cleanmarker_oob(c)) { + bad_offset = jeb->offset; - if (jffs2_write_nand_cleanmarker(c, jeb)) - goto bad2; - - jeb->first_node = jeb->last_node = NULL; + /* Cleanmarker in oob area or no cleanmarker at all ? */ + if (jffs2_cleanmarker_oob(c) || c->cleanmarker_size == 0) { + + if (jffs2_cleanmarker_oob(c)) { + if (jffs2_write_nand_cleanmarker(c, jeb)) + goto filebad; + } + jeb->first_node = jeb->last_node = NULL; jeb->free_size = c->sector_size; jeb->used_size = 0; jeb->dirty_size = 0; jeb->wasted_size = 0; + } else { + struct kvec vecs[1]; struct jffs2_unknown_node marker = { .magic = cpu_to_je16(JFFS2_MAGIC_BITMASK), @@ -395,21 +387,28 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb .totlen = cpu_to_je32(c->cleanmarker_size) }; + marker_ref = jffs2_alloc_raw_node_ref(); + if (!marker_ref) { + printk(KERN_WARNING "Failed to allocate raw node ref for clean marker. Refiling\n"); + goto refile; + } + marker.hdr_crc = cpu_to_je32(crc32(0, &marker, sizeof(struct jffs2_unknown_node)-4)); vecs[0].iov_base = (unsigned char *) ▮ vecs[0].iov_len = sizeof(marker); ret = jffs2_flash_direct_writev(c, vecs, 1, jeb->offset, &retlen); - if (ret) { - printk(KERN_WARNING "Write clean marker to block at 0x%08x failed: %d\n", - jeb->offset, ret); - goto bad2; - } - if (retlen != sizeof(marker)) { - printk(KERN_WARNING "Short write to newly-erased block at 0x%08x: Wanted %zd, got %zd\n", - jeb->offset, sizeof(marker), retlen); - goto bad2; + if (ret || retlen != sizeof(marker)) { + if (ret) + printk(KERN_WARNING "Write clean marker to block at 0x%08x failed: %d\n", + jeb->offset, ret); + else + printk(KERN_WARNING "Short write to newly-erased block at 0x%08x: Wanted %zd, got %zd\n", + jeb->offset, sizeof(marker), retlen); + + jffs2_free_raw_node_ref(marker_ref); + goto filebad; } marker_ref->next_in_ino = NULL; @@ -438,5 +437,22 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb c->nr_free_blocks++; spin_unlock(&c->erase_completion_lock); wake_up(&c->erase_wait); -} + return; + +filebad: + spin_lock(&c->erase_completion_lock); + /* Stick it on a list (any list) so erase_failed can take it + right off again. Silly, but shouldn't happen often. */ + list_add(&jeb->list, &c->erasing_list); + spin_unlock(&c->erase_completion_lock); + jffs2_erase_failed(c, jeb, bad_offset); + return; +refile: + /* Stick it back on the list from whence it came and come back later */ + jffs2_erase_pending_trigger(c); + spin_lock(&c->erase_completion_lock); + list_add(&jeb->list, &c->erase_complete_list); + spin_unlock(&c->erase_completion_lock); + return; +} diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 771a554701d..bd9ed9b0247 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -7,11 +7,10 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: file.c,v 1.99 2004/11/16 20:36:11 dwmw2 Exp $ + * $Id: file.c,v 1.102 2005/07/06 12:13:09 dwmw2 Exp $ * */ -#include <linux/version.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/fs.h> @@ -51,9 +50,7 @@ struct file_operations jffs2_file_operations = .ioctl = jffs2_ioctl, .mmap = generic_file_readonly_mmap, .fsync = jffs2_fsync, -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,29) .sendfile = generic_file_sendfile -#endif }; /* jffs2_file_inode_operations */ diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 30ab233fe42..5687c3f4200 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -7,11 +7,10 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: fs.c,v 1.51 2004/11/28 12:19:37 dedekind Exp $ + * $Id: fs.c,v 1.56 2005/07/06 12:13:09 dwmw2 Exp $ * */ -#include <linux/version.h> #include <linux/config.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -450,11 +449,15 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) c = JFFS2_SB_INFO(sb); -#ifndef CONFIG_JFFS2_FS_NAND +#ifndef CONFIG_JFFS2_FS_WRITEBUFFER if (c->mtd->type == MTD_NANDFLASH) { printk(KERN_ERR "jffs2: Cannot operate on NAND flash unless jffs2 NAND support is compiled in.\n"); return -EINVAL; } + if (c->mtd->type == MTD_DATAFLASH) { + printk(KERN_ERR "jffs2: Cannot operate on DataFlash unless jffs2 DataFlash support is compiled in.\n"); + return -EINVAL; + } #endif c->flash_size = c->mtd->size; @@ -522,9 +525,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) if (!sb->s_root) goto out_root_i; -#if LINUX_VERSION_CODE >= 0x20403 sb->s_maxbytes = 0xFFFFFFFF; -#endif sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = JFFS2_SUPER_MAGIC; @@ -661,6 +662,14 @@ static int jffs2_flash_setup(struct jffs2_sb_info *c) { if (ret) return ret; } + + /* and Dataflash */ + if (jffs2_dataflash(c)) { + ret = jffs2_dataflash_setup(c); + if (ret) + return ret; + } + return ret; } @@ -674,4 +683,9 @@ void jffs2_flash_cleanup(struct jffs2_sb_info *c) { if (jffs2_nor_ecc(c)) { jffs2_nor_ecc_flash_cleanup(c); } + + /* and DataFlash */ + if (jffs2_dataflash(c)) { + jffs2_dataflash_cleanup(c); + } } diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 87ec74ff593..7086cd63450 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -7,7 +7,7 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: gc.c,v 1.144 2004/12/21 11:18:50 dwmw2 Exp $ + * $Id: gc.c,v 1.148 2005/04/09 10:47:00 dedekind Exp $ * */ @@ -50,6 +50,7 @@ static struct jffs2_eraseblock *jffs2_find_gc_block(struct jffs2_sb_info *c) put the clever wear-levelling algorithms. Eventually. */ /* We possibly want to favour the dirtier blocks more when the number of free blocks is low. */ +again: if (!list_empty(&c->bad_used_list) && c->nr_free_blocks > c->resv_blocks_gcbad) { D1(printk(KERN_DEBUG "Picking block from bad_used_list to GC next\n")); nextlist = &c->bad_used_list; @@ -79,6 +80,13 @@ static struct jffs2_eraseblock *jffs2_find_gc_block(struct jffs2_sb_info *c) D1(printk(KERN_DEBUG "Picking block from erasable_list to GC next (clean_list and {very_,}dirty_list were empty)\n")); nextlist = &c->erasable_list; + } else if (!list_empty(&c->erasable_pending_wbuf_list)) { + /* There are blocks are wating for the wbuf sync */ + D1(printk(KERN_DEBUG "Synching wbuf in order to reuse erasable_pending_wbuf_list blocks\n")); + spin_unlock(&c->erase_completion_lock); + jffs2_flush_wbuf_pad(c); + spin_lock(&c->erase_completion_lock); + goto again; } else { /* Eep. All were empty */ D1(printk(KERN_NOTICE "jffs2: No clean, dirty _or_ erasable blocks to GC from! Where are they all?\n")); @@ -661,9 +669,10 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_ { struct jffs2_full_dnode *new_fn; struct jffs2_raw_inode ri; + struct jffs2_node_frag *last_frag; jint16_t dev; char *mdata = NULL, mdatalen = 0; - uint32_t alloclen, phys_ofs; + uint32_t alloclen, phys_ofs, ilen; int ret; if (S_ISBLK(JFFS2_F_I_MODE(f)) || @@ -699,6 +708,14 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_ goto out; } + last_frag = frag_last(&f->fragtree); + if (last_frag) + /* Fetch the inode length from the fragtree rather then + * from i_size since i_size may have not been updated yet */ + ilen = last_frag->ofs + last_frag->size; + else + ilen = JFFS2_F_I_SIZE(f); + memset(&ri, 0, sizeof(ri)); ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); ri.nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE); @@ -710,7 +727,7 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_ ri.mode = cpu_to_jemode(JFFS2_F_I_MODE(f)); ri.uid = cpu_to_je16(JFFS2_F_I_UID(f)); ri.gid = cpu_to_je16(JFFS2_F_I_GID(f)); - ri.isize = cpu_to_je32(JFFS2_F_I_SIZE(f)); + ri.isize = cpu_to_je32(ilen); ri.atime = cpu_to_je32(JFFS2_F_I_ATIME(f)); ri.ctime = cpu_to_je32(JFFS2_F_I_CTIME(f)); ri.mtime = cpu_to_je32(JFFS2_F_I_MTIME(f)); @@ -816,8 +833,7 @@ static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct /* Doesn't matter if there's one in the same erase block. We're going to delete it too at the same time. */ - if ((raw->flash_offset & ~(c->sector_size-1)) == - (fd->raw->flash_offset & ~(c->sector_size-1))) + if (SECTOR_ADDR(raw->flash_offset) == SECTOR_ADDR(fd->raw->flash_offset)) continue; D1(printk(KERN_DEBUG "Check potential deletion dirent at %08x\n", ref_offset(raw))); @@ -891,7 +907,7 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras struct jffs2_raw_inode ri; struct jffs2_node_frag *frag; struct jffs2_full_dnode *new_fn; - uint32_t alloclen, phys_ofs; + uint32_t alloclen, phys_ofs, ilen; int ret; D1(printk(KERN_DEBUG "Writing replacement hole node for ino #%u from offset 0x%x to 0x%x\n", @@ -951,10 +967,19 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras ri.csize = cpu_to_je32(0); ri.compr = JFFS2_COMPR_ZERO; } + + frag = frag_last(&f->fragtree); + if (frag) + /* Fetch the inode length from the fragtree rather then + * from i_size since i_size may have not been updated yet */ + ilen = frag->ofs + frag->size; + else + ilen = JFFS2_F_I_SIZE(f); + ri.mode = cpu_to_jemode(JFFS2_F_I_MODE(f)); ri.uid = cpu_to_je16(JFFS2_F_I_UID(f)); ri.gid = cpu_to_je16(JFFS2_F_I_GID(f)); - ri.isize = cpu_to_je32(JFFS2_F_I_SIZE(f)); + ri.isize = cpu_to_je32(ilen); ri.atime = cpu_to_je32(JFFS2_F_I_ATIME(f)); ri.ctime = cpu_to_je32(JFFS2_F_I_CTIME(f)); ri.mtime = cpu_to_je32(JFFS2_F_I_MTIME(f)); @@ -1161,7 +1186,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era D1(printk(KERN_DEBUG "Expanded dnode to write from (0x%x-0x%x) to (0x%x-0x%x)\n", orig_start, orig_end, start, end)); - BUG_ON(end > JFFS2_F_I_SIZE(f)); + D1(BUG_ON(end > frag_last(&f->fragtree)->ofs + frag_last(&f->fragtree)->size)); BUG_ON(end < orig_end); BUG_ON(start > orig_start); } diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c index cd6a8bd13e0..4991c348f6e 100644 --- a/fs/jffs2/nodelist.c +++ b/fs/jffs2/nodelist.c @@ -7,7 +7,7 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: nodelist.c,v 1.90 2004/12/08 17:59:20 dwmw2 Exp $ + * $Id: nodelist.c,v 1.98 2005/07/10 15:15:32 dedekind Exp $ * */ @@ -55,30 +55,63 @@ void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new }); } -/* Put a new tmp_dnode_info into the list, keeping the list in - order of increasing version -*/ -static void jffs2_add_tn_to_list(struct jffs2_tmp_dnode_info *tn, struct jffs2_tmp_dnode_info **list) +/* + * Put a new tmp_dnode_info into the temporaty RB-tree, keeping the list in + * order of increasing version. + */ +static void jffs2_add_tn_to_tree(struct jffs2_tmp_dnode_info *tn, struct rb_root *list) { - struct jffs2_tmp_dnode_info **prev = list; - - while ((*prev) && (*prev)->version < tn->version) { - prev = &((*prev)->next); - } - tn->next = (*prev); - *prev = tn; + struct rb_node **p = &list->rb_node; + struct rb_node * parent = NULL; + struct jffs2_tmp_dnode_info *this; + + while (*p) { + parent = *p; + this = rb_entry(parent, struct jffs2_tmp_dnode_info, rb); + + /* There may actually be a collision here, but it doesn't + actually matter. As long as the two nodes with the same + version are together, it's all fine. */ + if (tn->version < this->version) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&tn->rb, parent, p); + rb_insert_color(&tn->rb, list); } -static void jffs2_free_tmp_dnode_info_list(struct jffs2_tmp_dnode_info *tn) +static void jffs2_free_tmp_dnode_info_list(struct rb_root *list) { - struct jffs2_tmp_dnode_info *next; + struct rb_node *this; + struct jffs2_tmp_dnode_info *tn; + + this = list->rb_node; + + /* Now at bottom of tree */ + while (this) { + if (this->rb_left) + this = this->rb_left; + else if (this->rb_right) + this = this->rb_right; + else { + tn = rb_entry(this, struct jffs2_tmp_dnode_info, rb); + jffs2_free_full_dnode(tn->fn); + jffs2_free_tmp_dnode_info(tn); + + this = this->rb_parent; + if (!this) + break; - while (tn) { - next = tn; - tn = tn->next; - jffs2_free_full_dnode(next->fn); - jffs2_free_tmp_dnode_info(next); + if (this->rb_left == &tn->rb) + this->rb_left = NULL; + else if (this->rb_right == &tn->rb) + this->rb_right = NULL; + else BUG(); + } } + list->rb_node = NULL; } static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd) @@ -108,12 +141,13 @@ static struct jffs2_raw_node_ref *jffs2_first_valid_node(struct jffs2_raw_node_r with this ino, returning the former in order of version */ int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_info *f, - struct jffs2_tmp_dnode_info **tnp, struct jffs2_full_dirent **fdp, + struct rb_root *tnp, struct jffs2_full_dirent **fdp, uint32_t *highest_version, uint32_t *latest_mctime, uint32_t *mctime_ver) { struct jffs2_raw_node_ref *ref, *valid_ref; - struct jffs2_tmp_dnode_info *tn, *ret_tn = NULL; + struct jffs2_tmp_dnode_info *tn; + struct rb_root ret_tn = RB_ROOT; struct jffs2_full_dirent *fd, *ret_fd = NULL; union jffs2_node_union node; size_t retlen; @@ -127,7 +161,7 @@ int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_info *f, valid_ref = jffs2_first_valid_node(f->inocache->nodes); - if (!valid_ref) + if (!valid_ref && (f->inocache->ino != 1)) printk(KERN_WARNING "Eep. No valid nodes for ino #%u\n", f->inocache->ino); while (valid_ref) { @@ -386,7 +420,7 @@ int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_info *f, D1(printk(KERN_DEBUG "dnode @%08x: ver %u, offset %04x, dsize %04x\n", ref_offset(ref), je32_to_cpu(node.i.version), je32_to_cpu(node.i.offset), je32_to_cpu(node.i.dsize))); - jffs2_add_tn_to_list(tn, &ret_tn); + jffs2_add_tn_to_tree(tn, &ret_tn); break; default: @@ -450,7 +484,7 @@ int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_info *f, return 0; free_out: - jffs2_free_tmp_dnode_info_list(ret_tn); + jffs2_free_tmp_dnode_info_list(&ret_tn); jffs2_free_full_dirent_list(ret_fd); return err; } @@ -489,9 +523,13 @@ struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new) { struct jffs2_inode_cache **prev; - D2(printk(KERN_DEBUG "jffs2_add_ino_cache: Add %p (ino #%u)\n", new, new->ino)); + spin_lock(&c->inocache_lock); - + if (!new->ino) + new->ino = ++c->highest_ino; + + D2(printk(KERN_DEBUG "jffs2_add_ino_cache: Add %p (ino #%u)\n", new, new->ino)); + prev = &c->inocache_list[new->ino % INOCACHE_HASHSIZE]; while ((*prev) && (*prev)->ino < new->ino) { @@ -506,7 +544,7 @@ void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old) { struct jffs2_inode_cache **prev; - D2(printk(KERN_DEBUG "jffs2_del_ino_cache: Del %p (ino #%u)\n", old, old->ino)); + D1(printk(KERN_DEBUG "jffs2_del_ino_cache: Del %p (ino #%u)\n", old, old->ino)); spin_lock(&c->inocache_lock); prev = &c->inocache_list[old->ino % INOCACHE_HASHSIZE]; @@ -518,6 +556,14 @@ void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old) *prev = old->next; } + /* Free it now unless it's in READING or CLEARING state, which + are the transitions upon read_inode() and clear_inode(). The + rest of the time we know nobody else is looking at it, and + if it's held by read_inode() or clear_inode() they'll free it + for themselves. */ + if (old->state != INO_STATE_READING && old->state != INO_STATE_CLEARING) + jffs2_free_inode_cache(old); + spin_unlock(&c->inocache_lock); } @@ -530,7 +576,6 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c) this = c->inocache_list[i]; while (this) { next = this->next; - D2(printk(KERN_DEBUG "jffs2_free_ino_caches: Freeing ino #%u at %p\n", this->ino, this)); jffs2_free_inode_cache(this); this = next; } diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index a4864d05ea9..b34c397909e 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -7,7 +7,7 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: nodelist.h,v 1.126 2004/11/19 15:06:29 dedekind Exp $ + * $Id: nodelist.h,v 1.131 2005/07/05 21:03:07 dwmw2 Exp $ * */ @@ -135,6 +135,7 @@ struct jffs2_inode_cache { #define INO_STATE_CHECKEDABSENT 3 /* Checked, cleared again */ #define INO_STATE_GC 4 /* GCing a 'pristine' node */ #define INO_STATE_READING 5 /* In read_inode() */ +#define INO_STATE_CLEARING 6 /* In clear_inode() */ #define INOCACHE_HASHSIZE 128 @@ -160,7 +161,7 @@ struct jffs2_full_dnode */ struct jffs2_tmp_dnode_info { - struct jffs2_tmp_dnode_info *next; + struct rb_node rb; struct jffs2_full_dnode *fn; uint32_t version; }; @@ -362,6 +363,18 @@ static inline struct jffs2_node_frag *frag_first(struct rb_root *root) node = node->rb_left; return rb_entry(node, struct jffs2_node_frag, rb); } + +static inline struct jffs2_node_frag *frag_last(struct rb_root *root) +{ + struct rb_node *node = root->rb_node; + + if (!node) + return NULL; + while(node->rb_right) + node = node->rb_right; + return rb_entry(node, struct jffs2_node_frag, rb); +} + #define rb_parent(rb) ((rb)->rb_parent) #define frag_next(frag) rb_entry(rb_next(&(frag)->rb), struct jffs2_node_frag, rb) #define frag_prev(frag) rb_entry(rb_prev(&(frag)->rb), struct jffs2_node_frag, rb) @@ -374,7 +387,7 @@ static inline struct jffs2_node_frag *frag_first(struct rb_root *root) D2(void jffs2_print_frag_list(struct jffs2_inode_info *f)); void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new, struct jffs2_full_dirent **list); int jffs2_get_inode_nodes(struct jffs2_sb_info *c, struct jffs2_inode_info *f, - struct jffs2_tmp_dnode_info **tnp, struct jffs2_full_dirent **fdp, + struct rb_root *tnp, struct jffs2_full_dirent **fdp, uint32_t *highest_version, uint32_t *latest_mctime, uint32_t *mctime_ver); void jffs2_set_inocache_state(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic, int state); @@ -462,7 +475,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c); /* erase.c */ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count); -#ifdef CONFIG_JFFS2_FS_NAND +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER /* wbuf.c */ int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino); int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c); diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index 2651135bdf4..c1d8b5ed9ab 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -7,7 +7,7 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: nodemgmt.c,v 1.115 2004/11/22 11:07:21 dwmw2 Exp $ + * $Id: nodemgmt.c,v 1.122 2005/05/06 09:30:27 dedekind Exp $ * */ @@ -75,7 +75,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs dirty = c->dirty_size + c->erasing_size - c->nr_erasing_blocks * c->sector_size + c->unchecked_size; if (dirty < c->nospc_dirty_size) { if (prio == ALLOC_DELETION && c->nr_free_blocks + c->nr_erasing_blocks >= c->resv_blocks_deletion) { - printk(KERN_NOTICE "jffs2_reserve_space(): Low on dirty space to GC, but it's a deletion. Allowing...\n"); + D1(printk(KERN_NOTICE "jffs2_reserve_space(): Low on dirty space to GC, but it's a deletion. Allowing...\n")); break; } D1(printk(KERN_DEBUG "dirty size 0x%08x + unchecked_size 0x%08x < nospc_dirty_size 0x%08x, returning -ENOSPC\n", @@ -98,7 +98,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, uint32_t *ofs avail = c->free_size + c->dirty_size + c->erasing_size + c->unchecked_size; if ( (avail / c->sector_size) <= blocksneeded) { if (prio == ALLOC_DELETION && c->nr_free_blocks + c->nr_erasing_blocks >= c->resv_blocks_deletion) { - printk(KERN_NOTICE "jffs2_reserve_space(): Low on possibly available space, but it's a deletion. Allowing...\n"); + D1(printk(KERN_NOTICE "jffs2_reserve_space(): Low on possibly available space, but it's a deletion. Allowing...\n")); break; } @@ -308,7 +308,10 @@ int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_r D1(printk(KERN_DEBUG "jffs2_add_physical_node_ref(): Node at 0x%x(%d), size 0x%x\n", ref_offset(new), ref_flags(new), len)); #if 1 - if (jeb != c->nextblock || (ref_offset(new)) != jeb->offset + (c->sector_size - jeb->free_size)) { + /* we could get some obsolete nodes after nextblock was refiled + in wbuf.c */ + if ((c->nextblock || !ref_obsolete(new)) + &&(jeb != c->nextblock || ref_offset(new) != jeb->offset + (c->sector_size - jeb->free_size))) { printk(KERN_WARNING "argh. node added in wrong place\n"); jffs2_free_raw_node_ref(new); return -EINVAL; @@ -332,7 +335,7 @@ int jffs2_add_physical_node_ref(struct jffs2_sb_info *c, struct jffs2_raw_node_r c->used_size += len; } - if (!jeb->free_size && !jeb->dirty_size) { + if (!jeb->free_size && !jeb->dirty_size && !ISDIRTY(jeb->wasted_size)) { /* If it lives on the dirty_list, jffs2_reserve_space will put it there */ D1(printk(KERN_DEBUG "Adding full erase block at 0x%08x to clean_list (free 0x%08x, dirty 0x%08x, used 0x%08x\n", jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size)); @@ -400,7 +403,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref jeb = &c->blocks[blocknr]; if (jffs2_can_mark_obsolete(c) && !jffs2_is_readonly(c) && - !(c->flags & JFFS2_SB_FLAG_MOUNTING)) { + !(c->flags & (JFFS2_SB_FLAG_SCANNING | JFFS2_SB_FLAG_BUILDING))) { /* Hm. This may confuse static lock analysis. If any of the above three conditions is false, we're going to return from this function without actually obliterating any nodes or freeing @@ -434,7 +437,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref // Take care, that wasted size is taken into concern if ((jeb->dirty_size || ISDIRTY(jeb->wasted_size + ref_totlen(c, jeb, ref))) && jeb != c->nextblock) { - D1(printk("Dirtying\n")); + D1(printk(KERN_DEBUG "Dirtying\n")); addedsize = ref_totlen(c, jeb, ref); jeb->dirty_size += ref_totlen(c, jeb, ref); c->dirty_size += ref_totlen(c, jeb, ref); @@ -456,7 +459,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref } } } else { - D1(printk("Wasting\n")); + D1(printk(KERN_DEBUG "Wasting\n")); addedsize = 0; jeb->wasted_size += ref_totlen(c, jeb, ref); c->wasted_size += ref_totlen(c, jeb, ref); @@ -467,8 +470,8 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref D1(ACCT_PARANOIA_CHECK(jeb)); - if (c->flags & JFFS2_SB_FLAG_MOUNTING) { - /* Mount in progress. Don't muck about with the block + if (c->flags & JFFS2_SB_FLAG_SCANNING) { + /* Flash scanning is in progress. Don't muck about with the block lists because they're not ready yet, and don't actually obliterate nodes that look obsolete. If they weren't marked obsolete on the flash at the time they _became_ @@ -527,7 +530,8 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref spin_unlock(&c->erase_completion_lock); - if (!jffs2_can_mark_obsolete(c) || jffs2_is_readonly(c)) { + if (!jffs2_can_mark_obsolete(c) || jffs2_is_readonly(c) || + (c->flags & JFFS2_SB_FLAG_BUILDING)) { /* We didn't lock the erase_free_sem */ return; } @@ -590,11 +594,8 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *p = ref->next_in_ino; ref->next_in_ino = NULL; - if (ic->nodes == (void *)ic) { - D1(printk(KERN_DEBUG "inocache for ino #%u is all gone now. Freeing\n", ic->ino)); + if (ic->nodes == (void *)ic && ic->nlink == 0) jffs2_del_ino_cache(c, ic); - jffs2_free_inode_cache(ic); - } spin_unlock(&c->erase_completion_lock); } diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 03b0acc37b7..d900c8929b0 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -7,41 +7,24 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: os-linux.h,v 1.51 2004/11/16 20:36:11 dwmw2 Exp $ + * $Id: os-linux.h,v 1.58 2005/07/12 02:34:35 tpoynor Exp $ * */ #ifndef __JFFS2_OS_LINUX_H__ #define __JFFS2_OS_LINUX_H__ -#include <linux/version.h> /* JFFS2 uses Linux mode bits natively -- no need for conversion */ #define os_to_jffs2_mode(x) (x) #define jffs2_to_os_mode(x) (x) -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,73) -#define kstatfs statfs -#endif - struct kstatfs; struct kvec; -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,2) #define JFFS2_INODE_INFO(i) (list_entry(i, struct jffs2_inode_info, vfs_inode)) #define OFNI_EDONI_2SFFJ(f) (&(f)->vfs_inode) #define JFFS2_SB_INFO(sb) (sb->s_fs_info) #define OFNI_BS_2SFFJ(c) ((struct super_block *)c->os_priv) -#elif defined(JFFS2_OUT_OF_KERNEL) -#define JFFS2_INODE_INFO(i) ((struct jffs2_inode_info *) &(i)->u) -#define OFNI_EDONI_2SFFJ(f) ((struct inode *) ( ((char *)f) - ((char *)(&((struct inode *)NULL)->u)) ) ) -#define JFFS2_SB_INFO(sb) ((struct jffs2_sb_info *) &(sb)->u) -#define OFNI_BS_2SFFJ(c) ((struct super_block *) ( ((char *)c) - ((char *)(&((struct super_block *)NULL)->u)) ) ) -#else -#define JFFS2_INODE_INFO(i) (&i->u.jffs2_i) -#define OFNI_EDONI_2SFFJ(f) ((struct inode *) ( ((char *)f) - ((char *)(&((struct inode *)NULL)->u)) ) ) -#define JFFS2_SB_INFO(sb) (&sb->u.jffs2_sb) -#define OFNI_BS_2SFFJ(c) ((struct super_block *) ( ((char *)c) - ((char *)(&((struct super_block *)NULL)->u)) ) ) -#endif #define JFFS2_F_I_SIZE(f) (OFNI_EDONI_2SFFJ(f)->i_size) @@ -49,28 +32,14 @@ struct kvec; #define JFFS2_F_I_UID(f) (OFNI_EDONI_2SFFJ(f)->i_uid) #define JFFS2_F_I_GID(f) (OFNI_EDONI_2SFFJ(f)->i_gid) -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,1) #define JFFS2_F_I_RDEV_MIN(f) (iminor(OFNI_EDONI_2SFFJ(f))) #define JFFS2_F_I_RDEV_MAJ(f) (imajor(OFNI_EDONI_2SFFJ(f))) -#else -#define JFFS2_F_I_RDEV_MIN(f) (MINOR(to_kdev_t(OFNI_EDONI_2SFFJ(f)->i_rdev))) -#define JFFS2_F_I_RDEV_MAJ(f) (MAJOR(to_kdev_t(OFNI_EDONI_2SFFJ(f)->i_rdev))) -#endif -/* Urgh. The things we do to keep the 2.4 build working */ -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,47) #define ITIME(sec) ((struct timespec){sec, 0}) #define I_SEC(tv) ((tv).tv_sec) #define JFFS2_F_I_CTIME(f) (OFNI_EDONI_2SFFJ(f)->i_ctime.tv_sec) #define JFFS2_F_I_MTIME(f) (OFNI_EDONI_2SFFJ(f)->i_mtime.tv_sec) #define JFFS2_F_I_ATIME(f) (OFNI_EDONI_2SFFJ(f)->i_atime.tv_sec) -#else -#define ITIME(x) (x) -#define I_SEC(x) (x) -#define JFFS2_F_I_CTIME(f) (OFNI_EDONI_2SFFJ(f)->i_ctime) -#define JFFS2_F_I_MTIME(f) (OFNI_EDONI_2SFFJ(f)->i_mtime) -#define JFFS2_F_I_ATIME(f) (OFNI_EDONI_2SFFJ(f)->i_atime) -#endif #define sleep_on_spinunlock(wq, s) \ do { \ @@ -84,23 +53,21 @@ struct kvec; static inline void jffs2_init_inode_info(struct jffs2_inode_info *f) { -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,2) f->highest_version = 0; f->fragtree = RB_ROOT; f->metadata = NULL; f->dents = NULL; f->flags = 0; f->usercompr = 0; -#else - memset(f, 0, sizeof(*f)); - init_MUTEX_LOCKED(&f->sem); -#endif } + #define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & MS_RDONLY) -#if (!defined CONFIG_JFFS2_FS_NAND && !defined CONFIG_JFFS2_FS_NOR_ECC) +#ifndef CONFIG_JFFS2_FS_WRITEBUFFER +#define SECTOR_ADDR(x) ( ((unsigned long)(x) & ~(c->sector_size-1)) ) #define jffs2_can_mark_obsolete(c) (1) +#define jffs2_is_writebuffered(c) (0) #define jffs2_cleanmarker_oob(c) (0) #define jffs2_write_nand_cleanmarker(c,jeb) (-EIO) @@ -116,11 +83,16 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f) #define jffs2_wbuf_timeout NULL #define jffs2_wbuf_process NULL #define jffs2_nor_ecc(c) (0) +#define jffs2_dataflash(c) (0) #define jffs2_nor_ecc_flash_setup(c) (0) #define jffs2_nor_ecc_flash_cleanup(c) do {} while (0) +#define jffs2_dataflash_setup(c) (0) +#define jffs2_dataflash_cleanup(c) do {} while (0) #else /* NAND and/or ECC'd NOR support present */ +#define jffs2_is_writebuffered(c) (c->wbuf != NULL) +#define SECTOR_ADDR(x) ( ((unsigned long)(x) / (unsigned long)(c->sector_size)) * c->sector_size ) #define jffs2_can_mark_obsolete(c) ((c->mtd->type == MTD_NORFLASH && !(c->mtd->flags & MTD_ECC)) || c->mtd->type == MTD_RAM) #define jffs2_cleanmarker_oob(c) (c->mtd->type == MTD_NANDFLASH) @@ -142,16 +114,16 @@ int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino); int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c); int jffs2_nand_flash_setup(struct jffs2_sb_info *c); void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c); -#ifdef CONFIG_JFFS2_FS_NOR_ECC + #define jffs2_nor_ecc(c) (c->mtd->type == MTD_NORFLASH && (c->mtd->flags & MTD_ECC)) int jffs2_nor_ecc_flash_setup(struct jffs2_sb_info *c); void jffs2_nor_ecc_flash_cleanup(struct jffs2_sb_info *c); -#else -#define jffs2_nor_ecc(c) (0) -#define jffs2_nor_ecc_flash_setup(c) (0) -#define jffs2_nor_ecc_flash_cleanup(c) do {} while (0) -#endif /* NOR ECC */ -#endif /* NAND */ + +#define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH) +int jffs2_dataflash_setup(struct jffs2_sb_info *c); +void jffs2_dataflash_cleanup(struct jffs2_sb_info *c); + +#endif /* WRITEBUFFER */ /* erase.c */ static inline void jffs2_erase_pending_trigger(struct jffs2_sb_info *c) diff --git a/fs/jffs2/read.c b/fs/jffs2/read.c index eb493dc06db..c7f9068907c 100644 --- a/fs/jffs2/read.c +++ b/fs/jffs2/read.c @@ -7,7 +7,7 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: read.c,v 1.38 2004/11/16 20:36:12 dwmw2 Exp $ + * $Id: read.c,v 1.39 2005/03/01 10:34:03 dedekind Exp $ * */ @@ -214,33 +214,3 @@ int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, return 0; } -/* Core function to read symlink target. */ -char *jffs2_getlink(struct jffs2_sb_info *c, struct jffs2_inode_info *f) -{ - char *buf; - int ret; - - down(&f->sem); - - if (!f->metadata) { - printk(KERN_NOTICE "No metadata for symlink inode #%u\n", f->inocache->ino); - up(&f->sem); - return ERR_PTR(-EINVAL); - } - buf = kmalloc(f->metadata->size+1, GFP_USER); - if (!buf) { - up(&f->sem); - return ERR_PTR(-ENOMEM); - } - buf[f->metadata->size]=0; - - ret = jffs2_read_dnode(c, f, f->metadata, buf, 0, f->metadata->size); - - up(&f->sem); - - if (ret) { - kfree(buf); - return ERR_PTR(ret); - } - return buf; -} diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index aca4a0b17bc..5b2a83599d7 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -7,7 +7,7 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: readinode.c,v 1.117 2004/11/20 18:06:54 dwmw2 Exp $ + * $Id: readinode.c,v 1.125 2005/07/10 13:13:55 dedekind Exp $ * */ @@ -151,6 +151,9 @@ int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_in D1(printk(KERN_DEBUG "jffs2_add_full_dnode_to_inode(ino #%u, f %p, fn %p)\n", f->inocache->ino, f, fn)); + if (unlikely(!fn->size)) + return 0; + newfrag = jffs2_alloc_node_frag(); if (unlikely(!newfrag)) return -ENOMEM; @@ -158,11 +161,6 @@ int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_in D2(printk(KERN_DEBUG "adding node %04x-%04x @0x%08x on flash, newfrag *%p\n", fn->ofs, fn->ofs+fn->size, ref_offset(fn->raw), newfrag)); - if (unlikely(!fn->size)) { - jffs2_free_node_frag(newfrag); - return 0; - } - newfrag->ofs = fn->ofs; newfrag->size = fn->size; newfrag->node = fn; @@ -500,7 +498,9 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_raw_inode *latest_node) { - struct jffs2_tmp_dnode_info *tn_list, *tn; + struct jffs2_tmp_dnode_info *tn = NULL; + struct rb_root tn_list; + struct rb_node *rb, *repl_rb; struct jffs2_full_dirent *fd_list; struct jffs2_full_dnode *fn = NULL; uint32_t crc; @@ -522,9 +522,10 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, } f->dents = fd_list; - while (tn_list) { - tn = tn_list; + rb = rb_first(&tn_list); + while (rb) { + tn = rb_entry(rb, struct jffs2_tmp_dnode_info, rb); fn = tn->fn; if (f->metadata) { @@ -556,7 +557,29 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, mdata_ver = tn->version; } next_tn: - tn_list = tn->next; + BUG_ON(rb->rb_left); + if (rb->rb_parent && rb->rb_parent->rb_left == rb) { + /* We were then left-hand child of our parent. We need + to move our own right-hand child into our place. */ + repl_rb = rb->rb_right; + if (repl_rb) + repl_rb->rb_parent = rb->rb_parent; + } else + repl_rb = NULL; + + rb = rb_next(rb); + + /* Remove the spent tn from the tree; don't bother rebalancing + but put our right-hand child in our own place. */ + if (tn->rb.rb_parent) { + if (tn->rb.rb_parent->rb_left == &tn->rb) + tn->rb.rb_parent->rb_left = repl_rb; + else if (tn->rb.rb_parent->rb_right == &tn->rb) + tn->rb.rb_parent->rb_right = repl_rb; + else BUG(); + } else if (tn->rb.rb_right) + tn->rb.rb_right->rb_parent = NULL; + jffs2_free_tmp_dnode_info(tn); } D1(jffs2_sanitycheck_fragtree(f)); @@ -623,6 +646,40 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, case. */ if (!je32_to_cpu(latest_node->isize)) latest_node->isize = latest_node->dsize; + + if (f->inocache->state != INO_STATE_CHECKING) { + /* Symlink's inode data is the target path. Read it and + * keep in RAM to facilitate quick follow symlink operation. + * We use f->dents field to store the target path, which + * is somewhat ugly. */ + f->dents = kmalloc(je32_to_cpu(latest_node->csize) + 1, GFP_KERNEL); + if (!f->dents) { + printk(KERN_WARNING "Can't allocate %d bytes of memory " + "for the symlink target path cache\n", + je32_to_cpu(latest_node->csize)); + up(&f->sem); + jffs2_do_clear_inode(c, f); + return -ENOMEM; + } + + ret = jffs2_flash_read(c, ref_offset(fn->raw) + sizeof(*latest_node), + je32_to_cpu(latest_node->csize), &retlen, (char *)f->dents); + + if (ret || retlen != je32_to_cpu(latest_node->csize)) { + if (retlen != je32_to_cpu(latest_node->csize)) + ret = -EIO; + kfree(f->dents); + f->dents = NULL; + up(&f->sem); + jffs2_do_clear_inode(c, f); + return -ret; + } + + ((char *)f->dents)[je32_to_cpu(latest_node->csize)] = '\0'; + D1(printk(KERN_DEBUG "jffs2_do_read_inode(): symlink's target '%s' cached\n", + (char *)f->dents)); + } + /* fall through... */ case S_IFBLK: @@ -672,6 +729,9 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f) down(&f->sem); deleted = f->inocache && !f->inocache->nlink; + if (f->inocache && f->inocache->state != INO_STATE_CHECKING) + jffs2_set_inocache_state(c, f->inocache, INO_STATE_CLEARING); + if (f->metadata) { if (deleted) jffs2_mark_node_obsolete(c, f->metadata->raw); @@ -680,16 +740,27 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f) jffs2_kill_fragtree(&f->fragtree, deleted?c:NULL); - fds = f->dents; + /* For symlink inodes we us f->dents to store the target path name */ + if (S_ISLNK(OFNI_EDONI_2SFFJ(f)->i_mode)) { + if (f->dents) { + kfree(f->dents); + f->dents = NULL; + } + } else { + fds = f->dents; - while(fds) { - fd = fds; - fds = fd->next; - jffs2_free_full_dirent(fd); + while(fds) { + fd = fds; + fds = fd->next; + jffs2_free_full_dirent(fd); + } } - if (f->inocache && f->inocache->state != INO_STATE_CHECKING) + if (f->inocache && f->inocache->state != INO_STATE_CHECKING) { jffs2_set_inocache_state(c, f->inocache, INO_STATE_CHECKEDABSENT); + if (f->inocache->nodes == (void *)f->inocache) + jffs2_del_ino_cache(c, f->inocache); + } up(&f->sem); } diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index ded53584a89..b63160f83ba 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -7,7 +7,7 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: scan.c,v 1.115 2004/11/17 12:59:08 dedekind Exp $ + * $Id: scan.c,v 1.119 2005/02/17 17:51:13 dedekind Exp $ * */ #include <linux/kernel.h> @@ -19,7 +19,7 @@ #include <linux/compiler.h> #include "nodelist.h" -#define EMPTY_SCAN_SIZE 1024 +#define DEFAULT_EMPTY_SCAN_SIZE 1024 #define DIRTY_SPACE(x) do { typeof(x) _x = (x); \ c->free_size -= _x; c->dirty_size += _x; \ @@ -68,13 +68,21 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo static inline int min_free(struct jffs2_sb_info *c) { uint32_t min = 2 * sizeof(struct jffs2_raw_inode); -#if defined CONFIG_JFFS2_FS_NAND || defined CONFIG_JFFS2_FS_NOR_ECC +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER if (!jffs2_can_mark_obsolete(c) && min < c->wbuf_pagesize) return c->wbuf_pagesize; #endif return min; } + +static inline uint32_t EMPTY_SCAN_SIZE(uint32_t sector_size) { + if (sector_size < DEFAULT_EMPTY_SCAN_SIZE) + return sector_size; + else + return DEFAULT_EMPTY_SCAN_SIZE; +} + int jffs2_scan_medium(struct jffs2_sb_info *c) { int i, ret; @@ -220,7 +228,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) c->dirty_size -= c->nextblock->dirty_size; c->nextblock->dirty_size = 0; } -#if defined CONFIG_JFFS2_FS_NAND || defined CONFIG_JFFS2_FS_NOR_ECC +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER if (!jffs2_can_mark_obsolete(c) && c->nextblock && (c->nextblock->free_size & (c->wbuf_pagesize-1))) { /* If we're going to start writing into a block which already contains data, and the end of the data isn't page-aligned, @@ -286,7 +294,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo uint32_t hdr_crc, buf_ofs, buf_len; int err; int noise = 0; -#ifdef CONFIG_JFFS2_FS_NAND +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER int cleanmarkerfound = 0; #endif @@ -295,7 +303,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo D1(printk(KERN_DEBUG "jffs2_scan_eraseblock(): Scanning block at 0x%x\n", ofs)); -#ifdef CONFIG_JFFS2_FS_NAND +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER if (jffs2_cleanmarker_oob(c)) { int ret = jffs2_check_nand_cleanmarker(c, jeb); D2(printk(KERN_NOTICE "jffs_check_nand_cleanmarker returned %d\n",ret)); @@ -316,7 +324,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo if (!buf_size) { buf_len = c->sector_size; } else { - buf_len = EMPTY_SCAN_SIZE; + buf_len = EMPTY_SCAN_SIZE(c->sector_size); err = jffs2_fill_scan_buf(c, buf, buf_ofs, buf_len); if (err) return err; @@ -326,11 +334,11 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo ofs = 0; /* Scan only 4KiB of 0xFF before declaring it's empty */ - while(ofs < EMPTY_SCAN_SIZE && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF) + while(ofs < EMPTY_SCAN_SIZE(c->sector_size) && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF) ofs += 4; - if (ofs == EMPTY_SCAN_SIZE) { -#ifdef CONFIG_JFFS2_FS_NAND + if (ofs == EMPTY_SCAN_SIZE(c->sector_size)) { +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER if (jffs2_cleanmarker_oob(c)) { /* scan oob, take care of cleanmarker */ int ret = jffs2_check_oob_empty(c, jeb, cleanmarkerfound); @@ -343,7 +351,10 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo } #endif D1(printk(KERN_DEBUG "Block at 0x%08x is empty (erased)\n", jeb->offset)); - return BLK_STATE_ALLFF; /* OK to erase if all blocks are like this */ + if (c->cleanmarker_size == 0) + return BLK_STATE_CLEANMARKER; /* don't bother with re-erase */ + else + return BLK_STATE_ALLFF; /* OK to erase if all blocks are like this */ } if (ofs) { D1(printk(KERN_DEBUG "Free space at %08x ends at %08x\n", jeb->offset, @@ -422,8 +433,8 @@ scan_more: /* If we're only checking the beginning of a block with a cleanmarker, bail now */ if (buf_ofs == jeb->offset && jeb->used_size == PAD(c->cleanmarker_size) && - c->cleanmarker_size && !jeb->dirty_size && !jeb->first_node->next_in_ino) { - D1(printk(KERN_DEBUG "%d bytes at start of block seems clean... assuming all clean\n", EMPTY_SCAN_SIZE)); + c->cleanmarker_size && !jeb->dirty_size && !jeb->first_node->next_phys) { + D1(printk(KERN_DEBUG "%d bytes at start of block seems clean... assuming all clean\n", EMPTY_SCAN_SIZE(c->sector_size))); return BLK_STATE_CLEANMARKER; } @@ -618,7 +629,7 @@ scan_more: } if ((jeb->used_size + jeb->unchecked_size) == PAD(c->cleanmarker_size) && !jeb->dirty_size - && (!jeb->first_node || !jeb->first_node->next_in_ino) ) + && (!jeb->first_node || !jeb->first_node->next_phys) ) return BLK_STATE_CLEANMARKER; /* move blocks with max 4 byte dirty space to cleanlist */ diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 6b2a441d276..aaf9475cfb6 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -7,7 +7,7 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: super.c,v 1.104 2004/11/23 15:37:31 gleixner Exp $ + * $Id: super.c,v 1.107 2005/07/12 16:37:08 dedekind Exp $ * */ @@ -140,6 +140,15 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type, D1(printk(KERN_DEBUG "jffs2_get_sb_mtd(): New superblock for device %d (\"%s\")\n", mtd->index, mtd->name)); + /* Initialize JFFS2 superblock locks, the further initialization will be + * done later */ + init_MUTEX(&c->alloc_sem); + init_MUTEX(&c->erase_free_sem); + init_waitqueue_head(&c->erase_wait); + init_waitqueue_head(&c->inocache_wq); + spin_lock_init(&c->erase_completion_lock); + spin_lock_init(&c->inocache_lock); + sb->s_op = &jffs2_super_operations; sb->s_flags = flags | MS_NOATIME; @@ -270,8 +279,6 @@ static void jffs2_put_super (struct super_block *sb) D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n")); - if (!(sb->s_flags & MS_RDONLY)) - jffs2_stop_garbage_collect_thread(c); down(&c->alloc_sem); jffs2_flush_wbuf_pad(c); up(&c->alloc_sem); @@ -292,6 +299,8 @@ static void jffs2_put_super (struct super_block *sb) static void jffs2_kill_sb(struct super_block *sb) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + if (!(sb->s_flags & MS_RDONLY)) + jffs2_stop_garbage_collect_thread(c); generic_shutdown_super(sb); put_mtd_device(c->mtd); kfree(c); @@ -309,7 +318,7 @@ static int __init init_jffs2_fs(void) int ret; printk(KERN_INFO "JFFS2 version 2.2." -#ifdef CONFIG_JFFS2_FS_NAND +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER " (NAND)" #endif " (C) 2001-2003 Red Hat, Inc.\n"); diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index 7b1820d1371..82ef484f5e1 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -7,7 +7,7 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: symlink.c,v 1.14 2004/11/16 20:36:12 dwmw2 Exp $ + * $Id: symlink.c,v 1.16 2005/03/01 10:50:48 dedekind Exp $ * */ @@ -18,28 +18,48 @@ #include <linux/namei.h> #include "nodelist.h" -static int jffs2_follow_link(struct dentry *dentry, struct nameidata *nd); -static void jffs2_put_link(struct dentry *dentry, struct nameidata *nd); +static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd); struct inode_operations jffs2_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = jffs2_follow_link, - .put_link = jffs2_put_link, .setattr = jffs2_setattr }; -static int jffs2_follow_link(struct dentry *dentry, struct nameidata *nd) +static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd) { - unsigned char *buf; - buf = jffs2_getlink(JFFS2_SB_INFO(dentry->d_inode->i_sb), JFFS2_INODE_INFO(dentry->d_inode)); - nd_set_link(nd, buf); - return 0; -} + struct jffs2_inode_info *f = JFFS2_INODE_INFO(dentry->d_inode); + char *p = (char *)f->dents; + + /* + * We don't acquire the f->sem mutex here since the only data we + * use is f->dents which in case of the symlink inode points to the + * symlink's target path. + * + * 1. If we are here the inode has already built and f->dents has + * to point to the target path. + * 2. Nobody uses f->dents (if the inode is symlink's inode). The + * exception is inode freeing function which frees f->dents. But + * it can't be called while we are here and before VFS has + * stopped using our f->dents string which we provide by means of + * nd_set_link() call. + */ + + if (!p) { + printk(KERN_ERR "jffs2_follow_link(): can't find symlink taerget\n"); + p = ERR_PTR(-EIO); + } else { + D1(printk(KERN_DEBUG "jffs2_follow_link(): target path is '%s'\n", (char *) f->dents)); + } -static void jffs2_put_link(struct dentry *dentry, struct nameidata *nd) -{ - char *s = nd_get_link(nd); - if (!IS_ERR(s)) - kfree(s); + nd_set_link(nd, p); + + /* + * We unlock the f->sem mutex but VFS will use the f->dents string. This is safe + * since the only way that may cause f->dents to be changed is iput() operation. + * But VFS will not use f->dents after iput() has been called. + */ + return NULL; } + diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index c8128069ecf..996d922e503 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -9,7 +9,7 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: wbuf.c,v 1.82 2004/11/20 22:08:31 dwmw2 Exp $ + * $Id: wbuf.c,v 1.92 2005/04/05 12:51:54 dedekind Exp $ * */ @@ -83,7 +83,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino) struct jffs2_inodirty *new; /* Mark the superblock dirty so that kupdated will flush... */ - OFNI_BS_2SFFJ(c)->s_dirt = 1; + jffs2_erase_pending_trigger(c); if (jffs2_wbuf_pending_for_ino(c, ino)) return; @@ -130,7 +130,10 @@ static inline void jffs2_refile_wbuf_blocks(struct jffs2_sb_info *c) } } -static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) +#define REFILE_NOTEMPTY 0 +#define REFILE_ANYWAY 1 + +static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, int allow_empty) { D1(printk("About to refile bad block at %08x\n", jeb->offset)); @@ -144,7 +147,7 @@ static void jffs2_block_refile(struct jffs2_sb_info *c, struct jffs2_eraseblock D1(printk("Refiling block at %08x to bad_used_list\n", jeb->offset)); list_add(&jeb->list, &c->bad_used_list); } else { - BUG(); + BUG_ON(allow_empty == REFILE_NOTEMPTY); /* It has to have had some nodes or we couldn't be here */ D1(printk("Refiling block at %08x to erase_pending_list\n", jeb->offset)); list_add(&jeb->list, &c->erase_pending_list); @@ -179,7 +182,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) jeb = &c->blocks[c->wbuf_ofs / c->sector_size]; - jffs2_block_refile(c, jeb); + jffs2_block_refile(c, jeb, REFILE_NOTEMPTY); /* Find the first node to be recovered, by skipping over every node which ends before the wbuf starts, or which is obsolete. */ @@ -264,17 +267,16 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) ret = jffs2_reserve_space_gc(c, end-start, &ofs, &len); if (ret) { printk(KERN_WARNING "Failed to allocate space for wbuf recovery. Data loss ensues.\n"); - if (buf) - kfree(buf); + kfree(buf); return; } if (end-start >= c->wbuf_pagesize) { - /* Need to do another write immediately. This, btw, - means that we'll be writing from 'buf' and not from - the wbuf. Since if we're writing from the wbuf there - won't be more than a wbuf full of data, now will - there? :) */ - + /* Need to do another write immediately, but it's possible + that this is just because the wbuf itself is completely + full, and there's nothing earlier read back from the + flash. Hence 'buf' isn't necessarily what we're writing + from. */ + unsigned char *rewrite_buf = buf?:c->wbuf; uint32_t towrite = (end-start) - ((end-start)%c->wbuf_pagesize); D1(printk(KERN_DEBUG "Write 0x%x bytes at 0x%08x in wbuf recover\n", @@ -292,9 +294,9 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) #endif if (jffs2_cleanmarker_oob(c)) ret = c->mtd->write_ecc(c->mtd, ofs, towrite, &retlen, - buf, NULL, c->oobinfo); + rewrite_buf, NULL, c->oobinfo); else - ret = c->mtd->write(c->mtd, ofs, towrite, &retlen, buf); + ret = c->mtd->write(c->mtd, ofs, towrite, &retlen, rewrite_buf); if (ret || retlen != towrite) { /* Argh. We tried. Really we did. */ @@ -321,10 +323,10 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) c->wbuf_len = (end - start) - towrite; c->wbuf_ofs = ofs + towrite; - memcpy(c->wbuf, buf + towrite, c->wbuf_len); + memmove(c->wbuf, rewrite_buf + towrite, c->wbuf_len); /* Don't muck about with c->wbuf_inodes. False positives are harmless. */ - - kfree(buf); + if (buf) + kfree(buf); } else { /* OK, now we're left with the dregs in whichever buffer we're using */ if (buf) { @@ -413,9 +415,9 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) int ret; size_t retlen; - /* Nothing to do if not NAND flash. In particular, we shouldn't + /* Nothing to do if not write-buffering the flash. In particular, we shouldn't del_timer() the timer we never initialised. */ - if (jffs2_can_mark_obsolete(c)) + if (!jffs2_is_writebuffered(c)) return 0; if (!down_trylock(&c->alloc_sem)) { @@ -424,7 +426,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) BUG(); } - if(!c->wbuf || !c->wbuf_len) + if (!c->wbuf_len) /* already checked c->wbuf above */ return 0; /* claim remaining space on the page @@ -433,7 +435,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) if we have a switch to next page, we will not have enough remaining space for this. */ - if (pad) { + if (pad && !jffs2_dataflash(c)) { c->wbuf_len = PAD(c->wbuf_len); /* Pad with JFFS2_DIRTY_BITMASK initially. this helps out ECC'd NOR @@ -484,7 +486,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) spin_lock(&c->erase_completion_lock); /* Adjust free size of the block if we padded. */ - if (pad) { + if (pad && !jffs2_dataflash(c)) { struct jffs2_eraseblock *jeb; jeb = &c->blocks[c->wbuf_ofs / c->sector_size]; @@ -532,6 +534,9 @@ int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino) D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() called for ino #%u...\n", ino)); + if (!c->wbuf) + return 0; + down(&c->alloc_sem); if (!jffs2_wbuf_pending_for_ino(c, ino)) { D1(printk(KERN_DEBUG "Ino #%d not pending in wbuf. Returning\n", ino)); @@ -547,6 +552,10 @@ int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino) D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() padding. Not finished checking\n")); down_write(&c->wbuf_sem); ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING); + /* retry flushing wbuf in case jffs2_wbuf_recover + left some data in the wbuf */ + if (ret) + ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING); up_write(&c->wbuf_sem); } else while (old_wbuf_len && old_wbuf_ofs == c->wbuf_ofs) { @@ -561,6 +570,10 @@ int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino) down(&c->alloc_sem); down_write(&c->wbuf_sem); ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING); + /* retry flushing wbuf in case jffs2_wbuf_recover + left some data in the wbuf */ + if (ret) + ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING); up_write(&c->wbuf_sem); break; } @@ -578,15 +591,27 @@ int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c) { int ret; + if (!c->wbuf) + return 0; + down_write(&c->wbuf_sem); ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT); + /* retry - maybe wbuf recover left some data in wbuf. */ + if (ret) + ret = __jffs2_flush_wbuf(c, PAD_NOACCOUNT); up_write(&c->wbuf_sem); return ret; } +#ifdef CONFIG_JFFS2_FS_WRITEBUFFER +#define PAGE_DIV(x) ( ((unsigned long)(x) / (unsigned long)(c->wbuf_pagesize)) * (unsigned long)(c->wbuf_pagesize) ) +#define PAGE_MOD(x) ( (unsigned long)(x) % (unsigned long)(c->wbuf_pagesize) ) +#else #define PAGE_DIV(x) ( (x) & (~(c->wbuf_pagesize - 1)) ) #define PAGE_MOD(x) ( (x) & (c->wbuf_pagesize - 1) ) +#endif + int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, unsigned long count, loff_t to, size_t *retlen, uint32_t ino) { struct kvec outvecs[3]; @@ -601,7 +626,7 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, unsig uint32_t outvec_to = to; /* If not NAND flash, don't bother */ - if (!c->wbuf) + if (!jffs2_is_writebuffered(c)) return jffs2_flash_direct_writev(c, invecs, count, to, retlen); down_write(&c->wbuf_sem); @@ -630,7 +655,7 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, unsig erase block. Anything else, and you die. New block starts at xxx000c (0-b = block header) */ - if ( (to & ~(c->sector_size-1)) != (c->wbuf_ofs & ~(c->sector_size-1)) ) { + if (SECTOR_ADDR(to) != SECTOR_ADDR(c->wbuf_ofs)) { /* It's a write to a new block */ if (c->wbuf_len) { D1(printk(KERN_DEBUG "jffs2_flash_writev() to 0x%lx causes flush of wbuf at 0x%08x\n", (unsigned long)to, c->wbuf_ofs)); @@ -762,9 +787,18 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, unsig if (ret < 0 || wbuf_retlen != PAGE_DIV(totlen)) { /* At this point we have no problem, - c->wbuf is empty. + c->wbuf is empty. However refile nextblock to avoid + writing again to same address. */ - *retlen = donelen; + struct jffs2_eraseblock *jeb; + + spin_lock(&c->erase_completion_lock); + + jeb = &c->blocks[outvec_to / c->sector_size]; + jffs2_block_refile(c, jeb, REFILE_ANYWAY); + + *retlen = 0; + spin_unlock(&c->erase_completion_lock); goto exit; } @@ -819,7 +853,7 @@ int jffs2_flash_write(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *r { struct kvec vecs[1]; - if (jffs2_can_mark_obsolete(c)) + if (!jffs2_is_writebuffered(c)) return c->mtd->write(c->mtd, ofs, len, retlen, buf); vecs[0].iov_base = (unsigned char *) buf; @@ -835,39 +869,38 @@ int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *re loff_t orbf = 0, owbf = 0, lwbf = 0; int ret; - /* Read flash */ - if (!jffs2_can_mark_obsolete(c)) { - down_read(&c->wbuf_sem); - - if (jffs2_cleanmarker_oob(c)) - ret = c->mtd->read_ecc(c->mtd, ofs, len, retlen, buf, NULL, c->oobinfo); - else - ret = c->mtd->read(c->mtd, ofs, len, retlen, buf); - - if ( (ret == -EBADMSG) && (*retlen == len) ) { - printk(KERN_WARNING "mtd->read(0x%zx bytes from 0x%llx) returned ECC error\n", - len, ofs); - /* - * We have the raw data without ECC correction in the buffer, maybe - * we are lucky and all data or parts are correct. We check the node. - * If data are corrupted node check will sort it out. - * We keep this block, it will fail on write or erase and the we - * mark it bad. Or should we do that now? But we should give him a chance. - * Maybe we had a system crash or power loss before the ecc write or - * a erase was completed. - * So we return success. :) - */ - ret = 0; - } - } else + if (!jffs2_is_writebuffered(c)) return c->mtd->read(c->mtd, ofs, len, retlen, buf); + /* Read flash */ + down_read(&c->wbuf_sem); + if (jffs2_cleanmarker_oob(c)) + ret = c->mtd->read_ecc(c->mtd, ofs, len, retlen, buf, NULL, c->oobinfo); + else + ret = c->mtd->read(c->mtd, ofs, len, retlen, buf); + + if ( (ret == -EBADMSG) && (*retlen == len) ) { + printk(KERN_WARNING "mtd->read(0x%zx bytes from 0x%llx) returned ECC error\n", + len, ofs); + /* + * We have the raw data without ECC correction in the buffer, maybe + * we are lucky and all data or parts are correct. We check the node. + * If data are corrupted node check will sort it out. + * We keep this block, it will fail on write or erase and the we + * mark it bad. Or should we do that now? But we should give him a chance. + * Maybe we had a system crash or power loss before the ecc write or + * a erase was completed. + * So we return success. :) + */ + ret = 0; + } + /* if no writebuffer available or write buffer empty, return */ if (!c->wbuf_pagesize || !c->wbuf_len) goto exit; /* if we read in a different block, return */ - if ( (ofs & ~(c->sector_size-1)) != (c->wbuf_ofs & ~(c->sector_size-1)) ) + if (SECTOR_ADDR(ofs) != SECTOR_ADDR(c->wbuf_ofs)) goto exit; if (ofs >= c->wbuf_ofs) { @@ -1161,7 +1194,27 @@ void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c) kfree(c->wbuf); } -#ifdef CONFIG_JFFS2_FS_NOR_ECC +int jffs2_dataflash_setup(struct jffs2_sb_info *c) { + c->cleanmarker_size = 0; /* No cleanmarkers needed */ + + /* Initialize write buffer */ + init_rwsem(&c->wbuf_sem); + c->wbuf_pagesize = c->sector_size; + c->wbuf_ofs = 0xFFFFFFFF; + + c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf) + return -ENOMEM; + + printk(KERN_INFO "JFFS2 write-buffering enabled (%i)\n", c->wbuf_pagesize); + + return 0; +} + +void jffs2_dataflash_cleanup(struct jffs2_sb_info *c) { + kfree(c->wbuf); +} + int jffs2_nor_ecc_flash_setup(struct jffs2_sb_info *c) { /* Cleanmarker is actually larger on the flashes */ c->cleanmarker_size = 16; @@ -1181,4 +1234,3 @@ int jffs2_nor_ecc_flash_setup(struct jffs2_sb_info *c) { void jffs2_nor_ecc_flash_cleanup(struct jffs2_sb_info *c) { kfree(c->wbuf); } -#endif diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c index 80a5db54262..69100615d9a 100644 --- a/fs/jffs2/write.c +++ b/fs/jffs2/write.c @@ -7,7 +7,7 @@ * * For licensing information, see the file 'LICENCE' in this directory. * - * $Id: write.c,v 1.87 2004/11/16 20:36:12 dwmw2 Exp $ + * $Id: write.c,v 1.92 2005/04/13 13:22:35 dwmw2 Exp $ * */ @@ -35,13 +35,12 @@ int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint f->inocache = ic; f->inocache->nlink = 1; f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache; - f->inocache->ino = ++c->highest_ino; f->inocache->state = INO_STATE_PRESENT; - ri->ino = cpu_to_je32(f->inocache->ino); - D1(printk(KERN_DEBUG "jffs2_do_new_inode(): Assigned ino# %d\n", f->inocache->ino)); jffs2_add_ino_cache(c, f->inocache); + D1(printk(KERN_DEBUG "jffs2_do_new_inode(): Assigned ino# %d\n", f->inocache->ino)); + ri->ino = cpu_to_je32(f->inocache->ino); ri->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); ri->nodetype = cpu_to_je16(JFFS2_NODETYPE_INODE); @@ -136,6 +135,15 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2 raw->__totlen = PAD(sizeof(*ri)+datalen); raw->next_phys = NULL; + if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(ri->version) < f->highest_version)) { + BUG_ON(!retried); + D1(printk(KERN_DEBUG "jffs2_write_dnode : dnode_version %d, " + "highest version %d -> updating dnode\n", + je32_to_cpu(ri->version), f->highest_version)); + ri->version = cpu_to_je32(++f->highest_version); + ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); + } + ret = jffs2_flash_writev(c, vecs, cnt, flash_ofs, &retlen, (alloc_mode==ALLOC_GC)?0:f->inocache->ino); @@ -280,6 +288,16 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff raw->__totlen = PAD(sizeof(*rd)+namelen); raw->next_phys = NULL; + if ((alloc_mode!=ALLOC_GC) && (je32_to_cpu(rd->version) < f->highest_version)) { + BUG_ON(!retried); + D1(printk(KERN_DEBUG "jffs2_write_dirent : dirent_version %d, " + "highest version %d -> updating dirent\n", + je32_to_cpu(rd->version), f->highest_version)); + rd->version = cpu_to_je32(++f->highest_version); + fd->version = je32_to_cpu(rd->version); + rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); + } + ret = jffs2_flash_writev(c, vecs, 2, flash_ofs, &retlen, (alloc_mode==ALLOC_GC)?0:je32_to_cpu(rd->pino)); if (ret || (retlen != sizeof(*rd) + namelen)) { @@ -625,20 +643,23 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, down(&dead_f->sem); - while (dead_f->dents) { - /* There can be only deleted ones */ - fd = dead_f->dents; - - dead_f->dents = fd->next; - - if (fd->ino) { - printk(KERN_WARNING "Deleting inode #%u with active dentry \"%s\"->ino #%u\n", - dead_f->inocache->ino, fd->name, fd->ino); - } else { - D1(printk(KERN_DEBUG "Removing deletion dirent for \"%s\" from dir ino #%u\n", fd->name, dead_f->inocache->ino)); + if (S_ISDIR(OFNI_EDONI_2SFFJ(dead_f)->i_mode)) { + while (dead_f->dents) { + /* There can be only deleted ones */ + fd = dead_f->dents; + + dead_f->dents = fd->next; + + if (fd->ino) { + printk(KERN_WARNING "Deleting inode #%u with active dentry \"%s\"->ino #%u\n", + dead_f->inocache->ino, fd->name, fd->ino); + } else { + D1(printk(KERN_DEBUG "Removing deletion dirent for \"%s\" from dir ino #%u\n", + fd->name, dead_f->inocache->ino)); + } + jffs2_mark_node_obsolete(c, fd->raw); + jffs2_free_full_dirent(fd); } - jffs2_mark_node_obsolete(c, fd->raw); - jffs2_free_full_dirent(fd); } dead_f->inocache->nlink--; diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 8d2a9ab981d..e892dab40c2 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -21,6 +21,7 @@ #include <linux/sched.h> #include <linux/fs.h> #include <linux/quotaops.h> +#include <linux/posix_acl_xattr.h> #include "jfs_incore.h" #include "jfs_xattr.h" #include "jfs_acl.h" @@ -36,11 +37,11 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type) switch(type) { case ACL_TYPE_ACCESS: - ea_name = XATTR_NAME_ACL_ACCESS; + ea_name = POSIX_ACL_XATTR_ACCESS; p_acl = &ji->i_acl; break; case ACL_TYPE_DEFAULT: - ea_name = XATTR_NAME_ACL_DEFAULT; + ea_name = POSIX_ACL_XATTR_DEFAULT; p_acl = &ji->i_default_acl; break; default: @@ -70,8 +71,7 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type) if (!IS_ERR(acl)) *p_acl = posix_acl_dup(acl); } - if (value) - kfree(value); + kfree(value); return acl; } @@ -89,11 +89,11 @@ static int jfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) switch(type) { case ACL_TYPE_ACCESS: - ea_name = XATTR_NAME_ACL_ACCESS; + ea_name = POSIX_ACL_XATTR_ACCESS; p_acl = &ji->i_acl; break; case ACL_TYPE_DEFAULT: - ea_name = XATTR_NAME_ACL_DEFAULT; + ea_name = POSIX_ACL_XATTR_DEFAULT; p_acl = &ji->i_default_acl; if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; @@ -102,7 +102,7 @@ static int jfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) return -EINVAL; } if (acl) { - size = xattr_acl_size(acl->a_count); + size = posix_acl_xattr_size(acl->a_count); value = kmalloc(size, GFP_KERNEL); if (!value) return -ENOMEM; @@ -112,8 +112,7 @@ static int jfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) } rc = __jfs_setxattr(inode, ea_name, value, size, 0); out: - if (value) - kfree(value); + kfree(value); if (!rc) { if (*p_acl && (*p_acl != JFS_ACL_NOT_CACHED)) diff --git a/fs/jfs/file.c b/fs/jfs/file.c index a87b06fa8ff..c2c19c9ed9a 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -1,6 +1,6 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 - * Portions Copyright (c) Christoph Hellwig, 2001-2002 + * Copyright (C) International Business Machines Corp., 2000-2002 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,16 +19,13 @@ #include <linux/fs.h> #include "jfs_incore.h" +#include "jfs_inode.h" #include "jfs_dmap.h" #include "jfs_txnmgr.h" #include "jfs_xattr.h" #include "jfs_acl.h" #include "jfs_debug.h" - -extern int jfs_commit_inode(struct inode *, int); -extern void jfs_truncate(struct inode *); - int jfs_fsync(struct file *file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 24a689179af..767c7ecb429 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -23,6 +23,7 @@ #include <linux/pagemap.h> #include <linux/quotaops.h> #include "jfs_incore.h" +#include "jfs_inode.h" #include "jfs_filsys.h" #include "jfs_imap.h" #include "jfs_extent.h" @@ -30,14 +31,6 @@ #include "jfs_debug.h" -extern struct inode_operations jfs_dir_inode_operations; -extern struct inode_operations jfs_file_inode_operations; -extern struct inode_operations jfs_symlink_inode_operations; -extern struct file_operations jfs_dir_operations; -extern struct file_operations jfs_file_operations; -struct address_space_operations jfs_aops; -extern int freeZeroLink(struct inode *); - void jfs_read_inode(struct inode *inode) { if (diRead(inode)) { @@ -135,8 +128,12 @@ void jfs_delete_inode(struct inode *inode) { jfs_info("In jfs_delete_inode, inode = 0x%p", inode); + if (is_bad_inode(inode) || + (JFS_IP(inode)->fileset != cpu_to_le32(FILESYSTEM_I))) + return; + if (test_cflag(COMMIT_Freewmap, inode)) - freeZeroLink(inode); + jfs_free_zero_link(inode); diFree(inode); diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index d2ae430adec..a3acd3eec05 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -20,8 +20,6 @@ #ifdef CONFIG_JFS_POSIX_ACL -#include <linux/xattr_acl.h> - int jfs_permission(struct inode *, int, struct nameidata *); int jfs_init_acl(struct inode *, struct inode *); int jfs_setattr(struct dentry *, struct iattr *); diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c index 91a0a889ebc..4caea6b43b9 100644 --- a/fs/jfs/jfs_debug.c +++ b/fs/jfs/jfs_debug.c @@ -58,8 +58,6 @@ void dump_mem(char *label, void *data, int length) static struct proc_dir_entry *base; #ifdef CONFIG_JFS_DEBUG -extern read_proc_t jfs_txanchor_read; - static int loglevel_read(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -97,14 +95,6 @@ static int loglevel_write(struct file *file, const char __user *buffer, } #endif - -#ifdef CONFIG_JFS_STATISTICS -extern read_proc_t jfs_lmstats_read; -extern read_proc_t jfs_txstats_read; -extern read_proc_t jfs_xtstat_read; -extern read_proc_t jfs_mpstat_read; -#endif - static struct { const char *name; read_proc_t *read_fn; diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h index a38079ae1e0..ddffbbd4d95 100644 --- a/fs/jfs/jfs_debug.h +++ b/fs/jfs/jfs_debug.h @@ -1,6 +1,6 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 - * Portions Copyright (c) Christoph Hellwig, 2001-2002 + * Copyright (C) International Business Machines Corp., 2000-2002 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -31,7 +31,9 @@ * CONFIG_JFS_DEBUG or CONFIG_JFS_STATISTICS is defined */ #if defined(CONFIG_PROC_FS) && (defined(CONFIG_JFS_DEBUG) || defined(CONFIG_JFS_STATISTICS)) - #define PROC_FS_JFS +#define PROC_FS_JFS +extern void jfs_proc_init(void); +extern void jfs_proc_clean(void); #endif /* @@ -65,8 +67,8 @@ extern int jfsloglevel; -/* dump memory contents */ extern void dump_mem(char *label, void *data, int length); +extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *); /* information message: e.g., configuration, major event */ #define jfs_info(fmt, arg...) do { \ @@ -110,6 +112,11 @@ extern void dump_mem(char *label, void *data, int length); * ---------- */ #ifdef CONFIG_JFS_STATISTICS +extern int jfs_lmstats_read(char *, char **, off_t, int, int *, void *); +extern int jfs_txstats_read(char *, char **, off_t, int, int *, void *); +extern int jfs_mpstat_read(char *, char **, off_t, int, int *, void *); +extern int jfs_xtstat_read(char *, char **, off_t, int, int *, void *); + #define INCREMENT(x) ((x)++) #define DECREMENT(x) ((x)--) #define HIGHWATERMARK(x,y) ((x) = max((x), (y))) diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 69007fd546e..c739626f5bf 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -26,36 +26,6 @@ #include "jfs_debug.h" /* - * Debug code for double-checking block map - */ -/* #define _JFS_DEBUG_DMAP 1 */ - -#ifdef _JFS_DEBUG_DMAP -#define DBINITMAP(size,ipbmap,results) \ - DBinitmap(size,ipbmap,results) -#define DBALLOC(dbmap,mapsize,blkno,nblocks) \ - DBAlloc(dbmap,mapsize,blkno,nblocks) -#define DBFREE(dbmap,mapsize,blkno,nblocks) \ - DBFree(dbmap,mapsize,blkno,nblocks) -#define DBALLOCCK(dbmap,mapsize,blkno,nblocks) \ - DBAllocCK(dbmap,mapsize,blkno,nblocks) -#define DBFREECK(dbmap,mapsize,blkno,nblocks) \ - DBFreeCK(dbmap,mapsize,blkno,nblocks) - -static void DBinitmap(s64, struct inode *, u32 **); -static void DBAlloc(uint *, s64, s64, s64); -static void DBFree(uint *, s64, s64, s64); -static void DBAllocCK(uint *, s64, s64, s64); -static void DBFreeCK(uint *, s64, s64, s64); -#else -#define DBINITMAP(size,ipbmap,results) -#define DBALLOC(dbmap, mapsize, blkno, nblocks) -#define DBFREE(dbmap, mapsize, blkno, nblocks) -#define DBALLOCCK(dbmap, mapsize, blkno, nblocks) -#define DBFREECK(dbmap, mapsize, blkno, nblocks) -#endif /* _JFS_DEBUG_DMAP */ - -/* * SERIALIZATION of the Block Allocation Map. * * the working state of the block allocation map is accessed in @@ -105,7 +75,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, int nblocks); static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval); static void dbBackSplit(dmtree_t * tp, int leafno); -static void dbJoin(dmtree_t * tp, int leafno, int newval); +static int dbJoin(dmtree_t * tp, int leafno, int newval); static void dbAdjTree(dmtree_t * tp, int leafno, int newval); static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level); @@ -128,8 +98,8 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks); static int dbFindBits(u32 word, int l2nb); static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno); static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx); -static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, - int nblocks); +static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, int nblocks); static int dbMaxBud(u8 * cp); @@ -242,7 +212,6 @@ int dbMount(struct inode *ipbmap) JFS_SBI(ipbmap->i_sb)->bmap = bmp; memset(bmp->db_active, 0, sizeof(bmp->db_active)); - DBINITMAP(bmp->db_mapsize, ipbmap, &bmp->db_DBmap); /* * allocate/initialize the bmap lock @@ -272,7 +241,6 @@ int dbMount(struct inode *ipbmap) int dbUnmount(struct inode *ipbmap, int mounterror) { struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; - int i; if (!(mounterror || isReadOnly(ipbmap))) dbSync(ipbmap); @@ -282,14 +250,6 @@ int dbUnmount(struct inode *ipbmap, int mounterror) */ truncate_inode_pages(ipbmap->i_mapping, 0); - /* - * Sanity Check - */ - for (i = 0; i < bmp->db_numag; i++) - if (atomic_read(&bmp->db_active[i])) - printk(KERN_ERR "dbUnmount: db_active[%d] = %d\n", - i, atomic_read(&bmp->db_active[i])); - /* free the memory for the in-memory bmap. */ kfree(bmp); @@ -416,16 +376,13 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks) */ nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1))); - DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb); - /* free the blocks. */ if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) { + jfs_error(ip->i_sb, "dbFree: error in block map\n"); release_metapage(mp); IREAD_UNLOCK(ipbmap); return (rc); } - - DBFREE(bmp->db_DBmap, bmp->db_mapsize, blkno, nb); } /* write the last buffer. */ @@ -784,10 +741,6 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) IWRITE_LOCK(ipbmap); rc = dbAllocAny(bmp, nblocks, l2nb, results); - if (rc == 0) { - DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results, - nblocks); - } goto write_unlock; } @@ -845,8 +798,6 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) != -ENOSPC) { if (rc == 0) { *results = blkno; - DBALLOC(bmp->db_DBmap, bmp->db_mapsize, - *results, nblocks); mark_metapage_dirty(mp); } @@ -872,11 +823,8 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) if ((rc = dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results)) != -ENOSPC) { - if (rc == 0) { - DBALLOC(bmp->db_DBmap, bmp->db_mapsize, - *results, nblocks); + if (rc == 0) mark_metapage_dirty(mp); - } release_metapage(mp); goto read_unlock; @@ -887,11 +835,8 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) */ if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results)) != -ENOSPC) { - if (rc == 0) { - DBALLOC(bmp->db_DBmap, bmp->db_mapsize, - *results, nblocks); + if (rc == 0) mark_metapage_dirty(mp); - } release_metapage(mp); goto read_unlock; @@ -905,13 +850,9 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) * the same allocation group as the hint. */ IWRITE_LOCK(ipbmap); - if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) - != -ENOSPC) { - if (rc == 0) - DBALLOC(bmp->db_DBmap, bmp->db_mapsize, - *results, nblocks); + if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC) goto write_unlock; - } + IWRITE_UNLOCK(ipbmap); @@ -927,9 +868,6 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) */ if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == -ENOSPC) rc = dbAllocAny(bmp, nblocks, l2nb, results); - if (rc == 0) { - DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results, nblocks); - } write_unlock: IWRITE_UNLOCK(ipbmap); @@ -1001,10 +939,9 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) IREAD_UNLOCK(ipbmap); - if (rc == 0) { - DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks); + if (rc == 0) mark_metapage_dirty(mp); - } + release_metapage(mp); return (rc); @@ -1153,7 +1090,6 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) return -EIO; } - DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks); dp = (struct dmap *) mp->data; /* try to allocate the blocks immediately following the @@ -1164,11 +1100,9 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) IREAD_UNLOCK(ipbmap); /* were we successful ? */ - if (rc == 0) { - DBALLOC(bmp->db_DBmap, bmp->db_mapsize, extblkno, - addnblocks); + if (rc == 0) write_metapage(mp); - } else + else /* we were not successful */ release_metapage(mp); @@ -2087,7 +2021,7 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, int nblocks) { s8 oldroot; - int rc, word; + int rc = 0, word; /* save the current value of the root (i.e. maximum free string) * of the dmap tree. @@ -2095,11 +2029,11 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, oldroot = dp->tree.stree[ROOT]; /* free the specified (blocks) bits */ - dbFreeBits(bmp, dp, blkno, nblocks); + rc = dbFreeBits(bmp, dp, blkno, nblocks); - /* if the root has not changed, done. */ - if (dp->tree.stree[ROOT] == oldroot) - return (0); + /* if error or the root has not changed, done. */ + if (rc || (dp->tree.stree[ROOT] == oldroot)) + return (rc); /* root changed. bubble the change up to the dmap control pages. * if the adjustment of the upper level control pages fails, @@ -2288,15 +2222,16 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, * blkno - starting block number of the bits to be freed. * nblocks - number of bits to be freed. * - * RETURN VALUES: none + * RETURN VALUES: 0 for success * * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; */ -static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, +static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, int nblocks) { int dbitno, word, rembits, nb, nwords, wbitno, nw, agno; dmtree_t *tp = (dmtree_t *) & dp->tree; + int rc = 0; int size; /* determine the bit number and word within the dmap of the @@ -2345,8 +2280,10 @@ static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, /* update the leaf for this dmap word. */ - dbJoin(tp, word, - dbMaxBud((u8 *) & dp->wmap[word])); + rc = dbJoin(tp, word, + dbMaxBud((u8 *) & dp->wmap[word])); + if (rc) + return rc; word += 1; } else { @@ -2377,7 +2314,9 @@ static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, /* update the leaf. */ - dbJoin(tp, word, size); + rc = dbJoin(tp, word, size); + if (rc) + return rc; /* get the number of dmap words handled. */ @@ -2424,6 +2363,8 @@ static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, } BMAP_UNLOCK(bmp); + + return 0; } @@ -2531,7 +2472,9 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) } dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval); } else { - dbJoin((dmtree_t *) dcp, leafno, newval); + rc = dbJoin((dmtree_t *) dcp, leafno, newval); + if (rc) + return rc; } /* check if the root of the current dmap control page changed due @@ -2756,7 +2699,7 @@ static void dbBackSplit(dmtree_t * tp, int leafno) * * RETURN VALUES: none */ -static void dbJoin(dmtree_t * tp, int leafno, int newval) +static int dbJoin(dmtree_t * tp, int leafno, int newval) { int budsz, buddy; s8 *leaf; @@ -2796,7 +2739,9 @@ static void dbJoin(dmtree_t * tp, int leafno, int newval) if (newval > leaf[buddy]) break; - assert(newval == leaf[buddy]); + /* It shouldn't be less */ + if (newval < leaf[buddy]) + return -EIO; /* check which (leafno or buddy) is the left buddy. * the left buddy gets to claim the blocks resulting @@ -2828,6 +2773,8 @@ static void dbJoin(dmtree_t * tp, int leafno, int newval) /* update the leaf value. */ dbAdjTree(tp, leafno, newval); + + return 0; } @@ -3194,16 +3141,12 @@ int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks) */ nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1))); - DBFREECK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb); - /* allocate the blocks. */ if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) { release_metapage(mp); IREAD_UNLOCK(ipbmap); return (rc); } - - DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nb); } /* write the last buffer. */ @@ -4050,223 +3993,3 @@ s64 dbMapFileSizeToMapSize(struct inode * ipbmap) return (nblocks); } - - -#ifdef _JFS_DEBUG_DMAP -/* - * DBinitmap() - */ -static void DBinitmap(s64 size, struct inode *ipbmap, u32 ** results) -{ - int npages; - u32 *dbmap, *d; - int n; - s64 lblkno, cur_block; - struct dmap *dp; - struct metapage *mp; - - npages = size / 32768; - npages += (size % 32768) ? 1 : 0; - - dbmap = (u32 *) xmalloc(npages * 4096, L2PSIZE, kernel_heap); - if (dbmap == NULL) - BUG(); /* Not robust since this is only unused debug code */ - - for (n = 0, d = dbmap; n < npages; n++, d += 1024) - bzero(d, 4096); - - /* Need to initialize from disk map pages - */ - for (d = dbmap, cur_block = 0; cur_block < size; - cur_block += BPERDMAP, d += LPERDMAP) { - lblkno = BLKTODMAP(cur_block, - JFS_SBI(ipbmap->i_sb)->bmap-> - db_l2nbperpage); - mp = read_metapage(ipbmap, lblkno, PSIZE, 0); - if (mp == NULL) { - jfs_error(ipbmap->i_sb, - "DBinitmap: could not read disk map page"); - continue; - } - dp = (struct dmap *) mp->data; - - for (n = 0; n < LPERDMAP; n++) - d[n] = le32_to_cpu(dp->wmap[n]); - - release_metapage(mp); - } - - *results = dbmap; -} - - -/* - * DBAlloc() - */ -void DBAlloc(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks) -{ - int word, nb, bitno; - u32 mask; - - assert(blkno > 0 && blkno < mapsize); - assert(nblocks > 0 && nblocks <= mapsize); - - assert(blkno + nblocks <= mapsize); - - dbmap += (blkno / 32); - while (nblocks > 0) { - bitno = blkno & (32 - 1); - nb = min(nblocks, 32 - bitno); - - mask = (0xffffffff << (32 - nb) >> bitno); - assert((mask & *dbmap) == 0); - *dbmap |= mask; - - dbmap++; - blkno += nb; - nblocks -= nb; - } -} - - -/* - * DBFree() - */ -static void DBFree(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks) -{ - int word, nb, bitno; - u32 mask; - - assert(blkno > 0 && blkno < mapsize); - assert(nblocks > 0 && nblocks <= mapsize); - - assert(blkno + nblocks <= mapsize); - - dbmap += (blkno / 32); - while (nblocks > 0) { - bitno = blkno & (32 - 1); - nb = min(nblocks, 32 - bitno); - - mask = (0xffffffff << (32 - nb) >> bitno); - assert((mask & *dbmap) == mask); - *dbmap &= ~mask; - - dbmap++; - blkno += nb; - nblocks -= nb; - } -} - - -/* - * DBAllocCK() - */ -static void DBAllocCK(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks) -{ - int word, nb, bitno; - u32 mask; - - assert(blkno > 0 && blkno < mapsize); - assert(nblocks > 0 && nblocks <= mapsize); - - assert(blkno + nblocks <= mapsize); - - dbmap += (blkno / 32); - while (nblocks > 0) { - bitno = blkno & (32 - 1); - nb = min(nblocks, 32 - bitno); - - mask = (0xffffffff << (32 - nb) >> bitno); - assert((mask & *dbmap) == mask); - - dbmap++; - blkno += nb; - nblocks -= nb; - } -} - - -/* - * DBFreeCK() - */ -static void DBFreeCK(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks) -{ - int word, nb, bitno; - u32 mask; - - assert(blkno > 0 && blkno < mapsize); - assert(nblocks > 0 && nblocks <= mapsize); - - assert(blkno + nblocks <= mapsize); - - dbmap += (blkno / 32); - while (nblocks > 0) { - bitno = blkno & (32 - 1); - nb = min(nblocks, 32 - bitno); - - mask = (0xffffffff << (32 - nb) >> bitno); - assert((mask & *dbmap) == 0); - - dbmap++; - blkno += nb; - nblocks -= nb; - } -} - - -/* - * dbPrtMap() - */ -static void dbPrtMap(struct bmap * bmp) -{ - printk(" mapsize: %d%d\n", bmp->db_mapsize); - printk(" nfree: %d%d\n", bmp->db_nfree); - printk(" numag: %d\n", bmp->db_numag); - printk(" agsize: %d%d\n", bmp->db_agsize); - printk(" agl2size: %d\n", bmp->db_agl2size); - printk(" agwidth: %d\n", bmp->db_agwidth); - printk(" agstart: %d\n", bmp->db_agstart); - printk(" agheigth: %d\n", bmp->db_agheigth); - printk(" aglevel: %d\n", bmp->db_aglevel); - printk(" maxlevel: %d\n", bmp->db_maxlevel); - printk(" maxag: %d\n", bmp->db_maxag); - printk(" agpref: %d\n", bmp->db_agpref); - printk(" l2nbppg: %d\n", bmp->db_l2nbperpage); -} - - -/* - * dbPrtCtl() - */ -static void dbPrtCtl(struct dmapctl * dcp) -{ - int i, j, n; - - printk(" height: %08x\n", le32_to_cpu(dcp->height)); - printk(" leafidx: %08x\n", le32_to_cpu(dcp->leafidx)); - printk(" budmin: %08x\n", dcp->budmin); - printk(" nleafs: %08x\n", le32_to_cpu(dcp->nleafs)); - printk(" l2nleafs: %08x\n", le32_to_cpu(dcp->l2nleafs)); - - printk("\n Tree:\n"); - for (i = 0; i < CTLLEAFIND; i += 8) { - n = min(8, CTLLEAFIND - i); - - for (j = 0; j < n; j++) - printf(" [%03x]: %02x", i + j, - (char) dcp->stree[i + j]); - printf("\n"); - } - - printk("\n Tree Leaves:\n"); - for (i = 0; i < LPERCTL; i += 8) { - n = min(8, LPERCTL - i); - - for (j = 0; j < n; j++) - printf(" [%03x]: %02x", - i + j, - (char) dcp->stree[i + j + CTLLEAFIND]); - printf("\n"); - } -} -#endif /* _JFS_DEBUG_DMAP */ diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index ac41f72d6d5..404f33eae50 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -381,9 +381,12 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) * It's time to move the inline table to an external * page and begin to build the xtree */ - if (DQUOT_ALLOC_BLOCK(ip, sbi->nbperpage) || - dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) - goto clean_up; /* No space */ + if (DQUOT_ALLOC_BLOCK(ip, sbi->nbperpage)) + goto clean_up; + if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) { + DQUOT_FREE_BLOCK(ip, sbi->nbperpage); + goto clean_up; + } /* * Save the table, we're going to overwrite it with the @@ -397,13 +400,15 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) xtInitRoot(tid, ip); /* - * Allocate the first block & add it to the xtree + * Add the first block to the xtree */ if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) { /* This really shouldn't fail */ jfs_warn("add_index: xtInsert failed!"); memcpy(&jfs_ip->i_dirtable, temp_table, sizeof (temp_table)); + dbFree(ip, xaddr, sbi->nbperpage); + DQUOT_FREE_BLOCK(ip, sbi->nbperpage); goto clean_up; } ip->i_size = PSIZE; @@ -2931,6 +2936,9 @@ static void add_missing_indices(struct inode *inode, s64 bn) ASSERT(p->header.flag & BT_LEAF); tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY); + if (BT_IS_ROOT(mp)) + tlck->type |= tlckBTROOT; + dtlck = (struct dt_lock *) &tlck->lock; stbl = DT_GETSTBL(p); @@ -4551,202 +4559,3 @@ int dtModify(tid_t tid, struct inode *ip, return 0; } - -#ifdef _JFS_DEBUG_DTREE -/* - * dtDisplayTree() - * - * function: traverse forward - */ -int dtDisplayTree(struct inode *ip) -{ - int rc; - struct metapage *mp; - dtpage_t *p; - s64 bn, pbn; - int index, lastindex, v, h; - pxd_t *xd; - struct btstack btstack; - struct btframe *btsp; - struct btframe *parent; - u8 *stbl; - int psize = 256; - - printk("display B+-tree.\n"); - - /* clear stack */ - btsp = btstack.stack; - - /* - * start with root - * - * root resides in the inode - */ - bn = 0; - v = h = 0; - - /* - * first access of each page: - */ - newPage: - DT_GETPAGE(ip, bn, mp, psize, p, rc); - if (rc) - return rc; - - /* process entries forward from first index */ - index = 0; - lastindex = p->header.nextindex - 1; - - if (p->header.flag & BT_INTERNAL) { - /* - * first access of each internal page - */ - printf("internal page "); - dtDisplayPage(ip, bn, p); - - goto getChild; - } else { /* (p->header.flag & BT_LEAF) */ - - /* - * first access of each leaf page - */ - printf("leaf page "); - dtDisplayPage(ip, bn, p); - - /* - * process leaf page entries - * - for ( ; index <= lastindex; index++) - { - } - */ - - /* unpin the leaf page */ - DT_PUTPAGE(mp); - } - - /* - * go back up to the parent page - */ - getParent: - /* pop/restore parent entry for the current child page */ - if ((parent = (btsp == btstack.stack ? NULL : --btsp)) == NULL) - /* current page must have been root */ - return; - - /* - * parent page scan completed - */ - if ((index = parent->index) == (lastindex = parent->lastindex)) { - /* go back up to the parent page */ - goto getParent; - } - - /* - * parent page has entries remaining - */ - /* get back the parent page */ - bn = parent->bn; - /* v = parent->level; */ - DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; - - /* get next parent entry */ - index++; - - /* - * internal page: go down to child page of current entry - */ - getChild: - /* push/save current parent entry for the child page */ - btsp->bn = pbn = bn; - btsp->index = index; - btsp->lastindex = lastindex; - /* btsp->level = v; */ - /* btsp->node = h; */ - ++btsp; - - /* get current entry for the child page */ - stbl = DT_GETSTBL(p); - xd = (pxd_t *) & p->slot[stbl[index]]; - - /* - * first access of each internal entry: - */ - - /* get child page */ - bn = addressPXD(xd); - psize = lengthPXD(xd) << ip->i_ipmnt->i_l2bsize; - - printk("traverse down 0x%Lx[%d]->0x%Lx\n", pbn, index, bn); - v++; - h = index; - - /* release parent page */ - DT_PUTPAGE(mp); - - /* process the child page */ - goto newPage; -} - - -/* - * dtDisplayPage() - * - * function: display page - */ -int dtDisplayPage(struct inode *ip, s64 bn, dtpage_t * p) -{ - int rc; - struct metapage *mp; - struct ldtentry *lh; - struct idtentry *ih; - pxd_t *xd; - int i, j; - u8 *stbl; - wchar_t name[JFS_NAME_MAX + 1]; - struct component_name key = { 0, name }; - int freepage = 0; - - if (p == NULL) { - freepage = 1; - DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; - } - - /* display page control */ - printk("bn:0x%Lx flag:0x%08x nextindex:%d\n", - bn, p->header.flag, p->header.nextindex); - - /* display entries */ - stbl = DT_GETSTBL(p); - for (i = 0, j = 1; i < p->header.nextindex; i++, j++) { - dtGetKey(p, i, &key, JFS_SBI(ip->i_sb)->mntflag); - key.name[key.namlen] = '\0'; - if (p->header.flag & BT_LEAF) { - lh = (struct ldtentry *) & p->slot[stbl[i]]; - printf("\t[%d] %s:%d", i, key.name, - le32_to_cpu(lh->inumber)); - } else { - ih = (struct idtentry *) & p->slot[stbl[i]]; - xd = (pxd_t *) ih; - bn = addressPXD(xd); - printf("\t[%d] %s:0x%Lx", i, key.name, bn); - } - - if (j == 4) { - printf("\n"); - j = 0; - } - } - - printf("\n"); - - if (freepage) - DT_PUTPAGE(mp); - - return 0; -} -#endif /* _JFS_DEBUG_DTREE */ diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h index 273a80130c9..13e4fdf0772 100644 --- a/fs/jfs/jfs_dtree.h +++ b/fs/jfs/jfs_dtree.h @@ -269,11 +269,4 @@ extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag); extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir); - -#ifdef _JFS_DEBUG_DTREE -extern int dtDisplayTree(struct inode *ip); - -extern int dtDisplayPage(struct inode *ip, s64 bn, dtpage_t * p); -#endif /* _JFS_DEBUG_DTREE */ - #endif /* !_H_JFS_DTREE */ diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index 1953acb7926..4879603daa1 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -19,6 +19,7 @@ #include <linux/fs.h> #include <linux/quotaops.h> #include "jfs_incore.h" +#include "jfs_inode.h" #include "jfs_superblock.h" #include "jfs_dmap.h" #include "jfs_extent.h" @@ -33,12 +34,6 @@ static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *); #endif static s64 extRoundDown(s64 nb); -/* - * external references - */ -extern int jfs_commit_inode(struct inode *, int); - - #define DPD(a) (printk("(a): %d\n",(a))) #define DPC(a) (printk("(a): %c\n",(a))) #define DPL1(a) \ diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 7acff2ce3c8..4021d46da7e 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -47,6 +47,7 @@ #include <linux/quotaops.h> #include "jfs_incore.h" +#include "jfs_inode.h" #include "jfs_filsys.h" #include "jfs_dinode.h" #include "jfs_dmap.h" @@ -69,11 +70,6 @@ #define AG_UNLOCK(imap,agno) up(&imap->im_aglock[agno]) /* - * external references - */ -extern struct address_space_operations jfs_aops; - -/* * forward references */ static int diAllocAG(struct inomap *, int, boolean_t, struct inode *); @@ -91,25 +87,6 @@ static int copy_from_dinode(struct dinode *, struct inode *); static void copy_to_dinode(struct dinode *, struct inode *); /* - * debug code for double-checking inode map - */ -/* #define _JFS_DEBUG_IMAP 1 */ - -#ifdef _JFS_DEBUG_IMAP -#define DBG_DIINIT(imap) DBGdiInit(imap) -#define DBG_DIALLOC(imap, ino) DBGdiAlloc(imap, ino) -#define DBG_DIFREE(imap, ino) DBGdiFree(imap, ino) - -static void *DBGdiInit(struct inomap * imap); -static void DBGdiAlloc(struct inomap * imap, ino_t ino); -static void DBGdiFree(struct inomap * imap, ino_t ino); -#else -#define DBG_DIINIT(imap) -#define DBG_DIALLOC(imap, ino) -#define DBG_DIFREE(imap, ino) -#endif /* _JFS_DEBUG_IMAP */ - -/* * NAME: diMount() * * FUNCTION: initialize the incore inode map control structures for @@ -192,8 +169,6 @@ int diMount(struct inode *ipimap) imap->im_ipimap = ipimap; JFS_IP(ipimap)->i_imap = imap; -// DBG_DIINIT(imap); - return (0); } @@ -1047,7 +1022,6 @@ int diFree(struct inode *ip) /* update the bitmap. */ iagp->wmap[extno] = cpu_to_le32(bitmap); - DBG_DIFREE(imap, inum); /* update the free inode counts at the iag, ag and * map level. @@ -1235,7 +1209,6 @@ int diFree(struct inode *ip) jfs_error(ip->i_sb, "diFree: the pmap does not show inode free"); } iagp->wmap[extno] = 0; - DBG_DIFREE(imap, inum); PXDlength(&iagp->inoext[extno], 0); PXDaddress(&iagp->inoext[extno], 0); @@ -1354,7 +1327,6 @@ diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp) struct jfs_inode_info *jfs_ip = JFS_IP(ip); ip->i_ino = (iagno << L2INOSPERIAG) + ino; - DBG_DIALLOC(JFS_IP(ipimap)->i_imap, ip->i_ino); jfs_ip->ixpxd = iagp->inoext[extno]; jfs_ip->agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi); jfs_ip->active_ag = -1; @@ -3189,84 +3161,3 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip) if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) dip->di_rdev = cpu_to_le32(jfs_ip->dev); } - -#ifdef _JFS_DEBUG_IMAP -/* - * DBGdiInit() - */ -static void *DBGdiInit(struct inomap * imap) -{ - u32 *dimap; - int size; - size = 64 * 1024; - if ((dimap = (u32 *) xmalloc(size, L2PSIZE, kernel_heap)) == NULL) - assert(0); - bzero((void *) dimap, size); - imap->im_DBGdimap = dimap; -} - -/* - * DBGdiAlloc() - */ -static void DBGdiAlloc(struct inomap * imap, ino_t ino) -{ - u32 *dimap = imap->im_DBGdimap; - int w, b; - u32 m; - w = ino >> 5; - b = ino & 31; - m = 0x80000000 >> b; - assert(w < 64 * 256); - if (dimap[w] & m) { - printk("DEBUG diAlloc: duplicate alloc ino:0x%x\n", ino); - } - dimap[w] |= m; -} - -/* - * DBGdiFree() - */ -static void DBGdiFree(struct inomap * imap, ino_t ino) -{ - u32 *dimap = imap->im_DBGdimap; - int w, b; - u32 m; - w = ino >> 5; - b = ino & 31; - m = 0x80000000 >> b; - assert(w < 64 * 256); - if ((dimap[w] & m) == 0) { - printk("DEBUG diFree: duplicate free ino:0x%x\n", ino); - } - dimap[w] &= ~m; -} - -static void dump_cp(struct inomap * ipimap, char *function, int line) -{ - printk("\n* ********* *\nControl Page %s %d\n", function, line); - printk("FreeIAG %d\tNextIAG %d\n", ipimap->im_freeiag, - ipimap->im_nextiag); - printk("NumInos %d\tNumFree %d\n", - atomic_read(&ipimap->im_numinos), - atomic_read(&ipimap->im_numfree)); - printk("AG InoFree %d\tAG ExtFree %d\n", - ipimap->im_agctl[0].inofree, ipimap->im_agctl[0].extfree); - printk("AG NumInos %d\tAG NumFree %d\n", - ipimap->im_agctl[0].numinos, ipimap->im_agctl[0].numfree); -} - -static void dump_iag(struct iag * iag, char *function, int line) -{ - printk("\n* ********* *\nIAG %s %d\n", function, line); - printk("IagNum %d\tIAG Free %d\n", le32_to_cpu(iag->iagnum), - le32_to_cpu(iag->iagfree)); - printk("InoFreeFwd %d\tInoFreeBack %d\n", - le32_to_cpu(iag->inofreefwd), - le32_to_cpu(iag->inofreeback)); - printk("ExtFreeFwd %d\tExtFreeBack %d\n", - le32_to_cpu(iag->extfreefwd), - le32_to_cpu(iag->extfreeback)); - printk("NFreeInos %d\tNFreeExts %d\n", le32_to_cpu(iag->nfreeinos), - le32_to_cpu(iag->nfreeexts)); -} -#endif /* _JFS_DEBUG_IMAP */ diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 84f2459b219..2af5efbfd06 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -19,6 +19,7 @@ #include <linux/fs.h> #include <linux/quotaops.h> #include "jfs_incore.h" +#include "jfs_inode.h" #include "jfs_filsys.h" #include "jfs_imap.h" #include "jfs_dinode.h" diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 3df91fbfe78..b54bac576cb 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -1,5 +1,5 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2001 + * Copyright (C) International Business Machines Corp., 2000-2001 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,5 +19,22 @@ #define _H_JFS_INODE extern struct inode *ialloc(struct inode *, umode_t); +extern int jfs_fsync(struct file *, struct dentry *, int); +extern void jfs_read_inode(struct inode *); +extern int jfs_commit_inode(struct inode *, int); +extern int jfs_write_inode(struct inode*, int); +extern void jfs_delete_inode(struct inode *); +extern void jfs_dirty_inode(struct inode *); +extern void jfs_truncate(struct inode *); +extern void jfs_truncate_nolock(struct inode *, loff_t); +extern void jfs_free_zero_link(struct inode *); +extern struct dentry *jfs_get_parent(struct dentry *dentry); +extern struct address_space_operations jfs_aops; +extern struct inode_operations jfs_dir_inode_operations; +extern struct file_operations jfs_dir_operations; +extern struct inode_operations jfs_file_inode_operations; +extern struct file_operations jfs_file_operations; +extern struct inode_operations jfs_symlink_inode_operations; +extern struct dentry_operations jfs_ci_dentry_operations; #endif /* _H_JFS_INODE */ diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index dfa1200daa6..d27bac6acaa 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -71,6 +71,7 @@ #include "jfs_incore.h" #include "jfs_filsys.h" #include "jfs_metapage.h" +#include "jfs_superblock.h" #include "jfs_txnmgr.h" #include "jfs_debug.h" @@ -167,14 +168,6 @@ static struct jfs_log *dummy_log = NULL; static DECLARE_MUTEX(jfs_log_sem); /* - * external references - */ -extern void txLazyUnlock(struct tblock * tblk); -extern int jfs_stop_threads; -extern struct completion jfsIOwait; -extern int jfs_tlocks_low; - -/* * forward references */ static int lmWriteRecord(struct jfs_log * log, struct tblock * tblk, @@ -198,7 +191,7 @@ static int lbmIOWait(struct lbuf * bp, int flag); static bio_end_io_t lbmIODone; static void lbmStartIO(struct lbuf * bp); static void lmGCwrite(struct jfs_log * log, int cant_block); -static int lmLogSync(struct jfs_log * log, int nosyncwait); +static int lmLogSync(struct jfs_log * log, int hard_sync); @@ -922,19 +915,17 @@ static void lmPostGC(struct lbuf * bp) * if new sync address is available * (normally the case if sync() is executed by back-ground * process). - * if not, explicitly run jfs_blogsync() to initiate - * getting of new sync address. * calculate new value of i_nextsync which determines when * this code is called again. * * PARAMETERS: log - log structure - * nosyncwait - 1 if called asynchronously + * hard_sync - 1 to force all metadata to be written * * RETURN: 0 * * serialization: LOG_LOCK() held on entry/exit */ -static int lmLogSync(struct jfs_log * log, int nosyncwait) +static int lmLogSync(struct jfs_log * log, int hard_sync) { int logsize; int written; /* written since last syncpt */ @@ -948,11 +939,18 @@ static int lmLogSync(struct jfs_log * log, int nosyncwait) unsigned long flags; /* push dirty metapages out to disk */ - list_for_each_entry(sbi, &log->sb_list, log_list) { - filemap_flush(sbi->ipbmap->i_mapping); - filemap_flush(sbi->ipimap->i_mapping); - filemap_flush(sbi->direct_inode->i_mapping); - } + if (hard_sync) + list_for_each_entry(sbi, &log->sb_list, log_list) { + filemap_fdatawrite(sbi->ipbmap->i_mapping); + filemap_fdatawrite(sbi->ipimap->i_mapping); + filemap_fdatawrite(sbi->direct_inode->i_mapping); + } + else + list_for_each_entry(sbi, &log->sb_list, log_list) { + filemap_flush(sbi->ipbmap->i_mapping); + filemap_flush(sbi->ipimap->i_mapping); + filemap_flush(sbi->direct_inode->i_mapping); + } /* * forward syncpt @@ -1028,16 +1026,13 @@ static int lmLogSync(struct jfs_log * log, int nosyncwait) /* next syncpt trigger = written + more */ log->nextsync = written + more; - /* return if lmLogSync() from outside of transaction, e.g., sync() */ - if (nosyncwait) - return lsn; - /* if number of bytes written from last sync point is more * than 1/4 of the log size, stop new transactions from * starting until all current transactions are completed * by setting syncbarrier flag. */ - if (written > LOGSYNC_BARRIER(logsize) && logsize > 32 * LOGPSIZE) { + if (!test_bit(log_SYNCBARRIER, &log->flag) && + (written > LOGSYNC_BARRIER(logsize)) && log->active) { set_bit(log_SYNCBARRIER, &log->flag); jfs_info("log barrier on: lsn=0x%x syncpt=0x%x", lsn, log->syncpt); @@ -1055,11 +1050,12 @@ static int lmLogSync(struct jfs_log * log, int nosyncwait) * * FUNCTION: write log SYNCPT record for specified log * - * PARAMETERS: log - log structure + * PARAMETERS: log - log structure + * hard_sync - set to 1 to force metadata to be written */ -void jfs_syncpt(struct jfs_log *log) +void jfs_syncpt(struct jfs_log *log, int hard_sync) { LOG_LOCK(log); - lmLogSync(log, 1); + lmLogSync(log, hard_sync); LOG_UNLOCK(log); } @@ -1624,6 +1620,8 @@ void jfs_flush_journal(struct jfs_log *log, int wait) } } assert(list_empty(&log->cqueue)); + +#ifdef CONFIG_JFS_DEBUG if (!list_empty(&log->synclist)) { struct logsyncblk *lp; @@ -1638,9 +1636,8 @@ void jfs_flush_journal(struct jfs_log *log, int wait) dump_mem("orphan tblock", lp, sizeof(struct tblock)); } -// current->state = TASK_INTERRUPTIBLE; -// schedule(); } +#endif //assert(list_empty(&log->synclist)); clear_bit(log_FLUSH, &log->flag); } @@ -2365,9 +2362,9 @@ int jfsIOWait(void *arg) lbmStartIO(bp); spin_lock_irq(&log_redrive_lock); } - if (current->flags & PF_FREEZE) { + if (freezing(current)) { spin_unlock_irq(&log_redrive_lock); - refrigerator(PF_FREEZE); + refrigerator(); } else { add_wait_queue(&jfs_IO_thread_wait, &wq); set_current_state(TASK_INTERRUPTIBLE); diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h index 51291fbc420..e4978b5b65e 100644 --- a/fs/jfs/jfs_logmgr.h +++ b/fs/jfs/jfs_logmgr.h @@ -507,7 +507,9 @@ extern int lmLogClose(struct super_block *sb); extern int lmLogShutdown(struct jfs_log * log); extern int lmLogInit(struct jfs_log * log); extern int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize); +extern int lmGroupCommit(struct jfs_log *, struct tblock *); +extern int jfsIOWait(void *); extern void jfs_flush_journal(struct jfs_log * log, int wait); -extern void jfs_syncpt(struct jfs_log *log); +extern void jfs_syncpt(struct jfs_log *log, int hard_sync); #endif /* _H_JFS_LOGMGR */ diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 41bf078dce0..13d7e3f1feb 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -198,7 +198,7 @@ static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) } } -static inline struct metapage *alloc_metapage(int gfp_mask) +static inline struct metapage *alloc_metapage(unsigned int gfp_mask) { return mempool_alloc(metapage_mempool, gfp_mask); } @@ -561,7 +561,6 @@ static int metapage_releasepage(struct page *page, int gfp_mask) dump_mem("page", page, sizeof(struct page)); dump_stack(); } - WARN_ON(mp->lsn); if (mp->lsn) remove_from_logsync(mp); remove_metapage(page, mp); @@ -641,7 +640,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, } else { page = read_cache_page(mapping, page_index, (filler_t *)mapping->a_ops->readpage, NULL); - if (IS_ERR(page)) { + if (IS_ERR(page) || !PageUptodate(page)) { jfs_err("read_cache_page failed!"); return NULL; } @@ -726,12 +725,12 @@ void force_metapage(struct metapage *mp) page_cache_release(page); } -extern void hold_metapage(struct metapage *mp) +void hold_metapage(struct metapage *mp) { lock_page(mp->page); } -extern void put_metapage(struct metapage *mp) +void put_metapage(struct metapage *mp) { if (mp->count || mp->nohomeok) { /* Someone else will release this */ @@ -783,14 +782,6 @@ void release_metapage(struct metapage * mp) if (test_bit(META_discard, &mp->flag) && !mp->count) { clear_page_dirty(page); ClearPageUptodate(page); -#ifdef _NOT_YET - if (page->mapping) { - /* Remove from page cache and page cache reference */ - remove_from_page_cache(page); - page_cache_release(page); - metapage_releasepage(page, 0); - } -#endif } #else /* Try to keep metapages from using up too much memory */ diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h index 991e9fb84c7..f0b7d3282b0 100644 --- a/fs/jfs/jfs_metapage.h +++ b/fs/jfs/jfs_metapage.h @@ -1,6 +1,6 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 - * Portions Copyright (c) Christoph Hellwig, 2001-2002 + * Copyright (C) International Business Machines Corp., 2000-2002 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -58,6 +58,8 @@ struct metapage { #define mark_metapage_dirty(mp) set_bit(META_dirty, &(mp)->flag) /* function prototypes */ +extern int metapage_init(void); +extern void metapage_exit(void); extern struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, unsigned int size, int absolute, unsigned long new); diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h index ab0566f70cf..fcf781bf31c 100644 --- a/fs/jfs/jfs_superblock.h +++ b/fs/jfs/jfs_superblock.h @@ -109,5 +109,16 @@ struct jfs_superblock { extern int readSuper(struct super_block *, struct buffer_head **); extern int updateSuper(struct super_block *, uint); extern void jfs_error(struct super_block *, const char *, ...); +extern int jfs_mount(struct super_block *); +extern int jfs_mount_rw(struct super_block *, int); +extern int jfs_umount(struct super_block *); +extern int jfs_umount_rw(struct super_block *); + +extern int jfs_stop_threads; +extern struct completion jfsIOwait; +extern wait_queue_head_t jfs_IO_thread_wait; +extern wait_queue_head_t jfs_commit_thread_wait; +extern wait_queue_head_t jfs_sync_thread_wait; +extern int jfs_extendfs(struct super_block *, s64, int); #endif /*_H_JFS_SUPERBLOCK */ diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index e93d01aa12c..c7a92f9deb2 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -42,7 +42,6 @@ * hold on to mp+lock thru update of maps */ - #include <linux/fs.h> #include <linux/vmalloc.h> #include <linux/smp_lock.h> @@ -51,6 +50,7 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include "jfs_incore.h" +#include "jfs_inode.h" #include "jfs_filsys.h" #include "jfs_metapage.h" #include "jfs_dinode.h" @@ -109,7 +109,6 @@ static int TxLockHWM; /* High water mark for number of txLocks used */ static int TxLockVHWM; /* Very High water mark */ struct tlock *TxLock; /* transaction lock table */ - /* * transaction management lock */ @@ -149,7 +148,6 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event) #define TXN_WAKEUP(event) wake_up_all(event) - /* * statistics */ @@ -161,16 +159,6 @@ static struct { int waitlock; /* 4: # of tlock wait */ } stattx; - -/* - * external references - */ -extern int lmGroupCommit(struct jfs_log *, struct tblock *); -extern int jfs_commit_inode(struct inode *, int); -extern int jfs_stop_threads; - -extern struct completion jfsIOwait; - /* * forward references */ @@ -358,7 +346,6 @@ void txExit(void) TxBlock = NULL; } - /* * NAME: txBegin() * @@ -460,7 +447,6 @@ tid_t txBegin(struct super_block *sb, int flag) return t; } - /* * NAME: txBeginAnon() * @@ -503,7 +489,6 @@ void txBeginAnon(struct super_block *sb) TXN_UNLOCK(); } - /* * txEnd() * @@ -567,6 +552,11 @@ void txEnd(tid_t tid) * synchronize with logsync barrier */ if (test_bit(log_SYNCBARRIER, &log->flag)) { + TXN_UNLOCK(); + + /* write dirty metadata & forward log syncpt */ + jfs_syncpt(log, 1); + jfs_info("log barrier off: 0x%x", log->lsn); /* enable new transactions start */ @@ -575,11 +565,6 @@ void txEnd(tid_t tid) /* wakeup all waitors for logsync barrier */ TXN_WAKEUP(&log->syncwait); - TXN_UNLOCK(); - - /* forward log syncpt */ - jfs_syncpt(log); - goto wakeup; } } @@ -592,7 +577,6 @@ wakeup: TXN_WAKEUP(&TxAnchor.freewait); } - /* * txLock() * @@ -673,7 +657,9 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp, /* only anonymous txn. * Remove from anon_list */ + TXN_LOCK(); list_del_init(&jfs_ip->anon_inode_list); + TXN_UNLOCK(); } jfs_ip->atlhead = tlck->next; } else { @@ -868,7 +854,6 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp, return NULL; } - /* * NAME: txRelease() * @@ -908,7 +893,6 @@ static void txRelease(struct tblock * tblk) TXN_UNLOCK(); } - /* * NAME: txUnlock() * @@ -996,7 +980,6 @@ static void txUnlock(struct tblock * tblk) } } - /* * txMaplock() * @@ -1069,7 +1052,6 @@ struct tlock *txMaplock(tid_t tid, struct inode *ip, int type) return tlck; } - /* * txLinelock() * @@ -1103,8 +1085,6 @@ struct linelock *txLinelock(struct linelock * tlock) return linelock; } - - /* * transaction commit management * ----------------------------- @@ -1373,7 +1353,6 @@ int txCommit(tid_t tid, /* transaction identifier */ return rc; } - /* * NAME: txLog() * @@ -1437,7 +1416,6 @@ static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd) return rc; } - /* * diLog() * @@ -1465,7 +1443,6 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, if (tlck->type & tlckENTRY) { /* log after-image for logredo(): */ lrd->type = cpu_to_le16(LOG_REDOPAGE); -// *pxd = mp->cm_pxd; PXDaddress(pxd, mp->index); PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); @@ -1552,7 +1529,6 @@ static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, return rc; } - /* * dataLog() * @@ -1599,7 +1575,6 @@ static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, return 0; } - /* * dtLog() * @@ -1639,7 +1614,6 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, lrd->log.redopage.type |= cpu_to_le16(LOG_EXTEND); else lrd->log.redopage.type |= cpu_to_le16(LOG_NEW); -// *pxd = mp->cm_pxd; PXDaddress(pxd, mp->index); PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); @@ -1704,7 +1678,6 @@ static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, return; } - /* * xtLog() * @@ -1760,7 +1733,6 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, * applying the after-image to the meta-data page. */ lrd->type = cpu_to_le16(LOG_REDOPAGE); -// *page_pxd = mp->cm_pxd; PXDaddress(page_pxd, mp->index); PXDlength(page_pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); @@ -2093,7 +2065,6 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, return; } - /* * mapLog() * @@ -2180,7 +2151,6 @@ void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, } } - /* * txEA() * @@ -2233,7 +2203,6 @@ void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea) } } - /* * txForce() * @@ -2300,7 +2269,6 @@ void txForce(struct tblock * tblk) } } - /* * txUpdateMap() * @@ -2437,7 +2405,6 @@ static void txUpdateMap(struct tblock * tblk) } } - /* * txAllocPMap() * @@ -2509,7 +2476,6 @@ static void txAllocPMap(struct inode *ip, struct maplock * maplock, } } - /* * txFreeMap() * @@ -2611,7 +2577,6 @@ void txFreeMap(struct inode *ip, } } - /* * txFreelock() * @@ -2652,7 +2617,6 @@ void txFreelock(struct inode *ip) TXN_UNLOCK(); } - /* * txAbort() * @@ -2826,9 +2790,9 @@ int jfs_lazycommit(void *arg) /* In case a wakeup came while all threads were active */ jfs_commit_thread_waking = 0; - if (current->flags & PF_FREEZE) { + if (freezing(current)) { LAZY_UNLOCK(flags); - refrigerator(PF_FREEZE); + refrigerator(); } else { DECLARE_WAITQUEUE(wq, current); @@ -3025,9 +2989,9 @@ int jfs_sync(void *arg) /* Add anon_list2 back to anon_list */ list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list); - if (current->flags & PF_FREEZE) { + if (freezing(current)) { TXN_UNLOCK(); - refrigerator(PF_FREEZE); + refrigerator(); } else { DECLARE_WAITQUEUE(wq, current); diff --git a/fs/jfs/jfs_txnmgr.h b/fs/jfs/jfs_txnmgr.h index b71b82c2df0..59ad0f6b723 100644 --- a/fs/jfs/jfs_txnmgr.h +++ b/fs/jfs/jfs_txnmgr.h @@ -285,34 +285,26 @@ struct commit { /* * external declarations */ -extern struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage *mp, - int flag); - -extern struct tlock *txMaplock(tid_t tid, struct inode *ip, int flag); - -extern int txCommit(tid_t tid, int nip, struct inode **iplist, int flag); - -extern tid_t txBegin(struct super_block *sb, int flag); - -extern void txBeginAnon(struct super_block *sb); - -extern void txEnd(tid_t tid); - -extern void txAbort(tid_t tid, int dirty); - -extern struct linelock *txLinelock(struct linelock * tlock); - -extern void txFreeMap(struct inode *ip, struct maplock * maplock, - struct tblock * tblk, int maptype); - -extern void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea); - -extern void txFreelock(struct inode *ip); - -extern int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, - struct tlock * tlck); - -extern void txQuiesce(struct super_block *sb); - -extern void txResume(struct super_block *sb); +extern int jfs_tlocks_low; + +extern int txInit(void); +extern void txExit(void); +extern struct tlock *txLock(tid_t, struct inode *, struct metapage *, int); +extern struct tlock *txMaplock(tid_t, struct inode *, int); +extern int txCommit(tid_t, int, struct inode **, int); +extern tid_t txBegin(struct super_block *, int); +extern void txBeginAnon(struct super_block *); +extern void txEnd(tid_t); +extern void txAbort(tid_t, int); +extern struct linelock *txLinelock(struct linelock *); +extern void txFreeMap(struct inode *, struct maplock *, struct tblock *, int); +extern void txEA(tid_t, struct inode *, dxd_t *, dxd_t *); +extern void txFreelock(struct inode *); +extern int lmLog(struct jfs_log *, struct tblock *, struct lrd *, + struct tlock *); +extern void txQuiesce(struct super_block *); +extern void txResume(struct super_block *); +extern void txLazyUnlock(struct tblock *); +extern int jfs_lazycommit(void *); +extern int jfs_sync(void *); #endif /* _H_JFS_TXNMGR */ diff --git a/fs/jfs/jfs_unicode.c b/fs/jfs/jfs_unicode.c index b32208aad55..f327decfb15 100644 --- a/fs/jfs/jfs_unicode.c +++ b/fs/jfs/jfs_unicode.c @@ -51,8 +51,9 @@ int jfs_strfromUCS_le(char *to, const __le16 * from, } } else { for (i = 0; (i < len) && from[i]; i++) { - if (le16_to_cpu(from[i]) & 0xff00) { - if (warn) { + if (unlikely(le16_to_cpu(from[i]) & 0xff00)) { + to[i] = '?'; + if (unlikely(warn)) { warn--; warn_again--; printk(KERN_ERR @@ -61,7 +62,7 @@ int jfs_strfromUCS_le(char *to, const __le16 * from, printk(KERN_ERR "mount with iocharset=utf8 to access\n"); } - to[i] = '?'; + } else to[i] = (char) (le16_to_cpu(from[i])); diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index 31b34db4519..a7fe2f2b969 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -135,14 +135,6 @@ static int xtSearchNode(struct inode *ip, static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp); #endif /* _STILL_TO_PORT */ -/* External references */ - -/* - * debug control - */ -/* #define _JFS_DEBUG_XTREE 1 */ - - /* * xtLookup() * @@ -4140,338 +4132,6 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) return 0; } - -#ifdef _JFS_DEBUG_XTREE -/* - * xtDisplayTree() - * - * function: traverse forward - */ -int xtDisplayTree(struct inode *ip) -{ - int rc = 0; - struct metapage *mp; - xtpage_t *p; - s64 bn, pbn; - int index, lastindex, v, h; - xad_t *xad; - struct btstack btstack; - struct btframe *btsp; - struct btframe *parent; - - printk("display B+-tree.\n"); - - /* clear stack */ - btsp = btstack.stack; - - /* - * start with root - * - * root resides in the inode - */ - bn = 0; - v = h = 0; - - /* - * first access of each page: - */ - getPage: - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; - - /* process entries forward from first index */ - index = XTENTRYSTART; - lastindex = le16_to_cpu(p->header.nextindex) - 1; - - if (p->header.flag & BT_INTERNAL) { - /* - * first access of each internal page - */ - goto getChild; - } else { /* (p->header.flag & BT_LEAF) */ - - /* - * first access of each leaf page - */ - printf("leaf page "); - xtDisplayPage(ip, bn, p); - - /* unpin the leaf page */ - XT_PUTPAGE(mp); - } - - /* - * go back up to the parent page - */ - getParent: - /* pop/restore parent entry for the current child page */ - if ((parent = (btsp == btstack.stack ? NULL : --btsp)) == NULL) - /* current page must have been root */ - return; - - /* - * parent page scan completed - */ - if ((index = parent->index) == (lastindex = parent->lastindex)) { - /* go back up to the parent page */ - goto getParent; - } - - /* - * parent page has entries remaining - */ - /* get back the parent page */ - bn = parent->bn; - /* v = parent->level; */ - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; - - /* get next parent entry */ - index++; - - /* - * internal page: go down to child page of current entry - */ - getChild: - /* push/save current parent entry for the child page */ - btsp->bn = pbn = bn; - btsp->index = index; - btsp->lastindex = lastindex; - /* btsp->level = v; */ - /* btsp->node = h; */ - ++btsp; - - /* get child page */ - xad = &p->xad[index]; - bn = addressXAD(xad); - - /* - * first access of each internal entry: - */ - /* release parent page */ - XT_PUTPAGE(mp); - - printk("traverse down 0x%lx[%d]->0x%lx\n", (ulong) pbn, index, - (ulong) bn); - v++; - h = index; - - /* process the child page */ - goto getPage; -} - - -/* - * xtDisplayPage() - * - * function: display page - */ -int xtDisplayPage(struct inode *ip, s64 bn, xtpage_t * p) -{ - int rc = 0; - xad_t *xad; - s64 xaddr, xoff; - int xlen, i, j; - - /* display page control */ - printf("bn:0x%lx flag:0x%x nextindex:%d\n", - (ulong) bn, p->header.flag, - le16_to_cpu(p->header.nextindex)); - - /* display entries */ - xad = &p->xad[XTENTRYSTART]; - for (i = XTENTRYSTART, j = 1; i < le16_to_cpu(p->header.nextindex); - i++, xad++, j++) { - xoff = offsetXAD(xad); - xaddr = addressXAD(xad); - xlen = lengthXAD(xad); - printf("\t[%d] 0x%lx:0x%lx(0x%x)", i, (ulong) xoff, - (ulong) xaddr, xlen); - - if (j == 4) { - printf("\n"); - j = 0; - } - } - - printf("\n"); -} -#endif /* _JFS_DEBUG_XTREE */ - - -#ifdef _JFS_WIP -/* - * xtGather() - * - * function: - * traverse for allocation acquiring tlock at commit time - * (vs at the time of update) logging backward top down - * - * note: - * problem - establishing that all new allocation have been - * processed both for append and random write in sparse file - * at the current entry at the current subtree root page - * - */ -int xtGather(btree_t *t) -{ - int rc = 0; - xtpage_t *p; - u64 bn; - int index; - btentry_t *e; - struct btstack btstack; - struct btsf *parent; - - /* clear stack */ - BT_CLR(&btstack); - - /* - * start with root - * - * root resides in the inode - */ - bn = 0; - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; - - /* new root is NOT pointed by a new entry - if (p->header.flag & NEW) - allocate new page lock; - write a NEWPAGE log; - */ - - dopage: - /* - * first access of each page: - */ - /* process entries backward from last index */ - index = le16_to_cpu(p->header.nextindex) - 1; - - if (p->header.flag & BT_LEAF) { - /* - * first access of each leaf page - */ - /* process leaf page entries backward */ - for (; index >= XTENTRYSTART; index--) { - e = &p->xad[index]; - /* - * if newpage, log NEWPAGE. - * - if (e->flag & XAD_NEW) { - nfound =+ entry->length; - update current page lock for the entry; - newpage(entry); - * - * if moved, log move. - * - } else if (e->flag & XAD_MOVED) { - reset flag; - update current page lock for the entry; - } - */ - } - - /* unpin the leaf page */ - XT_PUTPAGE(mp); - - /* - * go back up to the parent page - */ - getParent: - /* restore parent entry for the current child page */ - if ((parent = BT_POP(&btstack)) == NULL) - /* current page must have been root */ - return 0; - - if ((index = parent->index) == XTENTRYSTART) { - /* - * parent page scan completed - */ - /* go back up to the parent page */ - goto getParent; - } else { - /* - * parent page has entries remaining - */ - /* get back the parent page */ - bn = parent->bn; - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return -EIO; - - /* first subroot page which - * covers all new allocated blocks - * itself not new/modified. - * (if modified from split of descendent, - * go down path of split page) - - if (nfound == nnew && - !(p->header.flag & (NEW | MOD))) - exit scan; - */ - - /* process parent page entries backward */ - index--; - } - } else { - /* - * first access of each internal page - */ - } - - /* - * internal page: go down to child page of current entry - */ - - /* save current parent entry for the child page */ - BT_PUSH(&btstack, bn, index); - - /* get current entry for the child page */ - e = &p->xad[index]; - - /* - * first access of each internal entry: - */ - /* - * if new entry, log btree_tnewentry. - * - if (e->flag & XAD_NEW) - update parent page lock for the entry; - */ - - /* release parent page */ - XT_PUTPAGE(mp); - - /* get child page */ - bn = e->bn; - XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); - if (rc) - return rc; - - /* - * first access of each non-root page: - */ - /* - * if new, log btree_newpage. - * - if (p->header.flag & NEW) - allocate new page lock; - write a NEWPAGE log (next, prev); - */ - - /* process the child page */ - goto dopage; - - out: - return 0; -} -#endif /* _JFS_WIP */ - - #ifdef CONFIG_JFS_STATISTICS int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length, int *eof, void *data) diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h index a69784254fe..af668a80b40 100644 --- a/fs/jfs/jfs_xtree.h +++ b/fs/jfs/jfs_xtree.h @@ -131,10 +131,4 @@ extern int xtRelocate(tid_t tid, struct inode *ip, extern int xtAppend(tid_t tid, struct inode *ip, int xflag, s64 xoff, int maxblocks, int *xlenp, s64 * xaddrp, int flag); - -#ifdef _JFS_DEBUG_XTREE -extern int xtDisplayTree(struct inode *ip); -extern int xtDisplayPage(struct inode *ip, s64 bn, xtpage_t * p); -#endif /* _JFS_DEBUG_XTREE */ - #endif /* !_H_JFS_XTREE */ diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 8413a368f44..1cae14e741e 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -31,20 +31,9 @@ #include "jfs_acl.h" #include "jfs_debug.h" -extern struct inode_operations jfs_file_inode_operations; -extern struct inode_operations jfs_symlink_inode_operations; -extern struct file_operations jfs_file_operations; -extern struct address_space_operations jfs_aops; - -extern int jfs_fsync(struct file *, struct dentry *, int); -extern void jfs_truncate_nolock(struct inode *, loff_t); -extern int jfs_init_acl(struct inode *, struct inode *); - /* * forward references */ -struct inode_operations jfs_dir_inode_operations; -struct file_operations jfs_dir_operations; struct dentry_operations jfs_ci_dentry_operations; static s64 commitZeroLink(tid_t, struct inode *); @@ -655,7 +644,7 @@ static s64 commitZeroLink(tid_t tid, struct inode *ip) /* - * NAME: freeZeroLink() + * NAME: jfs_free_zero_link() * * FUNCTION: for non-directory, called by iClose(), * free resources of a file from cache and WORKING map @@ -663,15 +652,12 @@ static s64 commitZeroLink(tid_t tid, struct inode *ip) * while associated with a pager object, * * PARAMETER: ip - pointer to inode of file. - * - * RETURN: 0 -ok */ -int freeZeroLink(struct inode *ip) +void jfs_free_zero_link(struct inode *ip) { - int rc = 0; int type; - jfs_info("freeZeroLink: ip = 0x%p", ip); + jfs_info("jfs_free_zero_link: ip = 0x%p", ip); /* return if not reg or symbolic link or if size is * already ok. @@ -684,10 +670,10 @@ int freeZeroLink(struct inode *ip) case S_IFLNK: /* if its contained in inode nothing to do */ if (ip->i_size < IDATASIZE) - return 0; + return; break; default: - return 0; + return; } /* @@ -737,9 +723,7 @@ int freeZeroLink(struct inode *ip) * free xtree/data blocks from working block map; */ if (ip->i_size) - rc = xtTruncate(0, ip, 0, COMMIT_WMAP); - - return rc; + xtTruncate(0, ip, 0, COMMIT_WMAP); } /* diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 5e774ed7fb6..9ff89720f93 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -24,10 +24,12 @@ #include <linux/completion.h> #include <linux/vfs.h> #include <linux/moduleparam.h> +#include <linux/posix_acl.h> #include <asm/uaccess.h> #include "jfs_incore.h" #include "jfs_filsys.h" +#include "jfs_inode.h" #include "jfs_metapage.h" #include "jfs_superblock.h" #include "jfs_dmap.h" @@ -62,37 +64,6 @@ module_param(jfsloglevel, int, 0644); MODULE_PARM_DESC(jfsloglevel, "Specify JFS loglevel (0, 1 or 2)"); #endif -/* - * External declarations - */ -extern int jfs_mount(struct super_block *); -extern int jfs_mount_rw(struct super_block *, int); -extern int jfs_umount(struct super_block *); -extern int jfs_umount_rw(struct super_block *); - -extern int jfsIOWait(void *); -extern int jfs_lazycommit(void *); -extern int jfs_sync(void *); - -extern void jfs_read_inode(struct inode *inode); -extern void jfs_dirty_inode(struct inode *inode); -extern void jfs_delete_inode(struct inode *inode); -extern int jfs_write_inode(struct inode *inode, int wait); - -extern struct dentry *jfs_get_parent(struct dentry *dentry); -extern int jfs_extendfs(struct super_block *, s64, int); - -extern struct dentry_operations jfs_ci_dentry_operations; - -#ifdef PROC_FS_JFS /* see jfs_debug.h */ -extern void jfs_proc_init(void); -extern void jfs_proc_clean(void); -#endif - -extern wait_queue_head_t jfs_IO_thread_wait; -extern wait_queue_head_t jfs_commit_thread_wait; -extern wait_queue_head_t jfs_sync_thread_wait; - static void jfs_handle_error(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); @@ -143,6 +114,8 @@ static void jfs_destroy_inode(struct inode *inode) { struct jfs_inode_info *ji = JFS_IP(inode); + BUG_ON(!list_empty(&ji->anon_inode_list)); + spin_lock_irq(&ji->ag_lock); if (ji->active_ag != -1) { struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap; @@ -560,7 +533,7 @@ static int jfs_sync_fs(struct super_block *sb, int wait) /* log == NULL indicates read-only mount */ if (log) { jfs_flush_journal(log, wait); - jfs_syncpt(log); + jfs_syncpt(log, 0); } return 0; @@ -593,11 +566,6 @@ static struct file_system_type jfs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; -extern int metapage_init(void); -extern int txInit(void); -extern void txExit(void); -extern void metapage_exit(void); - static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags) { struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo; diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c index ef4c07ee92b..16477b3835e 100644 --- a/fs/jfs/symlink.c +++ b/fs/jfs/symlink.c @@ -1,5 +1,5 @@ /* - * Copyright (c) Christoph Hellwig, 2001-2002 + * Copyright (C) Christoph Hellwig, 2001-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,13 +19,14 @@ #include <linux/fs.h> #include <linux/namei.h> #include "jfs_incore.h" +#include "jfs_inode.h" #include "jfs_xattr.h" -static int jfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd) { char *s = JFS_IP(dentry->d_inode)->i_inline; nd_set_link(nd, s); - return 0; + return NULL; } struct inode_operations jfs_symlink_inode_operations = { diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 7a9ffd5d03d..554ec739e49 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -19,6 +19,7 @@ #include <linux/fs.h> #include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> #include <linux/quotaops.h> #include "jfs_incore.h" #include "jfs_superblock.h" @@ -718,9 +719,9 @@ static int can_set_system_xattr(struct inode *inode, const char *name, return -EPERM; /* - * XATTR_NAME_ACL_ACCESS is tied to i_mode + * POSIX_ACL_XATTR_ACCESS is tied to i_mode */ - if (strcmp(name, XATTR_NAME_ACL_ACCESS) == 0) { + if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) { acl = posix_acl_from_xattr(value, value_len); if (IS_ERR(acl)) { rc = PTR_ERR(acl); @@ -750,7 +751,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name, JFS_IP(inode)->i_acl = JFS_ACL_NOT_CACHED; return 0; - } else if (strcmp(name, XATTR_NAME_ACL_DEFAULT) == 0) { + } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) { acl = posix_acl_from_xattr(value, value_len); if (IS_ERR(acl)) { rc = PTR_ERR(acl); @@ -780,7 +781,7 @@ static int can_set_xattr(struct inode *inode, const char *name, if (IS_RDONLY(inode)) return -EROFS; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode) || S_ISLNK(inode->i_mode)) + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; if(strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) == 0) @@ -789,12 +790,12 @@ static int can_set_xattr(struct inode *inode, const char *name, */ return can_set_system_xattr(inode, name, value, value_len); - if(strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0) + if(strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM); #ifdef CONFIG_JFS_SECURITY if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) - != 0) + == 0) return 0; /* Leave it to the security module */ #endif @@ -946,8 +947,7 @@ int __jfs_setxattr(struct inode *inode, const char *name, const void *value, out: up_write(&JFS_IP(inode)->xattr_sem); - if (os2name) - kfree(os2name); + kfree(os2name); return rc; } @@ -1042,8 +1042,7 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data, out: up_read(&JFS_IP(inode)->xattr_sem); - if (os2name) - kfree(os2name); + kfree(os2name); return size; } diff --git a/fs/libfs.c b/fs/libfs.c index f90b2959592..58101dff2c6 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -183,6 +183,7 @@ struct file_operations simple_dir_operations = { .llseek = dcache_dir_lseek, .read = generic_read_dir, .readdir = dcache_readdir, + .fsync = simple_sync_file, }; struct inode_operations simple_dir_inode_operations = { @@ -519,6 +520,102 @@ int simple_transaction_release(struct inode *inode, struct file *file) return 0; } +/* Simple attribute files */ + +struct simple_attr { + u64 (*get)(void *); + void (*set)(void *, u64); + char get_buf[24]; /* enough to store a u64 and "\n\0" */ + char set_buf[24]; + void *data; + const char *fmt; /* format for read operation */ + struct semaphore sem; /* protects access to these buffers */ +}; + +/* simple_attr_open is called by an actual attribute open file operation + * to set the attribute specific access operations. */ +int simple_attr_open(struct inode *inode, struct file *file, + u64 (*get)(void *), void (*set)(void *, u64), + const char *fmt) +{ + struct simple_attr *attr; + + attr = kmalloc(sizeof(*attr), GFP_KERNEL); + if (!attr) + return -ENOMEM; + + attr->get = get; + attr->set = set; + attr->data = inode->u.generic_ip; + attr->fmt = fmt; + init_MUTEX(&attr->sem); + + file->private_data = attr; + + return nonseekable_open(inode, file); +} + +int simple_attr_close(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +/* read from the buffer that is filled with the get function */ +ssize_t simple_attr_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct simple_attr *attr; + size_t size; + ssize_t ret; + + attr = file->private_data; + + if (!attr->get) + return -EACCES; + + down(&attr->sem); + if (*ppos) /* continued read */ + size = strlen(attr->get_buf); + else /* first read */ + size = scnprintf(attr->get_buf, sizeof(attr->get_buf), + attr->fmt, + (unsigned long long)attr->get(attr->data)); + + ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size); + up(&attr->sem); + return ret; +} + +/* interpret the buffer as a number to call the set function with */ +ssize_t simple_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + struct simple_attr *attr; + u64 val; + size_t size; + ssize_t ret; + + attr = file->private_data; + + if (!attr->set) + return -EACCES; + + down(&attr->sem); + ret = -EFAULT; + size = min(sizeof(attr->set_buf) - 1, len); + if (copy_from_user(attr->set_buf, buf, size)) + goto out; + + ret = len; /* claim we got the whole input */ + attr->set_buf[size] = '\0'; + val = simple_strtol(attr->set_buf, NULL, 0); + attr->set(attr->data, val); +out: + up(&attr->sem); + return ret; +} + EXPORT_SYMBOL(dcache_dir_close); EXPORT_SYMBOL(dcache_dir_lseek); EXPORT_SYMBOL(dcache_dir_open); @@ -547,3 +644,7 @@ EXPORT_SYMBOL(simple_read_from_buffer); EXPORT_SYMBOL(simple_transaction_get); EXPORT_SYMBOL(simple_transaction_read); EXPORT_SYMBOL(simple_transaction_release); +EXPORT_SYMBOL_GPL(simple_attr_open); +EXPORT_SYMBOL_GPL(simple_attr_close); +EXPORT_SYMBOL_GPL(simple_attr_read); +EXPORT_SYMBOL_GPL(simple_attr_write); diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index ef7103b8c5b..006bb9e1457 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -31,7 +31,7 @@ static int reclaimer(void *ptr); * This is the representation of a blocked client lock. */ struct nlm_wait { - struct nlm_wait * b_next; /* linked list */ + struct list_head b_list; /* linked list */ wait_queue_head_t b_wait; /* where to wait on */ struct nlm_host * b_host; struct file_lock * b_lock; /* local file lock */ @@ -39,27 +39,54 @@ struct nlm_wait { u32 b_status; /* grant callback status */ }; -static struct nlm_wait * nlm_blocked; +static LIST_HEAD(nlm_blocked); /* - * Block on a lock + * Queue up a lock for blocking so that the GRANTED request can see it */ -int -nlmclnt_block(struct nlm_host *host, struct file_lock *fl, u32 *statp) +int nlmclnt_prepare_block(struct nlm_rqst *req, struct nlm_host *host, struct file_lock *fl) +{ + struct nlm_wait *block; + + BUG_ON(req->a_block != NULL); + block = kmalloc(sizeof(*block), GFP_KERNEL); + if (block == NULL) + return -ENOMEM; + block->b_host = host; + block->b_lock = fl; + init_waitqueue_head(&block->b_wait); + block->b_status = NLM_LCK_BLOCKED; + + list_add(&block->b_list, &nlm_blocked); + req->a_block = block; + + return 0; +} + +void nlmclnt_finish_block(struct nlm_rqst *req) { - struct nlm_wait block, **head; - int err; - u32 pstate; + struct nlm_wait *block = req->a_block; - block.b_host = host; - block.b_lock = fl; - init_waitqueue_head(&block.b_wait); - block.b_status = NLM_LCK_BLOCKED; - block.b_next = nlm_blocked; - nlm_blocked = █ + if (block == NULL) + return; + req->a_block = NULL; + list_del(&block->b_list); + kfree(block); +} + +/* + * Block on a lock + */ +long nlmclnt_block(struct nlm_rqst *req, long timeout) +{ + struct nlm_wait *block = req->a_block; + long ret; - /* Remember pseudo nsm state */ - pstate = host->h_state; + /* A borken server might ask us to block even if we didn't + * request it. Just say no! + */ + if (!req->a_args.block) + return -EAGAIN; /* Go to sleep waiting for GRANT callback. Some servers seem * to lose callbacks, however, so we're going to poll from @@ -69,28 +96,16 @@ nlmclnt_block(struct nlm_host *host, struct file_lock *fl, u32 *statp) * a 1 minute timeout would do. See the comment before * nlmclnt_lock for an explanation. */ - sleep_on_timeout(&block.b_wait, 30*HZ); - - for (head = &nlm_blocked; *head; head = &(*head)->b_next) { - if (*head == &block) { - *head = block.b_next; - break; - } - } + ret = wait_event_interruptible_timeout(block->b_wait, + block->b_status != NLM_LCK_BLOCKED, + timeout); - if (!signalled()) { - *statp = block.b_status; - return 0; + if (block->b_status != NLM_LCK_BLOCKED) { + req->a_res.status = block->b_status; + block->b_status = NLM_LCK_BLOCKED; } - /* Okay, we were interrupted. Cancel the pending request - * unless the server has rebooted. - */ - if (pstate == host->h_state && (err = nlmclnt_cancel(host, fl)) < 0) - printk(KERN_NOTICE - "lockd: CANCEL call failed (errno %d)\n", -err); - - return -ERESTARTSYS; + return ret; } /* @@ -100,27 +115,23 @@ u32 nlmclnt_grant(struct nlm_lock *lock) { struct nlm_wait *block; + u32 res = nlm_lck_denied; /* * Look up blocked request based on arguments. * Warning: must not use cookie to match it! */ - for (block = nlm_blocked; block; block = block->b_next) { - if (nlm_compare_locks(block->b_lock, &lock->fl)) - break; + list_for_each_entry(block, &nlm_blocked, b_list) { + if (nlm_compare_locks(block->b_lock, &lock->fl)) { + /* Alright, we found a lock. Set the return status + * and wake up the caller + */ + block->b_status = NLM_LCK_GRANTED; + wake_up(&block->b_wait); + res = nlm_granted; + } } - - /* Ooops, no blocked request found. */ - if (block == NULL) - return nlm_lck_denied; - - /* Alright, we found the lock. Set the return status and - * wake up the caller. - */ - block->b_status = NLM_LCK_GRANTED; - wake_up(&block->b_wait); - - return nlm_granted; + return res; } /* @@ -230,7 +241,7 @@ restart: host->h_reclaiming = 0; /* Now, wake up all processes that sleep on a blocked lock */ - for (block = nlm_blocked; block; block = block->b_next) { + list_for_each_entry(block, &nlm_blocked, b_list) { if (block->b_host == host) { block->b_status = NLM_LCK_DENIED_GRACE_PERIOD; wake_up(&block->b_wait); diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index a4407619b1f..14b3ce87fa2 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -21,6 +21,7 @@ #define NLMDBG_FACILITY NLMDBG_CLIENT #define NLMCLNT_GRACE_WAIT (5*HZ) +#define NLMCLNT_POLL_TIMEOUT (30*HZ) static int nlmclnt_test(struct nlm_rqst *, struct file_lock *); static int nlmclnt_lock(struct nlm_rqst *, struct file_lock *); @@ -312,7 +313,7 @@ static int nlm_wait_on_grace(wait_queue_head_t *queue) prepare_to_wait(queue, &wait, TASK_INTERRUPTIBLE); if (!signalled ()) { schedule_timeout(NLMCLNT_GRACE_WAIT); - try_to_freeze(PF_FREEZE); + try_to_freeze(); if (!signalled ()) status = 0; } @@ -553,7 +554,8 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) { struct nlm_host *host = req->a_host; struct nlm_res *resp = &req->a_res; - int status; + long timeout; + int status; if (!host->h_monitored && nsm_monitor(host) < 0) { printk(KERN_NOTICE "lockd: failed to monitor %s\n", @@ -562,15 +564,32 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) goto out; } - do { - if ((status = nlmclnt_call(req, NLMPROC_LOCK)) >= 0) { - if (resp->status != NLM_LCK_BLOCKED) - break; - status = nlmclnt_block(host, fl, &resp->status); - } + if (req->a_args.block) { + status = nlmclnt_prepare_block(req, host, fl); if (status < 0) goto out; - } while (resp->status == NLM_LCK_BLOCKED && req->a_args.block); + } + for(;;) { + status = nlmclnt_call(req, NLMPROC_LOCK); + if (status < 0) + goto out_unblock; + if (resp->status != NLM_LCK_BLOCKED) + break; + /* Wait on an NLM blocking lock */ + timeout = nlmclnt_block(req, NLMCLNT_POLL_TIMEOUT); + /* Did a reclaimer thread notify us of a server reboot? */ + if (resp->status == NLM_LCK_DENIED_GRACE_PERIOD) + continue; + if (resp->status != NLM_LCK_BLOCKED) + break; + if (timeout >= 0) + continue; + /* We were interrupted. Send a CANCEL request to the server + * and exit + */ + status = (int)timeout; + goto out_unblock; + } if (resp->status == NLM_LCK_GRANTED) { fl->fl_u.nfs_fl.state = host->h_state; @@ -579,6 +598,11 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) do_vfs_lock(fl); } status = nlm_stat_to_errno(resp->status); +out_unblock: + nlmclnt_finish_block(req); + /* Cancel the blocked request if it is still pending */ + if (resp->status == NLM_LCK_BLOCKED) + nlmclnt_cancel(host, fl); out: nlmclnt_release_lockargs(req); return status; diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 52707c5ad6e..82c77df81c5 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -189,17 +189,15 @@ nlm_bind_host(struct nlm_host *host) goto forgetit; xprt_set_timeout(&xprt->timeout, 5, nlmsvc_timeout); + xprt->nocong = 1; /* No congestion control for NLM */ + xprt->resvport = 1; /* NLM requires a reserved port */ /* Existing NLM servers accept AUTH_UNIX only */ clnt = rpc_create_client(xprt, host->h_name, &nlm_program, host->h_version, RPC_AUTH_UNIX); - if (IS_ERR(clnt)) { - xprt_destroy(xprt); + if (IS_ERR(clnt)) goto forgetit; - } clnt->cl_autobind = 1; /* turn on pmap queries */ - xprt->nocong = 1; /* No congestion control for NLM */ - xprt->resvport = 1; /* NLM requires a reserved port */ host->h_rpcclnt = clnt; } diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 6fc1bebeec1..2d144abe84a 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -115,20 +115,19 @@ nsm_create(void) xprt = xprt_create_proto(IPPROTO_UDP, &sin, NULL); if (IS_ERR(xprt)) return (struct rpc_clnt *)xprt; + xprt->resvport = 1; /* NSM requires a reserved port */ clnt = rpc_create_client(xprt, "localhost", &nsm_program, SM_VERSION, RPC_AUTH_NULL); if (IS_ERR(clnt)) - goto out_destroy; + goto out_err; clnt->cl_softrtry = 1; clnt->cl_chatty = 1; clnt->cl_oneshot = 1; - xprt->resvport = 1; /* NSM requires a reserved port */ return clnt; -out_destroy: - xprt_destroy(xprt); +out_err: return clnt; } diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index b82e470912e..12a857c29e2 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -191,7 +191,9 @@ lockd(struct svc_rqst *rqstp) printk(KERN_DEBUG "lockd: new process, skipping host shutdown\n"); wake_up(&lockd_exit); - + + flush_signals(current); + /* Exit the RPC thread */ svc_exit_thread(rqstp); @@ -329,7 +331,7 @@ static ctl_table nlm_sysctls[] = { .ctl_name = CTL_UNNUMBERED, .procname = "nlm_grace_period", .data = &nlm_grace_period, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &proc_doulongvec_minmax, .extra1 = (unsigned long *) &nlm_grace_period_min, @@ -339,7 +341,7 @@ static ctl_table nlm_sysctls[] = { .ctl_name = CTL_UNNUMBERED, .procname = "nlm_timeout", .data = &nlm_timeout, - .maxlen = sizeof(int), + .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &proc_doulongvec_minmax, .extra1 = (unsigned long *) &nlm_timeout_min, diff --git a/fs/locks.c b/fs/locks.c index 3fa6a7ce57a..11956b6179f 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1276,7 +1276,7 @@ int fcntl_getlease(struct file *filp) */ static int __setlease(struct file *filp, long arg, struct file_lock **flp) { - struct file_lock *fl, **before, **my_before = NULL, *lease = *flp; + struct file_lock *fl, **before, **my_before = NULL, *lease; struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; int error, rdlease_count = 0, wrlease_count = 0; @@ -1287,6 +1287,8 @@ static int __setlease(struct file *filp, long arg, struct file_lock **flp) if (!flp || !(*flp) || !(*flp)->fl_lmops || !(*flp)->fl_lmops->fl_break) goto out; + lease = *flp; + error = -EAGAIN; if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) goto out; @@ -1548,6 +1550,8 @@ int fcntl_getlk(struct file *filp, struct flock __user *l) if (filp->f_op && filp->f_op->lock) { error = filp->f_op->lock(filp, F_GETLK, &file_lock); + if (file_lock.fl_ops && file_lock.fl_ops->fl_release_private) + file_lock.fl_ops->fl_release_private(&file_lock); if (error < 0) goto out; else @@ -1587,7 +1591,8 @@ out: /* Apply the lock described by l to an open file descriptor. * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ -int fcntl_setlk(struct file *filp, unsigned int cmd, struct flock __user *l) +int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, + struct flock __user *l) { struct file_lock *file_lock = locks_alloc_lock(); struct flock flock; @@ -1616,6 +1621,7 @@ int fcntl_setlk(struct file *filp, unsigned int cmd, struct flock __user *l) goto out; } +again: error = flock_to_posix_lock(filp, file_lock, &flock); if (error) goto out; @@ -1644,25 +1650,33 @@ int fcntl_setlk(struct file *filp, unsigned int cmd, struct flock __user *l) if (error) goto out; - if (filp->f_op && filp->f_op->lock != NULL) { + if (filp->f_op && filp->f_op->lock != NULL) error = filp->f_op->lock(filp, cmd, file_lock); - goto out; - } + else { + for (;;) { + error = __posix_lock_file(inode, file_lock); + if ((error != -EAGAIN) || (cmd == F_SETLK)) + break; + error = wait_event_interruptible(file_lock->fl_wait, + !file_lock->fl_next); + if (!error) + continue; - for (;;) { - error = __posix_lock_file(inode, file_lock); - if ((error != -EAGAIN) || (cmd == F_SETLK)) + locks_delete_block(file_lock); break; - error = wait_event_interruptible(file_lock->fl_wait, - !file_lock->fl_next); - if (!error) - continue; + } + } - locks_delete_block(file_lock); - break; + /* + * Attempt to detect a close/fcntl race and recover by + * releasing the lock that was just acquired. + */ + if (!error && fcheck(fd) != filp && flock.l_type != F_UNLCK) { + flock.l_type = F_UNLCK; + goto again; } - out: +out: locks_free_lock(file_lock); return error; } @@ -1690,6 +1704,8 @@ int fcntl_getlk64(struct file *filp, struct flock64 __user *l) if (filp->f_op && filp->f_op->lock) { error = filp->f_op->lock(filp, F_GETLK, &file_lock); + if (file_lock.fl_ops && file_lock.fl_ops->fl_release_private) + file_lock.fl_ops->fl_release_private(&file_lock); if (error < 0) goto out; else @@ -1718,7 +1734,8 @@ out: /* Apply the lock described by l to an open file descriptor. * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ -int fcntl_setlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) +int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, + struct flock64 __user *l) { struct file_lock *file_lock = locks_alloc_lock(); struct flock64 flock; @@ -1747,6 +1764,7 @@ int fcntl_setlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) goto out; } +again: error = flock64_to_posix_lock(filp, file_lock, &flock); if (error) goto out; @@ -1775,22 +1793,30 @@ int fcntl_setlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) if (error) goto out; - if (filp->f_op && filp->f_op->lock != NULL) { + if (filp->f_op && filp->f_op->lock != NULL) error = filp->f_op->lock(filp, cmd, file_lock); - goto out; - } + else { + for (;;) { + error = __posix_lock_file(inode, file_lock); + if ((error != -EAGAIN) || (cmd == F_SETLK64)) + break; + error = wait_event_interruptible(file_lock->fl_wait, + !file_lock->fl_next); + if (!error) + continue; - for (;;) { - error = __posix_lock_file(inode, file_lock); - if ((error != -EAGAIN) || (cmd == F_SETLK64)) + locks_delete_block(file_lock); break; - error = wait_event_interruptible(file_lock->fl_wait, - !file_lock->fl_next); - if (!error) - continue; + } + } - locks_delete_block(file_lock); - break; + /* + * Attempt to detect a close/fcntl race and recover by + * releasing the lock that was just acquired. + */ + if (!error && fcheck(fd) != filp && flock.l_type != F_UNLCK) { + flock.l_type = F_UNLCK; + goto again; } out: @@ -1873,6 +1899,8 @@ void locks_remove_flock(struct file *filp) .fl_end = OFFSET_MAX, }; filp->f_op->flock(filp, F_SETLKW, &fl); + if (fl.fl_ops && fl.fl_ops->fl_release_private) + fl.fl_ops->fl_release_private(&fl); } lock_kernel(); @@ -1880,12 +1908,7 @@ void locks_remove_flock(struct file *filp) while ((fl = *before) != NULL) { if (fl->fl_file == filp) { - /* - * We might have a POSIX lock that was created at the same time - * the filp was closed for the last time. Just remove that too, - * regardless of ownership, since nobody can own it. - */ - if (IS_FLOCK(fl) || IS_POSIX(fl)) { + if (IS_FLOCK(fl)) { locks_delete_lock(before); continue; } diff --git a/fs/mbcache.c b/fs/mbcache.c index c7170b9221a..b002a088857 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -316,11 +316,10 @@ fail: * currently in use cannot be freed, and thus remain in the cache. All others * are freed. * - * @cache: which cache to shrink * @bdev: which device's cache entries to shrink */ void -mb_cache_shrink(struct mb_cache *cache, struct block_device *bdev) +mb_cache_shrink(struct block_device *bdev) { LIST_HEAD(free_list); struct list_head *l, *ltmp; diff --git a/fs/namei.c b/fs/namei.c index a7f7f44119b..6ec1f0fefc5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -21,7 +21,7 @@ #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/pagemap.h> -#include <linux/dnotify.h> +#include <linux/fsnotify.h> #include <linux/smp_lock.h> #include <linux/personality.h> #include <linux/security.h> @@ -314,7 +314,7 @@ void path_release(struct nameidata *nd) void path_release_on_umount(struct nameidata *nd) { dput(nd->dentry); - _mntput(nd->mnt); + mntput_no_expire(nd->mnt); } /* @@ -501,6 +501,7 @@ struct path { static inline int __do_follow_link(struct path *path, struct nameidata *nd) { int error; + void *cookie; struct dentry *dentry = path->dentry; touch_atime(path->mnt, dentry); @@ -508,13 +509,15 @@ static inline int __do_follow_link(struct path *path, struct nameidata *nd) if (path->mnt == nd->mnt) mntget(path->mnt); - error = dentry->d_inode->i_op->follow_link(dentry, nd); - if (!error) { + cookie = dentry->d_inode->i_op->follow_link(dentry, nd); + error = PTR_ERR(cookie); + if (!IS_ERR(cookie)) { char *s = nd_get_link(nd); + error = 0; if (s) error = __vfs_follow_link(nd, s); if (dentry->d_inode->i_op->put_link) - dentry->d_inode->i_op->put_link(dentry, nd); + dentry->d_inode->i_op->put_link(dentry, nd, cookie); } dput(dentry); mntput(path->mnt); @@ -1312,7 +1315,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode, DQUOT_INIT(dir); error = dir->i_op->create(dir, dentry, mode, nd); if (!error) { - inode_dir_notify(dir, DN_CREATE); + fsnotify_create(dir, dentry->d_name.name); security_inode_post_create(dir, dentry, mode); } return error; @@ -1577,19 +1580,35 @@ do_link: * * Simple function to lookup and return a dentry and create it * if it doesn't exist. Is SMP-safe. + * + * Returns with nd->dentry->d_inode->i_sem locked. */ struct dentry *lookup_create(struct nameidata *nd, int is_dir) { - struct dentry *dentry; + struct dentry *dentry = ERR_PTR(-EEXIST); down(&nd->dentry->d_inode->i_sem); - dentry = ERR_PTR(-EEXIST); + /* + * Yucky last component or no last component at all? + * (foo/., foo/.., /////) + */ if (nd->last_type != LAST_NORM) goto fail; nd->flags &= ~LOOKUP_PARENT; + + /* + * Do the final lookup. + */ dentry = lookup_hash(&nd->last, nd->dentry); if (IS_ERR(dentry)) goto fail; + + /* + * Special case - lookup gave negative, but... we had foo/bar/ + * From the vfs_mknod() POV we just have a negative dentry - + * all is fine. Let's be bastards - you had / on the end, you've + * been asking for (non-existent) directory. -ENOENT for you. + */ if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) goto enoent; return dentry; @@ -1621,7 +1640,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) DQUOT_INIT(dir); error = dir->i_op->mknod(dir, dentry, mode, dev); if (!error) { - inode_dir_notify(dir, DN_CREATE); + fsnotify_create(dir, dentry->d_name.name); security_inode_post_mknod(dir, dentry, mode, dev); } return error; @@ -1694,7 +1713,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) DQUOT_INIT(dir); error = dir->i_op->mkdir(dir, dentry, mode); if (!error) { - inode_dir_notify(dir, DN_CREATE); + fsnotify_mkdir(dir, dentry->d_name.name); security_inode_post_mkdir(dir,dentry, mode); } return error; @@ -1785,7 +1804,6 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) } up(&dentry->d_inode->i_sem); if (!error) { - inode_dir_notify(dir, DN_DELETE); d_delete(dentry); } dput(dentry); @@ -1859,8 +1877,8 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { d_delete(dentry); - inode_dir_notify(dir, DN_DELETE); } + return error; } @@ -1934,7 +1952,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, i DQUOT_INIT(dir); error = dir->i_op->symlink(dir, dentry, oldname); if (!error) { - inode_dir_notify(dir, DN_CREATE); + fsnotify_create(dir, dentry->d_name.name); security_inode_post_symlink(dir, dentry, oldname); } return error; @@ -2007,7 +2025,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de error = dir->i_op->link(old_dentry, dir, new_dentry); up(&old_dentry->d_inode->i_sem); if (!error) { - inode_dir_notify(dir, DN_CREATE); + fsnotify_create(dir, new_dentry->d_name.name); security_inode_post_link(old_dentry, dir, new_dentry); } return error; @@ -2171,6 +2189,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, { int error; int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); + const char *old_name; if (old_dentry->d_inode == new_dentry->d_inode) return 0; @@ -2192,18 +2211,19 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, DQUOT_INIT(old_dir); DQUOT_INIT(new_dir); + old_name = fsnotify_oldname_init(old_dentry->d_name.name); + if (is_dir) error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); else error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); if (!error) { - if (old_dir == new_dir) - inode_dir_notify(old_dir, DN_RENAME); - else { - inode_dir_notify(old_dir, DN_DELETE); - inode_dir_notify(new_dir, DN_CREATE); - } + const char *new_name = old_dentry->d_name.name; + fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir, + new_dentry->d_inode, old_dentry->d_inode); } + fsnotify_oldname_free(old_name); + return error; } @@ -2327,15 +2347,17 @@ out: int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct nameidata nd; - int res; + void *cookie; + nd.depth = 0; - res = dentry->d_inode->i_op->follow_link(dentry, &nd); - if (!res) { - res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); + cookie = dentry->d_inode->i_op->follow_link(dentry, &nd); + if (!IS_ERR(cookie)) { + int res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); if (dentry->d_inode->i_op->put_link) - dentry->d_inode->i_op->put_link(dentry, &nd); + dentry->d_inode->i_op->put_link(dentry, &nd, cookie); + cookie = ERR_PTR(res); } - return res; + return PTR_ERR(cookie); } int vfs_follow_link(struct nameidata *nd, const char *link) @@ -2378,23 +2400,20 @@ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) return res; } -int page_follow_link_light(struct dentry *dentry, struct nameidata *nd) +void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) { - struct page *page; + struct page *page = NULL; nd_set_link(nd, page_getlink(dentry, &page)); - return 0; + return page; } -void page_put_link(struct dentry *dentry, struct nameidata *nd) +void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) { - if (!IS_ERR(nd_get_link(nd))) { - struct page *page; - page = find_get_page(dentry->d_inode->i_mapping, 0); - if (!page) - BUG(); + struct page *page = cookie; + + if (page) { kunmap(page); page_cache_release(page); - page_cache_release(page); } } diff --git a/fs/namespace.c b/fs/namespace.c index 3b93e5d750e..79bd8a46e1e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -61,7 +61,7 @@ struct vfsmount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_child); INIT_LIST_HEAD(&mnt->mnt_mounts); INIT_LIST_HEAD(&mnt->mnt_list); - INIT_LIST_HEAD(&mnt->mnt_fslink); + INIT_LIST_HEAD(&mnt->mnt_expire); if (name) { int size = strlen(name)+1; char *newname = kmalloc(size, GFP_KERNEL); @@ -160,13 +160,13 @@ clone_mnt(struct vfsmount *old, struct dentry *root) mnt->mnt_root = dget(root); mnt->mnt_mountpoint = mnt->mnt_root; mnt->mnt_parent = mnt; - mnt->mnt_namespace = old->mnt_namespace; + mnt->mnt_namespace = current->namespace; /* stick the duplicate mount on the same expiry list * as the original if that was on one */ spin_lock(&vfsmount_lock); - if (!list_empty(&old->mnt_fslink)) - list_add(&mnt->mnt_fslink, &old->mnt_fslink); + if (!list_empty(&old->mnt_expire)) + list_add(&mnt->mnt_expire, &old->mnt_expire); spin_unlock(&vfsmount_lock); } return mnt; @@ -337,7 +337,7 @@ int may_umount(struct vfsmount *mnt) EXPORT_SYMBOL(may_umount); -void umount_tree(struct vfsmount *mnt) +static void umount_tree(struct vfsmount *mnt) { struct vfsmount *p; LIST_HEAD(kill); @@ -345,12 +345,13 @@ void umount_tree(struct vfsmount *mnt) for (p = mnt; p; p = next_mnt(p, mnt)) { list_del(&p->mnt_list); list_add(&p->mnt_list, &kill); + p->mnt_namespace = NULL; } while (!list_empty(&kill)) { mnt = list_entry(kill.next, struct vfsmount, mnt_list); list_del_init(&mnt->mnt_list); - list_del_init(&mnt->mnt_fslink); + list_del_init(&mnt->mnt_expire); if (mnt->mnt_parent == mnt) { spin_unlock(&vfsmount_lock); } else { @@ -644,7 +645,7 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse) if (mnt) { /* stop bind mounts from expiring */ spin_lock(&vfsmount_lock); - list_del_init(&mnt->mnt_fslink); + list_del_init(&mnt->mnt_expire); spin_unlock(&vfsmount_lock); err = graft_tree(mnt, nd); @@ -743,7 +744,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name) /* if the mount is moved, it should no longer be expire * automatically */ - list_del_init(&old_nd.mnt->mnt_fslink); + list_del_init(&old_nd.mnt->mnt_expire); out2: spin_unlock(&vfsmount_lock); out1: @@ -807,12 +808,13 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd, goto unlock; newmnt->mnt_flags = mnt_flags; + newmnt->mnt_namespace = current->namespace; err = graft_tree(newmnt, nd); if (err == 0 && fslist) { /* add to the specified expiration list */ spin_lock(&vfsmount_lock); - list_add_tail(&newmnt->mnt_fslink, fslist); + list_add_tail(&newmnt->mnt_expire, fslist); spin_unlock(&vfsmount_lock); } @@ -824,6 +826,54 @@ unlock: EXPORT_SYMBOL_GPL(do_add_mount); +static void expire_mount(struct vfsmount *mnt, struct list_head *mounts) +{ + spin_lock(&vfsmount_lock); + + /* + * Check if mount is still attached, if not, let whoever holds it deal + * with the sucker + */ + if (mnt->mnt_parent == mnt) { + spin_unlock(&vfsmount_lock); + return; + } + + /* + * Check that it is still dead: the count should now be 2 - as + * contributed by the vfsmount parent and the mntget above + */ + if (atomic_read(&mnt->mnt_count) == 2) { + struct nameidata old_nd; + + /* delete from the namespace */ + list_del_init(&mnt->mnt_list); + mnt->mnt_namespace = NULL; + detach_mnt(mnt, &old_nd); + spin_unlock(&vfsmount_lock); + path_release(&old_nd); + + /* + * Now lay it to rest if this was the last ref on the superblock + */ + if (atomic_read(&mnt->mnt_sb->s_active) == 1) { + /* last instance - try to be smart */ + lock_kernel(); + DQUOT_OFF(mnt->mnt_sb); + acct_auto_close(mnt->mnt_sb); + unlock_kernel(); + } + mntput(mnt); + } else { + /* + * Someone brought it back to life whilst we didn't have any + * locks held so return it to the expiration list + */ + list_add_tail(&mnt->mnt_expire, mounts); + spin_unlock(&vfsmount_lock); + } +} + /* * process a list of expirable mountpoints with the intent of discarding any * mountpoints that aren't in use and haven't been touched since last we came @@ -846,13 +896,13 @@ void mark_mounts_for_expiry(struct list_head *mounts) * - still marked for expiry (marked on the last call here; marks are * cleared by mntput()) */ - list_for_each_entry_safe(mnt, next, mounts, mnt_fslink) { + list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { if (!xchg(&mnt->mnt_expiry_mark, 1) || atomic_read(&mnt->mnt_count) != 1) continue; mntget(mnt); - list_move(&mnt->mnt_fslink, &graveyard); + list_move(&mnt->mnt_expire, &graveyard); } /* @@ -862,61 +912,19 @@ void mark_mounts_for_expiry(struct list_head *mounts) * - dispose of the corpse */ while (!list_empty(&graveyard)) { - mnt = list_entry(graveyard.next, struct vfsmount, mnt_fslink); - list_del_init(&mnt->mnt_fslink); + mnt = list_entry(graveyard.next, struct vfsmount, mnt_expire); + list_del_init(&mnt->mnt_expire); /* don't do anything if the namespace is dead - all the * vfsmounts from it are going away anyway */ namespace = mnt->mnt_namespace; - if (!namespace || atomic_read(&namespace->count) <= 0) + if (!namespace || !namespace->root) continue; get_namespace(namespace); spin_unlock(&vfsmount_lock); down_write(&namespace->sem); - spin_lock(&vfsmount_lock); - - /* check that it is still dead: the count should now be 2 - as - * contributed by the vfsmount parent and the mntget above */ - if (atomic_read(&mnt->mnt_count) == 2) { - struct vfsmount *xdmnt; - struct dentry *xdentry; - - /* delete from the namespace */ - list_del_init(&mnt->mnt_list); - list_del_init(&mnt->mnt_child); - list_del_init(&mnt->mnt_hash); - mnt->mnt_mountpoint->d_mounted--; - - xdentry = mnt->mnt_mountpoint; - mnt->mnt_mountpoint = mnt->mnt_root; - xdmnt = mnt->mnt_parent; - mnt->mnt_parent = mnt; - - spin_unlock(&vfsmount_lock); - - mntput(xdmnt); - dput(xdentry); - - /* now lay it to rest if this was the last ref on the - * superblock */ - if (atomic_read(&mnt->mnt_sb->s_active) == 1) { - /* last instance - try to be smart */ - lock_kernel(); - DQUOT_OFF(mnt->mnt_sb); - acct_auto_close(mnt->mnt_sb); - unlock_kernel(); - } - - mntput(mnt); - } else { - /* someone brought it back to life whilst we didn't - * have any locks held so return it to the expiration - * list */ - list_add_tail(&mnt->mnt_fslink, mounts); - spin_unlock(&vfsmount_lock); - } - + expire_mount(mnt, mounts); up_write(&namespace->sem); mntput(mnt); @@ -1449,16 +1457,12 @@ void __init mnt_init(unsigned long mempages) void __put_namespace(struct namespace *namespace) { - struct vfsmount *mnt; - + struct vfsmount *root = namespace->root; + namespace->root = NULL; + spin_unlock(&vfsmount_lock); down_write(&namespace->sem); spin_lock(&vfsmount_lock); - - list_for_each_entry(mnt, &namespace->list, mnt_list) { - mnt->mnt_namespace = NULL; - } - - umount_tree(namespace->root); + umount_tree(root); spin_unlock(&vfsmount_lock); up_write(&namespace->sem); kfree(namespace); diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 2dc2d869396..a9f7a8ab1d5 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -705,18 +705,6 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir, DPRINTK("ncp_do_readdir: init failed, err=%d\n", err); return; } -#ifdef USE_OLD_SLOW_DIRECTORY_LISTING - for (;;) { - err = ncp_search_for_file_or_subdir(server, &seq, &entry.i); - if (err) { - DPRINTK("ncp_do_readdir: search failed, err=%d\n", err); - break; - } - entry.volume = entry.i.volNumber; - if (!ncp_fill_cache(filp, dirent, filldir, ctl, &entry)) - break; - } -#else /* We MUST NOT use server->buffer_size handshaked with server if we are using UDP, as for UDP server uses max. buffer size determined by MTU, and for TCP server uses hardwired value 65KB (== 66560 bytes). @@ -754,7 +742,6 @@ ncp_do_readdir(struct file *filp, void *dirent, filldir_t filldir, } } while (more); vfree(buf); -#endif return; } diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c index e4eb5ed4bee..c755e1848a4 100644 --- a/fs/ncpfs/ncplib_kernel.c +++ b/fs/ncpfs/ncplib_kernel.c @@ -845,46 +845,6 @@ out: return result; } -/* Search for everything */ -int ncp_search_for_file_or_subdir(struct ncp_server *server, - struct nw_search_sequence *seq, - struct nw_info_struct *target) -{ - int result; - - ncp_init_request(server); - ncp_add_byte(server, 3); /* subfunction */ - ncp_add_byte(server, server->name_space[seq->volNumber]); - ncp_add_byte(server, 0); /* data stream (???) */ - ncp_add_word(server, cpu_to_le16(0x8006)); /* Search attribs */ - ncp_add_dword(server, RIM_ALL); /* return info mask */ - ncp_add_mem(server, seq, 9); -#ifdef CONFIG_NCPFS_NFS_NS - if (server->name_space[seq->volNumber] == NW_NS_NFS) { - ncp_add_byte(server, 0); /* 0 byte pattern */ - } else -#endif - { - ncp_add_byte(server, 2); /* 2 byte pattern */ - ncp_add_byte(server, 0xff); /* following is a wildcard */ - ncp_add_byte(server, '*'); - } - - if ((result = ncp_request(server, 87)) != 0) - goto out; - memcpy(seq, ncp_reply_data(server, 0), sizeof(*seq)); - ncp_extract_file_info(ncp_reply_data(server, 10), target); - - ncp_unlock_server(server); - - result = ncp_obtain_nfs_info(server, target); - return result; - -out: - ncp_unlock_server(server); - return result; -} - int ncp_search_for_fileset(struct ncp_server *server, struct nw_search_sequence *seq, int* more, diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h index 05ec2e9d90c..9e4dc30c243 100644 --- a/fs/ncpfs/ncplib_kernel.h +++ b/fs/ncpfs/ncplib_kernel.h @@ -87,9 +87,6 @@ int ncp_open_create_file_or_subdir(struct ncp_server *, struct inode *, char *, int ncp_initialize_search(struct ncp_server *, struct inode *, struct nw_search_sequence *target); -int ncp_search_for_file_or_subdir(struct ncp_server *server, - struct nw_search_sequence *seq, - struct nw_info_struct *target); int ncp_search_for_fileset(struct ncp_server *server, struct nw_search_sequence *seq, int* more, int* cnt, diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index b4baa031edf..8b3bb715d17 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -8,6 +8,7 @@ nfs-y := dir.o file.o inode.o nfs2xdr.o pagelist.o \ proc.o read.o symlink.o unlink.o write.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o +nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ delegation.o idmap.o \ callback.o callback_xdr.o callback_proc.o diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 560d6175dd5..f2ca782aba3 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -14,6 +14,7 @@ #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> #include <linux/nfs_fs.h> +#include "nfs4_fs.h" #include "callback.h" #define NFSDBG_FACILITY NFSDBG_CALLBACK diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index ece27e42b93..65f1e19e4d1 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -8,6 +8,7 @@ #include <linux/config.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> +#include "nfs4_fs.h" #include "callback.h" #include "delegation.h" diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index d271df9df2b..7c33b9a81a9 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -10,6 +10,7 @@ #include <linux/sunrpc/svc.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> +#include "nfs4_fs.h" #include "callback.h" #define CB_OP_TAGLEN_MAXSZ (512) @@ -410,7 +411,6 @@ static int nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base); p = (uint32_t*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); - rqstp->rq_res.head[0].iov_len = PAGE_SIZE; xdr_init_encode(&xdr_out, &rqstp->rq_res, p); decode_compound_hdr_arg(&xdr_in, &hdr_arg); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 5b9c60f9779..d7f7eb669d0 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -16,6 +16,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_xdr.h> +#include "nfs4_fs.h" #include "delegation.h" static struct nfs_delegation *nfs_alloc_delegation(void) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ff6155f5e8d..2df639f143e 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -32,6 +32,7 @@ #include <linux/smp_lock.h> #include <linux/namei.h> +#include "nfs4_fs.h" #include "delegation.h" #define NFS_PARANOIA 1 @@ -50,8 +51,10 @@ static int nfs_mknod(struct inode *, struct dentry *, int, dev_t); static int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); static int nfs_fsync_dir(struct file *, struct dentry *, int); +static loff_t nfs_llseek_dir(struct file *, loff_t, int); struct file_operations nfs_dir_operations = { + .llseek = nfs_llseek_dir, .read = generic_read_dir, .readdir = nfs_readdir, .open = nfs_opendir, @@ -74,6 +77,27 @@ struct inode_operations nfs_dir_inode_operations = { .setattr = nfs_setattr, }; +#ifdef CONFIG_NFS_V3 +struct inode_operations nfs3_dir_inode_operations = { + .create = nfs_create, + .lookup = nfs_lookup, + .link = nfs_link, + .unlink = nfs_unlink, + .symlink = nfs_symlink, + .mkdir = nfs_mkdir, + .rmdir = nfs_rmdir, + .mknod = nfs_mknod, + .rename = nfs_rename, + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, + .listxattr = nfs3_listxattr, + .getxattr = nfs3_getxattr, + .setxattr = nfs3_setxattr, + .removexattr = nfs3_removexattr, +}; +#endif /* CONFIG_NFS_V3 */ + #ifdef CONFIG_NFS_V4 static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *); @@ -90,6 +114,9 @@ struct inode_operations nfs4_dir_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, .setattr = nfs_setattr, + .getxattr = nfs4_getxattr, + .setxattr = nfs4_setxattr, + .listxattr = nfs4_listxattr, }; #endif /* CONFIG_NFS_V4 */ @@ -116,7 +143,8 @@ typedef struct { struct page *page; unsigned long page_index; u32 *ptr; - u64 target; + u64 *dir_cookie; + loff_t current_index; struct nfs_entry *entry; decode_dirent_t decode; int plus; @@ -154,22 +182,22 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) /* We requested READDIRPLUS, but the server doesn't grok it */ if (error == -ENOTSUPP && desc->plus) { NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; - NFS_FLAGS(inode) &= ~NFS_INO_ADVISE_RDPLUS; + clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode)); desc->plus = 0; goto again; } goto error; } SetPageUptodate(page); - NFS_FLAGS(inode) |= NFS_INO_INVALID_ATIME; + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; + spin_unlock(&inode->i_lock); /* Ensure consistent page alignment of the data. * Note: assumes we have exclusive access to this mapping either - * throught inode->i_sem or some other mechanism. + * through inode->i_sem or some other mechanism. */ - if (page->index == 0) { - invalidate_inode_pages(inode->i_mapping); - NFS_I(inode)->readdir_timestamp = timestamp; - } + if (page->index == 0) + invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1); unlock_page(page); return 0; error: @@ -202,22 +230,22 @@ void dir_page_release(nfs_readdir_descriptor_t *desc) /* * Given a pointer to a buffer that has already been filled by a call - * to readdir, find the next entry. + * to readdir, find the next entry with cookie '*desc->dir_cookie'. * * If the end of the buffer has been reached, return -EAGAIN, if not, * return the offset within the buffer of the next entry to be * read. */ static inline -int find_dirent(nfs_readdir_descriptor_t *desc, struct page *page) +int find_dirent(nfs_readdir_descriptor_t *desc) { struct nfs_entry *entry = desc->entry; int loop_count = 0, status; while((status = dir_decode(desc)) == 0) { - dfprintk(VFS, "NFS: found cookie %Lu\n", (long long)entry->cookie); - if (entry->prev_cookie == desc->target) + dfprintk(VFS, "NFS: found cookie %Lu\n", (unsigned long long)entry->cookie); + if (entry->prev_cookie == *desc->dir_cookie) break; if (loop_count++ > 200) { loop_count = 0; @@ -229,8 +257,44 @@ int find_dirent(nfs_readdir_descriptor_t *desc, struct page *page) } /* - * Find the given page, and call find_dirent() in order to try to - * return the next entry. + * Given a pointer to a buffer that has already been filled by a call + * to readdir, find the entry at offset 'desc->file->f_pos'. + * + * If the end of the buffer has been reached, return -EAGAIN, if not, + * return the offset within the buffer of the next entry to be + * read. + */ +static inline +int find_dirent_index(nfs_readdir_descriptor_t *desc) +{ + struct nfs_entry *entry = desc->entry; + int loop_count = 0, + status; + + for(;;) { + status = dir_decode(desc); + if (status) + break; + + dfprintk(VFS, "NFS: found cookie %Lu at index %Ld\n", (unsigned long long)entry->cookie, desc->current_index); + + if (desc->file->f_pos == desc->current_index) { + *desc->dir_cookie = entry->cookie; + break; + } + desc->current_index++; + if (loop_count++ > 200) { + loop_count = 0; + schedule(); + } + } + dfprintk(VFS, "NFS: find_dirent_index() returns %d\n", status); + return status; +} + +/* + * Find the given page, and call find_dirent() or find_dirent_index in + * order to try to return the next entry. */ static inline int find_dirent_page(nfs_readdir_descriptor_t *desc) @@ -253,7 +317,10 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc) /* NOTE: Someone else may have changed the READDIRPLUS flag */ desc->page = page; desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ - status = find_dirent(desc, page); + if (*desc->dir_cookie != 0) + status = find_dirent(desc); + else + status = find_dirent_index(desc); if (status < 0) dir_page_release(desc); out: @@ -268,7 +335,8 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc) * Recurse through the page cache pages, and return a * filled nfs_entry structure of the next directory entry if possible. * - * The target for the search is 'desc->target'. + * The target for the search is '*desc->dir_cookie' if non-0, + * 'desc->file->f_pos' otherwise */ static inline int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) @@ -276,7 +344,16 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) int loop_count = 0; int res; - dfprintk(VFS, "NFS: readdir_search_pagecache() searching for cookie %Lu\n", (long long)desc->target); + /* Always search-by-index from the beginning of the cache */ + if (*desc->dir_cookie == 0) { + dfprintk(VFS, "NFS: readdir_search_pagecache() searching for offset %Ld\n", (long long)desc->file->f_pos); + desc->page_index = 0; + desc->entry->cookie = desc->entry->prev_cookie = 0; + desc->entry->eof = 0; + desc->current_index = 0; + } else + dfprintk(VFS, "NFS: readdir_search_pagecache() searching for cookie %Lu\n", (unsigned long long)*desc->dir_cookie); + for (;;) { res = find_dirent_page(desc); if (res != -EAGAIN) @@ -313,7 +390,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, int loop_count = 0, res; - dfprintk(VFS, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n", (long long)desc->target); + dfprintk(VFS, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n", (long long)entry->cookie); for(;;) { unsigned d_type = DT_UNKNOWN; @@ -333,10 +410,11 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, } res = filldir(dirent, entry->name, entry->len, - entry->prev_cookie, fileid, d_type); + file->f_pos, fileid, d_type); if (res < 0) break; - file->f_pos = desc->target = entry->cookie; + file->f_pos++; + *desc->dir_cookie = entry->cookie; if (dir_decode(desc) != 0) { desc->page_index ++; break; @@ -349,7 +427,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, dir_page_release(desc); if (dentry != NULL) dput(dentry); - dfprintk(VFS, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (long long)desc->target, res); + dfprintk(VFS, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (unsigned long long)*desc->dir_cookie, res); return res; } @@ -375,23 +453,25 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, struct page *page = NULL; int status; - dfprintk(VFS, "NFS: uncached_readdir() searching for cookie %Lu\n", (long long)desc->target); + dfprintk(VFS, "NFS: uncached_readdir() searching for cookie %Lu\n", (unsigned long long)*desc->dir_cookie); page = alloc_page(GFP_HIGHUSER); if (!page) { status = -ENOMEM; goto out; } - desc->error = NFS_PROTO(inode)->readdir(file->f_dentry, cred, desc->target, + desc->error = NFS_PROTO(inode)->readdir(file->f_dentry, cred, *desc->dir_cookie, page, NFS_SERVER(inode)->dtsize, desc->plus); - NFS_FLAGS(inode) |= NFS_INO_INVALID_ATIME; + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; + spin_unlock(&inode->i_lock); desc->page = page; desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ if (desc->error >= 0) { if ((status = dir_decode(desc)) == 0) - desc->entry->prev_cookie = desc->target; + desc->entry->prev_cookie = *desc->dir_cookie; } else status = -EIO; if (status < 0) @@ -412,8 +492,9 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, goto out; } -/* The file offset position is now represented as a true offset into the - * page cache as is the case in most of the other filesystems. +/* The file offset position represents the dirent entry number. A + last cookie cache takes care of the common case of reading the + whole directory. */ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { @@ -435,15 +516,15 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) } /* - * filp->f_pos points to the file offset in the page cache. - * but if the cache has meanwhile been zapped, we need to - * read from the last dirent to revalidate f_pos - * itself. + * filp->f_pos points to the dirent entry number. + * *desc->dir_cookie has the cookie for the next entry. We have + * to either find the entry with the appropriate number or + * revalidate the cookie. */ memset(desc, 0, sizeof(*desc)); desc->file = filp; - desc->target = filp->f_pos; + desc->dir_cookie = &((struct nfs_open_context *)filp->private_data)->dir_cookie; desc->decode = NFS_PROTO(inode)->decode_dirent; desc->plus = NFS_USE_READDIRPLUS(inode); @@ -455,9 +536,10 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) while(!desc->entry->eof) { res = readdir_search_pagecache(desc); + if (res == -EBADCOOKIE) { /* This means either end of directory */ - if (desc->entry->cookie != desc->target) { + if (*desc->dir_cookie && desc->entry->cookie != *desc->dir_cookie) { /* Or that the server has 'lost' a cookie */ res = uncached_readdir(desc, dirent, filldir); if (res >= 0) @@ -467,7 +549,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) break; } if (res == -ETOOSMALL && desc->plus) { - NFS_FLAGS(inode) &= ~NFS_INO_ADVISE_RDPLUS; + clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode)); nfs_zap_caches(inode); desc->plus = 0; desc->entry->eof = 0; @@ -490,6 +572,28 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) return 0; } +loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) +{ + down(&filp->f_dentry->d_inode->i_sem); + switch (origin) { + case 1: + offset += filp->f_pos; + case 0: + if (offset >= 0) + break; + default: + offset = -EINVAL; + goto out; + } + if (offset != filp->f_pos) { + filp->f_pos = offset; + ((struct nfs_open_context *)filp->private_data)->dir_cookie = 0; + } +out: + up(&filp->f_dentry->d_inode->i_sem); + return offset; +} + /* * All directory operations under NFS are synchronous, so fsync() * is a dummy operation. @@ -508,7 +612,7 @@ static inline int nfs_check_verifier(struct inode *dir, struct dentry *dentry) { if (IS_ROOT(dentry)) return 1; - if ((NFS_FLAGS(dir) & NFS_INO_INVALID_ATTR) != 0 + if ((NFS_I(dir)->cache_validity & NFS_INO_INVALID_ATTR) != 0 || nfs_attribute_timeout(dir)) return 0; return nfs_verify_change_attribute(dir, (unsigned long)dentry->d_fsdata); @@ -835,6 +939,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry error = nfs_revalidate_inode(NFS_SERVER(dir), dir); if (error < 0) { res = ERR_PTR(error); + unlock_kernel(); goto out; } @@ -1475,11 +1580,12 @@ out: int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res) { - struct nfs_access_entry *cache = &NFS_I(inode)->cache_access; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_access_entry *cache = &nfsi->cache_access; if (cache->cred != cred || time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode)) - || (NFS_FLAGS(inode) & NFS_INO_INVALID_ACCESS)) + || (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)) return -ENOENT; memcpy(res, cache, sizeof(*res)); return 0; @@ -1487,14 +1593,18 @@ int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) { - struct nfs_access_entry *cache = &NFS_I(inode)->cache_access; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_access_entry *cache = &nfsi->cache_access; if (cache->cred != set->cred) { if (cache->cred) put_rpccred(cache->cred); cache->cred = get_rpccred(set->cred); } - NFS_FLAGS(inode) &= ~NFS_INO_INVALID_ACCESS; + /* FIXME: replace current access_cache BKL reliance with inode->i_lock */ + spin_lock(&inode->i_lock); + nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS; + spin_unlock(&inode->i_lock); cache->jiffies = set->jiffies; cache->mask = set->mask; } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 68df803f27c..6537f2c4ae4 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -517,7 +517,7 @@ retry: result = tot_bytes; out: - nfs_end_data_update_defer(inode); + nfs_end_data_update(inode); nfs_writedata_free(wdata); return result; @@ -751,11 +751,6 @@ nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, retval = -EFAULT; if (!access_ok(VERIFY_READ, iov.iov_base, iov.iov_len)) goto out; - if (file->f_error) { - retval = file->f_error; - file->f_error = 0; - goto out; - } retval = -EFBIG; if (limit != RLIM_INFINITY) { if (pos >= limit) { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 55c90759249..f6b9eda925c 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -71,6 +71,18 @@ struct inode_operations nfs_file_inode_operations = { .setattr = nfs_setattr, }; +#ifdef CONFIG_NFS_V3 +struct inode_operations nfs3_file_inode_operations = { + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, + .listxattr = nfs3_listxattr, + .getxattr = nfs3_getxattr, + .setxattr = nfs3_setxattr, + .removexattr = nfs3_removexattr, +}; +#endif /* CONFIG_NFS_v3 */ + /* Hack for future NFS swap support */ #ifndef IS_SWAPFILE # define IS_SWAPFILE(inode) (0) @@ -116,6 +128,22 @@ nfs_file_release(struct inode *inode, struct file *filp) } /** + * nfs_revalidate_file - Revalidate the page cache & related metadata + * @inode - pointer to inode struct + * @file - pointer to file + */ +static int nfs_revalidate_file(struct inode *inode, struct file *filp) +{ + struct nfs_inode *nfsi = NFS_I(inode); + int retval = 0; + + if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) || nfs_attribute_timeout(inode)) + retval = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + nfs_revalidate_mapping(inode, filp->f_mapping); + return 0; +} + +/** * nfs_revalidate_size - Revalidate the file size * @inode - pointer to inode struct * @file - pointer to struct file @@ -137,7 +165,8 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp) goto force_reval; if (nfsi->npages != 0) return 0; - return nfs_revalidate_inode(server, inode); + if (!(nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode)) + return 0; force_reval: return __nfs_revalidate_inode(server, inode); } @@ -198,7 +227,7 @@ nfs_file_read(struct kiocb *iocb, char __user * buf, size_t count, loff_t pos) dentry->d_parent->d_name.name, dentry->d_name.name, (unsigned long) count, (unsigned long) pos); - result = nfs_revalidate_inode(NFS_SERVER(inode), inode); + result = nfs_revalidate_file(inode, iocb->ki_filp); if (!result) result = generic_file_aio_read(iocb, buf, count, pos); return result; @@ -216,7 +245,7 @@ nfs_file_sendfile(struct file *filp, loff_t *ppos, size_t count, dentry->d_parent->d_name.name, dentry->d_name.name, (unsigned long) count, (unsigned long long) *ppos); - res = nfs_revalidate_inode(NFS_SERVER(inode), inode); + res = nfs_revalidate_file(inode, filp); if (!res) res = generic_file_sendfile(filp, ppos, count, actor, target); return res; @@ -232,7 +261,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) dfprintk(VFS, "nfs: mmap(%s/%s)\n", dentry->d_parent->d_name.name, dentry->d_name.name); - status = nfs_revalidate_inode(NFS_SERVER(inode), inode); + status = nfs_revalidate_file(inode, file); if (!status) status = generic_file_mmap(file, vma); return status; @@ -321,9 +350,15 @@ nfs_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t result = -EBUSY; if (IS_SWAPFILE(inode)) goto out_swapfile; - result = nfs_revalidate_inode(NFS_SERVER(inode), inode); - if (result) - goto out; + /* + * O_APPEND implies that we must revalidate the file length. + */ + if (iocb->ki_filp->f_flags & O_APPEND) { + result = nfs_revalidate_file_size(inode, iocb->ki_filp); + if (result) + goto out; + } + nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); result = count; if (!count) diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 87f4f9aeac8..ffb8df91dc3 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -50,6 +50,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_idmap.h> +#include "nfs4_fs.h" #define IDMAP_HASH_SZ 128 diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f2317f3e29f..541b418327c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -39,6 +39,7 @@ #include <asm/system.h> #include <asm/uaccess.h> +#include "nfs4_fs.h" #include "delegation.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -63,6 +64,7 @@ static void nfs_clear_inode(struct inode *); static void nfs_umount_begin(struct super_block *); static int nfs_statfs(struct super_block *, struct kstatfs *); static int nfs_show_options(struct seq_file *, struct vfsmount *); +static void nfs_zap_acl_cache(struct inode *); static struct rpc_program nfs_program; @@ -106,6 +108,21 @@ static struct rpc_program nfs_program = { .pipe_dir_name = "/nfs", }; +#ifdef CONFIG_NFS_V3_ACL +static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; +static struct rpc_version * nfsacl_version[] = { + [3] = &nfsacl_version3, +}; + +struct rpc_program nfsacl_program = { + .name = "nfsacl", + .number = NFS_ACL_PROGRAM, + .nrvers = sizeof(nfsacl_version) / sizeof(nfsacl_version[0]), + .version = nfsacl_version, + .stats = &nfsacl_rpcstat, +}; +#endif /* CONFIG_NFS_V3_ACL */ + static inline unsigned long nfs_fattr_to_ino_t(struct nfs_fattr *fattr) { @@ -118,7 +135,7 @@ nfs_write_inode(struct inode *inode, int sync) int flags = sync ? FLUSH_WAIT : 0; int ret; - ret = nfs_commit_inode(inode, 0, 0, flags); + ret = nfs_commit_inode(inode, flags); if (ret < 0) return ret; return 0; @@ -140,10 +157,6 @@ nfs_delete_inode(struct inode * inode) clear_inode(inode); } -/* - * For the moment, the only task for the NFS clear_inode method is to - * release the mmap credential - */ static void nfs_clear_inode(struct inode *inode) { @@ -152,6 +165,7 @@ nfs_clear_inode(struct inode *inode) nfs_wb_all(inode); BUG_ON (!list_empty(&nfsi->open_files)); + nfs_zap_acl_cache(inode); cred = nfsi->cache_access.cred; if (cred) put_rpccred(cred); @@ -161,11 +175,13 @@ nfs_clear_inode(struct inode *inode) void nfs_umount_begin(struct super_block *sb) { - struct nfs_server *server = NFS_SB(sb); - struct rpc_clnt *rpc; + struct rpc_clnt *rpc = NFS_SB(sb)->client; /* -EIO all pending I/O */ - if ((rpc = server->client) != NULL) + if (!IS_ERR(rpc)) + rpc_killall_tasks(rpc); + rpc = NFS_SB(sb)->client_acl; + if (!IS_ERR(rpc)) rpc_killall_tasks(rpc); } @@ -366,13 +382,15 @@ nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data) xprt = xprt_create_proto(tcp ? IPPROTO_TCP : IPPROTO_UDP, &server->addr, &timeparms); if (IS_ERR(xprt)) { - printk(KERN_WARNING "NFS: cannot create RPC transport.\n"); + dprintk("%s: cannot create RPC transport. Error = %ld\n", + __FUNCTION__, PTR_ERR(xprt)); return (struct rpc_clnt *)xprt; } clnt = rpc_create_client(xprt, server->hostname, &nfs_program, server->rpc_ops->version, data->pseudoflavor); if (IS_ERR(clnt)) { - printk(KERN_WARNING "NFS: cannot create RPC client.\n"); + dprintk("%s: cannot create RPC client. Error = %ld\n", + __FUNCTION__, PTR_ERR(xprt)); goto out_fail; } @@ -383,7 +401,6 @@ nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data) return clnt; out_fail: - xprt_destroy(xprt); return clnt; } @@ -427,21 +444,16 @@ nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent) /* Check NFS protocol revision and initialize RPC op vector * and file handle pool. */ - if (server->flags & NFS_MOUNT_VER3) { #ifdef CONFIG_NFS_V3 + if (server->flags & NFS_MOUNT_VER3) { server->rpc_ops = &nfs_v3_clientops; server->caps |= NFS_CAP_READDIRPLUS; - if (data->version < 4) { - printk(KERN_NOTICE "NFS: NFSv3 not supported by mount program.\n"); - return -EIO; - } -#else - printk(KERN_NOTICE "NFS: NFSv3 not supported.\n"); - return -EIO; -#endif } else { server->rpc_ops = &nfs_v2_clientops; } +#else + server->rpc_ops = &nfs_v2_clientops; +#endif /* Fill in pseudoflavor for mount version < 5 */ if (!(data->flags & NFS_MOUNT_SECFLAVOUR)) @@ -455,17 +467,34 @@ nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent) return PTR_ERR(server->client); /* RFC 2623, sec 2.3.2 */ if (authflavor != RPC_AUTH_UNIX) { + struct rpc_auth *auth; + server->client_sys = rpc_clone_client(server->client); if (IS_ERR(server->client_sys)) return PTR_ERR(server->client_sys); - if (!rpcauth_create(RPC_AUTH_UNIX, server->client_sys)) - return -ENOMEM; + auth = rpcauth_create(RPC_AUTH_UNIX, server->client_sys); + if (IS_ERR(auth)) + return PTR_ERR(auth); } else { atomic_inc(&server->client->cl_count); server->client_sys = server->client; } - if (server->flags & NFS_MOUNT_VER3) { +#ifdef CONFIG_NFS_V3_ACL + if (!(server->flags & NFS_MOUNT_NOACL)) { + server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3); + /* No errors! Assume that Sun nfsacls are supported */ + if (!IS_ERR(server->client_acl)) + server->caps |= NFS_CAP_ACLS; + } +#else + server->flags &= ~NFS_MOUNT_NOACL; +#endif /* CONFIG_NFS_V3_ACL */ + /* + * The VFS shouldn't apply the umask to mode bits. We will + * do so ourselves when necessary. + */ + sb->s_flags |= MS_POSIXACL; if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN) server->namelen = NFS3_MAXNAMLEN; sb->s_time_gran = 1; @@ -549,6 +578,7 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt) { NFS_MOUNT_NOCTO, ",nocto", "" }, { NFS_MOUNT_NOAC, ",noac", "" }, { NFS_MOUNT_NONLM, ",nolock", ",lock" }, + { NFS_MOUNT_NOACL, ",noacl", "" }, { 0, NULL, NULL } }; struct proc_nfs_info *nfs_infop; @@ -585,14 +615,30 @@ nfs_zap_caches(struct inode *inode) struct nfs_inode *nfsi = NFS_I(inode); int mode = inode->i_mode; + spin_lock(&inode->i_lock); + NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); NFS_ATTRTIMEO_UPDATE(inode) = jiffies; memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) - nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS; + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; else - nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS; + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; + + spin_unlock(&inode->i_lock); +} + +static void nfs_zap_acl_cache(struct inode *inode) +{ + void (*clear_acl_cache)(struct inode *); + + clear_acl_cache = NFS_PROTO(inode)->clear_acl_cache; + if (clear_acl_cache != NULL) + clear_acl_cache(inode); + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL; + spin_unlock(&inode->i_lock); } /* @@ -689,7 +735,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ - inode->i_op = &nfs_file_inode_operations; + inode->i_op = NFS_SB(sb)->rpc_ops->file_inode_ops; if (S_ISREG(inode->i_mode)) { inode->i_fop = &nfs_file_operations; inode->i_data.a_ops = &nfs_file_aops; @@ -699,7 +745,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) inode->i_fop = &nfs_dir_operations; if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS) && fattr->size <= NFS_LIMIT_READDIRPLUS) - NFS_FLAGS(inode) |= NFS_INO_ADVISE_RDPLUS; + set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode)); } else if (S_ISLNK(inode->i_mode)) inode->i_op = &nfs_symlink_inode_operations; else @@ -774,55 +820,84 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) nfs_wb_all(inode); } error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr); - if (error == 0) { + if (error == 0) nfs_refresh_inode(inode, &fattr); + nfs_end_data_update(inode); + unlock_kernel(); + return error; +} + +/** + * nfs_setattr_update_inode - Update inode metadata after a setattr call. + * @inode: pointer to struct inode + * @attr: pointer to struct iattr + * + * Note: we do this in the *proc.c in order to ensure that + * it works for things like exclusive creates too. + */ +void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) +{ + if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { if ((attr->ia_valid & ATTR_MODE) != 0) { - int mode; - mode = inode->i_mode & ~S_IALLUGO; - mode |= attr->ia_mode & S_IALLUGO; + int mode = attr->ia_mode & S_IALLUGO; + mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; } if ((attr->ia_valid & ATTR_UID) != 0) inode->i_uid = attr->ia_uid; if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; - if ((attr->ia_valid & ATTR_SIZE) != 0) { - inode->i_size = attr->ia_size; - vmtruncate(inode, attr->ia_size); - } + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + spin_unlock(&inode->i_lock); + } + if ((attr->ia_valid & ATTR_SIZE) != 0) { + inode->i_size = attr->ia_size; + vmtruncate(inode, attr->ia_size); } - if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) - NFS_FLAGS(inode) |= NFS_INO_INVALID_ACCESS; - nfs_end_data_update(inode); - unlock_kernel(); - return error; +} + +static int nfs_wait_schedule(void *word) +{ + if (signal_pending(current)) + return -ERESTARTSYS; + schedule(); + return 0; } /* * Wait for the inode to get unlocked. - * (Used for NFS_INO_LOCKED and NFS_INO_REVALIDATING). */ -static int -nfs_wait_on_inode(struct inode *inode, int flag) +static int nfs_wait_on_inode(struct inode *inode) { struct rpc_clnt *clnt = NFS_CLIENT(inode); struct nfs_inode *nfsi = NFS_I(inode); - + sigset_t oldmask; int error; - if (!(NFS_FLAGS(inode) & flag)) - return 0; + atomic_inc(&inode->i_count); - error = nfs_wait_event(clnt, nfsi->nfs_i_wait, - !(NFS_FLAGS(inode) & flag)); + rpc_clnt_sigmask(clnt, &oldmask); + error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING, + nfs_wait_schedule, TASK_INTERRUPTIBLE); + rpc_clnt_sigunmask(clnt, &oldmask); iput(inode); + return error; } +static void nfs_wake_up_inode(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + clear_bit(NFS_INO_REVALIDATING, &nfsi->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&nfsi->flags, NFS_INO_REVALIDATING); +} + int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; - struct nfs_inode *nfsi = NFS_I(inode); - int need_atime = nfsi->flags & NFS_INO_INVALID_ATIME; + int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; int err; if (__IS_FLG(inode, MS_NOATIME)) @@ -851,7 +926,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rp ctx->state = NULL; ctx->lockowner = current->files; ctx->error = 0; - init_waitqueue_head(&ctx->waitq); + ctx->dir_cookie = 0; } return ctx; } @@ -968,7 +1043,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) struct nfs_fattr fattr; struct nfs_inode *nfsi = NFS_I(inode); unsigned long verifier; - unsigned int flags; + unsigned long cache_validity; dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); @@ -979,18 +1054,19 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) if (NFS_STALE(inode)) goto out_nowait; - while (NFS_REVALIDATING(inode)) { - status = nfs_wait_on_inode(inode, NFS_INO_REVALIDATING); - if (status < 0) - goto out_nowait; - if (NFS_ATTRTIMEO(inode) == 0) - continue; - if (NFS_FLAGS(inode) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME)) - continue; - status = NFS_STALE(inode) ? -ESTALE : 0; - goto out_nowait; + status = nfs_wait_on_inode(inode); + if (status < 0) + goto out; + if (NFS_STALE(inode)) { + status = -ESTALE; + /* Do we trust the cached ESTALE? */ + if (NFS_ATTRTIMEO(inode) != 0) { + if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME)) { + /* no */ + } else + goto out; + } } - NFS_FLAGS(inode) |= NFS_INO_REVALIDATING; /* Protect against RPC races by saving the change attribute */ verifier = nfs_save_change_attribute(inode); @@ -1002,7 +1078,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) if (status == -ESTALE) { nfs_zap_caches(inode); if (!S_ISDIR(inode->i_mode)) - NFS_FLAGS(inode) |= NFS_INO_STALE; + set_bit(NFS_INO_STALE, &NFS_FLAGS(inode)); } goto out; } @@ -1014,36 +1090,30 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) (long long)NFS_FILEID(inode), status); goto out; } - flags = nfsi->flags; + spin_lock(&inode->i_lock); + cache_validity = nfsi->cache_validity; + nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; + /* * We may need to keep the attributes marked as invalid if * we raced with nfs_end_attr_update(). */ if (verifier == nfsi->cache_change_attribute) - nfsi->flags &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME); - /* Do the page cache invalidation */ - if (flags & NFS_INO_INVALID_DATA) { - if (S_ISREG(inode->i_mode)) { - if (filemap_fdatawrite(inode->i_mapping) == 0) - filemap_fdatawait(inode->i_mapping); - nfs_wb_all(inode); - } - nfsi->flags &= ~NFS_INO_INVALID_DATA; - invalidate_inode_pages2(inode->i_mapping); - memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); - dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", - inode->i_sb->s_id, - (long long)NFS_FILEID(inode)); - /* This ensures we revalidate dentries */ - nfsi->cache_change_attribute++; - } + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME); + spin_unlock(&inode->i_lock); + + nfs_revalidate_mapping(inode, inode->i_mapping); + + if (cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); + dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); -out: - NFS_FLAGS(inode) &= ~NFS_INO_REVALIDATING; - wake_up(&nfsi->nfs_i_wait); + out: + nfs_wake_up_inode(inode); + out_nowait: unlock_kernel(); return status; @@ -1067,13 +1137,45 @@ int nfs_attribute_timeout(struct inode *inode) */ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { - if (!(NFS_FLAGS(inode) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) + if (!(NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA)) && !nfs_attribute_timeout(inode)) return NFS_STALE(inode) ? -ESTALE : 0; return __nfs_revalidate_inode(server, inode); } /** + * nfs_revalidate_mapping - Revalidate the pagecache + * @inode - pointer to host inode + * @mapping - pointer to mapping + */ +void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + if (nfsi->cache_validity & NFS_INO_INVALID_DATA) { + if (S_ISREG(inode->i_mode)) { + if (filemap_fdatawrite(mapping) == 0) + filemap_fdatawait(mapping); + nfs_wb_all(inode); + } + invalidate_inode_pages2(mapping); + + spin_lock(&inode->i_lock); + nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; + if (S_ISDIR(inode->i_mode)) { + memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); + /* This ensures we revalidate child dentries */ + nfsi->cache_change_attribute++; + } + spin_unlock(&inode->i_lock); + + dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", + inode->i_sb->s_id, + (long long)NFS_FILEID(inode)); + } +} + +/** * nfs_begin_data_update * @inode - pointer to inode * Declare that a set of operations will update file data on the server @@ -1096,37 +1198,18 @@ void nfs_end_data_update(struct inode *inode) if (!nfs_have_delegation(inode, FMODE_READ)) { /* Mark the attribute cache for revalidation */ - nfsi->flags |= NFS_INO_INVALID_ATTR; + spin_lock(&inode->i_lock); + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; /* Directories and symlinks: invalidate page cache too */ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - nfsi->flags |= NFS_INO_INVALID_DATA; + nfsi->cache_validity |= NFS_INO_INVALID_DATA; + spin_unlock(&inode->i_lock); } nfsi->cache_change_attribute ++; atomic_dec(&nfsi->data_updates); } /** - * nfs_end_data_update_defer - * @inode - pointer to inode - * Declare end of the operations that will update file data - * This will defer marking the inode as needing revalidation - * unless there are no other pending updates. - */ -void nfs_end_data_update_defer(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - - if (atomic_dec_and_test(&nfsi->data_updates)) { - /* Mark the attribute cache for revalidation */ - nfsi->flags |= NFS_INO_INVALID_ATTR; - /* Directories and symlinks: invalidate page cache too */ - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - nfsi->flags |= NFS_INO_INVALID_DATA; - nfsi->cache_change_attribute ++; - } -} - -/** * nfs_refresh_inode - verify consistency of the inode attribute cache * @inode - pointer to inode * @fattr - updated attributes @@ -1145,6 +1228,8 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) if (nfs_have_delegation(inode, FMODE_READ)) return 0; + spin_lock(&inode->i_lock); + /* Are we in the process of updating data on the server? */ data_unstable = nfs_caches_unstable(inode); @@ -1152,17 +1237,24 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) if ((fattr->valid & NFS_ATTR_PRE_CHANGE) != 0 && nfsi->change_attr == fattr->pre_change_attr) nfsi->change_attr = fattr->change_attr; - if (!data_unstable && nfsi->change_attr != fattr->change_attr) - nfsi->flags |= NFS_INO_INVALID_ATTR; + if (nfsi->change_attr != fattr->change_attr) { + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + if (!data_unstable) + nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE; + } } - if ((fattr->valid & NFS_ATTR_FATTR) == 0) + if ((fattr->valid & NFS_ATTR_FATTR) == 0) { + spin_unlock(&inode->i_lock); return 0; + } /* Has the inode gone and changed behind our back? */ if (nfsi->fileid != fattr->fileid - || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) + || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { + spin_unlock(&inode->i_lock); return -EIO; + } cur_size = i_size_read(inode); new_isize = nfs_size_to_loff_t(fattr->size); @@ -1176,27 +1268,32 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) } /* Verify a few of the more important attributes */ - if (!data_unstable) { - if (!timespec_equal(&inode->i_mtime, &fattr->mtime) - || cur_size != new_isize) - nfsi->flags |= NFS_INO_INVALID_ATTR; - } else if (S_ISREG(inode->i_mode) && new_isize > cur_size) - nfsi->flags |= NFS_INO_INVALID_ATTR; + if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + if (!data_unstable) + nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE; + } + if (cur_size != new_isize) { + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + if (nfsi->npages == 0) + nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE; + } /* Have any file permissions changed? */ if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) || inode->i_uid != fattr->uid || inode->i_gid != fattr->gid) - nfsi->flags |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS; + nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; /* Has the link count changed? */ if (inode->i_nlink != fattr->nlink) - nfsi->flags |= NFS_INO_INVALID_ATTR; + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (!timespec_equal(&inode->i_atime, &fattr->atime)) - nfsi->flags |= NFS_INO_INVALID_ATIME; + nfsi->cache_validity |= NFS_INO_INVALID_ATIME; nfsi->read_cache_jiffies = fattr->timestamp; + spin_unlock(&inode->i_lock); return 0; } @@ -1215,10 +1312,8 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsigned long verifier) { struct nfs_inode *nfsi = NFS_I(inode); - __u64 new_size; - loff_t new_isize; + loff_t cur_isize, new_isize; unsigned int invalid = 0; - loff_t cur_isize; int data_unstable; dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", @@ -1237,11 +1332,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign goto out_err; } + spin_lock(&inode->i_lock); + /* * Make sure the inode's type hasn't changed. */ - if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) + if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { + spin_unlock(&inode->i_lock); goto out_changed; + } /* * Update the read time so we don't revalidate too often. @@ -1251,61 +1350,56 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign /* Are we racing with known updates of the metadata on the server? */ data_unstable = ! nfs_verify_change_attribute(inode, verifier); - /* Check if the file size agrees */ - new_size = fattr->size; + /* Check if our cached file size is stale */ new_isize = nfs_size_to_loff_t(fattr->size); cur_isize = i_size_read(inode); - if (cur_isize != new_size) { -#ifdef NFS_DEBUG_VERBOSE - printk(KERN_DEBUG "NFS: isize change on %s/%ld\n", inode->i_sb->s_id, inode->i_ino); -#endif - /* - * If we have pending writebacks, things can get - * messy. - */ - if (S_ISREG(inode->i_mode) && data_unstable) { - if (new_isize > cur_isize) { + if (new_isize != cur_isize) { + /* Do we perhaps have any outstanding writes? */ + if (nfsi->npages == 0) { + /* No, but did we race with nfs_end_data_update()? */ + if (verifier == nfsi->cache_change_attribute) { inode->i_size = new_isize; - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; + invalid |= NFS_INO_INVALID_DATA; } - } else { + invalid |= NFS_INO_INVALID_ATTR; + } else if (new_isize > cur_isize) { inode->i_size = new_isize; invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } + dprintk("NFS: isize change on server for file %s/%ld\n", + inode->i_sb->s_id, inode->i_ino); } - /* - * Note: we don't check inode->i_mtime since pipes etc. - * can change this value in VFS without requiring a - * cache revalidation. - */ + /* Check if the mtime agrees */ if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); -#ifdef NFS_DEBUG_VERBOSE - printk(KERN_DEBUG "NFS: mtime change on %s/%ld\n", inode->i_sb->s_id, inode->i_ino); -#endif + dprintk("NFS: mtime change on server for file %s/%ld\n", + inode->i_sb->s_id, inode->i_ino); if (!data_unstable) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } if ((fattr->valid & NFS_ATTR_FATTR_V4) && nfsi->change_attr != fattr->change_attr) { -#ifdef NFS_DEBUG_VERBOSE - printk(KERN_DEBUG "NFS: change_attr change on %s/%ld\n", + dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); -#endif nfsi->change_attr = fattr->change_attr; if (!data_unstable) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS; + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; } - memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + /* If ctime has changed we should definitely clear access+acl caches */ + if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { + if (!data_unstable) + invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + } memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) || inode->i_uid != fattr->uid || inode->i_gid != fattr->gid) - invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS; + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_mode = fattr->mode; inode->i_nlink = fattr->nlink; @@ -1337,8 +1431,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign || S_ISLNK(inode->i_mode))) invalid &= ~NFS_INO_INVALID_DATA; if (!nfs_have_delegation(inode, FMODE_READ)) - nfsi->flags |= invalid; + nfsi->cache_validity |= invalid; + spin_unlock(&inode->i_lock); return 0; out_changed: /* @@ -1355,7 +1450,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign */ nfs_invalidate_inode(inode); out_err: - NFS_FLAGS(inode) |= NFS_INO_STALE; + set_bit(NFS_INO_STALE, &NFS_FLAGS(inode)); return -ESTALE; } @@ -1385,74 +1480,95 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data) { int error; - struct nfs_server *server; + struct nfs_server *server = NULL; struct super_block *s; struct nfs_fh *root; struct nfs_mount_data *data = raw_data; - if (!data) { - printk("nfs_read_super: missing data argument\n"); - return ERR_PTR(-EINVAL); + s = ERR_PTR(-EINVAL); + if (data == NULL) { + dprintk("%s: missing data argument\n", __FUNCTION__); + goto out_err; + } + if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) { + dprintk("%s: bad mount version\n", __FUNCTION__); + goto out_err; + } + switch (data->version) { + case 1: + data->namlen = 0; + case 2: + data->bsize = 0; + case 3: + if (data->flags & NFS_MOUNT_VER3) { + dprintk("%s: mount structure version %d does not support NFSv3\n", + __FUNCTION__, + data->version); + goto out_err; + } + data->root.size = NFS2_FHSIZE; + memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); + case 4: + if (data->flags & NFS_MOUNT_SECFLAVOUR) { + dprintk("%s: mount structure version %d does not support strong security\n", + __FUNCTION__, + data->version); + goto out_err; + } + case 5: + memset(data->context, 0, sizeof(data->context)); } +#ifndef CONFIG_NFS_V3 + /* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */ + s = ERR_PTR(-EPROTONOSUPPORT); + if (data->flags & NFS_MOUNT_VER3) { + dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__); + goto out_err; + } +#endif /* CONFIG_NFS_V3 */ + s = ERR_PTR(-ENOMEM); server = kmalloc(sizeof(struct nfs_server), GFP_KERNEL); if (!server) - return ERR_PTR(-ENOMEM); + goto out_err; memset(server, 0, sizeof(struct nfs_server)); /* Zero out the NFS state stuff */ init_nfsv4_state(server); - - if (data->version != NFS_MOUNT_VERSION) { - printk("nfs warning: mount version %s than kernel\n", - data->version < NFS_MOUNT_VERSION ? "older" : "newer"); - if (data->version < 2) - data->namlen = 0; - if (data->version < 3) - data->bsize = 0; - if (data->version < 4) { - data->flags &= ~NFS_MOUNT_VER3; - data->root.size = NFS2_FHSIZE; - memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); - } - if (data->version < 5) - data->flags &= ~NFS_MOUNT_SECFLAVOUR; - } + server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL); root = &server->fh; if (data->flags & NFS_MOUNT_VER3) root->size = data->root.size; else root->size = NFS2_FHSIZE; + s = ERR_PTR(-EINVAL); if (root->size > sizeof(root->data)) { - printk("nfs_get_sb: invalid root filehandle\n"); - kfree(server); - return ERR_PTR(-EINVAL); + dprintk("%s: invalid root filehandle\n", __FUNCTION__); + goto out_err; } memcpy(root->data, data->root.data, root->size); /* We now require that the mount process passes the remote address */ memcpy(&server->addr, &data->addr, sizeof(server->addr)); if (server->addr.sin_addr.s_addr == INADDR_ANY) { - printk("NFS: mount program didn't pass remote address!\n"); - kfree(server); - return ERR_PTR(-EINVAL); + dprintk("%s: mount program didn't pass remote address!\n", + __FUNCTION__); + goto out_err; } - s = sget(fs_type, nfs_compare_super, nfs_set_super, server); - - if (IS_ERR(s) || s->s_root) { - kfree(server); - return s; + /* Fire up rpciod if not yet running */ + s = ERR_PTR(rpciod_up()); + if (IS_ERR(s)) { + dprintk("%s: couldn't start rpciod! Error = %ld\n", + __FUNCTION__, PTR_ERR(s)); + goto out_err; } - s->s_flags = flags; + s = sget(fs_type, nfs_compare_super, nfs_set_super, server); + if (IS_ERR(s) || s->s_root) + goto out_rpciod_down; - /* Fire up rpciod if not yet running */ - if (rpciod_up() != 0) { - printk(KERN_WARNING "NFS: couldn't start rpciod!\n"); - kfree(server); - return ERR_PTR(-EIO); - } + s->s_flags = flags; error = nfs_fill_super(s, data, flags & MS_VERBOSE ? 1 : 0); if (error) { @@ -1462,6 +1578,11 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type, } s->s_flags |= MS_ACTIVE; return s; +out_rpciod_down: + rpciod_down(); +out_err: + kfree(server); + return s; } static void nfs_kill_super(struct super_block *s) @@ -1470,10 +1591,12 @@ static void nfs_kill_super(struct super_block *s) kill_anon_super(s); - if (server->client != NULL && !IS_ERR(server->client)) + if (!IS_ERR(server->client)) rpc_shutdown_client(server->client); - if (server->client_sys != NULL && !IS_ERR(server->client_sys)) + if (!IS_ERR(server->client_sys)) rpc_shutdown_client(server->client_sys); + if (!IS_ERR(server->client_acl)) + rpc_shutdown_client(server->client_acl); if (!(server->flags & NFS_MOUNT_NONLM)) lockd_down(); /* release rpc.lockd */ @@ -1594,15 +1717,19 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, clp = nfs4_get_client(&server->addr.sin_addr); if (!clp) { - printk(KERN_WARNING "NFS: failed to create NFS4 client.\n"); + dprintk("%s: failed to create NFS4 client.\n", __FUNCTION__); return -EIO; } /* Now create transport and client */ authflavour = RPC_AUTH_UNIX; if (data->auth_flavourlen != 0) { - if (data->auth_flavourlen > 1) - printk(KERN_INFO "NFS: cannot yet deal with multiple auth flavours.\n"); + if (data->auth_flavourlen != 1) { + dprintk("%s: Invalid number of RPC auth flavours %d.\n", + __FUNCTION__, data->auth_flavourlen); + err = -EINVAL; + goto out_fail; + } if (copy_from_user(&authflavour, data->auth_flavours, sizeof(authflavour))) { err = -EFAULT; goto out_fail; @@ -1610,21 +1737,22 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, } down_write(&clp->cl_sem); - if (clp->cl_rpcclient == NULL) { + if (IS_ERR(clp->cl_rpcclient)) { xprt = xprt_create_proto(proto, &server->addr, &timeparms); if (IS_ERR(xprt)) { up_write(&clp->cl_sem); - printk(KERN_WARNING "NFS: cannot create RPC transport.\n"); err = PTR_ERR(xprt); + dprintk("%s: cannot create RPC transport. Error = %d\n", + __FUNCTION__, err); goto out_fail; } clnt = rpc_create_client(xprt, server->hostname, &nfs_program, server->rpc_ops->version, authflavour); if (IS_ERR(clnt)) { up_write(&clp->cl_sem); - printk(KERN_WARNING "NFS: cannot create RPC client.\n"); - xprt_destroy(xprt); err = PTR_ERR(clnt); + dprintk("%s: cannot create RPC client. Error = %d\n", + __FUNCTION__, err); goto out_fail; } clnt->cl_intr = 1; @@ -1656,21 +1784,26 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, clp = NULL; if (IS_ERR(clnt)) { - printk(KERN_WARNING "NFS: cannot create RPC client.\n"); - return PTR_ERR(clnt); + err = PTR_ERR(clnt); + dprintk("%s: cannot create RPC client. Error = %d\n", + __FUNCTION__, err); + return err; } server->client = clnt; if (server->nfs4_state->cl_idmap == NULL) { - printk(KERN_WARNING "NFS: failed to create idmapper.\n"); + dprintk("%s: failed to create idmapper.\n", __FUNCTION__); return -ENOMEM; } if (clnt->cl_auth->au_flavor != authflavour) { - if (rpcauth_create(authflavour, clnt) == NULL) { - printk(KERN_WARNING "NFS: couldn't create credcache!\n"); - return -ENOMEM; + struct rpc_auth *auth; + + auth = rpcauth_create(authflavour, clnt); + if (IS_ERR(auth)) { + dprintk("%s: couldn't create credcache!\n", __FUNCTION__); + return PTR_ERR(auth); } } @@ -1730,8 +1863,12 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type, struct nfs4_mount_data *data = raw_data; void *p; - if (!data) { - printk("nfs_read_super: missing data argument\n"); + if (data == NULL) { + dprintk("%s: missing data argument\n", __FUNCTION__); + return ERR_PTR(-EINVAL); + } + if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) { + dprintk("%s: bad mount version\n", __FUNCTION__); return ERR_PTR(-EINVAL); } @@ -1741,11 +1878,7 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type, memset(server, 0, sizeof(struct nfs_server)); /* Zero out the NFS state stuff */ init_nfsv4_state(server); - - if (data->version != NFS4_MOUNT_VERSION) { - printk("nfs warning: mount version %s than kernel\n", - data->version < NFS4_MOUNT_VERSION ? "older" : "newer"); - } + server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL); p = nfs_copy_user_string(NULL, &data->hostname, 256); if (IS_ERR(p)) @@ -1773,11 +1906,20 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type, } if (server->addr.sin_family != AF_INET || server->addr.sin_addr.s_addr == INADDR_ANY) { - printk("NFS: mount program didn't pass remote IP address!\n"); + dprintk("%s: mount program didn't pass remote IP address!\n", + __FUNCTION__); s = ERR_PTR(-EINVAL); goto out_free; } + /* Fire up rpciod if not yet running */ + s = ERR_PTR(rpciod_up()); + if (IS_ERR(s)) { + dprintk("%s: couldn't start rpciod! Error = %ld\n", + __FUNCTION__, PTR_ERR(s)); + goto out_free; + } + s = sget(fs_type, nfs4_compare_super, nfs_set_super, server); if (IS_ERR(s) || s->s_root) @@ -1785,13 +1927,6 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type, s->s_flags = flags; - /* Fire up rpciod if not yet running */ - if (rpciod_up() != 0) { - printk(KERN_WARNING "NFS: couldn't start rpciod!\n"); - s = ERR_PTR(-EIO); - goto out_free; - } - error = nfs4_fill_super(s, data, flags & MS_VERBOSE ? 1 : 0); if (error) { up_write(&s->s_umount); @@ -1874,7 +2009,15 @@ static struct inode *nfs_alloc_inode(struct super_block *sb) nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, SLAB_KERNEL); if (!nfsi) return NULL; - nfsi->flags = 0; + nfsi->flags = 0UL; + nfsi->cache_validity = 0UL; +#ifdef CONFIG_NFS_V3_ACL + nfsi->acl_access = ERR_PTR(-EAGAIN); + nfsi->acl_default = ERR_PTR(-EAGAIN); +#endif +#ifdef CONFIG_NFS_V4 + nfsi->nfs4_acl = NULL; +#endif /* CONFIG_NFS_V4 */ return &nfsi->vfs_inode; } @@ -1899,7 +2042,6 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) nfsi->ndirty = 0; nfsi->ncommit = 0; nfsi->npages = 0; - init_waitqueue_head(&nfsi->nfs_i_wait); nfs4_init_once(nfsi); } } diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 9d3ddad96d9..0e82617f2de 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -80,9 +80,7 @@ mnt_create(char *hostname, struct sockaddr_in *srvaddr, int version, clnt = rpc_create_client(xprt, hostname, &mnt_program, version, RPC_AUTH_UNIX); - if (IS_ERR(clnt)) { - xprt_destroy(xprt); - } else { + if (!IS_ERR(clnt)) { clnt->cl_softrtry = 1; clnt->cl_chatty = 1; clnt->cl_oneshot = 1; diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c new file mode 100644 index 00000000000..6a5bbc0ae94 --- /dev/null +++ b/fs/nfs/nfs3acl.c @@ -0,0 +1,405 @@ +#include <linux/fs.h> +#include <linux/nfs.h> +#include <linux/nfs3.h> +#include <linux/nfs_fs.h> +#include <linux/posix_acl_xattr.h> +#include <linux/nfsacl.h> + +#define NFSDBG_FACILITY NFSDBG_PROC + +ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct posix_acl *acl; + int pos=0, len=0; + +# define output(s) do { \ + if (pos + sizeof(s) <= size) { \ + memcpy(buffer + pos, s, sizeof(s)); \ + pos += sizeof(s); \ + } \ + len += sizeof(s); \ + } while(0) + + acl = nfs3_proc_getacl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + output("system.posix_acl_access"); + posix_acl_release(acl); + } + + if (S_ISDIR(inode->i_mode)) { + acl = nfs3_proc_getacl(inode, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + output("system.posix_acl_default"); + posix_acl_release(acl); + } + } + +# undef output + + if (!buffer || len <= size) + return len; + return -ERANGE; +} + +ssize_t nfs3_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct posix_acl *acl; + int type, error = 0; + + if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) + type = ACL_TYPE_ACCESS; + else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) + type = ACL_TYPE_DEFAULT; + else + return -EOPNOTSUPP; + + acl = nfs3_proc_getacl(inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + if (type == ACL_TYPE_ACCESS && acl->a_count == 0) + error = -ENODATA; + else + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + } else + error = -ENODATA; + + return error; +} + +int nfs3_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + struct posix_acl *acl; + int type, error; + + if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) + type = ACL_TYPE_ACCESS; + else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) + type = ACL_TYPE_DEFAULT; + else + return -EOPNOTSUPP; + + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + error = nfs3_proc_setacl(inode, type, acl); + posix_acl_release(acl); + + return error; +} + +int nfs3_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = dentry->d_inode; + int type; + + if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) + type = ACL_TYPE_ACCESS; + else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) + type = ACL_TYPE_DEFAULT; + else + return -EOPNOTSUPP; + + return nfs3_proc_setacl(inode, type, NULL); +} + +static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi) +{ + if (!IS_ERR(nfsi->acl_access)) { + posix_acl_release(nfsi->acl_access); + nfsi->acl_access = ERR_PTR(-EAGAIN); + } + if (!IS_ERR(nfsi->acl_default)) { + posix_acl_release(nfsi->acl_default); + nfsi->acl_default = ERR_PTR(-EAGAIN); + } +} + +void nfs3_forget_cached_acls(struct inode *inode) +{ + dprintk("NFS: nfs3_forget_cached_acls(%s/%ld)\n", inode->i_sb->s_id, + inode->i_ino); + spin_lock(&inode->i_lock); + __nfs3_forget_cached_acls(NFS_I(inode)); + spin_unlock(&inode->i_lock); +} + +static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct posix_acl *acl = ERR_PTR(-EINVAL); + + spin_lock(&inode->i_lock); + switch(type) { + case ACL_TYPE_ACCESS: + acl = nfsi->acl_access; + break; + + case ACL_TYPE_DEFAULT: + acl = nfsi->acl_default; + break; + + default: + goto out; + } + if (IS_ERR(acl)) + acl = ERR_PTR(-EAGAIN); + else + acl = posix_acl_dup(acl); +out: + spin_unlock(&inode->i_lock); + dprintk("NFS: nfs3_get_cached_acl(%s/%ld, %d) = %p\n", inode->i_sb->s_id, + inode->i_ino, type, acl); + return acl; +} + +static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + dprintk("nfs3_cache_acls(%s/%ld, %p, %p)\n", inode->i_sb->s_id, + inode->i_ino, acl, dfacl); + spin_lock(&inode->i_lock); + __nfs3_forget_cached_acls(NFS_I(inode)); + nfsi->acl_access = posix_acl_dup(acl); + nfsi->acl_default = posix_acl_dup(dfacl); + spin_unlock(&inode->i_lock); +} + +struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_fattr fattr; + struct page *pages[NFSACL_MAXPAGES] = { }; + struct nfs3_getaclargs args = { + .fh = NFS_FH(inode), + /* The xdr layer may allocate pages here. */ + .pages = pages, + }; + struct nfs3_getaclres res = { + .fattr = &fattr, + }; + struct posix_acl *acl; + int status, count; + + if (!nfs_server_capable(inode, NFS_CAP_ACLS)) + return ERR_PTR(-EOPNOTSUPP); + + status = nfs_revalidate_inode(server, inode); + if (status < 0) + return ERR_PTR(status); + acl = nfs3_get_cached_acl(inode, type); + if (acl != ERR_PTR(-EAGAIN)) + return acl; + acl = NULL; + + /* + * Only get the access acl when explicitly requested: We don't + * need it for access decisions, and only some applications use + * it. Applications which request the access acl first are not + * penalized from this optimization. + */ + if (type == ACL_TYPE_ACCESS) + args.mask |= NFS_ACLCNT|NFS_ACL; + if (S_ISDIR(inode->i_mode)) + args.mask |= NFS_DFACLCNT|NFS_DFACL; + if (args.mask == 0) + return NULL; + + dprintk("NFS call getacl\n"); + status = rpc_call(server->client_acl, ACLPROC3_GETACL, + &args, &res, 0); + dprintk("NFS reply getacl: %d\n", status); + + /* pages may have been allocated at the xdr layer. */ + for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++) + __free_page(args.pages[count]); + + switch (status) { + case 0: + status = nfs_refresh_inode(inode, &fattr); + break; + case -EPFNOSUPPORT: + case -EPROTONOSUPPORT: + dprintk("NFS_V3_ACL extension not supported; disabling\n"); + server->caps &= ~NFS_CAP_ACLS; + case -ENOTSUPP: + status = -EOPNOTSUPP; + default: + goto getout; + } + if ((args.mask & res.mask) != args.mask) { + status = -EIO; + goto getout; + } + + if (res.acl_access != NULL) { + if (posix_acl_equiv_mode(res.acl_access, NULL) == 0) { + posix_acl_release(res.acl_access); + res.acl_access = NULL; + } + } + nfs3_cache_acls(inode, res.acl_access, res.acl_default); + + switch(type) { + case ACL_TYPE_ACCESS: + acl = res.acl_access; + res.acl_access = NULL; + break; + + case ACL_TYPE_DEFAULT: + acl = res.acl_default; + res.acl_default = NULL; + } + +getout: + posix_acl_release(res.acl_access); + posix_acl_release(res.acl_default); + + if (status != 0) { + posix_acl_release(acl); + acl = ERR_PTR(status); + } + return acl; +} + +static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_fattr fattr; + struct page *pages[NFSACL_MAXPAGES] = { }; + struct nfs3_setaclargs args = { + .inode = inode, + .mask = NFS_ACL, + .acl_access = acl, + .pages = pages, + }; + int status, count; + + status = -EOPNOTSUPP; + if (!nfs_server_capable(inode, NFS_CAP_ACLS)) + goto out; + + /* We are doing this here, because XDR marshalling can only + return -ENOMEM. */ + status = -ENOSPC; + if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES) + goto out; + if (dfacl != NULL && dfacl->a_count > NFS_ACL_MAX_ENTRIES) + goto out; + if (S_ISDIR(inode->i_mode)) { + args.mask |= NFS_DFACL; + args.acl_default = dfacl; + } + + dprintk("NFS call setacl\n"); + nfs_begin_data_update(inode); + status = rpc_call(server->client_acl, ACLPROC3_SETACL, + &args, &fattr, 0); + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS; + spin_unlock(&inode->i_lock); + nfs_end_data_update(inode); + dprintk("NFS reply setacl: %d\n", status); + + /* pages may have been allocated at the xdr layer. */ + for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++) + __free_page(args.pages[count]); + + switch (status) { + case 0: + status = nfs_refresh_inode(inode, &fattr); + break; + case -EPFNOSUPPORT: + case -EPROTONOSUPPORT: + dprintk("NFS_V3_ACL SETACL RPC not supported" + "(will not retry)\n"); + server->caps &= ~NFS_CAP_ACLS; + case -ENOTSUPP: + status = -EOPNOTSUPP; + } +out: + return status; +} + +int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl) +{ + struct posix_acl *alloc = NULL, *dfacl = NULL; + int status; + + if (S_ISDIR(inode->i_mode)) { + switch(type) { + case ACL_TYPE_ACCESS: + alloc = dfacl = nfs3_proc_getacl(inode, + ACL_TYPE_DEFAULT); + if (IS_ERR(alloc)) + goto fail; + break; + + case ACL_TYPE_DEFAULT: + dfacl = acl; + alloc = acl = nfs3_proc_getacl(inode, + ACL_TYPE_ACCESS); + if (IS_ERR(alloc)) + goto fail; + break; + + default: + return -EINVAL; + } + } else if (type != ACL_TYPE_ACCESS) + return -EINVAL; + + if (acl == NULL) { + alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + if (IS_ERR(alloc)) + goto fail; + } + status = nfs3_proc_setacls(inode, acl, dfacl); + posix_acl_release(alloc); + return status; + +fail: + return PTR_ERR(alloc); +} + +int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, + mode_t mode) +{ + struct posix_acl *dfacl, *acl; + int error = 0; + + dfacl = nfs3_proc_getacl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(dfacl)) { + error = PTR_ERR(dfacl); + return (error == -EOPNOTSUPP) ? 0 : error; + } + if (!dfacl) + return 0; + acl = posix_acl_clone(dfacl, GFP_KERNEL); + error = -ENOMEM; + if (!acl) + goto out_release_dfacl; + error = posix_acl_create_masq(acl, &mode); + if (error < 0) + goto out_release_acl; + error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ? + dfacl : NULL); +out_release_acl: + posix_acl_release(acl); +out_release_dfacl: + posix_acl_release(dfacl); + return error; +} diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 3878494dfc2..2681485cf2d 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -17,6 +17,7 @@ #include <linux/nfs_page.h> #include <linux/lockd/bind.h> #include <linux/smp_lock.h> +#include <linux/nfs_mount.h> #define NFSDBG_FACILITY NFSDBG_PROC @@ -45,7 +46,7 @@ static inline int nfs3_rpc_call_wrapper(struct rpc_clnt *clnt, u32 proc, void *argp, void *resp, int flags) { struct rpc_message msg = { - .rpc_proc = &nfs3_procedures[proc], + .rpc_proc = &clnt->cl_procinfo[proc], .rpc_argp = argp, .rpc_resp = resp, }; @@ -119,6 +120,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, dprintk("NFS call setattr\n"); fattr->valid = 0; status = rpc_call(NFS_CLIENT(inode), NFS3PROC_SETATTR, &arg, fattr, 0); + if (status == 0) + nfs_setattr_update_inode(inode, sattr); dprintk("NFS reply setattr: %d\n", status); return status; } @@ -313,7 +316,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, .fh = &fhandle, .fattr = &fattr }; - int status; + mode_t mode = sattr->ia_mode; + int status; dprintk("NFS call create %s\n", dentry->d_name.name); arg.createmode = NFS3_CREATE_UNCHECKED; @@ -323,6 +327,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, arg.verifier[1] = current->pid; } + sattr->ia_mode &= ~current->fs->umask; + again: dir_attr.valid = 0; fattr.valid = 0; @@ -366,9 +372,14 @@ again: * not sure this buys us anything (and I'd have * to revamp the NFSv3 XDR code) */ status = nfs3_proc_setattr(dentry, &fattr, sattr); + if (status == 0) + nfs_setattr_update_inode(dentry->d_inode, sattr); nfs_refresh_inode(dentry->d_inode, &fattr); dprintk("NFS reply setattr (post-create): %d\n", status); } + if (status != 0) + goto out; + status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); out: dprintk("NFS reply create: %d\n", status); return status; @@ -538,15 +549,24 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) .fh = &fhandle, .fattr = &fattr }; - int status; + int mode = sattr->ia_mode; + int status; dprintk("NFS call mkdir %s\n", dentry->d_name.name); dir_attr.valid = 0; fattr.valid = 0; + + sattr->ia_mode &= ~current->fs->umask; + status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKDIR, &arg, &res, 0); nfs_refresh_inode(dir, &dir_attr); - if (status == 0) - status = nfs_instantiate(dentry, &fhandle, &fattr); + if (status != 0) + goto out; + status = nfs_instantiate(dentry, &fhandle, &fattr); + if (status != 0) + goto out; + status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); +out: dprintk("NFS reply mkdir: %d\n", status); return status; } @@ -641,6 +661,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, .fh = &fh, .fattr = &fattr }; + mode_t mode = sattr->ia_mode; int status; switch (sattr->ia_mode & S_IFMT) { @@ -653,12 +674,20 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, MAJOR(rdev), MINOR(rdev)); + + sattr->ia_mode &= ~current->fs->umask; + dir_attr.valid = 0; fattr.valid = 0; status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKNOD, &arg, &res, 0); nfs_refresh_inode(dir, &dir_attr); - if (status == 0) - status = nfs_instantiate(dentry, &fh, &fattr); + if (status != 0) + goto out; + status = nfs_instantiate(dentry, &fh, &fattr); + if (status != 0) + goto out; + status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode); +out: dprintk("NFS reply mknod: %d\n", status); return status; } @@ -825,7 +854,8 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) struct nfs_rpc_ops nfs_v3_clientops = { .version = 3, /* protocol version */ .dentry_ops = &nfs_dentry_operations, - .dir_inode_ops = &nfs_dir_inode_operations, + .dir_inode_ops = &nfs3_dir_inode_operations, + .file_inode_ops = &nfs3_file_inode_operations, .getroot = nfs3_proc_get_root, .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, @@ -856,4 +886,5 @@ struct nfs_rpc_ops nfs_v3_clientops = { .file_open = nfs_open, .file_release = nfs_release, .lock = nfs3_proc_lock, + .clear_acl_cache = nfs3_forget_cached_acls, }; diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index a3593d47e5a..db4a904810a 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -21,6 +21,7 @@ #include <linux/nfs.h> #include <linux/nfs3.h> #include <linux/nfs_fs.h> +#include <linux/nfsacl.h> #define NFSDBG_FACILITY NFSDBG_XDR @@ -79,6 +80,11 @@ extern int nfs_stat_to_errno(int); #define NFS3_pathconfres_sz (1+NFS3_post_op_attr_sz+6) #define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2) +#define ACL3_getaclargs_sz (NFS3_fh_sz+1) +#define ACL3_setaclargs_sz (NFS3_fh_sz+1+2*(2+5*3)) +#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+2*(2+5*3)) +#define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) + /* * Map file type to S_IFMT bits */ @@ -627,6 +633,74 @@ nfs3_xdr_commitargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args) return 0; } +#ifdef CONFIG_NFS_V3_ACL +/* + * Encode GETACL arguments + */ +static int +nfs3_xdr_getaclargs(struct rpc_rqst *req, u32 *p, + struct nfs3_getaclargs *args) +{ + struct rpc_auth *auth = req->rq_task->tk_auth; + unsigned int replen; + + p = xdr_encode_fhandle(p, args->fh); + *p++ = htonl(args->mask); + req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + + if (args->mask & (NFS_ACL | NFS_DFACL)) { + /* Inline the page array */ + replen = (RPC_REPHDRSIZE + auth->au_rslack + + ACL3_getaclres_sz) << 2; + xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0, + NFSACL_MAXPAGES << PAGE_SHIFT); + } + return 0; +} + +/* + * Encode SETACL arguments + */ +static int +nfs3_xdr_setaclargs(struct rpc_rqst *req, u32 *p, + struct nfs3_setaclargs *args) +{ + struct xdr_buf *buf = &req->rq_snd_buf; + unsigned int base, len_in_head, len = nfsacl_size( + (args->mask & NFS_ACL) ? args->acl_access : NULL, + (args->mask & NFS_DFACL) ? args->acl_default : NULL); + int count, err; + + p = xdr_encode_fhandle(p, NFS_FH(args->inode)); + *p++ = htonl(args->mask); + base = (char *)p - (char *)buf->head->iov_base; + /* put as much of the acls into head as possible. */ + len_in_head = min_t(unsigned int, buf->head->iov_len - base, len); + len -= len_in_head; + req->rq_slen = xdr_adjust_iovec(req->rq_svec, p + (len_in_head >> 2)); + + for (count = 0; (count << PAGE_SHIFT) < len; count++) { + args->pages[count] = alloc_page(GFP_KERNEL); + if (!args->pages[count]) { + while (count) + __free_page(args->pages[--count]); + return -ENOMEM; + } + } + xdr_encode_pages(buf, args->pages, 0, len); + + err = nfsacl_encode(buf, base, args->inode, + (args->mask & NFS_ACL) ? + args->acl_access : NULL, 1, 0); + if (err > 0) + err = nfsacl_encode(buf, base + err, args->inode, + (args->mask & NFS_DFACL) ? + args->acl_default : NULL, 1, + NFS_ACL_DEFAULT); + return (err > 0) ? 0 : err; +} +#endif /* CONFIG_NFS_V3_ACL */ + /* * NFS XDR decode functions */ @@ -978,6 +1052,54 @@ nfs3_xdr_commitres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res) return 0; } +#ifdef CONFIG_NFS_V3_ACL +/* + * Decode GETACL reply + */ +static int +nfs3_xdr_getaclres(struct rpc_rqst *req, u32 *p, + struct nfs3_getaclres *res) +{ + struct xdr_buf *buf = &req->rq_rcv_buf; + int status = ntohl(*p++); + struct posix_acl **acl; + unsigned int *aclcnt; + int err, base; + + if (status != 0) + return -nfs_stat_to_errno(status); + p = xdr_decode_post_op_attr(p, res->fattr); + res->mask = ntohl(*p++); + if (res->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) + return -EINVAL; + base = (char *)p - (char *)req->rq_rcv_buf.head->iov_base; + + acl = (res->mask & NFS_ACL) ? &res->acl_access : NULL; + aclcnt = (res->mask & NFS_ACLCNT) ? &res->acl_access_count : NULL; + err = nfsacl_decode(buf, base, aclcnt, acl); + + acl = (res->mask & NFS_DFACL) ? &res->acl_default : NULL; + aclcnt = (res->mask & NFS_DFACLCNT) ? &res->acl_default_count : NULL; + if (err > 0) + err = nfsacl_decode(buf, base + err, aclcnt, acl); + return (err > 0) ? 0 : err; +} + +/* + * Decode setacl reply. + */ +static int +nfs3_xdr_setaclres(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr) +{ + int status = ntohl(*p++); + + if (status) + return -nfs_stat_to_errno(status); + xdr_decode_post_op_attr(p, fattr); + return 0; +} +#endif /* CONFIG_NFS_V3_ACL */ + #ifndef MAX # define MAX(a, b) (((a) > (b))? (a) : (b)) #endif @@ -1021,3 +1143,28 @@ struct rpc_version nfs_version3 = { .procs = nfs3_procedures }; +#ifdef CONFIG_NFS_V3_ACL +static struct rpc_procinfo nfs3_acl_procedures[] = { + [ACLPROC3_GETACL] = { + .p_proc = ACLPROC3_GETACL, + .p_encode = (kxdrproc_t) nfs3_xdr_getaclargs, + .p_decode = (kxdrproc_t) nfs3_xdr_getaclres, + .p_bufsiz = MAX(ACL3_getaclargs_sz, ACL3_getaclres_sz) << 2, + .p_timer = 1, + }, + [ACLPROC3_SETACL] = { + .p_proc = ACLPROC3_SETACL, + .p_encode = (kxdrproc_t) nfs3_xdr_setaclargs, + .p_decode = (kxdrproc_t) nfs3_xdr_setaclres, + .p_bufsiz = MAX(ACL3_setaclargs_sz, ACL3_setaclres_sz) << 2, + .p_timer = 0, + }, +}; + +struct rpc_version nfsacl_version3 = { + .number = 3, + .nrprocs = sizeof(nfs3_acl_procedures)/ + sizeof(nfs3_acl_procedures[0]), + .procs = nfs3_acl_procedures, +}; +#endif /* CONFIG_NFS_V3_ACL */ diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h new file mode 100644 index 00000000000..ec1a22d7b87 --- /dev/null +++ b/fs/nfs/nfs4_fs.h @@ -0,0 +1,253 @@ +/* + * linux/fs/nfs/nfs4_fs.h + * + * Copyright (C) 2005 Trond Myklebust + * + * NFSv4-specific filesystem definitions and declarations + */ + +#ifndef __LINUX_FS_NFS_NFS4_FS_H +#define __LINUX_FS_NFS_NFS4_FS_H + +#ifdef CONFIG_NFS_V4 + +struct idmap; + +/* + * In a seqid-mutating op, this macro controls which error return + * values trigger incrementation of the seqid. + * + * from rfc 3010: + * The client MUST monotonically increment the sequence number for the + * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE + * operations. This is true even in the event that the previous + * operation that used the sequence number received an error. The only + * exception to this rule is if the previous operation received one of + * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID, + * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR, + * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE. + * + */ +#define seqid_mutating_err(err) \ +(((err) != NFSERR_STALE_CLIENTID) && \ + ((err) != NFSERR_STALE_STATEID) && \ + ((err) != NFSERR_BAD_STATEID) && \ + ((err) != NFSERR_BAD_SEQID) && \ + ((err) != NFSERR_BAD_XDR) && \ + ((err) != NFSERR_RESOURCE) && \ + ((err) != NFSERR_NOFILEHANDLE)) + +enum nfs4_client_state { + NFS4CLNT_OK = 0, +}; + +/* + * The nfs4_client identifies our client state to the server. + */ +struct nfs4_client { + struct list_head cl_servers; /* Global list of servers */ + struct in_addr cl_addr; /* Server identifier */ + u64 cl_clientid; /* constant */ + nfs4_verifier cl_confirm; + unsigned long cl_state; + + u32 cl_lockowner_id; + + /* + * The following rwsem ensures exclusive access to the server + * while we recover the state following a lease expiration. + */ + struct rw_semaphore cl_sem; + + struct list_head cl_delegations; + struct list_head cl_state_owners; + struct list_head cl_unused; + int cl_nunused; + spinlock_t cl_lock; + atomic_t cl_count; + + struct rpc_clnt * cl_rpcclient; + struct rpc_cred * cl_cred; + + struct list_head cl_superblocks; /* List of nfs_server structs */ + + unsigned long cl_lease_time; + unsigned long cl_last_renewal; + struct work_struct cl_renewd; + struct work_struct cl_recoverd; + + wait_queue_head_t cl_waitq; + struct rpc_wait_queue cl_rpcwaitq; + + /* used for the setclientid verifier */ + struct timespec cl_boot_time; + + /* idmapper */ + struct idmap * cl_idmap; + + /* Our own IP address, as a null-terminated string. + * This is used to generate the clientid, and the callback address. + */ + char cl_ipaddr[16]; + unsigned char cl_id_uniquifier; +}; + +/* + * NFS4 state_owners and lock_owners are simply labels for ordered + * sequences of RPC calls. Their sole purpose is to provide once-only + * semantics by allowing the server to identify replayed requests. + * + * The ->so_sema is held during all state_owner seqid-mutating operations: + * OPEN, OPEN_DOWNGRADE, and CLOSE. Its purpose is to properly serialize + * so_seqid. + */ +struct nfs4_state_owner { + struct list_head so_list; /* per-clientid list of state_owners */ + struct nfs4_client *so_client; + u32 so_id; /* 32-bit identifier, unique */ + struct semaphore so_sema; + u32 so_seqid; /* protected by so_sema */ + atomic_t so_count; + + struct rpc_cred *so_cred; /* Associated cred */ + struct list_head so_states; + struct list_head so_delegations; +}; + +/* + * struct nfs4_state maintains the client-side state for a given + * (state_owner,inode) tuple (OPEN) or state_owner (LOCK). + * + * OPEN: + * In order to know when to OPEN_DOWNGRADE or CLOSE the state on the server, + * we need to know how many files are open for reading or writing on a + * given inode. This information too is stored here. + * + * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) + */ + +struct nfs4_lock_state { + struct list_head ls_locks; /* Other lock stateids */ + struct nfs4_state * ls_state; /* Pointer to open state */ + fl_owner_t ls_owner; /* POSIX lock owner */ +#define NFS_LOCK_INITIALIZED 1 + int ls_flags; + u32 ls_seqid; + u32 ls_id; + nfs4_stateid ls_stateid; + atomic_t ls_count; +}; + +/* bits for nfs4_state->flags */ +enum { + LK_STATE_IN_USE, + NFS_DELEGATED_STATE, +}; + +struct nfs4_state { + struct list_head open_states; /* List of states for the same state_owner */ + struct list_head inode_states; /* List of states for the same inode */ + struct list_head lock_states; /* List of subservient lock stateids */ + + struct nfs4_state_owner *owner; /* Pointer to the open owner */ + struct inode *inode; /* Pointer to the inode */ + + unsigned long flags; /* Do we hold any locks? */ + struct semaphore lock_sema; /* Serializes file locking operations */ + spinlock_t state_lock; /* Protects the lock_states list */ + + nfs4_stateid stateid; + + unsigned int nreaders; + unsigned int nwriters; + int state; /* State on the server (R,W, or RW) */ + atomic_t count; +}; + + +struct nfs4_exception { + long timeout; + int retry; +}; + +struct nfs4_state_recovery_ops { + int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *); + int (*recover_lock)(struct nfs4_state *, struct file_lock *); +}; + +extern struct dentry_operations nfs4_dentry_operations; +extern struct inode_operations nfs4_dir_inode_operations; + +/* inode.c */ +extern ssize_t nfs4_getxattr(struct dentry *, const char *, void *, size_t); +extern int nfs4_setxattr(struct dentry *, const char *, const void *, size_t, int); +extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t); + + +/* nfs4proc.c */ +extern int nfs4_map_errors(int err); +extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short); +extern int nfs4_proc_setclientid_confirm(struct nfs4_client *); +extern int nfs4_proc_async_renew(struct nfs4_client *); +extern int nfs4_proc_renew(struct nfs4_client *); +extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode); +extern struct inode *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); +extern int nfs4_open_revalidate(struct inode *, struct dentry *, int); + +extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops; +extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops; + +extern const u32 nfs4_fattr_bitmap[2]; +extern const u32 nfs4_statfs_bitmap[2]; +extern const u32 nfs4_pathconf_bitmap[2]; +extern const u32 nfs4_fsinfo_bitmap[2]; + +/* nfs4renewd.c */ +extern void nfs4_schedule_state_renewal(struct nfs4_client *); +extern void nfs4_renewd_prepare_shutdown(struct nfs_server *); +extern void nfs4_kill_renewd(struct nfs4_client *); +extern void nfs4_renew_state(void *); + +/* nfs4state.c */ +extern void init_nfsv4_state(struct nfs_server *); +extern void destroy_nfsv4_state(struct nfs_server *); +extern struct nfs4_client *nfs4_get_client(struct in_addr *); +extern void nfs4_put_client(struct nfs4_client *clp); +extern int nfs4_init_client(struct nfs4_client *clp); +extern struct nfs4_client *nfs4_find_client(struct in_addr *); +extern u32 nfs4_alloc_lockowner_id(struct nfs4_client *); + +extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); +extern void nfs4_put_state_owner(struct nfs4_state_owner *); +extern void nfs4_drop_state_owner(struct nfs4_state_owner *); +extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); +extern void nfs4_put_open_state(struct nfs4_state *); +extern void nfs4_close_state(struct nfs4_state *, mode_t); +extern struct nfs4_state *nfs4_find_state(struct inode *, struct rpc_cred *, mode_t mode); +extern void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp); +extern void nfs4_schedule_state_recovery(struct nfs4_client *); +extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); +extern void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *ls); +extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); + +extern const nfs4_stateid zero_stateid; + +/* nfs4xdr.c */ +extern uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus); +extern struct rpc_procinfo nfs4_procedures[]; + +struct nfs4_mount_data; + +/* callback_xdr.c */ +extern struct svc_version nfs4_callback_version1; + +#else + +#define init_nfsv4_state(server) do { } while (0) +#define destroy_nfsv4_state(server) do { } while (0) +#define nfs4_put_state_owner(inode, owner) do { } while (0) +#define nfs4_put_open_state(state) do { } while (0) +#define nfs4_close_state(a, b) do { } while (0) + +#endif /* CONFIG_NFS_V4 */ +#endif /* __LINUX_FS_NFS_NFS4_FS.H */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1d5cb3e80c3..0c5a308e496 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -48,6 +48,7 @@ #include <linux/smp_lock.h> #include <linux/namei.h> +#include "nfs4_fs.h" #include "delegation.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -62,8 +63,6 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus); extern struct rpc_procinfo nfs4_procedures[]; -extern nfs4_stateid zero_stateid; - /* Prevent leaks of NFSv4 errors into userland */ int nfs4_map_errors(int err) { @@ -104,7 +103,7 @@ const u32 nfs4_statfs_bitmap[2] = { | FATTR4_WORD1_SPACE_TOTAL }; -u32 nfs4_pathconf_bitmap[2] = { +const u32 nfs4_pathconf_bitmap[2] = { FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME, 0 @@ -124,7 +123,7 @@ static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry, BUG_ON(readdir->count < 80); if (cookie > 2) { - readdir->cookie = (cookie > 2) ? cookie : 0; + readdir->cookie = cookie; memcpy(&readdir->verifier, verifier, sizeof(readdir->verifier)); return; } @@ -270,14 +269,9 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta int err; do { err = _nfs4_open_reclaim(sp, state); - switch (err) { - case 0: - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: - return err; - } - err = nfs4_handle_exception(server, err, &exception); + if (err != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, err, &exception); } while (exception.retry); return err; } @@ -509,6 +503,20 @@ out_stale: goto out_nodeleg; } +static inline int nfs4_do_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry) +{ + struct nfs_server *server = NFS_SERVER(dentry->d_inode); + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs4_open_expired(sp, state, dentry); + if (err == -NFS4ERR_DELAY) + nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + return err; +} + static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) { struct nfs_inode *nfsi = NFS_I(state->inode); @@ -521,7 +529,7 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta continue; get_nfs_open_context(ctx); spin_unlock(&state->inode->i_lock); - status = _nfs4_open_expired(sp, state, ctx->dentry); + status = nfs4_do_open_expired(sp, state, ctx->dentry); put_nfs_open_context(ctx); return status; } @@ -745,17 +753,18 @@ static int _nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr, .rpc_argp = &arg, .rpc_resp = &res, }; + int status; fattr->valid = 0; - if (state != NULL) + if (state != NULL) { msg.rpc_cred = state->owner->so_cred; - if (sattr->ia_valid & ATTR_SIZE) - nfs4_copy_stateid(&arg.stateid, state, NULL); - else + nfs4_copy_stateid(&arg.stateid, state, current->files); + } else memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); - return rpc_call_sync(server->client, &msg, 0); + status = rpc_call_sync(server->client, &msg, 0); + return status; } static int nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr, @@ -1116,47 +1125,33 @@ static int nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct iattr *sattr) { - struct inode * inode = dentry->d_inode; - int size_change = sattr->ia_valid & ATTR_SIZE; - struct nfs4_state *state = NULL; - int need_iput = 0; + struct rpc_cred *cred; + struct inode *inode = dentry->d_inode; + struct nfs4_state *state; int status; fattr->valid = 0; - if (size_change) { - struct rpc_cred *cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0); - if (IS_ERR(cred)) - return PTR_ERR(cred); + cred = rpcauth_lookupcred(NFS_SERVER(inode)->client->cl_auth, 0); + if (IS_ERR(cred)) + return PTR_ERR(cred); + /* Search for an existing WRITE delegation first */ + state = nfs4_open_delegated(inode, FMODE_WRITE, cred); + if (!IS_ERR(state)) { + /* NB: nfs4_open_delegated() bumps the inode->i_count */ + iput(inode); + } else { + /* Search for an existing open(O_WRITE) stateid */ state = nfs4_find_state(inode, cred, FMODE_WRITE); - if (state == NULL) { - state = nfs4_open_delegated(dentry->d_inode, - FMODE_WRITE, cred); - if (IS_ERR(state)) - state = nfs4_do_open(dentry->d_parent->d_inode, - dentry, FMODE_WRITE, - NULL, cred); - need_iput = 1; - } - put_rpccred(cred); - if (IS_ERR(state)) - return PTR_ERR(state); - - if (state->inode != inode) { - printk(KERN_WARNING "nfs: raced in setattr (%p != %p), returning -EIO\n", inode, state->inode); - status = -EIO; - goto out; - } } + status = nfs4_do_setattr(NFS_SERVER(inode), fattr, NFS_FH(inode), sattr, state); -out: - if (state) { - inode = state->inode; + if (status == 0) + nfs_setattr_update_inode(inode, sattr); + if (state != NULL) nfs4_close_state(state, FMODE_WRITE); - if (need_iput) - iput(inode); - } + put_rpccred(cred); return status; } @@ -1458,8 +1453,10 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, struct nfs_fattr fattr; status = nfs4_do_setattr(NFS_SERVER(dir), &fattr, NFS_FH(state->inode), sattr, state); - if (status == 0) + if (status == 0) { + nfs_setattr_update_inode(state->inode, sattr); goto out; + } } else if (flags != 0) goto out; nfs4_close_state(state, flags); @@ -1731,6 +1728,10 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, }; int status; + dprintk("%s: dentry = %s/%s, cookie = %Lu\n", __FUNCTION__, + dentry->d_parent->d_name.name, + dentry->d_name.name, + (unsigned long long)cookie); lock_kernel(); nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); res.pgbase = args.pgbase; @@ -1738,6 +1739,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, if (status == 0) memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); unlock_kernel(); + dprintk("%s: returns %d\n", __FUNCTION__, status); return status; } @@ -2163,6 +2165,193 @@ nfs4_proc_file_release(struct inode *inode, struct file *filp) return 0; } +static inline int nfs4_server_supports_acls(struct nfs_server *server) +{ + return (server->caps & NFS_CAP_ACLS) + && (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL) + && (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL); +} + +/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_CACHE_SIZE, and that + * it's OK to put sizeof(void) * (XATTR_SIZE_MAX/PAGE_CACHE_SIZE) bytes on + * the stack. + */ +#define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT) + +static void buf_to_pages(const void *buf, size_t buflen, + struct page **pages, unsigned int *pgbase) +{ + const void *p = buf; + + *pgbase = offset_in_page(buf); + p -= *pgbase; + while (p < buf + buflen) { + *(pages++) = virt_to_page(p); + p += PAGE_CACHE_SIZE; + } +} + +struct nfs4_cached_acl { + int cached; + size_t len; + char data[0]; +}; + +static void nfs4_set_cached_acl(struct inode *inode, struct nfs4_cached_acl *acl) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + spin_lock(&inode->i_lock); + kfree(nfsi->nfs4_acl); + nfsi->nfs4_acl = acl; + spin_unlock(&inode->i_lock); +} + +static void nfs4_zap_acl_attr(struct inode *inode) +{ + nfs4_set_cached_acl(inode, NULL); +} + +static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_t buflen) +{ + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs4_cached_acl *acl; + int ret = -ENOENT; + + spin_lock(&inode->i_lock); + acl = nfsi->nfs4_acl; + if (acl == NULL) + goto out; + if (buf == NULL) /* user is just asking for length */ + goto out_len; + if (acl->cached == 0) + goto out; + ret = -ERANGE; /* see getxattr(2) man page */ + if (acl->len > buflen) + goto out; + memcpy(buf, acl->data, acl->len); +out_len: + ret = acl->len; +out: + spin_unlock(&inode->i_lock); + return ret; +} + +static void nfs4_write_cached_acl(struct inode *inode, const char *buf, size_t acl_len) +{ + struct nfs4_cached_acl *acl; + + if (buf && acl_len <= PAGE_SIZE) { + acl = kmalloc(sizeof(*acl) + acl_len, GFP_KERNEL); + if (acl == NULL) + goto out; + acl->cached = 1; + memcpy(acl->data, buf, acl_len); + } else { + acl = kmalloc(sizeof(*acl), GFP_KERNEL); + if (acl == NULL) + goto out; + acl->cached = 0; + } + acl->len = acl_len; +out: + nfs4_set_cached_acl(inode, acl); +} + +static inline ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +{ + struct page *pages[NFS4ACL_MAXPAGES]; + struct nfs_getaclargs args = { + .fh = NFS_FH(inode), + .acl_pages = pages, + .acl_len = buflen, + }; + size_t resp_len = buflen; + void *resp_buf; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL], + .rpc_argp = &args, + .rpc_resp = &resp_len, + }; + struct page *localpage = NULL; + int ret; + + if (buflen < PAGE_SIZE) { + /* As long as we're doing a round trip to the server anyway, + * let's be prepared for a page of acl data. */ + localpage = alloc_page(GFP_KERNEL); + resp_buf = page_address(localpage); + if (localpage == NULL) + return -ENOMEM; + args.acl_pages[0] = localpage; + args.acl_pgbase = 0; + args.acl_len = PAGE_SIZE; + } else { + resp_buf = buf; + buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase); + } + ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + if (ret) + goto out_free; + if (resp_len > args.acl_len) + nfs4_write_cached_acl(inode, NULL, resp_len); + else + nfs4_write_cached_acl(inode, resp_buf, resp_len); + if (buf) { + ret = -ERANGE; + if (resp_len > buflen) + goto out_free; + if (localpage) + memcpy(buf, resp_buf, resp_len); + } + ret = resp_len; +out_free: + if (localpage) + __free_page(localpage); + return ret; +} + +static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) +{ + struct nfs_server *server = NFS_SERVER(inode); + int ret; + + if (!nfs4_server_supports_acls(server)) + return -EOPNOTSUPP; + ret = nfs_revalidate_inode(server, inode); + if (ret < 0) + return ret; + ret = nfs4_read_cached_acl(inode, buf, buflen); + if (ret != -ENOENT) + return ret; + return nfs4_get_acl_uncached(inode, buf, buflen); +} + +static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct page *pages[NFS4ACL_MAXPAGES]; + struct nfs_setaclargs arg = { + .fh = NFS_FH(inode), + .acl_pages = pages, + .acl_len = buflen, + }; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETACL], + .rpc_argp = &arg, + .rpc_resp = NULL, + }; + int ret; + + if (!nfs4_server_supports_acls(server)) + return -EOPNOTSUPP; + buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); + ret = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0); + if (ret == 0) + nfs4_write_cached_acl(inode, buf, buflen); + return ret; +} + static int nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server) { @@ -2448,14 +2637,11 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock down_read(&clp->cl_sem); nlo.clientid = clp->cl_clientid; down(&state->lock_sema); - lsp = nfs4_find_lock_state(state, request->fl_owner); - if (lsp) - nlo.id = lsp->ls_id; - else { - spin_lock(&clp->cl_lock); - nlo.id = nfs4_alloc_lockowner_id(clp); - spin_unlock(&clp->cl_lock); - } + status = nfs4_set_lock_state(state, request); + if (status != 0) + goto out; + lsp = request->fl_u.nfs4_fl.owner; + nlo.id = lsp->ls_id; arg.u.lockt = &nlo; status = rpc_call_sync(server->client, &msg, 0); if (!status) { @@ -2476,8 +2662,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock request->fl_pid = 0; status = 0; } - if (lsp) - nfs4_put_lock_state(lsp); +out: up(&state->lock_sema); up_read(&clp->cl_sem); return status; @@ -2537,28 +2722,26 @@ static int _nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock }; struct nfs4_lock_state *lsp; struct nfs_locku_opargs luargs; - int status = 0; + int status; down_read(&clp->cl_sem); down(&state->lock_sema); - lsp = nfs4_find_lock_state(state, request->fl_owner); - if (!lsp) + status = nfs4_set_lock_state(state, request); + if (status != 0) goto out; + lsp = request->fl_u.nfs4_fl.owner; /* We might have lost the locks! */ - if ((lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) { - luargs.seqid = lsp->ls_seqid; - memcpy(&luargs.stateid, &lsp->ls_stateid, sizeof(luargs.stateid)); - arg.u.locku = &luargs; - status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); - nfs4_increment_lock_seqid(status, lsp); - } + if ((lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) + goto out; + luargs.seqid = lsp->ls_seqid; + memcpy(&luargs.stateid, &lsp->ls_stateid, sizeof(luargs.stateid)); + arg.u.locku = &luargs; + status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); + nfs4_increment_lock_seqid(status, lsp); - if (status == 0) { + if (status == 0) memcpy(&lsp->ls_stateid, &res.u.stateid, sizeof(lsp->ls_stateid)); - nfs4_notify_unlck(state, request, lsp); - } - nfs4_put_lock_state(lsp); out: up(&state->lock_sema); if (status == 0) @@ -2584,7 +2767,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r { struct inode *inode = state->inode; struct nfs_server *server = NFS_SERVER(inode); - struct nfs4_lock_state *lsp; + struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner; struct nfs_lockargs arg = { .fh = NFS_FH(inode), .type = nfs4_lck_type(cmd, request), @@ -2606,9 +2789,6 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r }; int status; - lsp = nfs4_get_lock_state(state, request->fl_owner); - if (lsp == NULL) - return -ENOMEM; if (!(lsp->ls_flags & NFS_LOCK_INITIALIZED)) { struct nfs4_state_owner *owner = state->owner; struct nfs_open_to_lock otl = { @@ -2630,38 +2810,57 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r * seqid mutating errors */ nfs4_increment_seqid(status, owner); up(&owner->so_sema); + if (status == 0) { + lsp->ls_flags |= NFS_LOCK_INITIALIZED; + lsp->ls_seqid++; + } } else { struct nfs_exist_lock el = { .seqid = lsp->ls_seqid, }; memcpy(&el.stateid, &lsp->ls_stateid, sizeof(el.stateid)); largs.u.exist_lock = ⪙ - largs.new_lock_owner = 0; arg.u.lock = &largs; status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); + /* increment seqid on success, and * seqid mutating errors*/ + nfs4_increment_lock_seqid(status, lsp); } - /* increment seqid on success, and * seqid mutating errors*/ - nfs4_increment_lock_seqid(status, lsp); /* save the returned stateid. */ - if (status == 0) { + if (status == 0) memcpy(&lsp->ls_stateid, &res.u.stateid, sizeof(nfs4_stateid)); - lsp->ls_flags |= NFS_LOCK_INITIALIZED; - if (!reclaim) - nfs4_notify_setlk(state, request, lsp); - } else if (status == -NFS4ERR_DENIED) + else if (status == -NFS4ERR_DENIED) status = -EAGAIN; - nfs4_put_lock_state(lsp); return status; } static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request) { - return _nfs4_do_setlk(state, F_SETLK, request, 1); + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs4_do_setlk(state, F_SETLK, request, 1); + if (err != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + return err; } static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request) { - return _nfs4_do_setlk(state, F_SETLK, request, 0); + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs4_do_setlk(state, F_SETLK, request, 0); + if (err != -NFS4ERR_DELAY) + break; + nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + return err; } static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) @@ -2671,7 +2870,9 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock down_read(&clp->cl_sem); down(&state->lock_sema); - status = _nfs4_do_setlk(state, cmd, request, 0); + status = nfs4_set_lock_state(state, request); + if (status == 0) + status = _nfs4_do_setlk(state, cmd, request, 0); up(&state->lock_sema); if (status == 0) { /* Note: we always want to sleep here! */ @@ -2729,10 +2930,53 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (signalled()) break; } while(status < 0); - return status; } + +#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" + +int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, + size_t buflen, int flags) +{ + struct inode *inode = dentry->d_inode; + + if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0) + return -EOPNOTSUPP; + + if (!S_ISREG(inode->i_mode) && + (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) + return -EPERM; + + return nfs4_proc_set_acl(inode, buf, buflen); +} + +/* The getxattr man page suggests returning -ENODATA for unknown attributes, + * and that's what we'll do for e.g. user attributes that haven't been set. + * But we'll follow ext2/ext3's lead by returning -EOPNOTSUPP for unsupported + * attributes in kernel-managed attribute namespaces. */ +ssize_t nfs4_getxattr(struct dentry *dentry, const char *key, void *buf, + size_t buflen) +{ + struct inode *inode = dentry->d_inode; + + if (strcmp(key, XATTR_NAME_NFSV4_ACL) != 0) + return -EOPNOTSUPP; + + return nfs4_proc_get_acl(inode, buf, buflen); +} + +ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen) +{ + size_t len = strlen(XATTR_NAME_NFSV4_ACL) + 1; + + if (buf && buflen < len) + return -ERANGE; + if (buf) + memcpy(buf, XATTR_NAME_NFSV4_ACL, len); + return len; +} + struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = { .recover_open = nfs4_open_reclaim, .recover_lock = nfs4_lock_reclaim, @@ -2743,10 +2987,20 @@ struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops = { .recover_lock = nfs4_lock_expired, }; +static struct inode_operations nfs4_file_inode_operations = { + .permission = nfs_permission, + .getattr = nfs_getattr, + .setattr = nfs_setattr, + .getxattr = nfs4_getxattr, + .setxattr = nfs4_setxattr, + .listxattr = nfs4_listxattr, +}; + struct nfs_rpc_ops nfs_v4_clientops = { .version = 4, /* protocol version */ .dentry_ops = &nfs4_dentry_operations, .dir_inode_ops = &nfs4_dir_inode_operations, + .file_inode_ops = &nfs4_file_inode_operations, .getroot = nfs4_proc_get_root, .getattr = nfs4_proc_getattr, .setattr = nfs4_proc_setattr, @@ -2777,6 +3031,7 @@ struct nfs_rpc_ops nfs_v4_clientops = { .file_open = nfs4_proc_file_open, .file_release = nfs4_proc_file_release, .lock = nfs4_proc_lock, + .clear_acl_cache = nfs4_zap_acl_attr, }; /* diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 667e06f1c64..a3001628ad3 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -53,6 +53,7 @@ #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> +#include "nfs4_fs.h" #define NFSDBG_FACILITY NFSDBG_PROC diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 231cebce3c8..afe587d82f1 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -46,24 +46,18 @@ #include <linux/workqueue.h> #include <linux/bitops.h> +#include "nfs4_fs.h" #include "callback.h" #include "delegation.h" #define OPENOWNER_POOL_SIZE 8 -static DEFINE_SPINLOCK(state_spinlock); - -nfs4_stateid zero_stateid; - -#if 0 -nfs4_stateid one_stateid = - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; -#endif +const nfs4_stateid zero_stateid; +static DEFINE_SPINLOCK(state_spinlock); static LIST_HEAD(nfs4_clientid_list); static void nfs4_recover_state(void *); -extern void nfs4_renew_state(void *); void init_nfsv4_state(struct nfs_server *server) @@ -116,6 +110,7 @@ nfs4_alloc_client(struct in_addr *addr) INIT_LIST_HEAD(&clp->cl_superblocks); init_waitqueue_head(&clp->cl_waitq); rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS4 client"); + clp->cl_rpcclient = ERR_PTR(-EINVAL); clp->cl_boot_time = CURRENT_TIME; clp->cl_state = 1 << NFS4CLNT_OK; return clp; @@ -137,7 +132,7 @@ nfs4_free_client(struct nfs4_client *clp) if (clp->cl_cred) put_rpccred(clp->cl_cred); nfs_idmap_delete(clp); - if (clp->cl_rpcclient) + if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); kfree(clp); nfs_callback_down(); @@ -365,7 +360,7 @@ nfs4_alloc_open_state(void) atomic_set(&state->count, 1); INIT_LIST_HEAD(&state->lock_states); init_MUTEX(&state->lock_sema); - rwlock_init(&state->state_lock); + spin_lock_init(&state->state_lock); return state; } @@ -547,16 +542,6 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) return NULL; } -struct nfs4_lock_state * -nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) -{ - struct nfs4_lock_state *lsp; - read_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, fl_owner); - read_unlock(&state->state_lock); - return lsp; -} - /* * Return a compatible lock_state. If no initialized lock_state structure * exists, return an uninitialized one. @@ -573,14 +558,13 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f return NULL; lsp->ls_flags = 0; lsp->ls_seqid = 0; /* arbitrary */ - lsp->ls_id = -1; memset(lsp->ls_stateid.data, 0, sizeof(lsp->ls_stateid.data)); atomic_set(&lsp->ls_count, 1); lsp->ls_owner = fl_owner; - INIT_LIST_HEAD(&lsp->ls_locks); spin_lock(&clp->cl_lock); lsp->ls_id = nfs4_alloc_lockowner_id(clp); spin_unlock(&clp->cl_lock); + INIT_LIST_HEAD(&lsp->ls_locks); return lsp; } @@ -590,121 +574,112 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f * * The caller must be holding state->lock_sema and clp->cl_sem */ -struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) +static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) { - struct nfs4_lock_state * lsp; + struct nfs4_lock_state *lsp, *new = NULL; - lsp = nfs4_find_lock_state(state, owner); - if (lsp == NULL) - lsp = nfs4_alloc_lock_state(state, owner); + for(;;) { + spin_lock(&state->state_lock); + lsp = __nfs4_find_lock_state(state, owner); + if (lsp != NULL) + break; + if (new != NULL) { + new->ls_state = state; + list_add(&new->ls_locks, &state->lock_states); + set_bit(LK_STATE_IN_USE, &state->flags); + lsp = new; + new = NULL; + break; + } + spin_unlock(&state->state_lock); + new = nfs4_alloc_lock_state(state, owner); + if (new == NULL) + return NULL; + } + spin_unlock(&state->state_lock); + kfree(new); return lsp; } /* - * Byte-range lock aware utility to initialize the stateid of read/write - * requests. + * Release reference to lock_state, and free it if we see that + * it is no longer in use */ -void -nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) +static void nfs4_put_lock_state(struct nfs4_lock_state *lsp) { - if (test_bit(LK_STATE_IN_USE, &state->flags)) { - struct nfs4_lock_state *lsp; + struct nfs4_state *state; - lsp = nfs4_find_lock_state(state, fl_owner); - if (lsp) { - memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); - nfs4_put_lock_state(lsp); - return; - } - } - memcpy(dst, &state->stateid, sizeof(*dst)); + if (lsp == NULL) + return; + state = lsp->ls_state; + if (!atomic_dec_and_lock(&lsp->ls_count, &state->state_lock)) + return; + list_del(&lsp->ls_locks); + if (list_empty(&state->lock_states)) + clear_bit(LK_STATE_IN_USE, &state->flags); + spin_unlock(&state->state_lock); + kfree(lsp); } -/* -* Called with state->lock_sema and clp->cl_sem held. -*/ -void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *lsp) +static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) { - if (status == NFS_OK || seqid_mutating_err(-status)) - lsp->ls_seqid++; -} + struct nfs4_lock_state *lsp = src->fl_u.nfs4_fl.owner; -/* -* Check to see if the request lock (type FL_UNLK) effects the fl lock. -* -* fl and request must have the same posix owner -* -* return: -* 0 -> fl not effected by request -* 1 -> fl consumed by request -*/ + dst->fl_u.nfs4_fl.owner = lsp; + atomic_inc(&lsp->ls_count); +} -static int -nfs4_check_unlock(struct file_lock *fl, struct file_lock *request) +static void nfs4_fl_release_lock(struct file_lock *fl) { - if (fl->fl_start >= request->fl_start && fl->fl_end <= request->fl_end) - return 1; - return 0; + nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner); } -/* - * Post an initialized lock_state on the state->lock_states list. - */ -void nfs4_notify_setlk(struct nfs4_state *state, struct file_lock *request, struct nfs4_lock_state *lsp) +static struct file_lock_operations nfs4_fl_lock_ops = { + .fl_copy_lock = nfs4_fl_copy_lock, + .fl_release_private = nfs4_fl_release_lock, +}; + +int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) { - if (!list_empty(&lsp->ls_locks)) - return; - atomic_inc(&lsp->ls_count); - write_lock(&state->state_lock); - list_add(&lsp->ls_locks, &state->lock_states); - set_bit(LK_STATE_IN_USE, &state->flags); - write_unlock(&state->state_lock); + struct nfs4_lock_state *lsp; + + if (fl->fl_ops != NULL) + return 0; + lsp = nfs4_get_lock_state(state, fl->fl_owner); + if (lsp == NULL) + return -ENOMEM; + fl->fl_u.nfs4_fl.owner = lsp; + fl->fl_ops = &nfs4_fl_lock_ops; + return 0; } -/* - * to decide to 'reap' lock state: - * 1) search i_flock for file_locks with fl.lock_state = to ls. - * 2) determine if unlock will consume found lock. - * if so, reap - * - * else, don't reap. - * +/* + * Byte-range lock aware utility to initialize the stateid of read/write + * requests. */ -void -nfs4_notify_unlck(struct nfs4_state *state, struct file_lock *request, struct nfs4_lock_state *lsp) +void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) { - struct inode *inode = state->inode; - struct file_lock *fl; + struct nfs4_lock_state *lsp; - for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { - if (!(fl->fl_flags & FL_POSIX)) - continue; - if (fl->fl_owner != lsp->ls_owner) - continue; - /* Exit if we find at least one lock which is not consumed */ - if (nfs4_check_unlock(fl,request) == 0) - return; - } + memcpy(dst, &state->stateid, sizeof(*dst)); + if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) + return; - write_lock(&state->state_lock); - list_del_init(&lsp->ls_locks); - if (list_empty(&state->lock_states)) - clear_bit(LK_STATE_IN_USE, &state->flags); - write_unlock(&state->state_lock); + spin_lock(&state->state_lock); + lsp = __nfs4_find_lock_state(state, fl_owner); + if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) + memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); + spin_unlock(&state->state_lock); nfs4_put_lock_state(lsp); } /* - * Release reference to lock_state, and free it if we see that - * it is no longer in use - */ -void -nfs4_put_lock_state(struct nfs4_lock_state *lsp) +* Called with state->lock_sema and clp->cl_sem held. +*/ +void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *lsp) { - if (!atomic_dec_and_test(&lsp->ls_count)) - return; - BUG_ON (!list_empty(&lsp->ls_locks)); - kfree(lsp); + if (status == NFS_OK || seqid_mutating_err(-status)) + lsp->ls_seqid++; } /* diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 5f4de05763c..6c564ef9489 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -51,6 +51,7 @@ #include <linux/nfs4.h> #include <linux/nfs_fs.h> #include <linux/nfs_idmap.h> +#include "nfs4_fs.h" #define NFSDBG_FACILITY NFSDBG_XDR @@ -82,12 +83,16 @@ static int nfs_stat_to_errno(int); #define encode_getfh_maxsz (op_encode_hdr_maxsz) #define decode_getfh_maxsz (op_decode_hdr_maxsz + 1 + \ ((3+NFS4_FHSIZE) >> 2)) -#define encode_getattr_maxsz (op_encode_hdr_maxsz + 3) +#define nfs4_fattr_bitmap_maxsz 3 +#define encode_getattr_maxsz (op_encode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) #define nfs4_name_maxsz (1 + ((3 + NFS4_MAXNAMLEN) >> 2)) #define nfs4_path_maxsz (1 + ((3 + NFS4_MAXPATHLEN) >> 2)) -#define nfs4_fattr_bitmap_maxsz (36 + 2 * nfs4_name_maxsz) -#define decode_getattr_maxsz (op_decode_hdr_maxsz + 3 + \ - nfs4_fattr_bitmap_maxsz) +/* This is based on getfattr, which uses the most attributes: */ +#define nfs4_fattr_value_maxsz (1 + (1 + 2 + 2 + 4 + 2 + 1 + 1 + 2 + 2 + \ + 3 + 3 + 3 + 2 * nfs4_name_maxsz)) +#define nfs4_fattr_maxsz (nfs4_fattr_bitmap_maxsz + \ + nfs4_fattr_value_maxsz) +#define decode_getattr_maxsz (op_decode_hdr_maxsz + nfs4_fattr_maxsz) #define encode_savefh_maxsz (op_encode_hdr_maxsz) #define decode_savefh_maxsz (op_decode_hdr_maxsz) #define encode_fsinfo_maxsz (op_encode_hdr_maxsz + 2) @@ -122,11 +127,11 @@ static int nfs_stat_to_errno(int); #define encode_symlink_maxsz (op_encode_hdr_maxsz + \ 1 + nfs4_name_maxsz + \ nfs4_path_maxsz + \ - nfs4_fattr_bitmap_maxsz) + nfs4_fattr_maxsz) #define decode_symlink_maxsz (op_decode_hdr_maxsz + 8) #define encode_create_maxsz (op_encode_hdr_maxsz + \ 2 + nfs4_name_maxsz + \ - nfs4_fattr_bitmap_maxsz) + nfs4_fattr_maxsz) #define decode_create_maxsz (op_decode_hdr_maxsz + 8) #define encode_delegreturn_maxsz (op_encode_hdr_maxsz + 4) #define decode_delegreturn_maxsz (op_decode_hdr_maxsz) @@ -205,7 +210,7 @@ static int nfs_stat_to_errno(int); #define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ op_encode_hdr_maxsz + 4 + \ - nfs4_fattr_bitmap_maxsz + \ + nfs4_fattr_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_setattr_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ @@ -360,6 +365,20 @@ static int nfs_stat_to_errno(int); encode_delegreturn_maxsz) #define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \ decode_delegreturn_maxsz) +#define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_getacl_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + op_decode_hdr_maxsz + \ + nfs4_fattr_bitmap_maxsz + 1) +#define NFS4_enc_setacl_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + op_encode_hdr_maxsz + 4 + \ + nfs4_fattr_bitmap_maxsz + 1) +#define NFS4_dec_setacl_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz) static struct { unsigned int mode; @@ -459,7 +478,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s * In the worst-case, this would be * 12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) * = 36 bytes, plus any contribution from variable-length fields - * such as owner/group/acl's. + * such as owner/group. */ len = 16; @@ -660,8 +679,6 @@ static int encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1 static int encode_getfattr(struct xdr_stream *xdr, const u32* bitmask) { - extern u32 nfs4_fattr_bitmap[]; - return encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0], bitmask[1] & nfs4_fattr_bitmap[1]); @@ -669,8 +686,6 @@ static int encode_getfattr(struct xdr_stream *xdr, const u32* bitmask) static int encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask) { - extern u32 nfs4_fsinfo_bitmap[]; - return encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0], bitmask[1] & nfs4_fsinfo_bitmap[1]); } @@ -969,7 +984,6 @@ static int encode_putrootfh(struct xdr_stream *xdr) static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) { - extern nfs4_stateid zero_stateid; nfs4_stateid stateid; uint32_t *p; @@ -1000,6 +1014,10 @@ static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args) static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req) { struct rpc_auth *auth = req->rq_task->tk_auth; + uint32_t attrs[2] = { + FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID, + FATTR4_WORD1_MOUNTED_ON_FILEID, + }; int replen; uint32_t *p; @@ -1010,13 +1028,20 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg WRITE32(readdir->count >> 1); /* We're not doing readdirplus */ WRITE32(readdir->count); WRITE32(2); - if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) { - WRITE32(0); - WRITE32(FATTR4_WORD1_MOUNTED_ON_FILEID); - } else { - WRITE32(FATTR4_WORD0_FILEID); - WRITE32(0); - } + /* Switch to mounted_on_fileid if the server supports it */ + if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) + attrs[0] &= ~FATTR4_WORD0_FILEID; + else + attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; + WRITE32(attrs[0] & readdir->bitmask[0]); + WRITE32(attrs[1] & readdir->bitmask[1]); + dprintk("%s: cookie = %Lu, verifier = 0x%x%x, bitmap = 0x%x%x\n", + __FUNCTION__, + (unsigned long long)readdir->cookie, + ((u32 *)readdir->verifier.data)[0], + ((u32 *)readdir->verifier.data)[1], + attrs[0] & readdir->bitmask[0], + attrs[1] & readdir->bitmask[1]); /* set up reply kvec * toplevel_status + taglen + rescount + OP_PUTFH + status @@ -1025,6 +1050,9 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg replen = (RPC_REPHDRSIZE + auth->au_rslack + 9) << 2; xdr_inline_pages(&req->rq_rcv_buf, replen, readdir->pages, readdir->pgbase, readdir->count); + dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", + __FUNCTION__, replen, readdir->pages, + readdir->pgbase, readdir->count); return 0; } @@ -1089,6 +1117,25 @@ static int encode_renew(struct xdr_stream *xdr, const struct nfs4_client *client } static int +encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg) +{ + uint32_t *p; + + RESERVE_SPACE(4+sizeof(zero_stateid.data)); + WRITE32(OP_SETATTR); + WRITEMEM(zero_stateid.data, sizeof(zero_stateid.data)); + RESERVE_SPACE(2*4); + WRITE32(1); + WRITE32(FATTR4_WORD0_ACL); + if (arg->acl_len % 4) + return -EINVAL; + RESERVE_SPACE(4); + WRITE32(arg->acl_len); + xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); + return 0; +} + +static int encode_savefh(struct xdr_stream *xdr) { uint32_t *p; @@ -1632,6 +1679,34 @@ out: } /* + * Encode a GETACL request + */ +static int +nfs4_xdr_enc_getacl(struct rpc_rqst *req, uint32_t *p, + struct nfs_getaclargs *args) +{ + struct xdr_stream xdr; + struct rpc_auth *auth = req->rq_task->tk_auth; + struct compound_hdr hdr = { + .nops = 2, + }; + int replen, status; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, &hdr); + status = encode_putfh(&xdr, args->fh); + if (status) + goto out; + status = encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0); + /* set up reply buffer: */ + replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_getacl_sz) << 2; + xdr_inline_pages(&req->rq_rcv_buf, replen, + args->acl_pages, args->acl_pgbase, args->acl_len); +out: + return status; +} + +/* * Encode a WRITE request */ static int nfs4_xdr_enc_write(struct rpc_rqst *req, uint32_t *p, struct nfs_writeargs *args) @@ -1697,7 +1772,6 @@ static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs */ static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, uint32_t *p, const struct nfs4_pathconf_arg *args) { - extern u32 nfs4_pathconf_bitmap[2]; struct xdr_stream xdr; struct compound_hdr hdr = { .nops = 2, @@ -1718,7 +1792,6 @@ static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, uint32_t *p, const struct */ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, uint32_t *p, const struct nfs4_statfs_arg *args) { - extern u32 nfs4_statfs_bitmap[]; struct xdr_stream xdr; struct compound_hdr hdr = { .nops = 2, @@ -3003,6 +3076,11 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n return status; READ_BUF(8); COPYMEM(readdir->verifier.data, 8); + dprintk("%s: verifier = 0x%x%x\n", + __FUNCTION__, + ((u32 *)readdir->verifier.data)[0], + ((u32 *)readdir->verifier.data)[1]); + hdrlen = (char *) p - (char *) iov->iov_base; recvd = rcvbuf->len - hdrlen; @@ -3017,12 +3095,14 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n for (nr = 0; *p++; nr++) { if (p + 3 > end) goto short_pkt; + dprintk("cookie = %Lu, ", *((unsigned long long *)p)); p += 2; /* cookie */ len = ntohl(*p++); /* filename length */ if (len > NFS4_MAXNAMLEN) { printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)\n", len); goto err_unmap; } + dprintk("filename = %*s\n", len, (char *)p); p += XDR_QUADLEN(len); if (p + 1 > end) goto short_pkt; @@ -3042,6 +3122,7 @@ out: kunmap_atomic(kaddr, KM_USER0); return 0; short_pkt: + dprintk("%s: short packet at entry %d\n", __FUNCTION__, nr); entry[0] = entry[1] = 0; /* truncate listing ? */ if (!nr) { @@ -3127,6 +3208,47 @@ static int decode_renew(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_RENEW); } +static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, + size_t *acl_len) +{ + uint32_t *savep; + uint32_t attrlen, + bitmap[2] = {0}; + struct kvec *iov = req->rq_rcv_buf.head; + int status; + + *acl_len = 0; + if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) + goto out; + if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) + goto out; + if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) + goto out; + + if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U))) + return -EIO; + if (likely(bitmap[0] & FATTR4_WORD0_ACL)) { + int hdrlen, recvd; + + /* We ignore &savep and don't do consistency checks on + * the attr length. Let userspace figure it out.... */ + hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base; + recvd = req->rq_rcv_buf.len - hdrlen; + if (attrlen > recvd) { + printk(KERN_WARNING "NFS: server cheating in getattr" + " acl reply: attrlen %u > recvd %u\n", + attrlen, recvd); + return -EINVAL; + } + if (attrlen <= *acl_len) + xdr_read_pages(xdr, attrlen); + *acl_len = attrlen; + } + +out: + return status; +} + static int decode_savefh(struct xdr_stream *xdr) { @@ -3418,6 +3540,71 @@ out: } +/* + * Encode an SETACL request + */ +static int +nfs4_xdr_enc_setacl(struct rpc_rqst *req, uint32_t *p, struct nfs_setaclargs *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .nops = 2, + }; + int status; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, &hdr); + status = encode_putfh(&xdr, args->fh); + if (status) + goto out; + status = encode_setacl(&xdr, args); +out: + return status; +} +/* + * Decode SETACL response + */ +static int +nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, uint32_t *p, void *res) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_putfh(&xdr); + if (status) + goto out; + status = decode_setattr(&xdr, res); +out: + return status; +} + +/* + * Decode GETACL response + */ +static int +nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, uint32_t *p, size_t *acl_len) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (status) + goto out; + status = decode_putfh(&xdr); + if (status) + goto out; + status = decode_getacl(&xdr, rqstp, acl_len); + +out: + return status; +} /* * Decode CLOSE response @@ -3895,6 +4082,12 @@ uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus) } len = XDR_QUADLEN(ntohl(*p++)); /* attribute buffer length */ if (len > 0) { + if (bitmap[0] & FATTR4_WORD0_RDATTR_ERROR) { + bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; + /* Ignore the return value of rdattr_error for now */ + p++; + len--; + } if (bitmap[0] == 0 && bitmap[1] == FATTR4_WORD1_MOUNTED_ON_FILEID) xdr_decode_hyper(p, &entry->ino); else if (bitmap[0] == FATTR4_WORD0_FILEID) @@ -3934,6 +4127,8 @@ static struct { { NFS4ERR_DQUOT, EDQUOT }, { NFS4ERR_STALE, ESTALE }, { NFS4ERR_BADHANDLE, EBADHANDLE }, + { NFS4ERR_BADOWNER, EINVAL }, + { NFS4ERR_BADNAME, EINVAL }, { NFS4ERR_BAD_COOKIE, EBADCOOKIE }, { NFS4ERR_NOTSUPP, ENOTSUPP }, { NFS4ERR_TOOSMALL, ETOOSMALL }, @@ -4019,6 +4214,8 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(READDIR, enc_readdir, dec_readdir), PROC(SERVER_CAPS, enc_server_caps, dec_server_caps), PROC(DELEGRETURN, enc_delegreturn, dec_delegreturn), + PROC(GETACL, enc_getacl, dec_getacl), + PROC(SETACL, enc_setacl, dec_setacl), }; struct rpc_version nfs_version4 = { diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index fd5bc596fe8..1b272a135a3 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -124,6 +124,7 @@ enum { Opt_soft, Opt_hard, Opt_intr, Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac, Opt_noac, Opt_lock, Opt_nolock, Opt_v2, Opt_v3, Opt_udp, Opt_tcp, + Opt_acl, Opt_noacl, /* Error token */ Opt_err }; @@ -158,6 +159,8 @@ static match_table_t __initdata tokens = { {Opt_udp, "udp"}, {Opt_tcp, "proto=tcp"}, {Opt_tcp, "tcp"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, {Opt_err, NULL} }; @@ -266,6 +269,12 @@ static int __init root_nfs_parse(char *name, char *buf) case Opt_tcp: nfs_data.flags |= NFS_MOUNT_TCP; break; + case Opt_acl: + nfs_data.flags &= ~NFS_MOUNT_NOACL; + break; + case Opt_noacl: + nfs_data.flags |= NFS_MOUNT_NOACL; + break; default : return 0; } diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 4f1ba723848..d53857b148e 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -107,11 +107,38 @@ void nfs_unlock_request(struct nfs_page *req) smp_mb__before_clear_bit(); clear_bit(PG_BUSY, &req->wb_flags); smp_mb__after_clear_bit(); - wake_up_all(&req->wb_context->waitq); + wake_up_bit(&req->wb_flags, PG_BUSY); nfs_release_request(req); } /** + * nfs_set_page_writeback_locked - Lock a request for writeback + * @req: + */ +int nfs_set_page_writeback_locked(struct nfs_page *req) +{ + struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode); + + if (!nfs_lock_request(req)) + return 0; + radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_WRITEBACK); + return 1; +} + +/** + * nfs_clear_page_writeback - Unlock request and wake up sleepers + */ +void nfs_clear_page_writeback(struct nfs_page *req) +{ + struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode); + + spin_lock(&nfsi->req_lock); + radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_WRITEBACK); + spin_unlock(&nfsi->req_lock); + nfs_unlock_request(req); +} + +/** * nfs_clear_request - Free up all resources allocated to the request * @req: * @@ -150,34 +177,15 @@ nfs_release_request(struct nfs_page *req) nfs_page_free(req); } -/** - * nfs_list_add_request - Insert a request into a sorted list - * @req: request - * @head: head of list into which to insert the request. - * - * Note that the wb_list is sorted by page index in order to facilitate - * coalescing of requests. - * We use an insertion sort that is optimized for the case of appended - * writes. - */ -void -nfs_list_add_request(struct nfs_page *req, struct list_head *head) +static int nfs_wait_bit_interruptible(void *word) { - struct list_head *pos; + int ret = 0; -#ifdef NFS_PARANOIA - if (!list_empty(&req->wb_list)) { - printk(KERN_ERR "NFS: Add to list failed!\n"); - BUG(); - } -#endif - list_for_each_prev(pos, head) { - struct nfs_page *p = nfs_list_entry(pos); - if (p->wb_index < req->wb_index) - break; - } - list_add(&req->wb_list, pos); - req->wb_list_head = head; + if (signal_pending(current)) + ret = -ERESTARTSYS; + else + schedule(); + return ret; } /** @@ -190,12 +198,22 @@ nfs_list_add_request(struct nfs_page *req, struct list_head *head) int nfs_wait_on_request(struct nfs_page *req) { - struct inode *inode = req->wb_context->dentry->d_inode; - struct rpc_clnt *clnt = NFS_CLIENT(inode); - - if (!NFS_WBACK_BUSY(req)) - return 0; - return nfs_wait_event(clnt, req->wb_context->waitq, !NFS_WBACK_BUSY(req)); + struct rpc_clnt *clnt = NFS_CLIENT(req->wb_context->dentry->d_inode); + sigset_t oldmask; + int ret = 0; + + if (!test_bit(PG_BUSY, &req->wb_flags)) + goto out; + /* + * Note: the call to rpc_clnt_sigmask() suffices to ensure that we + * are not interrupted if intr flag is not set + */ + rpc_clnt_sigmask(clnt, &oldmask); + ret = out_of_line_wait_on_bit(&req->wb_flags, PG_BUSY, + nfs_wait_bit_interruptible, TASK_INTERRUPTIBLE); + rpc_clnt_sigunmask(clnt, &oldmask); +out: + return ret; } /** @@ -243,6 +261,62 @@ nfs_coalesce_requests(struct list_head *head, struct list_head *dst, return npages; } +#define NFS_SCAN_MAXENTRIES 16 +/** + * nfs_scan_lock_dirty - Scan the radix tree for dirty requests + * @nfsi: NFS inode + * @dst: Destination list + * @idx_start: lower bound of page->index to scan + * @npages: idx_start + npages sets the upper bound to scan. + * + * Moves elements from one of the inode request lists. + * If the number of requests is set to 0, the entire address_space + * starting at index idx_start, is scanned. + * The requests are *not* checked to ensure that they form a contiguous set. + * You must be holding the inode's req_lock when calling this function + */ +int +nfs_scan_lock_dirty(struct nfs_inode *nfsi, struct list_head *dst, + unsigned long idx_start, unsigned int npages) +{ + struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES]; + struct nfs_page *req; + unsigned long idx_end; + int found, i; + int res; + + res = 0; + if (npages == 0) + idx_end = ~0; + else + idx_end = idx_start + npages - 1; + + for (;;) { + found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, + (void **)&pgvec[0], idx_start, NFS_SCAN_MAXENTRIES, + NFS_PAGE_TAG_DIRTY); + if (found <= 0) + break; + for (i = 0; i < found; i++) { + req = pgvec[i]; + if (req->wb_index > idx_end) + goto out; + + idx_start = req->wb_index + 1; + + if (nfs_set_page_writeback_locked(req)) { + radix_tree_tag_clear(&nfsi->nfs_page_tree, + req->wb_index, NFS_PAGE_TAG_DIRTY); + nfs_list_remove_request(req); + nfs_list_add_request(req, dst); + res++; + } + } + } +out: + return res; +} + /** * nfs_scan_list - Scan a list for matching requests * @head: One of the NFS inode request lists @@ -280,7 +354,7 @@ nfs_scan_list(struct list_head *head, struct list_head *dst, if (req->wb_index > idx_end) break; - if (!nfs_lock_request(req)) + if (!nfs_set_page_writeback_locked(req)) continue; nfs_list_remove_request(req); nfs_list_add_request(req, dst); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index d31b4d6e5a5..be23c3fb926 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -114,6 +114,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, dprintk("NFS call setattr\n"); fattr->valid = 0; status = rpc_call(NFS_CLIENT(inode), NFSPROC_SETATTR, &arg, fattr, 0); + if (status == 0) + nfs_setattr_update_inode(inode, sattr); dprintk("NFS reply setattr: %d\n", status); return status; } @@ -622,6 +624,7 @@ struct nfs_rpc_ops nfs_v2_clientops = { .version = 2, /* protocol version */ .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs_dir_inode_operations, + .file_inode_ops = &nfs_file_inode_operations, .getroot = nfs_proc_get_root, .getattr = nfs_proc_getattr, .setattr = nfs_proc_setattr, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index a0042fb5863..6ceb1d471f2 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -140,7 +140,9 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode, if (rdata->res.eof != 0 || result == 0) break; } while (count); - NFS_FLAGS(inode) |= NFS_INO_INVALID_ATIME; + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME; + spin_unlock(&inode->i_lock); if (count) memclear_highpage_flush(page, rdata->args.pgbase, count); @@ -173,7 +175,6 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, if (len < PAGE_CACHE_SIZE) memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len); - nfs_lock_request(new); nfs_list_add_request(new, &one_request); nfs_pagein_one(&one_request, inode); return 0; @@ -185,7 +186,6 @@ static void nfs_readpage_release(struct nfs_page *req) nfs_clear_request(req); nfs_release_request(req); - nfs_unlock_request(req); dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", req->wb_context->dentry->d_inode->i_sb->s_id, @@ -475,7 +475,9 @@ void nfs_readpage_result(struct rpc_task *task) } task->tk_status = -EIO; } - NFS_FLAGS(data->inode) |= NFS_INO_INVALID_ATIME; + spin_lock(&data->inode->i_lock); + NFS_I(data->inode)->cache_validity |= NFS_INO_INVALID_ATIME; + spin_unlock(&data->inode->i_lock); data->complete(data, status); } @@ -553,7 +555,6 @@ readpage_async_filler(void *data, struct page *page) } if (len < PAGE_CACHE_SIZE) memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len); - nfs_lock_request(new); nfs_list_add_request(new, desc->head); return 0; } diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 35f10659914..18dc95b0b64 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -27,26 +27,14 @@ /* Symlink caching in the page cache is even more simplistic * and straight-forward than readdir caching. - * - * At the beginning of the page we store pointer to struct page in question, - * simplifying nfs_put_link() (if inode got invalidated we can't find the page - * to be freed via pagecache lookup). - * The NUL-terminated string follows immediately thereafter. */ -struct nfs_symlink { - struct page *page; - char body[0]; -}; - static int nfs_symlink_filler(struct inode *inode, struct page *page) { - const unsigned int pgbase = offsetof(struct nfs_symlink, body); - const unsigned int pglen = PAGE_SIZE - pgbase; int error; lock_kernel(); - error = NFS_PROTO(inode)->readlink(inode, page, pgbase, pglen); + error = NFS_PROTO(inode)->readlink(inode, page, 0, PAGE_SIZE); unlock_kernel(); if (error < 0) goto error; @@ -60,11 +48,10 @@ error: return -EIO; } -static int nfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; struct page *page; - struct nfs_symlink *p; void *err = ERR_PTR(nfs_revalidate_inode(NFS_SERVER(inode), inode)); if (err) goto read_failed; @@ -78,28 +65,20 @@ static int nfs_follow_link(struct dentry *dentry, struct nameidata *nd) err = ERR_PTR(-EIO); goto getlink_read_error; } - p = kmap(page); - p->page = page; - nd_set_link(nd, p->body); - return 0; + nd_set_link(nd, kmap(page)); + return page; getlink_read_error: page_cache_release(page); read_failed: nd_set_link(nd, err); - return 0; + return NULL; } -static void nfs_put_link(struct dentry *dentry, struct nameidata *nd) +static void nfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) { - char *s = nd_get_link(nd); - if (!IS_ERR(s)) { - struct nfs_symlink *p; - struct page *page; - - p = container_of(s, struct nfs_symlink, body[0]); - page = p->page; - + if (cookie) { + struct page *page = cookie; kunmap(page); page_cache_release(page); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 6f7a4af3bc4..5130eda231d 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -220,7 +220,7 @@ static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode, ClearPageError(page); io_error: - nfs_end_data_update_defer(inode); + nfs_end_data_update(inode); nfs_writedata_free(wdata); return written ? written : result; } @@ -352,7 +352,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) if (err < 0) goto out; } - err = nfs_commit_inode(inode, 0, 0, wb_priority(wbc)); + err = nfs_commit_inode(inode, wb_priority(wbc)); if (err > 0) { wbc->nr_to_write -= err; err = 0; @@ -401,7 +401,7 @@ static void nfs_inode_remove_request(struct nfs_page *req) nfsi->npages--; if (!nfsi->npages) { spin_unlock(&nfsi->req_lock); - nfs_end_data_update_defer(inode); + nfs_end_data_update(inode); iput(inode); } else spin_unlock(&nfsi->req_lock); @@ -446,6 +446,8 @@ nfs_mark_request_dirty(struct nfs_page *req) struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&nfsi->req_lock); + radix_tree_tag_set(&nfsi->nfs_page_tree, + req->wb_index, NFS_PAGE_TAG_DIRTY); nfs_list_add_request(req, &nfsi->dirty); nfsi->ndirty++; spin_unlock(&nfsi->req_lock); @@ -503,13 +505,12 @@ nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, unsigned int spin_lock(&nfsi->req_lock); next = idx_start; - while (radix_tree_gang_lookup(&nfsi->nfs_page_tree, (void **)&req, next, 1)) { + while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_WRITEBACK)) { if (req->wb_index > idx_end) break; next = req->wb_index + 1; - if (!NFS_WBACK_BUSY(req)) - continue; + BUG_ON(!NFS_WBACK_BUSY(req)); atomic_inc(&req->wb_count); spin_unlock(&nfsi->req_lock); @@ -538,12 +539,15 @@ static int nfs_scan_dirty(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages) { struct nfs_inode *nfsi = NFS_I(inode); - int res; - res = nfs_scan_list(&nfsi->dirty, dst, idx_start, npages); - nfsi->ndirty -= res; - sub_page_state(nr_dirty,res); - if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty)) - printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n"); + int res = 0; + + if (nfsi->ndirty != 0) { + res = nfs_scan_lock_dirty(nfsi, dst, idx_start, npages); + nfsi->ndirty -= res; + sub_page_state(nr_dirty,res); + if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty)) + printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n"); + } return res; } @@ -562,11 +566,14 @@ static int nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages) { struct nfs_inode *nfsi = NFS_I(inode); - int res; - res = nfs_scan_list(&nfsi->commit, dst, idx_start, npages); - nfsi->ncommit -= res; - if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit)) - printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); + int res = 0; + + if (nfsi->ncommit != 0) { + res = nfs_scan_list(&nfsi->commit, dst, idx_start, npages); + nfsi->ncommit -= res; + if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit)) + printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); + } return res; } #endif @@ -750,7 +757,7 @@ int nfs_updatepage(struct file *file, struct page *page, * is entirely in cache, it may be more efficient to avoid * fragmenting write requests. */ - if (PageUptodate(page) && inode->i_flock == NULL) { + if (PageUptodate(page) && inode->i_flock == NULL && !(file->f_mode & O_SYNC)) { loff_t end_offs = i_size_read(inode) - 1; unsigned long end_index = end_offs >> PAGE_CACHE_SHIFT; @@ -821,7 +828,7 @@ out: #else nfs_inode_remove_request(req); #endif - nfs_unlock_request(req); + nfs_clear_page_writeback(req); } static inline int flush_task_priority(int how) @@ -952,7 +959,7 @@ out_bad: nfs_writedata_free(data); } nfs_mark_request_dirty(req); - nfs_unlock_request(req); + nfs_clear_page_writeback(req); return -ENOMEM; } @@ -1002,7 +1009,7 @@ static int nfs_flush_one(struct list_head *head, struct inode *inode, int how) struct nfs_page *req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_mark_request_dirty(req); - nfs_unlock_request(req); + nfs_clear_page_writeback(req); } return -ENOMEM; } @@ -1029,7 +1036,7 @@ nfs_flush_list(struct list_head *head, int wpages, int how) req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_mark_request_dirty(req); - nfs_unlock_request(req); + nfs_clear_page_writeback(req); } return error; } @@ -1121,7 +1128,7 @@ static void nfs_writeback_done_full(struct nfs_write_data *data, int status) nfs_inode_remove_request(req); #endif next: - nfs_unlock_request(req); + nfs_clear_page_writeback(req); } } @@ -1210,36 +1217,24 @@ static void nfs_commit_rpcsetup(struct list_head *head, struct nfs_write_data *data, int how) { struct rpc_task *task = &data->task; - struct nfs_page *first, *last; + struct nfs_page *first; struct inode *inode; - loff_t start, end, len; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ list_splice_init(head, &data->pages); first = nfs_list_entry(data->pages.next); - last = nfs_list_entry(data->pages.prev); inode = first->wb_context->dentry->d_inode; - /* - * Determine the offset range of requests in the COMMIT call. - * We rely on the fact that data->pages is an ordered list... - */ - start = req_offset(first); - end = req_offset(last) + last->wb_bytes; - len = end - start; - /* If 'len' is not a 32-bit quantity, pass '0' in the COMMIT call */ - if (end >= i_size_read(inode) || len < 0 || len > (~((u32)0) >> 1)) - len = 0; - data->inode = inode; data->cred = first->wb_context->cred; data->args.fh = NFS_FH(data->inode); - data->args.offset = start; - data->args.count = len; - data->res.count = len; + /* Note: we always request a commit of the entire inode */ + data->args.offset = 0; + data->args.count = 0; + data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; @@ -1278,7 +1273,7 @@ nfs_commit_list(struct list_head *head, int how) req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_mark_request_commit(req); - nfs_unlock_request(req); + nfs_clear_page_writeback(req); } return -ENOMEM; } @@ -1324,7 +1319,7 @@ nfs_commit_done(struct rpc_task *task) dprintk(" mismatch\n"); nfs_mark_request_dirty(req); next: - nfs_unlock_request(req); + nfs_clear_page_writeback(req); res++; } sub_page_state(nr_unstable,res); @@ -1342,16 +1337,23 @@ static int nfs_flush_inode(struct inode *inode, unsigned long idx_start, spin_lock(&nfsi->req_lock); res = nfs_scan_dirty(inode, &head, idx_start, npages); spin_unlock(&nfsi->req_lock); - if (res) - error = nfs_flush_list(&head, NFS_SERVER(inode)->wpages, how); + if (res) { + struct nfs_server *server = NFS_SERVER(inode); + + /* For single writes, FLUSH_STABLE is more efficient */ + if (res == nfsi->npages && nfsi->npages <= server->wpages) { + if (res > 1 || nfs_list_entry(head.next)->wb_bytes <= server->wsize) + how |= FLUSH_STABLE; + } + error = nfs_flush_list(&head, server->wpages, how); + } if (error < 0) return error; return res; } #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -int nfs_commit_inode(struct inode *inode, unsigned long idx_start, - unsigned int npages, int how) +int nfs_commit_inode(struct inode *inode, int how) { struct nfs_inode *nfsi = NFS_I(inode); LIST_HEAD(head); @@ -1359,15 +1361,13 @@ int nfs_commit_inode(struct inode *inode, unsigned long idx_start, error = 0; spin_lock(&nfsi->req_lock); - res = nfs_scan_commit(inode, &head, idx_start, npages); + res = nfs_scan_commit(inode, &head, 0, 0); + spin_unlock(&nfsi->req_lock); if (res) { - res += nfs_scan_commit(inode, &head, 0, 0); - spin_unlock(&nfsi->req_lock); error = nfs_commit_list(&head, how); - } else - spin_unlock(&nfsi->req_lock); - if (error < 0) - return error; + if (error < 0) + return error; + } return res; } #endif @@ -1389,7 +1389,7 @@ int nfs_sync_inode(struct inode *inode, unsigned long idx_start, error = nfs_flush_inode(inode, idx_start, npages, how); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) if (error == 0) - error = nfs_commit_inode(inode, idx_start, npages, how); + error = nfs_commit_inode(inode, how); #endif } while (error > 0); return error; diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile new file mode 100644 index 00000000000..f689ed82af3 --- /dev/null +++ b/fs/nfs_common/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for Linux filesystem routines that are shared by client and server. +# + +obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o + +nfs_acl-objs := nfsacl.o diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c new file mode 100644 index 00000000000..251e5a1bb1c --- /dev/null +++ b/fs/nfs_common/nfsacl.c @@ -0,0 +1,258 @@ +/* + * fs/nfs_common/nfsacl.c + * + * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de> + */ + +/* + * The Solaris nfsacl protocol represents some ACLs slightly differently + * than POSIX 1003.1e draft 17 does (and we do): + * + * - Minimal ACLs always have an ACL_MASK entry, so they have + * four instead of three entries. + * - The ACL_MASK entry in such minimal ACLs always has the same + * permissions as the ACL_GROUP_OBJ entry. (In extended ACLs + * the ACL_MASK and ACL_GROUP_OBJ entries may differ.) + * - The identifier fields of the ACL_USER_OBJ and ACL_GROUP_OBJ + * entries contain the identifiers of the owner and owning group. + * (In POSIX ACLs we always set them to ACL_UNDEFINED_ID). + * - ACL entries in the kernel are kept sorted in ascending order + * of (e_tag, e_id). Solaris ACLs are unsorted. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/sunrpc/xdr.h> +#include <linux/nfsacl.h> +#include <linux/nfs3.h> +#include <linux/sort.h> + +MODULE_LICENSE("GPL"); + +EXPORT_SYMBOL(nfsacl_encode); +EXPORT_SYMBOL(nfsacl_decode); + +struct nfsacl_encode_desc { + struct xdr_array2_desc desc; + unsigned int count; + struct posix_acl *acl; + int typeflag; + uid_t uid; + gid_t gid; +}; + +static int +xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem) +{ + struct nfsacl_encode_desc *nfsacl_desc = + (struct nfsacl_encode_desc *) desc; + u32 *p = (u32 *) elem; + + if (nfsacl_desc->count < nfsacl_desc->acl->a_count) { + struct posix_acl_entry *entry = + &nfsacl_desc->acl->a_entries[nfsacl_desc->count++]; + + *p++ = htonl(entry->e_tag | nfsacl_desc->typeflag); + switch(entry->e_tag) { + case ACL_USER_OBJ: + *p++ = htonl(nfsacl_desc->uid); + break; + case ACL_GROUP_OBJ: + *p++ = htonl(nfsacl_desc->gid); + break; + case ACL_USER: + case ACL_GROUP: + *p++ = htonl(entry->e_id); + break; + default: /* Solaris depends on that! */ + *p++ = 0; + break; + } + *p++ = htonl(entry->e_perm & S_IRWXO); + } else { + const struct posix_acl_entry *pa, *pe; + int group_obj_perm = ACL_READ|ACL_WRITE|ACL_EXECUTE; + + FOREACH_ACL_ENTRY(pa, nfsacl_desc->acl, pe) { + if (pa->e_tag == ACL_GROUP_OBJ) { + group_obj_perm = pa->e_perm & S_IRWXO; + break; + } + } + /* fake up ACL_MASK entry */ + *p++ = htonl(ACL_MASK | nfsacl_desc->typeflag); + *p++ = htonl(0); + *p++ = htonl(group_obj_perm); + } + + return 0; +} + +unsigned int +nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, + struct posix_acl *acl, int encode_entries, int typeflag) +{ + int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0; + struct nfsacl_encode_desc nfsacl_desc = { + .desc = { + .elem_size = 12, + .array_len = encode_entries ? entries : 0, + .xcode = xdr_nfsace_encode, + }, + .acl = acl, + .typeflag = typeflag, + .uid = inode->i_uid, + .gid = inode->i_gid, + }; + int err; + + if (entries > NFS_ACL_MAX_ENTRIES || + xdr_encode_word(buf, base, entries)) + return -EINVAL; + err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc); + if (!err) + err = 8 + nfsacl_desc.desc.elem_size * + nfsacl_desc.desc.array_len; + return err; +} + +struct nfsacl_decode_desc { + struct xdr_array2_desc desc; + unsigned int count; + struct posix_acl *acl; +}; + +static int +xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem) +{ + struct nfsacl_decode_desc *nfsacl_desc = + (struct nfsacl_decode_desc *) desc; + u32 *p = (u32 *) elem; + struct posix_acl_entry *entry; + + if (!nfsacl_desc->acl) { + if (desc->array_len > NFS_ACL_MAX_ENTRIES) + return -EINVAL; + nfsacl_desc->acl = posix_acl_alloc(desc->array_len, GFP_KERNEL); + if (!nfsacl_desc->acl) + return -ENOMEM; + nfsacl_desc->count = 0; + } + + entry = &nfsacl_desc->acl->a_entries[nfsacl_desc->count++]; + entry->e_tag = ntohl(*p++) & ~NFS_ACL_DEFAULT; + entry->e_id = ntohl(*p++); + entry->e_perm = ntohl(*p++); + + switch(entry->e_tag) { + case ACL_USER_OBJ: + case ACL_USER: + case ACL_GROUP_OBJ: + case ACL_GROUP: + case ACL_OTHER: + if (entry->e_perm & ~S_IRWXO) + return -EINVAL; + break; + case ACL_MASK: + /* Solaris sometimes sets additonal bits in the mask */ + entry->e_perm &= S_IRWXO; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int +cmp_acl_entry(const void *x, const void *y) +{ + const struct posix_acl_entry *a = x, *b = y; + + if (a->e_tag != b->e_tag) + return a->e_tag - b->e_tag; + else if (a->e_id > b->e_id) + return 1; + else if (a->e_id < b->e_id) + return -1; + else + return 0; +} + +/* + * Convert from a Solaris ACL to a POSIX 1003.1e draft 17 ACL. + */ +static int +posix_acl_from_nfsacl(struct posix_acl *acl) +{ + struct posix_acl_entry *pa, *pe, + *group_obj = NULL, *mask = NULL; + + if (!acl) + return 0; + + sort(acl->a_entries, acl->a_count, sizeof(struct posix_acl_entry), + cmp_acl_entry, NULL); + + /* Clear undefined identifier fields and find the ACL_GROUP_OBJ + and ACL_MASK entries. */ + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch(pa->e_tag) { + case ACL_USER_OBJ: + pa->e_id = ACL_UNDEFINED_ID; + break; + case ACL_GROUP_OBJ: + pa->e_id = ACL_UNDEFINED_ID; + group_obj = pa; + break; + case ACL_MASK: + mask = pa; + /* fall through */ + case ACL_OTHER: + pa->e_id = ACL_UNDEFINED_ID; + break; + } + } + if (acl->a_count == 4 && group_obj && mask && + mask->e_perm == group_obj->e_perm) { + /* remove bogus ACL_MASK entry */ + memmove(mask, mask+1, (3 - (mask - acl->a_entries)) * + sizeof(struct posix_acl_entry)); + acl->a_count = 3; + } + return 0; +} + +unsigned int +nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt, + struct posix_acl **pacl) +{ + struct nfsacl_decode_desc nfsacl_desc = { + .desc = { + .elem_size = 12, + .xcode = pacl ? xdr_nfsace_decode : NULL, + }, + }; + u32 entries; + int err; + + if (xdr_decode_word(buf, base, &entries) || + entries > NFS_ACL_MAX_ENTRIES) + return -EINVAL; + nfsacl_desc.desc.array_maxlen = entries; + err = xdr_decode_array2(buf, base + 4, &nfsacl_desc.desc); + if (err) + return err; + if (pacl) { + if (entries != nfsacl_desc.desc.array_len || + posix_acl_from_nfsacl(nfsacl_desc.acl) != 0) { + posix_acl_release(nfsacl_desc.acl); + return -EINVAL; + } + *pacl = nfsacl_desc.acl; + } + if (aclcnt) + *aclcnt = entries; + return 8 + nfsacl_desc.desc.elem_size * + nfsacl_desc.desc.array_len; +} diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index b8680a247f8..ce341dc76d5 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -6,7 +6,9 @@ obj-$(CONFIG_NFSD) += nfsd.o nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o +nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o +nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ - nfs4acl.o nfs4callback.o + nfs4acl.o nfs4callback.o nfs4recover.o nfsd-objs := $(nfsd-y) diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c new file mode 100644 index 00000000000..7cbf0682b2f --- /dev/null +++ b/fs/nfsd/nfs2acl.c @@ -0,0 +1,336 @@ +/* + * linux/fs/nfsd/nfsacl.c + * + * Process version 2 NFSACL requests. + * + * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de> + */ + +#include <linux/sunrpc/svc.h> +#include <linux/nfs.h> +#include <linux/nfsd/nfsd.h> +#include <linux/nfsd/cache.h> +#include <linux/nfsd/xdr.h> +#include <linux/nfsd/xdr3.h> +#include <linux/posix_acl.h> +#include <linux/nfsacl.h> + +#define NFSDDBG_FACILITY NFSDDBG_PROC +#define RETURN_STATUS(st) { resp->status = (st); return (st); } + +/* + * NULL call. + */ +static int +nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + return nfs_ok; +} + +/* + * Get the Access and/or Default ACL of a file. + */ +static int nfsacld_proc_getacl(struct svc_rqst * rqstp, + struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp) +{ + svc_fh *fh; + struct posix_acl *acl; + int nfserr = 0; + + dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); + + fh = fh_copy(&resp->fh, &argp->fh); + if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP))) + RETURN_STATUS(nfserr_inval); + + if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) + RETURN_STATUS(nfserr_inval); + resp->mask = argp->mask; + + if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { + acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) { + int err = PTR_ERR(acl); + + if (err == -ENODATA || err == -EOPNOTSUPP) + acl = NULL; + else { + nfserr = nfserrno(err); + goto fail; + } + } + if (acl == NULL) { + /* Solaris returns the inode's minimum ACL. */ + + struct inode *inode = fh->fh_dentry->d_inode; + acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + } + resp->acl_access = acl; + } + if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { + /* Check how Solaris handles requests for the Default ACL + of a non-directory! */ + + acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) { + int err = PTR_ERR(acl); + + if (err == -ENODATA || err == -EOPNOTSUPP) + acl = NULL; + else { + nfserr = nfserrno(err); + goto fail; + } + } + resp->acl_default = acl; + } + + /* resp->acl_{access,default} are released in nfssvc_release_getacl. */ + RETURN_STATUS(0); + +fail: + posix_acl_release(resp->acl_access); + posix_acl_release(resp->acl_default); + RETURN_STATUS(nfserr); +} + +/* + * Set the Access and/or Default ACL of a file. + */ +static int nfsacld_proc_setacl(struct svc_rqst * rqstp, + struct nfsd3_setaclargs *argp, + struct nfsd_attrstat *resp) +{ + svc_fh *fh; + int nfserr = 0; + + dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh)); + + fh = fh_copy(&resp->fh, &argp->fh); + nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP); + + if (!nfserr) { + nfserr = nfserrno( nfsd_set_posix_acl( + fh, ACL_TYPE_ACCESS, argp->acl_access) ); + } + if (!nfserr) { + nfserr = nfserrno( nfsd_set_posix_acl( + fh, ACL_TYPE_DEFAULT, argp->acl_default) ); + } + + /* argp->acl_{access,default} may have been allocated in + nfssvc_decode_setaclargs. */ + posix_acl_release(argp->acl_access); + posix_acl_release(argp->acl_default); + return nfserr; +} + +/* + * Check file attributes + */ +static int nfsacld_proc_getattr(struct svc_rqst * rqstp, + struct nfsd_fhandle *argp, struct nfsd_attrstat *resp) +{ + dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); + + fh_copy(&resp->fh, &argp->fh); + return fh_verify(rqstp, &resp->fh, 0, MAY_NOP); +} + +/* + * Check file access + */ +static int nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp, + struct nfsd3_accessres *resp) +{ + int nfserr; + + dprintk("nfsd: ACCESS(2acl) %s 0x%x\n", + SVCFH_fmt(&argp->fh), + argp->access); + + fh_copy(&resp->fh, &argp->fh); + resp->access = argp->access; + nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL); + return nfserr; +} + +/* + * XDR decode functions + */ +static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p, + struct nfsd3_getaclargs *argp) +{ + if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + return 0; + argp->mask = ntohl(*p); p++; + + return xdr_argsize_check(rqstp, p); +} + + +static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p, + struct nfsd3_setaclargs *argp) +{ + struct kvec *head = rqstp->rq_arg.head; + unsigned int base; + int n; + + if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + return 0; + argp->mask = ntohl(*p++); + if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) || + !xdr_argsize_check(rqstp, p)) + return 0; + + base = (char *)p - (char *)head->iov_base; + n = nfsacl_decode(&rqstp->rq_arg, base, NULL, + (argp->mask & NFS_ACL) ? + &argp->acl_access : NULL); + if (n > 0) + n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL, + (argp->mask & NFS_DFACL) ? + &argp->acl_default : NULL); + return (n > 0); +} + +static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, u32 *p, + struct nfsd_fhandle *argp) +{ + if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + return 0; + return xdr_argsize_check(rqstp, p); +} + +static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, u32 *p, + struct nfsd3_accessargs *argp) +{ + if (!(p = nfs2svc_decode_fh(p, &argp->fh))) + return 0; + argp->access = ntohl(*p++); + + return xdr_argsize_check(rqstp, p); +} + +/* + * XDR encode functions + */ + +/* GETACL */ +static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, u32 *p, + struct nfsd3_getaclres *resp) +{ + struct dentry *dentry = resp->fh.fh_dentry; + struct inode *inode = dentry->d_inode; + int w = nfsacl_size( + (resp->mask & NFS_ACL) ? resp->acl_access : NULL, + (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); + struct kvec *head = rqstp->rq_res.head; + unsigned int base; + int n; + + if (dentry == NULL || dentry->d_inode == NULL) + return 0; + inode = dentry->d_inode; + + p = nfs2svc_encode_fattr(rqstp, p, &resp->fh); + *p++ = htonl(resp->mask); + if (!xdr_ressize_check(rqstp, p)) + return 0; + base = (char *)p - (char *)head->iov_base; + + rqstp->rq_res.page_len = w; + while (w > 0) { + if (!svc_take_res_page(rqstp)) + return 0; + w -= PAGE_SIZE; + } + + n = nfsacl_encode(&rqstp->rq_res, base, inode, + resp->acl_access, + resp->mask & NFS_ACL, 0); + if (n > 0) + n = nfsacl_encode(&rqstp->rq_res, base + n, inode, + resp->acl_default, + resp->mask & NFS_DFACL, + NFS_ACL_DEFAULT); + if (n <= 0) + return 0; + return 1; +} + +static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, u32 *p, + struct nfsd_attrstat *resp) +{ + p = nfs2svc_encode_fattr(rqstp, p, &resp->fh); + return xdr_ressize_check(rqstp, p); +} + +/* ACCESS */ +static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, u32 *p, + struct nfsd3_accessres *resp) +{ + p = nfs2svc_encode_fattr(rqstp, p, &resp->fh); + *p++ = htonl(resp->access); + return xdr_ressize_check(rqstp, p); +} + +/* + * XDR release functions + */ +static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, u32 *p, + struct nfsd3_getaclres *resp) +{ + fh_put(&resp->fh); + posix_acl_release(resp->acl_access); + posix_acl_release(resp->acl_default); + return 1; +} + +static int nfsaclsvc_release_fhandle(struct svc_rqst *rqstp, u32 *p, + struct nfsd_fhandle *resp) +{ + fh_put(&resp->fh); + return 1; +} + +#define nfsaclsvc_decode_voidargs NULL +#define nfsaclsvc_encode_voidres NULL +#define nfsaclsvc_release_void NULL +#define nfsd3_fhandleargs nfsd_fhandle +#define nfsd3_attrstatres nfsd_attrstat +#define nfsd3_voidres nfsd3_voidargs +struct nfsd3_voidargs { int dummy; }; + +#define PROC(name, argt, rest, relt, cache, respsize) \ + { (svc_procfunc) nfsacld_proc_##name, \ + (kxdrproc_t) nfsaclsvc_decode_##argt##args, \ + (kxdrproc_t) nfsaclsvc_encode_##rest##res, \ + (kxdrproc_t) nfsaclsvc_release_##relt, \ + sizeof(struct nfsd3_##argt##args), \ + sizeof(struct nfsd3_##rest##res), \ + 0, \ + cache, \ + respsize, \ + } + +#define ST 1 /* status*/ +#define AT 21 /* attributes */ +#define pAT (1+AT) /* post attributes - conditional */ +#define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */ + +static struct svc_procedure nfsd_acl_procedures2[] = { + PROC(null, void, void, void, RC_NOCACHE, ST), + PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)), + PROC(setacl, setacl, attrstat, fhandle, RC_NOCACHE, ST+AT), + PROC(getattr, fhandle, attrstat, fhandle, RC_NOCACHE, ST+AT), + PROC(access, access, access, fhandle, RC_NOCACHE, ST+AT+1), +}; + +struct svc_version nfsd_acl_version2 = { + .vs_vers = 2, + .vs_nproc = 5, + .vs_proc = nfsd_acl_procedures2, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS3_SVC_XDRSIZE, +}; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c new file mode 100644 index 00000000000..64ba40572fe --- /dev/null +++ b/fs/nfsd/nfs3acl.c @@ -0,0 +1,267 @@ +/* + * linux/fs/nfsd/nfs3acl.c + * + * Process version 3 NFSACL requests. + * + * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de> + */ + +#include <linux/sunrpc/svc.h> +#include <linux/nfs3.h> +#include <linux/nfsd/nfsd.h> +#include <linux/nfsd/cache.h> +#include <linux/nfsd/xdr3.h> +#include <linux/posix_acl.h> +#include <linux/nfsacl.h> + +#define RETURN_STATUS(st) { resp->status = (st); return (st); } + +/* + * NULL call. + */ +static int +nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp) +{ + return nfs_ok; +} + +/* + * Get the Access and/or Default ACL of a file. + */ +static int nfsd3_proc_getacl(struct svc_rqst * rqstp, + struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp) +{ + svc_fh *fh; + struct posix_acl *acl; + int nfserr = 0; + + fh = fh_copy(&resp->fh, &argp->fh); + if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP))) + RETURN_STATUS(nfserr_inval); + + if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) + RETURN_STATUS(nfserr_inval); + resp->mask = argp->mask; + + if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { + acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) { + int err = PTR_ERR(acl); + + if (err == -ENODATA || err == -EOPNOTSUPP) + acl = NULL; + else { + nfserr = nfserrno(err); + goto fail; + } + } + if (acl == NULL) { + /* Solaris returns the inode's minimum ACL. */ + + struct inode *inode = fh->fh_dentry->d_inode; + acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + } + resp->acl_access = acl; + } + if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { + /* Check how Solaris handles requests for the Default ACL + of a non-directory! */ + + acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) { + int err = PTR_ERR(acl); + + if (err == -ENODATA || err == -EOPNOTSUPP) + acl = NULL; + else { + nfserr = nfserrno(err); + goto fail; + } + } + resp->acl_default = acl; + } + + /* resp->acl_{access,default} are released in nfs3svc_release_getacl. */ + RETURN_STATUS(0); + +fail: + posix_acl_release(resp->acl_access); + posix_acl_release(resp->acl_default); + RETURN_STATUS(nfserr); +} + +/* + * Set the Access and/or Default ACL of a file. + */ +static int nfsd3_proc_setacl(struct svc_rqst * rqstp, + struct nfsd3_setaclargs *argp, + struct nfsd3_attrstat *resp) +{ + svc_fh *fh; + int nfserr = 0; + + fh = fh_copy(&resp->fh, &argp->fh); + nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP); + + if (!nfserr) { + nfserr = nfserrno( nfsd_set_posix_acl( + fh, ACL_TYPE_ACCESS, argp->acl_access) ); + } + if (!nfserr) { + nfserr = nfserrno( nfsd_set_posix_acl( + fh, ACL_TYPE_DEFAULT, argp->acl_default) ); + } + + /* argp->acl_{access,default} may have been allocated in + nfs3svc_decode_setaclargs. */ + posix_acl_release(argp->acl_access); + posix_acl_release(argp->acl_default); + RETURN_STATUS(nfserr); +} + +/* + * XDR decode functions + */ +static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p, + struct nfsd3_getaclargs *args) +{ + if (!(p = nfs3svc_decode_fh(p, &args->fh))) + return 0; + args->mask = ntohl(*p); p++; + + return xdr_argsize_check(rqstp, p); +} + + +static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p, + struct nfsd3_setaclargs *args) +{ + struct kvec *head = rqstp->rq_arg.head; + unsigned int base; + int n; + + if (!(p = nfs3svc_decode_fh(p, &args->fh))) + return 0; + args->mask = ntohl(*p++); + if (args->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) || + !xdr_argsize_check(rqstp, p)) + return 0; + + base = (char *)p - (char *)head->iov_base; + n = nfsacl_decode(&rqstp->rq_arg, base, NULL, + (args->mask & NFS_ACL) ? + &args->acl_access : NULL); + if (n > 0) + n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL, + (args->mask & NFS_DFACL) ? + &args->acl_default : NULL); + return (n > 0); +} + +/* + * XDR encode functions + */ + +/* GETACL */ +static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, u32 *p, + struct nfsd3_getaclres *resp) +{ + struct dentry *dentry = resp->fh.fh_dentry; + + p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); + if (resp->status == 0 && dentry && dentry->d_inode) { + struct inode *inode = dentry->d_inode; + int w = nfsacl_size( + (resp->mask & NFS_ACL) ? resp->acl_access : NULL, + (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); + struct kvec *head = rqstp->rq_res.head; + unsigned int base; + int n; + + *p++ = htonl(resp->mask); + if (!xdr_ressize_check(rqstp, p)) + return 0; + base = (char *)p - (char *)head->iov_base; + + rqstp->rq_res.page_len = w; + while (w > 0) { + if (!svc_take_res_page(rqstp)) + return 0; + w -= PAGE_SIZE; + } + + n = nfsacl_encode(&rqstp->rq_res, base, inode, + resp->acl_access, + resp->mask & NFS_ACL, 0); + if (n > 0) + n = nfsacl_encode(&rqstp->rq_res, base + n, inode, + resp->acl_default, + resp->mask & NFS_DFACL, + NFS_ACL_DEFAULT); + if (n <= 0) + return 0; + } else + if (!xdr_ressize_check(rqstp, p)) + return 0; + + return 1; +} + +/* SETACL */ +static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, u32 *p, + struct nfsd3_attrstat *resp) +{ + p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); + + return xdr_ressize_check(rqstp, p); +} + +/* + * XDR release functions + */ +static int nfs3svc_release_getacl(struct svc_rqst *rqstp, u32 *p, + struct nfsd3_getaclres *resp) +{ + fh_put(&resp->fh); + posix_acl_release(resp->acl_access); + posix_acl_release(resp->acl_default); + return 1; +} + +#define nfs3svc_decode_voidargs NULL +#define nfs3svc_release_void NULL +#define nfsd3_setaclres nfsd3_attrstat +#define nfsd3_voidres nfsd3_voidargs +struct nfsd3_voidargs { int dummy; }; + +#define PROC(name, argt, rest, relt, cache, respsize) \ + { (svc_procfunc) nfsd3_proc_##name, \ + (kxdrproc_t) nfs3svc_decode_##argt##args, \ + (kxdrproc_t) nfs3svc_encode_##rest##res, \ + (kxdrproc_t) nfs3svc_release_##relt, \ + sizeof(struct nfsd3_##argt##args), \ + sizeof(struct nfsd3_##rest##res), \ + 0, \ + cache, \ + respsize, \ + } + +#define ST 1 /* status*/ +#define AT 21 /* attributes */ +#define pAT (1+AT) /* post attributes - conditional */ +#define ACL (1+NFS_ACL_MAX_ENTRIES*3) /* Access Control List */ + +static struct svc_procedure nfsd_acl_procedures3[] = { + PROC(null, void, void, void, RC_NOCACHE, ST), + PROC(getacl, getacl, getacl, getacl, RC_NOCACHE, ST+1+2*(1+ACL)), + PROC(setacl, setacl, setacl, fhandle, RC_NOCACHE, ST+pAT), +}; + +struct svc_version nfsd_acl_version3 = { + .vs_vers = 3, + .vs_nproc = 3, + .vs_proc = nfsd_acl_procedures3, + .vs_dispatch = nfsd_dispatch, + .vs_xdrsize = NFS3_SVC_XDRSIZE, +}; + diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 11f806835c5..e0e134d6bab 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -71,6 +71,12 @@ decode_fh(u32 *p, struct svc_fh *fhp) return p + XDR_QUADLEN(size); } +/* Helper function for NFSv3 ACL code */ +u32 *nfs3svc_decode_fh(u32 *p, struct svc_fh *fhp) +{ + return decode_fh(p, fhp); +} + static inline u32 * encode_fh(u32 *p, struct svc_fh *fhp) { @@ -233,6 +239,13 @@ encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) return p; } +/* Helper for NFSv3 ACLs */ +u32 * +nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) +{ + return encode_post_op_attr(rqstp, p, fhp); +} + /* * Enocde weak cache consistency data */ diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 11ebf6c4aa5..4a2105552ac 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -125,7 +125,7 @@ static short ace2type(struct nfs4_ace *); static int _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, unsigned int); static struct posix_acl *_nfsv4_to_posix_one(struct nfs4_acl *, unsigned int); int nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t); -int nfs4_acl_split(struct nfs4_acl *, struct nfs4_acl *); +static int nfs4_acl_split(struct nfs4_acl *, struct nfs4_acl *); struct nfs4_acl * nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl, @@ -775,7 +775,7 @@ out_err: return pacl; } -int +static int nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl) { struct list_head *h, *n; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 1a55dfcb74b..583c0710e45 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -54,7 +54,6 @@ /* declarations */ static void nfs4_cb_null(struct rpc_task *task); -extern spinlock_t recall_lock; /* Index of predefined Linux callback client operations */ @@ -329,12 +328,12 @@ out: .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2, \ } -struct rpc_procinfo nfs4_cb_procedures[] = { +static struct rpc_procinfo nfs4_cb_procedures[] = { PROC(CB_NULL, NULL, enc_cb_null, dec_cb_null), PROC(CB_RECALL, COMPOUND, enc_cb_recall, dec_cb_recall), }; -struct rpc_version nfs_cb_version4 = { +static struct rpc_version nfs_cb_version4 = { .number = 1, .nrprocs = sizeof(nfs4_cb_procedures)/sizeof(nfs4_cb_procedures[0]), .procs = nfs4_cb_procedures @@ -348,7 +347,7 @@ static struct rpc_version * nfs_cb_version[] = { /* * Use the SETCLIENTID credential */ -struct rpc_cred * +static struct rpc_cred * nfsd4_lookupcred(struct nfs4_client *clp, int taskflags) { struct auth_cred acred; @@ -387,9 +386,7 @@ nfsd4_probe_callback(struct nfs4_client *clp) char hostname[32]; int status; - dprintk("NFSD: probe_callback. cb_parsed %d cb_set %d\n", - cb->cb_parsed, atomic_read(&cb->cb_set)); - if (!cb->cb_parsed || atomic_read(&cb->cb_set)) + if (atomic_read(&cb->cb_set)) return; /* Initialize address */ @@ -427,10 +424,10 @@ nfsd4_probe_callback(struct nfs4_client *clp) * XXX AUTH_UNIX only - need AUTH_GSS.... */ sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(addr.sin_addr.s_addr)); - clnt = rpc_create_client(xprt, hostname, program, 1, RPC_AUTH_UNIX); + clnt = rpc_new_client(xprt, hostname, program, 1, RPC_AUTH_UNIX); if (IS_ERR(clnt)) { dprintk("NFSD: couldn't create callback client\n"); - goto out_xprt; + goto out_err; } clnt->cl_intr = 0; clnt->cl_softrtry = 1; @@ -465,8 +462,6 @@ out_rpciod: out_clnt: rpc_shutdown_client(clnt); goto out_err; -out_xprt: - xprt_destroy(xprt); out_err: dprintk("NFSD: warning: no callback path to client %.*s\n", (int)clp->cl_name.len, clp->cl_name.data); diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 4ba540841cf..5605a26efc5 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -104,7 +104,7 @@ ent_update(struct ent *new, struct ent *itm) ent_init(new, itm); } -void +static void ent_put(struct cache_head *ch, struct cache_detail *cd) { if (cache_put(ch, cd)) { @@ -186,7 +186,7 @@ warn_no_idmapd(struct cache_detail *detail) static int idtoname_parse(struct cache_detail *, char *, int); static struct ent *idtoname_lookup(struct ent *, int); -struct cache_detail idtoname_cache = { +static struct cache_detail idtoname_cache = { .hash_size = ENT_HASHMAX, .hash_table = idtoname_table, .name = "nfs4.idtoname", @@ -277,7 +277,7 @@ nametoid_hash(struct ent *ent) return hash_str(ent->name, ENT_HASHBITS); } -void +static void nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, int *blen) { @@ -317,9 +317,9 @@ nametoid_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) } static struct ent *nametoid_lookup(struct ent *, int); -int nametoid_parse(struct cache_detail *, char *, int); +static int nametoid_parse(struct cache_detail *, char *, int); -struct cache_detail nametoid_cache = { +static struct cache_detail nametoid_cache = { .hash_size = ENT_HASHMAX, .hash_table = nametoid_table, .name = "nfs4.nametoid", @@ -330,7 +330,7 @@ struct cache_detail nametoid_cache = { .warn_no_listener = warn_no_idmapd, }; -int +static int nametoid_parse(struct cache_detail *cd, char *buf, int buflen) { struct ent ent, *res; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index e8158741e8b..e08edc17c6a 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -45,6 +45,7 @@ #include <linux/param.h> #include <linux/major.h> #include <linux/slab.h> +#include <linux/file.h> #include <linux/sunrpc/svc.h> #include <linux/nfsd/nfsd.h> @@ -168,12 +169,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open (int)open->op_fname.len, open->op_fname.data, open->op_stateowner); - if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) - return nfserr_grace; - - if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) - return nfserr_no_grace; - /* This check required by spec. */ if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) return nfserr_inval; @@ -198,6 +193,11 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open if (status) goto out; switch (open->op_claim_type) { + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + status = nfserr_inval; + if (open->op_create) + goto out; + /* fall through */ case NFS4_OPEN_CLAIM_NULL: /* * (1) set CURRENT_FH to the file being opened, @@ -220,7 +220,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open if (status) goto out; break; - case NFS4_OPEN_CLAIM_DELEGATE_CUR: case NFS4_OPEN_CLAIM_DELEGATE_PREV: printk("NFSD: unsupported OPEN claim type %d\n", open->op_claim_type); @@ -473,26 +472,27 @@ static inline int nfsd4_read(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_read *read) { int status; - struct file *filp = NULL; /* no need to check permission - this will be done in nfsd_read() */ + read->rd_filp = NULL; if (read->rd_offset >= OFFSET_MAX) return nfserr_inval; nfs4_lock_state(); /* check stateid */ if ((status = nfs4_preprocess_stateid_op(current_fh, &read->rd_stateid, - CHECK_FH | RD_STATE, &filp))) { + CHECK_FH | RD_STATE, &read->rd_filp))) { dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; } + if (read->rd_filp) + get_file(read->rd_filp); status = nfs_ok; out: nfs4_unlock_state(); read->rd_rqstp = rqstp; read->rd_fhp = current_fh; - read->rd_filp = filp; return status; } @@ -532,6 +532,8 @@ nfsd4_remove(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_rem { int status; + if (nfs4_in_grace()) + return nfserr_grace; status = nfsd_unlink(rqstp, current_fh, 0, remove->rm_name, remove->rm_namelen); if (status == nfserr_symlink) return nfserr_notdir; @@ -550,6 +552,9 @@ nfsd4_rename(struct svc_rqst *rqstp, struct svc_fh *current_fh, if (!save_fh->fh_dentry) return status; + if (nfs4_in_grace() && !(save_fh->fh_export->ex_flags + & NFSEXP_NOSUBTREECHECK)) + return nfserr_grace; status = nfsd_rename(rqstp, save_fh, rename->rn_sname, rename->rn_snamelen, current_fh, rename->rn_tname, rename->rn_tnamelen); @@ -624,6 +629,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_writ dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); goto out; } + if (filp) + get_file(filp); nfs4_unlock_state(); write->wr_bytes_written = write->wr_buflen; @@ -635,6 +642,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_writ status = nfsd_write(rqstp, current_fh, filp, write->wr_offset, write->wr_vec, write->wr_vlen, write->wr_buflen, &write->wr_how_written); + if (filp) + fput(filp); if (status == nfserr_symlink) status = nfserr_inval; @@ -923,6 +932,9 @@ encode_op: nfs4_put_stateowner(replay_owner); replay_owner = NULL; } + /* XXX Ugh, we need to get rid of this kind of special case: */ + if (op->opnum == OP_READ && op->u.read.rd_filp) + fput(op->u.read.rd_filp); } out: diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c new file mode 100644 index 00000000000..57ed50fe7f8 --- /dev/null +++ b/fs/nfsd/nfs4recover.c @@ -0,0 +1,421 @@ +/* +* linux/fs/nfsd/nfs4recover.c +* +* Copyright (c) 2004 The Regents of the University of Michigan. +* All rights reserved. +* +* Andy Adamson <andros@citi.umich.edu> +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* +* 1. Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* 2. Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution. +* 3. Neither the name of the University nor the names of its +* contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED +* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +* DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +* +*/ + + +#include <linux/sunrpc/svc.h> +#include <linux/nfsd/nfsd.h> +#include <linux/nfs4.h> +#include <linux/nfsd/state.h> +#include <linux/nfsd/xdr4.h> +#include <linux/param.h> +#include <linux/file.h> +#include <linux/namei.h> +#include <asm/uaccess.h> +#include <asm/scatterlist.h> +#include <linux/crypto.h> + + +#define NFSDDBG_FACILITY NFSDDBG_PROC + +/* Globals */ +static struct nameidata rec_dir; +static int rec_dir_init = 0; + +static void +nfs4_save_user(uid_t *saveuid, gid_t *savegid) +{ + *saveuid = current->fsuid; + *savegid = current->fsgid; + current->fsuid = 0; + current->fsgid = 0; +} + +static void +nfs4_reset_user(uid_t saveuid, gid_t savegid) +{ + current->fsuid = saveuid; + current->fsgid = savegid; +} + +static void +md5_to_hex(char *out, char *md5) +{ + int i; + + for (i=0; i<16; i++) { + unsigned char c = md5[i]; + + *out++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1); + *out++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1); + } + *out = '\0'; +} + +int +nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname) +{ + struct xdr_netobj cksum; + struct crypto_tfm *tfm; + struct scatterlist sg[1]; + int status = nfserr_resource; + + dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", + clname->len, clname->data); + tfm = crypto_alloc_tfm("md5", 0); + if (tfm == NULL) + goto out; + cksum.len = crypto_tfm_alg_digestsize(tfm); + cksum.data = kmalloc(cksum.len, GFP_KERNEL); + if (cksum.data == NULL) + goto out; + crypto_digest_init(tfm); + + sg[0].page = virt_to_page(clname->data); + sg[0].offset = offset_in_page(clname->data); + sg[0].length = clname->len; + + crypto_digest_update(tfm, sg, 1); + crypto_digest_final(tfm, cksum.data); + + md5_to_hex(dname, cksum.data); + + kfree(cksum.data); + status = nfs_ok; +out: + if (tfm) + crypto_free_tfm(tfm); + return status; +} + +static void +nfsd4_sync_rec_dir(void) +{ + down(&rec_dir.dentry->d_inode->i_sem); + nfsd_sync_dir(rec_dir.dentry); + up(&rec_dir.dentry->d_inode->i_sem); +} + +int +nfsd4_create_clid_dir(struct nfs4_client *clp) +{ + char *dname = clp->cl_recdir; + struct dentry *dentry; + uid_t uid; + gid_t gid; + int status; + + dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); + + if (!rec_dir_init || clp->cl_firststate) + return 0; + + nfs4_save_user(&uid, &gid); + + /* lock the parent */ + down(&rec_dir.dentry->d_inode->i_sem); + + dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1); + if (IS_ERR(dentry)) { + status = PTR_ERR(dentry); + goto out_unlock; + } + status = -EEXIST; + if (dentry->d_inode) { + dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); + goto out_put; + } + status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU); +out_put: + dput(dentry); +out_unlock: + up(&rec_dir.dentry->d_inode->i_sem); + if (status == 0) { + clp->cl_firststate = 1; + nfsd4_sync_rec_dir(); + } + nfs4_reset_user(uid, gid); + dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status); + return status; +} + +typedef int (recdir_func)(struct dentry *, struct dentry *); + +struct dentry_list { + struct dentry *dentry; + struct list_head list; +}; + +struct dentry_list_arg { + struct list_head dentries; + struct dentry *parent; +}; + +static int +nfsd4_build_dentrylist(void *arg, const char *name, int namlen, + loff_t offset, ino_t ino, unsigned int d_type) +{ + struct dentry_list_arg *dla = arg; + struct list_head *dentries = &dla->dentries; + struct dentry *parent = dla->parent; + struct dentry *dentry; + struct dentry_list *child; + + if (name && isdotent(name, namlen)) + return nfs_ok; + dentry = lookup_one_len(name, parent, namlen); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + child = kmalloc(sizeof(*child), GFP_KERNEL); + if (child == NULL) + return -ENOMEM; + child->dentry = dentry; + list_add(&child->list, dentries); + return 0; +} + +static int +nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f) +{ + struct file *filp; + struct dentry_list_arg dla = { + .parent = dir, + }; + struct list_head *dentries = &dla.dentries; + struct dentry_list *child; + uid_t uid; + gid_t gid; + int status; + + if (!rec_dir_init) + return 0; + + nfs4_save_user(&uid, &gid); + + filp = dentry_open(dget(dir), mntget(rec_dir.mnt), + O_RDWR); + status = PTR_ERR(filp); + if (IS_ERR(filp)) + goto out; + INIT_LIST_HEAD(dentries); + status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla); + fput(filp); + while (!list_empty(dentries)) { + child = list_entry(dentries->next, struct dentry_list, list); + status = f(dir, child->dentry); + if (status) + goto out; + list_del(&child->list); + dput(child->dentry); + kfree(child); + } +out: + while (!list_empty(dentries)) { + child = list_entry(dentries->next, struct dentry_list, list); + list_del(&child->list); + dput(child->dentry); + kfree(child); + } + nfs4_reset_user(uid, gid); + return status; +} + +static int +nfsd4_remove_clid_file(struct dentry *dir, struct dentry *dentry) +{ + int status; + + if (!S_ISREG(dir->d_inode->i_mode)) { + printk("nfsd4: non-file found in client recovery directory\n"); + return -EINVAL; + } + down(&dir->d_inode->i_sem); + status = vfs_unlink(dir->d_inode, dentry); + up(&dir->d_inode->i_sem); + return status; +} + +static int +nfsd4_clear_clid_dir(struct dentry *dir, struct dentry *dentry) +{ + int status; + + /* For now this directory should already be empty, but we empty it of + * any regular files anyway, just in case the directory was created by + * a kernel from the future.... */ + nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file); + down(&dir->d_inode->i_sem); + status = vfs_rmdir(dir->d_inode, dentry); + up(&dir->d_inode->i_sem); + return status; +} + +static int +nfsd4_unlink_clid_dir(char *name, int namlen) +{ + struct dentry *dentry; + int status; + + dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); + + down(&rec_dir.dentry->d_inode->i_sem); + dentry = lookup_one_len(name, rec_dir.dentry, namlen); + up(&rec_dir.dentry->d_inode->i_sem); + if (IS_ERR(dentry)) { + status = PTR_ERR(dentry); + return status; + } + status = -ENOENT; + if (!dentry->d_inode) + goto out; + + status = nfsd4_clear_clid_dir(rec_dir.dentry, dentry); +out: + dput(dentry); + return status; +} + +void +nfsd4_remove_clid_dir(struct nfs4_client *clp) +{ + uid_t uid; + gid_t gid; + int status; + + if (!rec_dir_init || !clp->cl_firststate) + return; + + clp->cl_firststate = 0; + nfs4_save_user(&uid, &gid); + status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); + nfs4_reset_user(uid, gid); + if (status == 0) + nfsd4_sync_rec_dir(); + if (status) + printk("NFSD: Failed to remove expired client state directory" + " %.*s\n", HEXDIR_LEN, clp->cl_recdir); + return; +} + +static int +purge_old(struct dentry *parent, struct dentry *child) +{ + int status; + + if (nfs4_has_reclaimed_state(child->d_name.name)) + return nfs_ok; + + status = nfsd4_clear_clid_dir(parent, child); + if (status) + printk("failed to remove client recovery directory %s\n", + child->d_name.name); + /* Keep trying, success or failure: */ + return nfs_ok; +} + +void +nfsd4_recdir_purge_old(void) { + int status; + + if (!rec_dir_init) + return; + status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old); + if (status == 0) + nfsd4_sync_rec_dir(); + if (status) + printk("nfsd4: failed to purge old clients from recovery" + " directory %s\n", rec_dir.dentry->d_name.name); + return; +} + +static int +load_recdir(struct dentry *parent, struct dentry *child) +{ + if (child->d_name.len != HEXDIR_LEN - 1) { + printk("nfsd4: illegal name %s in recovery directory\n", + child->d_name.name); + /* Keep trying; maybe the others are OK: */ + return nfs_ok; + } + nfs4_client_to_reclaim(child->d_name.name); + return nfs_ok; +} + +int +nfsd4_recdir_load(void) { + int status; + + status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir); + if (status) + printk("nfsd4: failed loading clients from recovery" + " directory %s\n", rec_dir.dentry->d_name.name); + return status; +} + +/* + * Hold reference to the recovery directory. + */ + +void +nfsd4_init_recdir(char *rec_dirname) +{ + uid_t uid = 0; + gid_t gid = 0; + int status; + + printk("NFSD: Using %s as the NFSv4 state recovery directory\n", + rec_dirname); + + BUG_ON(rec_dir_init); + + nfs4_save_user(&uid, &gid); + + status = path_lookup(rec_dirname, LOOKUP_FOLLOW, &rec_dir); + if (status == -ENOENT) + printk("NFSD: recovery directory %s doesn't exist\n", + rec_dirname); + + if (!status) + rec_dir_init = 1; + nfs4_reset_user(uid, gid); +} + +void +nfsd4_shutdown_recdir(void) +{ + if (!rec_dir_init) + return; + rec_dir_init = 0; + path_release(&rec_dir); +} diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 75e8b137580..b83f8fb441e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -48,39 +48,32 @@ #include <linux/nfs4.h> #include <linux/nfsd/state.h> #include <linux/nfsd/xdr4.h> +#include <linux/namei.h> #define NFSDDBG_FACILITY NFSDDBG_PROC /* Globals */ static time_t lease_time = 90; /* default lease time */ -static time_t old_lease_time = 90; /* past incarnation lease time */ -static u32 nfs4_reclaim_init = 0; -time_t boot_time; -static time_t grace_end = 0; +static time_t user_lease_time = 90; +static time_t boot_time; +static int in_grace = 1; static u32 current_clientid = 1; static u32 current_ownerid = 1; static u32 current_fileid = 1; static u32 current_delegid = 1; static u32 nfs4_init; -stateid_t zerostateid; /* bits all 0 */ -stateid_t onestateid; /* bits all 1 */ - -/* debug counters */ -u32 list_add_perfile = 0; -u32 list_del_perfile = 0; -u32 add_perclient = 0; -u32 del_perclient = 0; -u32 alloc_file = 0; -u32 free_file = 0; -u32 vfsopen = 0; -u32 vfsclose = 0; -u32 alloc_delegation= 0; -u32 free_delegation= 0; +static stateid_t zerostateid; /* bits all 0 */ +static stateid_t onestateid; /* bits all 1 */ + +#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t))) +#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) /* forward declarations */ -struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); +static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); static void release_stateid_lockowners(struct nfs4_stateid *open_stp); +static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; +static void nfs4_set_recdir(char *recdir); /* Locking: * @@ -90,6 +83,11 @@ static void release_stateid_lockowners(struct nfs4_stateid *open_stp); */ static DECLARE_MUTEX(client_sema); +static kmem_cache_t *stateowner_slab = NULL; +static kmem_cache_t *file_slab = NULL; +static kmem_cache_t *stateid_slab = NULL; +static kmem_cache_t *deleg_slab = NULL; + void nfs4_lock_state(void) { @@ -118,16 +116,36 @@ opaque_hashval(const void *ptr, int nbytes) /* forward declarations */ static void release_stateowner(struct nfs4_stateowner *sop); static void release_stateid(struct nfs4_stateid *stp, int flags); -static void release_file(struct nfs4_file *fp); /* * Delegation state */ /* recall_lock protects the del_recall_lru */ -spinlock_t recall_lock; +static spinlock_t recall_lock = SPIN_LOCK_UNLOCKED; static struct list_head del_recall_lru; +static void +free_nfs4_file(struct kref *kref) +{ + struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref); + list_del(&fp->fi_hash); + iput(fp->fi_inode); + kmem_cache_free(file_slab, fp); +} + +static inline void +put_nfs4_file(struct nfs4_file *fi) +{ + kref_put(&fi->fi_ref, free_nfs4_file); +} + +static inline void +get_nfs4_file(struct nfs4_file *fi) +{ + kref_get(&fi->fi_ref); +} + static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) { @@ -136,13 +154,14 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback; dprintk("NFSD alloc_init_deleg\n"); - if ((dp = kmalloc(sizeof(struct nfs4_delegation), - GFP_KERNEL)) == NULL) + dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); + if (dp == NULL) return dp; - INIT_LIST_HEAD(&dp->dl_del_perfile); - INIT_LIST_HEAD(&dp->dl_del_perclnt); + INIT_LIST_HEAD(&dp->dl_perfile); + INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); dp->dl_client = clp; + get_nfs4_file(fp); dp->dl_file = fp; dp->dl_flock = NULL; get_file(stp->st_vfs_file); @@ -160,9 +179,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f current_fh->fh_handle.fh_size); dp->dl_time = 0; atomic_set(&dp->dl_count, 1); - list_add(&dp->dl_del_perfile, &fp->fi_del_perfile); - list_add(&dp->dl_del_perclnt, &clp->cl_del_perclnt); - alloc_delegation++; + list_add(&dp->dl_perfile, &fp->fi_delegations); + list_add(&dp->dl_perclnt, &clp->cl_delegations); return dp; } @@ -171,8 +189,8 @@ nfs4_put_delegation(struct nfs4_delegation *dp) { if (atomic_dec_and_test(&dp->dl_count)) { dprintk("NFSD: freeing dp %p\n",dp); - kfree(dp); - free_delegation++; + put_nfs4_file(dp->dl_file); + kmem_cache_free(deleg_slab, dp); } } @@ -193,15 +211,14 @@ nfs4_close_delegation(struct nfs4_delegation *dp) if (dp->dl_flock) setlease(filp, F_UNLCK, &dp->dl_flock); nfsd_close(filp); - vfsclose++; } /* Called under the state lock. */ static void unhash_delegation(struct nfs4_delegation *dp) { - list_del_init(&dp->dl_del_perfile); - list_del_init(&dp->dl_del_perclnt); + list_del_init(&dp->dl_perfile); + list_del_init(&dp->dl_perclnt); spin_lock(&recall_lock); list_del_init(&dp->dl_recall_lru); spin_unlock(&recall_lock); @@ -220,8 +237,8 @@ unhash_delegation(struct nfs4_delegation *dp) #define clientid_hashval(id) \ ((id) & CLIENT_HASH_MASK) -#define clientstr_hashval(name, namelen) \ - (opaque_hashval((name), (namelen)) & CLIENT_HASH_MASK) +#define clientstr_hashval(name) \ + (opaque_hashval((name), 8) & CLIENT_HASH_MASK) /* * reclaim_str_hashtbl[] holds known client info from previous reset/reboot * used in reboot/reset lease grace period processing @@ -331,11 +348,11 @@ expire_client(struct nfs4_client *clp) INIT_LIST_HEAD(&reaplist); spin_lock(&recall_lock); - while (!list_empty(&clp->cl_del_perclnt)) { - dp = list_entry(clp->cl_del_perclnt.next, struct nfs4_delegation, dl_del_perclnt); + while (!list_empty(&clp->cl_delegations)) { + dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); dprintk("NFSD: expire client. dp %p, fp %p\n", dp, dp->dl_flock); - list_del_init(&dp->dl_del_perclnt); + list_del_init(&dp->dl_perclnt); list_move(&dp->dl_recall_lru, &reaplist); } spin_unlock(&recall_lock); @@ -347,26 +364,26 @@ expire_client(struct nfs4_client *clp) list_del(&clp->cl_idhash); list_del(&clp->cl_strhash); list_del(&clp->cl_lru); - while (!list_empty(&clp->cl_perclient)) { - sop = list_entry(clp->cl_perclient.next, struct nfs4_stateowner, so_perclient); + while (!list_empty(&clp->cl_openowners)) { + sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); release_stateowner(sop); } put_nfs4_client(clp); } static struct nfs4_client * -create_client(struct xdr_netobj name) { +create_client(struct xdr_netobj name, char *recdir) { struct nfs4_client *clp; if (!(clp = alloc_client(name))) goto out; + memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); atomic_set(&clp->cl_count, 1); atomic_set(&clp->cl_callback.cb_set, 0); - clp->cl_callback.cb_parsed = 0; INIT_LIST_HEAD(&clp->cl_idhash); INIT_LIST_HEAD(&clp->cl_strhash); - INIT_LIST_HEAD(&clp->cl_perclient); - INIT_LIST_HEAD(&clp->cl_del_perclnt); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); INIT_LIST_HEAD(&clp->cl_lru); out: return clp; @@ -392,11 +409,9 @@ copy_cred(struct svc_cred *target, struct svc_cred *source) { get_group_info(target->cr_group_info); } -static int -cmp_name(struct xdr_netobj *n1, struct xdr_netobj *n2) { - if (!n1 || !n2) - return 0; - return((n1->len == n2->len) && !memcmp(n1->data, n2->data, n2->len)); +static inline int +same_name(const char *n1, const char *n2) { + return 0 == memcmp(n1, n2, HEXDIR_LEN); } static int @@ -446,7 +461,7 @@ check_name(struct xdr_netobj name) { return 1; } -void +static void add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval) { unsigned int idhashval; @@ -458,7 +473,7 @@ add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval) clp->cl_time = get_seconds(); } -void +static void move_to_confirmed(struct nfs4_client *clp) { unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id); @@ -468,8 +483,7 @@ move_to_confirmed(struct nfs4_client *clp) list_del_init(&clp->cl_strhash); list_del_init(&clp->cl_idhash); list_add(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); - strhashval = clientstr_hashval(clp->cl_name.data, - clp->cl_name.len); + strhashval = clientstr_hashval(clp->cl_recdir); list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); renew_client(clp); } @@ -500,6 +514,30 @@ find_unconfirmed_client(clientid_t *clid) return NULL; } +static struct nfs4_client * +find_confirmed_client_by_str(const char *dname, unsigned int hashval) +{ + struct nfs4_client *clp; + + list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) { + if (same_name(clp->cl_recdir, dname)) + return clp; + } + return NULL; +} + +static struct nfs4_client * +find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) +{ + struct nfs4_client *clp; + + list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) { + if (same_name(clp->cl_recdir, dname)) + return clp; + } + return NULL; +} + /* a helper function for parse_callback */ static int parse_octet(unsigned int *lenp, char **addrp) @@ -534,7 +572,7 @@ parse_octet(unsigned int *lenp, char **addrp) } /* parse and set the setclientid ipv4 callback address */ -int +static int parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigned short *cbportp) { int temp = 0; @@ -570,7 +608,7 @@ parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigne return 1; } -void +static void gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se) { struct nfs4_callback *cb = &clp->cl_callback; @@ -584,14 +622,12 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se) goto out_err; cb->cb_prog = se->se_callback_prog; cb->cb_ident = se->se_callback_ident; - cb->cb_parsed = 1; return; out_err: printk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " "will not receive delegations\n", clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); - cb->cb_parsed = 0; return; } @@ -638,59 +674,43 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid) }; nfs4_verifier clverifier = setclid->se_verf; unsigned int strhashval; - struct nfs4_client * conf, * unconf, * new, * clp; + struct nfs4_client *conf, *unconf, *new; int status; + char dname[HEXDIR_LEN]; status = nfserr_inval; if (!check_name(clname)) goto out; + status = nfs4_make_rec_clidname(dname, &clname); + if (status) + goto out; + /* * XXX The Duplicate Request Cache (DRC) has been checked (??) * We get here on a DRC miss. */ - strhashval = clientstr_hashval(clname.data, clname.len); + strhashval = clientstr_hashval(dname); - conf = NULL; nfs4_lock_state(); - list_for_each_entry(clp, &conf_str_hashtbl[strhashval], cl_strhash) { - if (!cmp_name(&clp->cl_name, &clname)) - continue; + conf = find_confirmed_client_by_str(dname, strhashval); + if (conf) { /* * CASE 0: * clname match, confirmed, different principal * or different ip_address */ status = nfserr_clid_inuse; - if (!cmp_creds(&clp->cl_cred,&rqstp->rq_cred)) { + if (!cmp_creds(&conf->cl_cred, &rqstp->rq_cred) + || conf->cl_addr != ip_addr) { printk("NFSD: setclientid: string in use by client" "(clientid %08x/%08x)\n", - clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); - goto out; - } - if (clp->cl_addr != ip_addr) { - printk("NFSD: setclientid: string in use by client" - "(clientid %08x/%08x)\n", - clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); + conf->cl_clientid.cl_boot, conf->cl_clientid.cl_id); goto out; } - - /* - * cl_name match from a previous SETCLIENTID operation - * XXX check for additional matches? - */ - conf = clp; - break; - } - unconf = NULL; - list_for_each_entry(clp, &unconf_str_hashtbl[strhashval], cl_strhash) { - if (!cmp_name(&clp->cl_name, &clname)) - continue; - /* cl_name match from a previous SETCLIENTID operation */ - unconf = clp; - break; } + unconf = find_unconfirmed_client_by_str(dname, strhashval); status = nfserr_resource; if (!conf) { /* @@ -699,7 +719,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid) */ if (unconf) expire_client(unconf); - if (!(new = create_client(clname))) + new = create_client(clname, dname); + if (new == NULL) goto out; copy_verf(new, &clverifier); new->cl_addr = ip_addr; @@ -722,12 +743,16 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid) * nfs4_client, but with the new callback info and a * new cl_confirm */ - if ((unconf) && - cmp_verf(&unconf->cl_verifier, &conf->cl_verifier) && - cmp_clid(&unconf->cl_clientid, &conf->cl_clientid)) { - expire_client(unconf); + if (unconf) { + /* Note this is removing unconfirmed {*x***}, + * which is stronger than RFC recommended {vxc**}. + * This has the advantage that there is at most + * one {*x***} in either list at any time. + */ + expire_client(unconf); } - if (!(new = create_client(clname))) + new = create_client(clname, dname); + if (new == NULL) goto out; copy_verf(new,&conf->cl_verifier); new->cl_addr = ip_addr; @@ -745,7 +770,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid) * using input clverifier, clname, and callback info * and generate a new cl_clientid and cl_confirm. */ - if (!(new = create_client(clname))) + new = create_client(clname, dname); + if (new == NULL) goto out; copy_verf(new,&clverifier); new->cl_addr = ip_addr; @@ -771,7 +797,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid) * new cl_verifier and a new cl_confirm */ expire_client(unconf); - if (!(new = create_client(clname))) + new = create_client(clname, dname); + if (new == NULL) goto out; copy_verf(new,&clverifier); new->cl_addr = ip_addr; @@ -807,7 +834,7 @@ int nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_setclientid_confirm *setclientid_confirm) { u32 ip_addr = rqstp->rq_addr.sin_addr.s_addr; - struct nfs4_client *clp, *conf = NULL, *unconf = NULL; + struct nfs4_client *conf, *unconf; nfs4_verifier confirm = setclientid_confirm->sc_confirm; clientid_t * clid = &setclientid_confirm->sc_clientid; int status; @@ -820,102 +847,91 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_setclientid_confi */ nfs4_lock_state(); - clp = find_confirmed_client(clid); - if (clp) { - status = nfserr_inval; - /* - * Found a record for this clientid. If the IP addresses - * don't match, return ERR_INVAL just as if the record had - * not been found. - */ - if (clp->cl_addr != ip_addr) { - printk("NFSD: setclientid: string in use by client" - "(clientid %08x/%08x)\n", - clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); - goto out; - } - conf = clp; - } - clp = find_unconfirmed_client(clid); - if (clp) { - status = nfserr_inval; - if (clp->cl_addr != ip_addr) { - printk("NFSD: setclientid: string in use by client" - "(clientid %08x/%08x)\n", - clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); - goto out; - } - unconf = clp; - } - /* CASE 1: - * unconf record that matches input clientid and input confirm. - * conf record that matches input clientid. - * conf and unconf records match names, verifiers - */ + + conf = find_confirmed_client(clid); + unconf = find_unconfirmed_client(clid); + + status = nfserr_clid_inuse; + if (conf && conf->cl_addr != ip_addr) + goto out; + if (unconf && unconf->cl_addr != ip_addr) + goto out; + if ((conf && unconf) && (cmp_verf(&unconf->cl_confirm, &confirm)) && (cmp_verf(&conf->cl_verifier, &unconf->cl_verifier)) && - (cmp_name(&conf->cl_name,&unconf->cl_name)) && + (same_name(conf->cl_recdir,unconf->cl_recdir)) && (!cmp_verf(&conf->cl_confirm, &unconf->cl_confirm))) { + /* CASE 1: + * unconf record that matches input clientid and input confirm. + * conf record that matches input clientid. + * conf and unconf records match names, verifiers + */ if (!cmp_creds(&conf->cl_cred, &unconf->cl_cred)) status = nfserr_clid_inuse; else { - expire_client(conf); - clp = unconf; - move_to_confirmed(unconf); + /* XXX: We just turn off callbacks until we can handle + * change request correctly. */ + atomic_set(&conf->cl_callback.cb_set, 0); + gen_confirm(conf); + nfsd4_remove_clid_dir(unconf); + expire_client(unconf); status = nfs_ok; + } - goto out; - } - /* CASE 2: - * conf record that matches input clientid. - * if unconf record that matches input clientid, then unconf->cl_name - * or unconf->cl_verifier don't match the conf record. - */ - if ((conf && !unconf) || + } else if ((conf && !unconf) || ((conf && unconf) && (!cmp_verf(&conf->cl_verifier, &unconf->cl_verifier) || - !cmp_name(&conf->cl_name, &unconf->cl_name)))) { - if (!cmp_creds(&conf->cl_cred,&rqstp->rq_cred)) { + !same_name(conf->cl_recdir, unconf->cl_recdir)))) { + /* CASE 2: + * conf record that matches input clientid. + * if unconf record matches input clientid, then + * unconf->cl_name or unconf->cl_verifier don't match the + * conf record. + */ + if (!cmp_creds(&conf->cl_cred,&rqstp->rq_cred)) status = nfserr_clid_inuse; - } else { - clp = conf; + else status = nfs_ok; - } - goto out; - } - /* CASE 3: - * conf record not found. - * unconf record found. - * unconf->cl_confirm matches input confirm - */ - if (!conf && unconf && cmp_verf(&unconf->cl_confirm, &confirm)) { + } else if (!conf && unconf + && cmp_verf(&unconf->cl_confirm, &confirm)) { + /* CASE 3: + * conf record not found. + * unconf record found. + * unconf->cl_confirm matches input confirm + */ if (!cmp_creds(&unconf->cl_cred, &rqstp->rq_cred)) { status = nfserr_clid_inuse; } else { - status = nfs_ok; - clp = unconf; + unsigned int hash = + clientstr_hashval(unconf->cl_recdir); + conf = find_confirmed_client_by_str(unconf->cl_recdir, + hash); + if (conf) { + nfsd4_remove_clid_dir(conf); + expire_client(conf); + } move_to_confirmed(unconf); + conf = unconf; + status = nfs_ok; } - goto out; - } - /* CASE 4: - * conf record not found, or if conf, then conf->cl_confirm does not - * match input confirm. - * unconf record not found, or if unconf, then unconf->cl_confirm - * does not match input confirm. - */ - if ((!conf || (conf && !cmp_verf(&conf->cl_confirm, &confirm))) && - (!unconf || (unconf && !cmp_verf(&unconf->cl_confirm, &confirm)))) { + } else if ((!conf || (conf && !cmp_verf(&conf->cl_confirm, &confirm))) + && (!unconf || (unconf && !cmp_verf(&unconf->cl_confirm, + &confirm)))) { + /* CASE 4: + * conf record not found, or if conf, conf->cl_confirm does not + * match input confirm. + * unconf record not found, or if unconf, unconf->cl_confirm + * does not match input confirm. + */ status = nfserr_stale_clientid; - goto out; + } else { + /* check that we have hit one of the cases...*/ + status = nfserr_clid_inuse; } - /* check that we have hit one of the cases...*/ - status = nfserr_inval; - goto out; out: if (!status) - nfsd4_probe_callback(clp); + nfsd4_probe_callback(conf); nfs4_unlock_state(); return status; } @@ -961,60 +977,65 @@ alloc_init_file(struct inode *ino) struct nfs4_file *fp; unsigned int hashval = file_hashval(ino); - if ((fp = kmalloc(sizeof(struct nfs4_file),GFP_KERNEL))) { + fp = kmem_cache_alloc(file_slab, GFP_KERNEL); + if (fp) { + kref_init(&fp->fi_ref); INIT_LIST_HEAD(&fp->fi_hash); - INIT_LIST_HEAD(&fp->fi_perfile); - INIT_LIST_HEAD(&fp->fi_del_perfile); + INIT_LIST_HEAD(&fp->fi_stateids); + INIT_LIST_HEAD(&fp->fi_delegations); list_add(&fp->fi_hash, &file_hashtbl[hashval]); fp->fi_inode = igrab(ino); fp->fi_id = current_fileid++; - alloc_file++; return fp; } return NULL; } static void -release_all_files(void) +nfsd4_free_slab(kmem_cache_t **slab) { - int i; - struct nfs4_file *fp; + int status; - for (i=0;i<FILE_HASH_SIZE;i++) { - while (!list_empty(&file_hashtbl[i])) { - fp = list_entry(file_hashtbl[i].next, struct nfs4_file, fi_hash); - /* this should never be more than once... */ - if (!list_empty(&fp->fi_perfile) || !list_empty(&fp->fi_del_perfile)) { - printk("ERROR: release_all_files: file %p is open, creating dangling state !!!\n",fp); - } - release_file(fp); - } - } + if (*slab == NULL) + return; + status = kmem_cache_destroy(*slab); + *slab = NULL; + WARN_ON(status); } -kmem_cache_t *stateowner_slab = NULL; +static void +nfsd4_free_slabs(void) +{ + nfsd4_free_slab(&stateowner_slab); + nfsd4_free_slab(&file_slab); + nfsd4_free_slab(&stateid_slab); + nfsd4_free_slab(&deleg_slab); +} static int nfsd4_init_slabs(void) { stateowner_slab = kmem_cache_create("nfsd4_stateowners", sizeof(struct nfs4_stateowner), 0, 0, NULL, NULL); - if (stateowner_slab == NULL) { - dprintk("nfsd4: out of memory while initializing nfsv4\n"); - return -ENOMEM; - } + if (stateowner_slab == NULL) + goto out_nomem; + file_slab = kmem_cache_create("nfsd4_files", + sizeof(struct nfs4_file), 0, 0, NULL, NULL); + if (file_slab == NULL) + goto out_nomem; + stateid_slab = kmem_cache_create("nfsd4_stateids", + sizeof(struct nfs4_stateid), 0, 0, NULL, NULL); + if (stateid_slab == NULL) + goto out_nomem; + deleg_slab = kmem_cache_create("nfsd4_delegations", + sizeof(struct nfs4_delegation), 0, 0, NULL, NULL); + if (deleg_slab == NULL) + goto out_nomem; return 0; -} - -static void -nfsd4_free_slabs(void) -{ - int status = 0; - - if (stateowner_slab) - status = kmem_cache_destroy(stateowner_slab); - stateowner_slab = NULL; - BUG_ON(status); +out_nomem: + nfsd4_free_slabs(); + dprintk("nfsd4: out of memory while initializing nfsv4\n"); + return -ENOMEM; } void @@ -1055,14 +1076,13 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str INIT_LIST_HEAD(&sop->so_idhash); INIT_LIST_HEAD(&sop->so_strhash); INIT_LIST_HEAD(&sop->so_perclient); - INIT_LIST_HEAD(&sop->so_perfilestate); - INIT_LIST_HEAD(&sop->so_perlockowner); /* not used */ + INIT_LIST_HEAD(&sop->so_stateids); + INIT_LIST_HEAD(&sop->so_perstateid); /* not used */ INIT_LIST_HEAD(&sop->so_close_lru); sop->so_time = 0; list_add(&sop->so_idhash, &ownerid_hashtbl[idhashval]); list_add(&sop->so_strhash, &ownerstr_hashtbl[strhashval]); - list_add(&sop->so_perclient, &clp->cl_perclient); - add_perclient++; + list_add(&sop->so_perclient, &clp->cl_openowners); sop->so_is_open_owner = 1; sop->so_id = current_ownerid++; sop->so_client = clp; @@ -1080,10 +1100,10 @@ release_stateid_lockowners(struct nfs4_stateid *open_stp) { struct nfs4_stateowner *lock_sop; - while (!list_empty(&open_stp->st_perlockowner)) { - lock_sop = list_entry(open_stp->st_perlockowner.next, - struct nfs4_stateowner, so_perlockowner); - /* list_del(&open_stp->st_perlockowner); */ + while (!list_empty(&open_stp->st_lockowners)) { + lock_sop = list_entry(open_stp->st_lockowners.next, + struct nfs4_stateowner, so_perstateid); + /* list_del(&open_stp->st_lockowners); */ BUG_ON(lock_sop->so_is_open_owner); release_stateowner(lock_sop); } @@ -1096,14 +1116,12 @@ unhash_stateowner(struct nfs4_stateowner *sop) list_del(&sop->so_idhash); list_del(&sop->so_strhash); - if (sop->so_is_open_owner) { + if (sop->so_is_open_owner) list_del(&sop->so_perclient); - del_perclient++; - } - list_del(&sop->so_perlockowner); - while (!list_empty(&sop->so_perfilestate)) { - stp = list_entry(sop->so_perfilestate.next, - struct nfs4_stateid, st_perfilestate); + list_del(&sop->so_perstateid); + while (!list_empty(&sop->so_stateids)) { + stp = list_entry(sop->so_stateids.next, + struct nfs4_stateid, st_perstateowner); if (sop->so_is_open_owner) release_stateid(stp, OPEN_STATE); else @@ -1125,14 +1143,14 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open * unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id); INIT_LIST_HEAD(&stp->st_hash); - INIT_LIST_HEAD(&stp->st_perfilestate); - INIT_LIST_HEAD(&stp->st_perlockowner); + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); INIT_LIST_HEAD(&stp->st_perfile); list_add(&stp->st_hash, &stateid_hashtbl[hashval]); - list_add(&stp->st_perfilestate, &sop->so_perfilestate); - list_add_perfile++; - list_add(&stp->st_perfile, &fp->fi_perfile); + list_add(&stp->st_perstateowner, &sop->so_stateids); + list_add(&stp->st_perfile, &fp->fi_stateids); stp->st_stateowner = sop; + get_nfs4_file(fp); stp->st_file = fp; stp->st_stateid.si_boot = boot_time; stp->st_stateid.si_stateownerid = sop->so_id; @@ -1142,6 +1160,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open * stp->st_deny_bmap = 0; __set_bit(open->op_share_access, &stp->st_access_bmap); __set_bit(open->op_share_deny, &stp->st_deny_bmap); + stp->st_openstp = NULL; } static void @@ -1150,30 +1169,20 @@ release_stateid(struct nfs4_stateid *stp, int flags) struct file *filp = stp->st_vfs_file; list_del(&stp->st_hash); - list_del_perfile++; list_del(&stp->st_perfile); - list_del(&stp->st_perfilestate); + list_del(&stp->st_perstateowner); if (flags & OPEN_STATE) { release_stateid_lockowners(stp); stp->st_vfs_file = NULL; nfsd_close(filp); - vfsclose++; } else if (flags & LOCK_STATE) locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner); - kfree(stp); + put_nfs4_file(stp->st_file); + kmem_cache_free(stateid_slab, stp); stp = NULL; } static void -release_file(struct nfs4_file *fp) -{ - free_file++; - list_del(&fp->fi_hash); - iput(fp->fi_inode); - kfree(fp); -} - -void move_to_close_lru(struct nfs4_stateowner *sop) { dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); @@ -1183,11 +1192,10 @@ move_to_close_lru(struct nfs4_stateowner *sop) sop->so_time = get_seconds(); } -void +static void release_state_owner(struct nfs4_stateid *stp, int flag) { struct nfs4_stateowner *sop = stp->st_stateowner; - struct nfs4_file *fp = stp->st_file; dprintk("NFSD: release_state_owner\n"); release_stateid(stp, flag); @@ -1196,12 +1204,8 @@ release_state_owner(struct nfs4_stateid *stp, int flag) * released by the laundromat service after the lease period * to enable us to handle CLOSE replay */ - if (sop->so_confirmed && list_empty(&sop->so_perfilestate)) + if (sop->so_confirmed && list_empty(&sop->so_stateids)) move_to_close_lru(sop); - /* unused nfs4_file's are releseed. XXX slab cache? */ - if (list_empty(&fp->fi_perfile) && list_empty(&fp->fi_del_perfile)) { - release_file(fp); - } } static int @@ -1231,8 +1235,10 @@ find_file(struct inode *ino) struct nfs4_file *fp; list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { - if (fp->fi_inode == ino) + if (fp->fi_inode == ino) { + get_nfs4_file(fp); return fp; + } } return NULL; } @@ -1240,7 +1246,7 @@ find_file(struct inode *ino) #define TEST_ACCESS(x) ((x > 0 || x < 4)?1:0) #define TEST_DENY(x) ((x >= 0 || x < 5)?1:0) -void +static void set_access(unsigned int *access, unsigned long bmap) { int i; @@ -1251,7 +1257,7 @@ set_access(unsigned int *access, unsigned long bmap) { } } -void +static void set_deny(unsigned int *deny, unsigned long bmap) { int i; @@ -1277,25 +1283,30 @@ test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) { * Called to check deny when READ with all zero stateid or * WRITE with all zero or all one stateid */ -int +static int nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) { struct inode *ino = current_fh->fh_dentry->d_inode; struct nfs4_file *fp; struct nfs4_stateid *stp; + int ret; dprintk("NFSD: nfs4_share_conflict\n"); fp = find_file(ino); - if (fp) { + if (!fp) + return nfs_ok; + ret = nfserr_locked; /* Search for conflicting share reservations */ - list_for_each_entry(stp, &fp->fi_perfile, st_perfile) { - if (test_bit(deny_type, &stp->st_deny_bmap) || - test_bit(NFS4_SHARE_DENY_BOTH, &stp->st_deny_bmap)) - return nfserr_share_denied; - } + list_for_each_entry(stp, &fp->fi_stateids, st_perfile) { + if (test_bit(deny_type, &stp->st_deny_bmap) || + test_bit(NFS4_SHARE_DENY_BOTH, &stp->st_deny_bmap)) + goto out; } - return nfs_ok; + ret = nfs_ok; +out: + put_nfs4_file(fp); + return ret; } static inline void @@ -1427,7 +1438,7 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) return -EAGAIN; } -struct lock_manager_operations nfsd_lease_mng_ops = { +static struct lock_manager_operations nfsd_lease_mng_ops = { .fl_break = nfsd_break_deleg_cb, .fl_release_private = nfsd_release_deleg_cb, .fl_copy_lock = nfsd_copy_lock_deleg_cb, @@ -1473,7 +1484,7 @@ nfsd4_process_open1(struct nfsd4_open *open) if (sop) { open->op_stateowner = sop; /* check for replay */ - if (open->op_seqid == sop->so_seqid){ + if (open->op_seqid == sop->so_seqid - 1){ if (sop->so_replay.rp_buflen) return NFSERR_REPLAY_ME; else { @@ -1488,7 +1499,7 @@ nfsd4_process_open1(struct nfsd4_open *open) goto renew; } } else if (sop->so_confirmed) { - if (open->op_seqid == sop->so_seqid + 1) + if (open->op_seqid == sop->so_seqid) goto renew; status = nfserr_bad_seqid; goto out; @@ -1521,11 +1532,54 @@ renew: status = nfs_ok; renew_client(sop->so_client); out: - if (status && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) - status = nfserr_reclaim_bad; return status; } +static inline int +nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) +{ + if ((flags & WR_STATE) && (dp->dl_type == NFS4_OPEN_DELEGATE_READ)) + return nfserr_openmode; + else + return nfs_ok; +} + +static struct nfs4_delegation * +find_delegation_file(struct nfs4_file *fp, stateid_t *stid) +{ + struct nfs4_delegation *dp; + + list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) { + if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) + return dp; + } + return NULL; +} + +static int +nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, + struct nfs4_delegation **dp) +{ + int flags; + int status = nfserr_bad_stateid; + + *dp = find_delegation_file(fp, &open->op_delegate_stateid); + if (*dp == NULL) + goto out; + flags = open->op_share_access == NFS4_SHARE_ACCESS_READ ? + RD_STATE : WR_STATE; + status = nfs4_check_delegmode(*dp, flags); + if (status) + *dp = NULL; +out: + if (open->op_claim_type != NFS4_OPEN_CLAIM_DELEGATE_CUR) + return nfs_ok; + if (status) + return status; + open->op_stateowner->so_confirmed = 1; + return nfs_ok; +} + static int nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_stateid **stpp) { @@ -1533,7 +1587,7 @@ nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_state int status = nfserr_share_denied; struct nfs4_stateowner *sop = open->op_stateowner; - list_for_each_entry(local, &fp->fi_perfile, st_perfile) { + list_for_each_entry(local, &fp->fi_stateids, st_perfile) { /* ignore lock owners */ if (local->st_stateowner->so_is_open_owner == 0) continue; @@ -1549,25 +1603,37 @@ out: return status; } +static inline struct nfs4_stateid * +nfs4_alloc_stateid(void) +{ + return kmem_cache_alloc(stateid_slab, GFP_KERNEL); +} + static int nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp, + struct nfs4_delegation *dp, struct svc_fh *cur_fh, int flags) { struct nfs4_stateid *stp; - int status; - stp = kmalloc(sizeof(struct nfs4_stateid), GFP_KERNEL); + stp = nfs4_alloc_stateid(); if (stp == NULL) return nfserr_resource; - status = nfsd_open(rqstp, cur_fh, S_IFREG, flags, &stp->st_vfs_file); - if (status) { - if (status == nfserr_dropit) - status = nfserr_jukebox; - kfree(stp); - return status; + if (dp) { + get_file(dp->dl_vfs_file); + stp->st_vfs_file = dp->dl_vfs_file; + } else { + int status; + status = nfsd_open(rqstp, cur_fh, S_IFREG, flags, + &stp->st_vfs_file); + if (status) { + if (status == nfserr_dropit) + status = nfserr_jukebox; + kmem_cache_free(stateid_slab, stp); + return status; + } } - vfsopen++; *stpp = stp; return 0; } @@ -1619,18 +1685,11 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_sta } -/* decrement seqid on successful reclaim, it will be bumped in encode_open */ static void -nfs4_set_claim_prev(struct nfsd4_open *open, int *status) +nfs4_set_claim_prev(struct nfsd4_open *open) { - if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) { - if (*status) - *status = nfserr_reclaim_bad; - else { - open->op_stateowner->so_confirmed = 1; - open->op_stateowner->so_seqid--; - } - } + open->op_stateowner->so_confirmed = 1; + open->op_stateowner->so_client->cl_firststate = 1; } /* @@ -1646,14 +1705,30 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta int status, flag = 0; flag = NFS4_OPEN_DELEGATE_NONE; - if (open->op_claim_type != NFS4_OPEN_CLAIM_NULL - || !atomic_read(&cb->cb_set) || !sop->so_confirmed) - goto out; - - if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) - flag = NFS4_OPEN_DELEGATE_WRITE; - else - flag = NFS4_OPEN_DELEGATE_READ; + open->op_recall = 0; + switch (open->op_claim_type) { + case NFS4_OPEN_CLAIM_PREVIOUS: + if (!atomic_read(&cb->cb_set)) + open->op_recall = 1; + flag = open->op_delegate_type; + if (flag == NFS4_OPEN_DELEGATE_NONE) + goto out; + break; + case NFS4_OPEN_CLAIM_NULL: + /* Let's not give out any delegations till everyone's + * had the chance to reclaim theirs.... */ + if (nfs4_in_grace()) + goto out; + if (!atomic_read(&cb->cb_set) || !sop->so_confirmed) + goto out; + if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) + flag = NFS4_OPEN_DELEGATE_WRITE; + else + flag = NFS4_OPEN_DELEGATE_READ; + break; + default: + goto out; + } dp = alloc_init_deleg(sop->so_client, stp, fh, flag); if (dp == NULL) { @@ -1687,6 +1762,10 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta dp->dl_stateid.si_fileid, dp->dl_stateid.si_generation); out: + if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS + && flag == NFS4_OPEN_DELEGATE_NONE + && open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) + printk("NFSD: WARNING: refusing delegation reclaim\n"); open->op_delegate_type = flag; } @@ -1699,8 +1778,15 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf struct nfs4_file *fp = NULL; struct inode *ino = current_fh->fh_dentry->d_inode; struct nfs4_stateid *stp = NULL; + struct nfs4_delegation *dp = NULL; int status; + if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + return nfserr_grace; + + if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + return nfserr_no_grace; + status = nfserr_inval; if (!TEST_ACCESS(open->op_share_access) || !TEST_DENY(open->op_share_deny)) goto out; @@ -1713,7 +1799,13 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (fp) { if ((status = nfs4_check_open(fp, open, &stp))) goto out; + status = nfs4_check_deleg(fp, open, &dp); + if (status) + goto out; } else { + status = nfserr_bad_stateid; + if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR) + goto out; status = nfserr_resource; fp = alloc_init_file(ino); if (fp == NULL) @@ -1729,6 +1821,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs4_upgrade_open(rqstp, current_fh, stp, open); if (status) goto out; + update_stateid(&stp->st_stateid); } else { /* Stateid was not found, this is a new OPEN */ int flags = 0; @@ -1736,7 +1829,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf flags = MAY_WRITE; else flags = MAY_READ; - if ((status = nfs4_new_open(rqstp, &stp, current_fh, flags))) + status = nfs4_new_open(rqstp, &stp, dp, current_fh, flags); + if (status) goto out; init_stateid(stp, fp, open); status = nfsd4_truncate(rqstp, current_fh, open); @@ -1759,12 +1853,10 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf stp->st_stateid.si_boot, stp->st_stateid.si_stateownerid, stp->st_stateid.si_fileid, stp->st_stateid.si_generation); out: - /* take the opportunity to clean up unused state */ - if (fp && list_empty(&fp->fi_perfile) && list_empty(&fp->fi_del_perfile)) - release_file(fp); - - /* CLAIM_PREVIOUS has different error returns */ - nfs4_set_claim_prev(open, &status); + if (fp) + put_nfs4_file(fp); + if (status == 0 && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + nfs4_set_claim_prev(open); /* * To finish the open response, we just need to set the rflags. */ @@ -1775,6 +1867,7 @@ out: return status; } +static struct workqueue_struct *laundry_wq; static struct work_struct laundromat_work; static void laundromat_main(void *); static DECLARE_WORK(laundromat_work, laundromat_main, NULL); @@ -1800,7 +1893,7 @@ nfsd4_renew(clientid_t *clid) } renew_client(clp); status = nfserr_cb_path_down; - if (!list_empty(&clp->cl_del_perclnt) + if (!list_empty(&clp->cl_delegations) && !atomic_read(&clp->cl_callback.cb_set)) goto out; status = nfs_ok; @@ -1809,7 +1902,15 @@ out: return status; } -time_t +static void +end_grace(void) +{ + dprintk("NFSD: end of grace period\n"); + nfsd4_recdir_purge_old(); + in_grace = 0; +} + +static time_t nfs4_laundromat(void) { struct nfs4_client *clp; @@ -1823,6 +1924,8 @@ nfs4_laundromat(void) nfs4_lock_state(); dprintk("NFSD: laundromat service - starting\n"); + if (in_grace) + end_grace(); list_for_each_safe(pos, next, &client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { @@ -1833,6 +1936,7 @@ nfs4_laundromat(void) } dprintk("NFSD: purging unused client (clientid %08x)\n", clp->cl_clientid.cl_id); + nfsd4_remove_clid_dir(clp); expire_client(clp); } INIT_LIST_HEAD(&reaplist); @@ -1882,17 +1986,14 @@ laundromat_main(void *not_used) t = nfs4_laundromat(); dprintk("NFSD: laundromat_main - sleeping for %ld seconds\n", t); - schedule_delayed_work(&laundromat_work, t*HZ); + queue_delayed_work(laundry_wq, &laundromat_work, t*HZ); } -/* search ownerid_hashtbl[] and close_lru for stateid owner - * (stateid->si_stateownerid) - */ -struct nfs4_stateowner * -find_openstateowner_id(u32 st_id, int flags) { +static struct nfs4_stateowner * +search_close_lru(u32 st_id, int flags) +{ struct nfs4_stateowner *local = NULL; - dprintk("NFSD: find_openstateowner_id %d\n", st_id); if (flags & CLOSE_STATE) { list_for_each_entry(local, &close_lru, so_close_lru) { if (local->so_id == st_id) @@ -1949,15 +2050,6 @@ out: } static inline int -nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) -{ - if ((flags & WR_STATE) && (dp->dl_type == NFS4_OPEN_DELEGATE_READ)) - return nfserr_openmode; - else - return nfs_ok; -} - -static inline int check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) { /* Trying to call delegreturn with a special stateid? Yuch: */ @@ -2067,14 +2159,19 @@ out: return status; } +static inline int +setlkflg (int type) +{ + return (type == NFS4_READW_LT || type == NFS4_READ_LT) ? + RD_STATE : WR_STATE; +} /* * Checks for sequence id mutating operations. */ -int -nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, clientid_t *lockclid) +static int +nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock) { - int status; struct nfs4_stateid *stp; struct nfs4_stateowner *sop; @@ -2082,53 +2179,65 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei "stateid = (%08x/%08x/%08x/%08x)\n", seqid, stateid->si_boot, stateid->si_stateownerid, stateid->si_fileid, stateid->si_generation); - + *stpp = NULL; *sopp = NULL; - status = nfserr_bad_stateid; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { printk("NFSD: preprocess_seqid_op: magic stateid!\n"); - goto out; + return nfserr_bad_stateid; } - status = nfserr_stale_stateid; if (STALE_STATEID(stateid)) - goto out; + return nfserr_stale_stateid; /* * We return BAD_STATEID if filehandle doesn't match stateid, * the confirmed flag is incorrecly set, or the generation * number is incorrect. - * If there is no entry in the openfile table for this id, - * we can't always return BAD_STATEID; - * this might be a retransmitted CLOSE which has arrived after - * the openfile has been released. */ - if (!(stp = find_stateid(stateid, flags))) - goto no_nfs4_stateid; - - status = nfserr_bad_stateid; + stp = find_stateid(stateid, flags); + if (stp == NULL) { + /* + * Also, we should make sure this isn't just the result of + * a replayed close: + */ + sop = search_close_lru(stateid->si_stateownerid, flags); + if (sop == NULL) + return nfserr_bad_stateid; + *sopp = sop; + goto check_replay; + } - /* for new lock stateowners: - * check that the lock->v.new.open_stateid - * refers to an open stateowner - * - * check that the lockclid (nfs4_lock->v.new.clientid) is the same - * as the open_stateid->st_stateowner->so_client->clientid - */ - if (lockclid) { + if (lock) { struct nfs4_stateowner *sop = stp->st_stateowner; + clientid_t *lockclid = &lock->v.new.clientid; struct nfs4_client *clp = sop->so_client; + int lkflg = 0; + int status; + + lkflg = setlkflg(lock->lk_type); + + if (lock->lk_is_new) { + if (!sop->so_is_open_owner) + return nfserr_bad_stateid; + if (!cmp_clid(&clp->cl_clientid, lockclid)) + return nfserr_bad_stateid; + /* stp is the open stateid */ + status = nfs4_check_openmode(stp, lkflg); + if (status) + return status; + } else { + /* stp is the lock stateid */ + status = nfs4_check_openmode(stp->st_openstp, lkflg); + if (status) + return status; + } - if (!sop->so_is_open_owner) - goto out; - if (!cmp_clid(&clp->cl_clientid, lockclid)) - goto out; } if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp)) { printk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n"); - goto out; + return nfserr_bad_stateid; } *stpp = stp; @@ -2139,63 +2248,41 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei * For the moment, we ignore the possibility of * generation number wraparound. */ - if (seqid != sop->so_seqid + 1) + if (seqid != sop->so_seqid) goto check_replay; - if (sop->so_confirmed) { - if (flags & CONFIRM) { - printk("NFSD: preprocess_seqid_op: expected unconfirmed stateowner!\n"); - goto out; - } + if (sop->so_confirmed && flags & CONFIRM) { + printk("NFSD: preprocess_seqid_op: expected" + " unconfirmed stateowner!\n"); + return nfserr_bad_stateid; } - else { - if (!(flags & CONFIRM)) { - printk("NFSD: preprocess_seqid_op: stateowner not confirmed yet!\n"); - goto out; - } + if (!sop->so_confirmed && !(flags & CONFIRM)) { + printk("NFSD: preprocess_seqid_op: stateowner not" + " confirmed yet!\n"); + return nfserr_bad_stateid; } if (stateid->si_generation > stp->st_stateid.si_generation) { printk("NFSD: preprocess_seqid_op: future stateid?!\n"); - goto out; + return nfserr_bad_stateid; } - status = nfserr_old_stateid; if (stateid->si_generation < stp->st_stateid.si_generation) { printk("NFSD: preprocess_seqid_op: old stateid!\n"); - goto out; - } - /* XXX renew the client lease here */ - status = nfs_ok; - -out: - return status; - -no_nfs4_stateid: - - /* - * We determine whether this is a bad stateid or a replay, - * starting by trying to look up the stateowner. - * If stateowner is not found - stateid is bad. - */ - if (!(sop = find_openstateowner_id(stateid->si_stateownerid, flags))) { - printk("NFSD: preprocess_seqid_op: no stateowner or nfs4_stateid!\n"); - status = nfserr_bad_stateid; - goto out; + return nfserr_old_stateid; } - *sopp = sop; + renew_client(sop->so_client); + return nfs_ok; check_replay: - if (seqid == sop->so_seqid) { + if (seqid == sop->so_seqid - 1) { printk("NFSD: preprocess_seqid_op: retransmission?\n"); /* indicate replay to calling function */ - status = NFSERR_REPLAY_ME; - } else { - printk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d\n", sop->so_seqid +1, seqid); - - *sopp = NULL; - status = nfserr_bad_seqid; + return NFSERR_REPLAY_ME; } - goto out; + printk("NFSD: preprocess_seqid_op: bad seqid (expected %d, got %d)\n", + sop->so_seqid, seqid); + *sopp = NULL; + return nfserr_bad_seqid; } int @@ -2230,6 +2317,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs stp->st_stateid.si_stateownerid, stp->st_stateid.si_fileid, stp->st_stateid.si_generation); + + nfsd4_create_clid_dir(sop->so_client); out: if (oc->oc_stateowner) nfs4_get_stateowner(oc->oc_stateowner); @@ -2387,7 +2476,7 @@ static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE]; static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; -struct nfs4_stateid * +static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags) { struct nfs4_stateid *local = NULL; @@ -2419,25 +2508,19 @@ find_stateid(stateid_t *stid, int flags) static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid) { - struct nfs4_delegation *dp = NULL; - struct nfs4_file *fp = NULL; - u32 st_id; + struct nfs4_file *fp; + struct nfs4_delegation *dl; dprintk("NFSD:find_delegation_stateid stateid=(%08x/%08x/%08x/%08x)\n", stid->si_boot, stid->si_stateownerid, stid->si_fileid, stid->si_generation); - st_id = stid->si_stateownerid; fp = find_file(ino); - if (fp) { - list_for_each_entry(dp, &fp->fi_del_perfile, dl_del_perfile) { - if(dp->dl_stateid.si_stateownerid == st_id) { - dprintk("NFSD: find_delegation dp %p\n",dp); - return dp; - } - } - } - return NULL; + if (!fp) + return NULL; + dl = find_delegation_file(fp, stid); + put_nfs4_file(fp); + return dl; } /* @@ -2457,7 +2540,7 @@ nfs4_transform_lock_offset(struct file_lock *lock) lock->fl_end = OFFSET_MAX; } -int +static int nfs4_verify_lock_stateowner(struct nfs4_stateowner *sop, unsigned int hashval) { struct nfs4_stateowner *local = NULL; @@ -2498,22 +2581,6 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) } static struct nfs4_stateowner * -find_lockstateowner(struct xdr_netobj *owner, clientid_t *clid) -{ - struct nfs4_stateowner *local = NULL; - int i; - - for (i = 0; i < LOCK_HASH_SIZE; i++) { - list_for_each_entry(local, &lock_ownerid_hashtbl[i], so_idhash) { - if (!cmp_owner_str(local, owner, clid)) - continue; - return local; - } - } - return NULL; -} - -static struct nfs4_stateowner * find_lockstateowner_str(struct inode *inode, clientid_t *clid, struct xdr_netobj *owner) { @@ -2533,7 +2600,6 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid, * occured. * * strhashval = lock_ownerstr_hashval - * so_seqid = lock->lk_new_lock_seqid - 1: it gets bumped in encode */ static struct nfs4_stateowner * @@ -2548,17 +2614,17 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, str INIT_LIST_HEAD(&sop->so_idhash); INIT_LIST_HEAD(&sop->so_strhash); INIT_LIST_HEAD(&sop->so_perclient); - INIT_LIST_HEAD(&sop->so_perfilestate); - INIT_LIST_HEAD(&sop->so_perlockowner); + INIT_LIST_HEAD(&sop->so_stateids); + INIT_LIST_HEAD(&sop->so_perstateid); INIT_LIST_HEAD(&sop->so_close_lru); /* not used */ sop->so_time = 0; list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]); list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]); - list_add(&sop->so_perlockowner, &open_stp->st_perlockowner); + list_add(&sop->so_perstateid, &open_stp->st_lockowners); sop->so_is_open_owner = 0; sop->so_id = current_ownerid++; sop->so_client = clp; - sop->so_seqid = lock->lk_new_lock_seqid - 1; + sop->so_seqid = lock->lk_new_lock_seqid; sop->so_confirmed = 1; rp = &sop->so_replay; rp->rp_status = NFSERR_SERVERFAULT; @@ -2567,24 +2633,24 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, str return sop; } -struct nfs4_stateid * +static struct nfs4_stateid * alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struct nfs4_stateid *open_stp) { struct nfs4_stateid *stp; unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id); - if ((stp = kmalloc(sizeof(struct nfs4_stateid), - GFP_KERNEL)) == NULL) + stp = nfs4_alloc_stateid(); + if (stp == NULL) goto out; INIT_LIST_HEAD(&stp->st_hash); INIT_LIST_HEAD(&stp->st_perfile); - INIT_LIST_HEAD(&stp->st_perfilestate); - INIT_LIST_HEAD(&stp->st_perlockowner); /* not used */ + INIT_LIST_HEAD(&stp->st_perstateowner); + INIT_LIST_HEAD(&stp->st_lockowners); /* not used */ list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]); - list_add(&stp->st_perfile, &fp->fi_perfile); - list_add_perfile++; - list_add(&stp->st_perfilestate, &sop->so_perfilestate); + list_add(&stp->st_perfile, &fp->fi_stateids); + list_add(&stp->st_perstateowner, &sop->so_stateids); stp->st_stateowner = sop; + get_nfs4_file(fp); stp->st_file = fp; stp->st_stateid.si_boot = boot_time; stp->st_stateid.si_stateownerid = sop->so_id; @@ -2593,12 +2659,13 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc stp->st_vfs_file = open_stp->st_vfs_file; /* FIXME refcount?? */ stp->st_access_bmap = open_stp->st_access_bmap; stp->st_deny_bmap = open_stp->st_deny_bmap; + stp->st_openstp = open_stp; out: return stp; } -int +static int check_lock_length(u64 offset, u64 length) { return ((length == 0) || ((length != ~(u64)0) && @@ -2611,7 +2678,7 @@ check_lock_length(u64 offset, u64 length) int nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock *lock) { - struct nfs4_stateowner *lock_sop = NULL, *open_sop = NULL; + struct nfs4_stateowner *open_sop = NULL; struct nfs4_stateid *lock_stp; struct file *filp; struct file_lock file_lock; @@ -2623,22 +2690,17 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock (long long) lock->lk_offset, (long long) lock->lk_length); - if (nfs4_in_grace() && !lock->lk_reclaim) - return nfserr_grace; - if (!nfs4_in_grace() && lock->lk_reclaim) - return nfserr_no_grace; - if (check_lock_length(lock->lk_offset, lock->lk_length)) return nfserr_inval; nfs4_lock_state(); if (lock->lk_is_new) { - /* - * Client indicates that this is a new lockowner. - * Use open owner and open stateid to create lock owner and lock - * stateid. - */ + /* + * Client indicates that this is a new lockowner. + * Use open owner and open stateid to create lock owner and + * lock stateid. + */ struct nfs4_stateid *open_stp = NULL; struct nfs4_file *fp; @@ -2648,38 +2710,22 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock goto out; } - /* is the new lock seqid presented by the client zero? */ - status = nfserr_bad_seqid; - if (lock->v.new.lock_seqid != 0) - goto out; - /* validate and update open stateid and open seqid */ status = nfs4_preprocess_seqid_op(current_fh, lock->lk_new_open_seqid, &lock->lk_new_open_stateid, CHECK_FH | OPEN_STATE, - &open_sop, &open_stp, - &lock->v.new.clientid); - if (status) { - if (lock->lk_reclaim) - status = nfserr_reclaim_bad; + &open_sop, &open_stp, lock); + if (status) goto out; - } /* create lockowner and lock stateid */ fp = open_stp->st_file; strhashval = lock_ownerstr_hashval(fp->fi_inode, open_sop->so_client->cl_clientid.cl_id, &lock->v.new.owner); - /* - * If we already have this lock owner, the client is in - * error (or our bookeeping is wrong!) - * for asking for a 'new lock'. - */ - status = nfserr_bad_stateid; - lock_sop = find_lockstateowner(&lock->v.new.owner, - &lock->v.new.clientid); - if (lock_sop) - goto out; + /* XXX: Do we need to check for duplicate stateowners on + * the same file, or should they just be allowed (and + * create new stateids)? */ status = nfserr_resource; if (!(lock->lk_stateowner = alloc_init_lock_stateowner(strhashval, open_sop->so_client, open_stp, lock))) goto out; @@ -2697,7 +2743,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock lock->lk_old_lock_seqid, &lock->lk_old_lock_stateid, CHECK_FH | LOCK_STATE, - &lock->lk_stateowner, &lock_stp, NULL); + &lock->lk_stateowner, &lock_stp, lock); if (status) goto out; } @@ -2709,6 +2755,13 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock goto out; } + status = nfserr_grace; + if (nfs4_in_grace() && !lock->lk_reclaim) + goto out; + status = nfserr_no_grace; + if (!nfs4_in_grace() && lock->lk_reclaim) + goto out; + locks_init_lock(&file_lock); switch (lock->lk_type) { case NFS4_READ_LT: @@ -2775,10 +2828,10 @@ conflicting_lock: out_destroy_new_stateid: if (lock->lk_is_new) { dprintk("NFSD: nfsd4_lock: destroy new stateid!\n"); - /* - * An error encountered after instantiation of the new - * stateid has forced us to destroy it. - */ + /* + * An error encountered after instantiation of the new + * stateid has forced us to destroy it. + */ if (!seqid_mutating_err(status)) open_sop->so_seqid--; @@ -2970,8 +3023,11 @@ int nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_release_lockowner *rlockowner) { clientid_t *clid = &rlockowner->rl_clientid; - struct nfs4_stateowner *local = NULL; + struct nfs4_stateowner *sop; + struct nfs4_stateid *stp; struct xdr_netobj *owner = &rlockowner->rl_owner; + struct list_head matches; + int i; int status; dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", @@ -2987,22 +3043,37 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_release_lockowner * nfs4_lock_state(); - status = nfs_ok; - local = find_lockstateowner(owner, clid); - if (local) { - struct nfs4_stateid *stp; - - /* check for any locks held by any stateid - * associated with the (lock) stateowner */ - status = nfserr_locks_held; - list_for_each_entry(stp, &local->so_perfilestate, - st_perfilestate) { - if (check_for_locks(stp->st_vfs_file, local)) - goto out; + status = nfserr_locks_held; + /* XXX: we're doing a linear search through all the lockowners. + * Yipes! For now we'll just hope clients aren't really using + * release_lockowner much, but eventually we have to fix these + * data structures. */ + INIT_LIST_HEAD(&matches); + for (i = 0; i < LOCK_HASH_SIZE; i++) { + list_for_each_entry(sop, &lock_ownerid_hashtbl[i], so_idhash) { + if (!cmp_owner_str(sop, owner, clid)) + continue; + list_for_each_entry(stp, &sop->so_stateids, + st_perstateowner) { + if (check_for_locks(stp->st_vfs_file, sop)) + goto out; + /* Note: so_perclient unused for lockowners, + * so it's OK to fool with here. */ + list_add(&sop->so_perclient, &matches); + } } - /* no locks held by (lock) stateowner */ - status = nfs_ok; - release_stateowner(local); + } + /* Clients probably won't expect us to return with some (but not all) + * of the lockowner state released; so don't release any until all + * have been checked. */ + status = nfs_ok; + while (!list_empty(&matches)) { + sop = list_entry(matches.next, struct nfs4_stateowner, + so_perclient); + /* unhash_stateowner deletes so_perclient only + * for openowners. */ + list_del(&sop->so_perclient); + release_stateowner(sop); } out: nfs4_unlock_state(); @@ -3010,39 +3081,38 @@ out: } static inline struct nfs4_client_reclaim * -alloc_reclaim(int namelen) +alloc_reclaim(void) { - struct nfs4_client_reclaim *crp = NULL; + return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL); +} - crp = kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL); - if (!crp) - return NULL; - crp->cr_name.data = kmalloc(namelen, GFP_KERNEL); - if (!crp->cr_name.data) { - kfree(crp); - return NULL; - } - return crp; +int +nfs4_has_reclaimed_state(const char *name) +{ + unsigned int strhashval = clientstr_hashval(name); + struct nfs4_client *clp; + + clp = find_confirmed_client_by_str(name, strhashval); + return clp ? 1 : 0; } /* * failure => all reset bets are off, nfserr_no_grace... */ -static int -nfs4_client_to_reclaim(char *name, int namlen) +int +nfs4_client_to_reclaim(const char *name) { unsigned int strhashval; struct nfs4_client_reclaim *crp = NULL; - dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", namlen, name); - crp = alloc_reclaim(namlen); + dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name); + crp = alloc_reclaim(); if (!crp) return 0; - strhashval = clientstr_hashval(name, namlen); + strhashval = clientstr_hashval(name); INIT_LIST_HEAD(&crp->cr_strhash); list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]); - memcpy(crp->cr_name.data, name, namlen); - crp->cr_name.len = namlen; + memcpy(crp->cr_recdir, name, HEXDIR_LEN); reclaim_str_hashtbl_size++; return 1; } @@ -3053,13 +3123,11 @@ nfs4_release_reclaim(void) struct nfs4_client_reclaim *crp = NULL; int i; - BUG_ON(!nfs4_reclaim_init); for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&reclaim_str_hashtbl[i])) { crp = list_entry(reclaim_str_hashtbl[i].next, struct nfs4_client_reclaim, cr_strhash); list_del(&crp->cr_strhash); - kfree(crp->cr_name.data); kfree(crp); reclaim_str_hashtbl_size--; } @@ -3069,7 +3137,7 @@ nfs4_release_reclaim(void) /* * called from OPEN, CLAIM_PREVIOUS with a new clientid. */ -struct nfs4_client_reclaim * +static struct nfs4_client_reclaim * nfs4_find_reclaim_client(clientid_t *clid) { unsigned int strhashval; @@ -3082,13 +3150,14 @@ nfs4_find_reclaim_client(clientid_t *clid) if (clp == NULL) return NULL; - dprintk("NFSD: nfs4_find_reclaim_client for %.*s\n", - clp->cl_name.len, clp->cl_name.data); + dprintk("NFSD: nfs4_find_reclaim_client for %.*s with recdir %s\n", + clp->cl_name.len, clp->cl_name.data, + clp->cl_recdir); /* find clp->cl_name in reclaim_str_hashtbl */ - strhashval = clientstr_hashval(clp->cl_name.data, clp->cl_name.len); + strhashval = clientstr_hashval(clp->cl_recdir); list_for_each_entry(crp, &reclaim_str_hashtbl[strhashval], cr_strhash) { - if (cmp_name(&crp->cr_name, &clp->cl_name)) { + if (same_name(crp->cr_recdir, clp->cl_recdir)) { return crp; } } @@ -3101,30 +3170,16 @@ nfs4_find_reclaim_client(clientid_t *clid) int nfs4_check_open_reclaim(clientid_t *clid) { - struct nfs4_client_reclaim *crp; - - if ((crp = nfs4_find_reclaim_client(clid)) == NULL) - return nfserr_reclaim_bad; - return nfs_ok; + return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad; } +/* initialization to perform at module load time: */ -/* - * Start and stop routines - */ - -static void -__nfs4_state_init(void) +void +nfs4_state_init(void) { int i; - time_t grace_time; - if (!nfs4_reclaim_init) { - for (i = 0; i < CLIENT_HASH_SIZE; i++) - INIT_LIST_HEAD(&reclaim_str_hashtbl[i]); - reclaim_str_hashtbl_size = 0; - nfs4_reclaim_init = 1; - } for (i = 0; i < CLIENT_HASH_SIZE; i++) { INIT_LIST_HEAD(&conf_id_hashtbl[i]); INIT_LIST_HEAD(&conf_str_hashtbl[i]); @@ -3146,26 +3201,46 @@ __nfs4_state_init(void) INIT_LIST_HEAD(&lock_ownerid_hashtbl[i]); INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]); } - memset(&zerostateid, 0, sizeof(stateid_t)); memset(&onestateid, ~0, sizeof(stateid_t)); - INIT_LIST_HEAD(&close_lru); INIT_LIST_HEAD(&client_lru); INIT_LIST_HEAD(&del_recall_lru); - spin_lock_init(&recall_lock); + for (i = 0; i < CLIENT_HASH_SIZE; i++) + INIT_LIST_HEAD(&reclaim_str_hashtbl[i]); + reclaim_str_hashtbl_size = 0; +} + +static void +nfsd4_load_reboot_recovery_data(void) +{ + int status; + + nfs4_lock_state(); + nfsd4_init_recdir(user_recovery_dirname); + status = nfsd4_recdir_load(); + nfs4_unlock_state(); + if (status) + printk("NFSD: Failure reading reboot recovery data\n"); +} + +/* initialization to perform when the nfsd service is started: */ + +static void +__nfs4_state_start(void) +{ + time_t grace_time; + boot_time = get_seconds(); - grace_time = max(old_lease_time, lease_time); - if (reclaim_str_hashtbl_size == 0) - grace_time = 0; - if (grace_time) - printk("NFSD: starting %ld-second grace period\n", grace_time); - grace_end = boot_time + grace_time; - INIT_WORK(&laundromat_work,laundromat_main, NULL); - schedule_delayed_work(&laundromat_work, NFSD_LEASE_TIME*HZ); + grace_time = max(user_lease_time, lease_time); + lease_time = user_lease_time; + in_grace = 1; + printk("NFSD: starting %ld-second grace period\n", grace_time); + laundry_wq = create_singlethread_workqueue("nfsd4"); + queue_delayed_work(laundry_wq, &laundromat_work, grace_time*HZ); } int -nfs4_state_init(void) +nfs4_state_start(void) { int status; @@ -3174,7 +3249,8 @@ nfs4_state_init(void) status = nfsd4_init_slabs(); if (status) return status; - __nfs4_state_init(); + nfsd4_load_reboot_recovery_data(); + __nfs4_state_start(); nfs4_init = 1; return 0; } @@ -3182,14 +3258,7 @@ nfs4_state_init(void) int nfs4_in_grace(void) { - return get_seconds() < grace_end; -} - -void -set_no_grace(void) -{ - printk("NFSD: ERROR in reboot recovery. State reclaims will fail.\n"); - grace_end = get_seconds(); + return in_grace; } time_t @@ -3236,21 +3305,11 @@ __nfs4_state_shutdown(void) unhash_delegation(dp); } - release_all_files(); cancel_delayed_work(&laundromat_work); - flush_scheduled_work(); + flush_workqueue(laundry_wq); + destroy_workqueue(laundry_wq); + nfsd4_shutdown_recdir(); nfs4_init = 0; - dprintk("NFSD: list_add_perfile %d list_del_perfile %d\n", - list_add_perfile, list_del_perfile); - dprintk("NFSD: add_perclient %d del_perclient %d\n", - add_perclient, del_perclient); - dprintk("NFSD: alloc_file %d free_file %d\n", - alloc_file, free_file); - dprintk("NFSD: vfsopen %d vfsclose %d\n", - vfsopen, vfsclose); - dprintk("NFSD: alloc_delegation %d free_delegation %d\n", - alloc_delegation, free_delegation); - } void @@ -3263,56 +3322,48 @@ nfs4_state_shutdown(void) nfs4_unlock_state(); } +static void +nfs4_set_recdir(char *recdir) +{ + nfs4_lock_state(); + strcpy(user_recovery_dirname, recdir); + nfs4_unlock_state(); +} + +/* + * Change the NFSv4 recovery directory to recdir. + */ +int +nfs4_reset_recoverydir(char *recdir) +{ + int status; + struct nameidata nd; + + status = path_lookup(recdir, LOOKUP_FOLLOW, &nd); + if (status) + return status; + status = -ENOTDIR; + if (S_ISDIR(nd.dentry->d_inode->i_mode)) { + nfs4_set_recdir(recdir); + status = 0; + } + path_release(&nd); + return status; +} + /* * Called when leasetime is changed. * - * if nfsd is not started, simply set the global lease. - * - * if nfsd(s) are running, lease change requires nfsv4 state to be reset. - * e.g: boot_time is reset, existing nfs4_client structs are - * used to fill reclaim_str_hashtbl, then all state (except for the - * reclaim_str_hashtbl) is re-initialized. - * - * if the old lease time is greater than the new lease time, the grace - * period needs to be set to the old lease time to allow clients to reclaim - * their state. XXX - we may want to set the grace period == lease time - * after an initial grace period == old lease time - * - * if an error occurs in this process, the new lease is set, but the server - * will not honor OPEN or LOCK reclaims, and will return nfserr_no_grace - * which means OPEN/LOCK/READ/WRITE will fail during grace period. - * - * clients will attempt to reset all state with SETCLIENTID/CONFIRM, and - * OPEN and LOCK reclaims. + * The only way the protocol gives us to handle on-the-fly lease changes is to + * simulate a reboot. Instead of doing that, we just wait till the next time + * we start to register any changes in lease time. If the administrator + * really wants to change the lease time *now*, they can go ahead and bring + * nfsd down and then back up again after changing the lease time. */ void nfs4_reset_lease(time_t leasetime) { - struct nfs4_client *clp; - int i; - - printk("NFSD: New leasetime %ld\n",leasetime); - if (!nfs4_init) - return; - nfs4_lock_state(); - old_lease_time = lease_time; - lease_time = leasetime; - - nfs4_release_reclaim(); - - /* populate reclaim_str_hashtbl with current confirmed nfs4_clientid */ - for (i = 0; i < CLIENT_HASH_SIZE; i++) { - list_for_each_entry(clp, &conf_id_hashtbl[i], cl_idhash) { - if (!nfs4_client_to_reclaim(clp->cl_name.data, - clp->cl_name.len)) { - nfs4_release_reclaim(); - goto init_state; - } - } - } -init_state: - __nfs4_state_shutdown(); - __nfs4_state_init(); - nfs4_unlock_state(); + lock_kernel(); + user_lease_time = leasetime; + unlock_kernel(); } - diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 36a058a112d..4c414635023 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -136,7 +136,7 @@ xdr_error: \ } \ } while (0) -u32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes) +static u32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes) { /* We want more bytes than seem to be available. * Maybe we need a new page, maybe we have just run out @@ -190,7 +190,7 @@ defer_free(struct nfsd4_compoundargs *argp, return 0; } -char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes) +static char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes) { void *new = NULL; if (p == argp->tmp) { @@ -1210,16 +1210,15 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) save = resp->p; /* - * Routine for encoding the result of a - * "seqid-mutating" NFSv4 operation. This is - * where seqids are incremented, and the - * replay cache is filled. + * Routine for encoding the result of a "seqid-mutating" NFSv4 operation. This + * is where sequence id's are incremented, and the replay cache is filled. + * Note that we increment sequence id's here, at the last moment, so we're sure + * we know whether the error to be returned is a sequence id mutating error. */ #define ENCODE_SEQID_OP_TAIL(stateowner) do { \ if (seqid_mutating_err(nfserr) && stateowner) { \ - if (stateowner->so_confirmed) \ - stateowner->so_seqid++; \ + stateowner->so_seqid++; \ stateowner->so_replay.rp_status = nfserr; \ stateowner->so_replay.rp_buflen = \ (((char *)(resp)->p - (char *)save)); \ @@ -1366,7 +1365,10 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) { if ((buflen -= 4) < 0) goto out_resource; - WRITE32( NFS4_FH_NOEXPIRE_WITH_OPEN | NFS4_FH_VOL_RENAME ); + if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) + WRITE32(NFS4_FH_PERSISTENT); + else + WRITE32(NFS4_FH_PERSISTENT|NFS4_FH_VOL_RENAME); } if (bmval0 & FATTR4_WORD0_CHANGE) { /* @@ -1969,7 +1971,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_open case NFS4_OPEN_DELEGATE_READ: RESERVE_SPACE(20 + sizeof(stateid_t)); WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t)); - WRITE32(0); + WRITE32(open->op_recall); /* * TODO: ACE's in delegations diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 161afdcb8f7..841c562991e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -51,6 +51,7 @@ enum { NFSD_Fh, NFSD_Threads, NFSD_Leasetime, + NFSD_RecoveryDir, }; /* @@ -66,6 +67,7 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size); static ssize_t write_filehandle(struct file *file, char *buf, size_t size); static ssize_t write_threads(struct file *file, char *buf, size_t size); static ssize_t write_leasetime(struct file *file, char *buf, size_t size); +static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); static ssize_t (*write_op[])(struct file *, char *, size_t) = { [NFSD_Svc] = write_svc, @@ -78,6 +80,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = { [NFSD_Fh] = write_filehandle, [NFSD_Threads] = write_threads, [NFSD_Leasetime] = write_leasetime, + [NFSD_RecoveryDir] = write_recoverydir, }; static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos) @@ -349,6 +352,25 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) return strlen(buf); } +static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) +{ + char *mesg = buf; + char *recdir; + int len, status; + + if (size > PATH_MAX || buf[size-1] != '\n') + return -EINVAL; + buf[size-1] = 0; + + recdir = mesg; + len = qword_get(&mesg, recdir, size); + if (len <= 0) + return -EINVAL; + + status = nfs4_reset_recoverydir(recdir); + return strlen(buf); +} + /*----------------------------------------------------------------------------*/ /* * populating the filesystem. @@ -369,6 +391,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, #endif /* last one */ {""} }; @@ -397,9 +420,8 @@ static int __init init_nfsd(void) nfsd_cache_init(); /* RPC reply cache */ nfsd_export_init(); /* Exports table */ nfsd_lockd_init(); /* lockd->nfsd callbacks */ -#ifdef CONFIG_NFSD_V4 + nfs4_state_init(); /* NFSv4 locking state */ nfsd_idmap_init(); /* Name to ID mapping */ -#endif /* CONFIG_NFSD_V4 */ if (proc_mkdir("fs/nfs", NULL)) { struct proc_dir_entry *entry; entry = create_proc_entry("fs/nfs/exports", 0, NULL); @@ -426,9 +448,7 @@ static void __exit exit_nfsd(void) remove_proc_entry("fs/nfs", NULL); nfsd_stat_shutdown(); nfsd_lockd_shutdown(); -#ifdef CONFIG_NFSD_V4 nfsd_idmap_shutdown(); -#endif /* CONFIG_NFSD_V4 */ unregister_filesystem(&nfsd_fs_type); } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 757f9d20803..0aa1b9603d7 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -591,6 +591,7 @@ nfserrno (int errno) { nfserr_dropit, -ENOMEM }, { nfserr_badname, -ESRCH }, { nfserr_io, -ETXTBSY }, + { nfserr_notsupp, -EOPNOTSUPP }, { -1, -EIO } }; int i; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 02ded7cfbdc..1697539a717 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -31,6 +31,7 @@ #include <linux/nfsd/stats.h> #include <linux/nfsd/cache.h> #include <linux/lockd/bind.h> +#include <linux/nfsacl.h> #define NFSDDBG_FACILITY NFSDDBG_SVC @@ -94,7 +95,7 @@ nfsd_svc(unsigned short port, int nrservs) error = nfsd_racache_init(2*nrservs); if (error<0) goto out; - error = nfs4_state_init(); + error = nfs4_state_start(); if (error<0) goto out; if (!nfsd_serv) { @@ -286,6 +287,7 @@ out: svc_exit_thread(rqstp); /* Release module */ + unlock_kernel(); module_put_and_exit(0); } @@ -362,6 +364,32 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp) return 1; } +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +static struct svc_stat nfsd_acl_svcstats; +static struct svc_version * nfsd_acl_version[] = { + [2] = &nfsd_acl_version2, + [3] = &nfsd_acl_version3, +}; + +#define NFSD_ACL_NRVERS (sizeof(nfsd_acl_version)/sizeof(nfsd_acl_version[0])) +static struct svc_program nfsd_acl_program = { + .pg_prog = NFS_ACL_PROGRAM, + .pg_nvers = NFSD_ACL_NRVERS, + .pg_vers = nfsd_acl_version, + .pg_name = "nfsd", + .pg_class = "nfsd", + .pg_stats = &nfsd_acl_svcstats, +}; + +static struct svc_stat nfsd_acl_svcstats = { + .program = &nfsd_acl_program, +}; + +#define nfsd_acl_program_p &nfsd_acl_program +#else +#define nfsd_acl_program_p NULL +#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ + extern struct svc_version nfsd_version2, nfsd_version3, nfsd_version4; static struct svc_version * nfsd_version[] = { @@ -376,6 +404,7 @@ static struct svc_version * nfsd_version[] = { #define NFSD_NRVERS (sizeof(nfsd_version)/sizeof(nfsd_version[0])) struct svc_program nfsd_program = { + .pg_next = nfsd_acl_program_p, .pg_prog = NFS_PROGRAM, /* program number */ .pg_nvers = NFSD_NRVERS, /* nr of entries in nfsd_version */ .pg_vers = nfsd_version, /* version table */ diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 948b08287c9..b45999ff33e 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -49,6 +49,12 @@ decode_fh(u32 *p, struct svc_fh *fhp) return p + (NFS_FHSIZE >> 2); } +/* Helper function for NFSv2 ACL code */ +u32 *nfs2svc_decode_fh(u32 *p, struct svc_fh *fhp) +{ + return decode_fh(p, fhp); +} + static inline u32 * encode_fh(u32 *p, struct svc_fh *fhp) { @@ -190,6 +196,11 @@ encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) return p; } +/* Helper function for NFSv2 ACL code */ +u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) +{ + return encode_fattr(rqstp, p, fhp); +} /* * XDR decode functions diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index e3e9d217236..4f2cd3d2756 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -45,11 +45,10 @@ #endif /* CONFIG_NFSD_V3 */ #include <linux/nfsd/nfsfh.h> #include <linux/quotaops.h> -#include <linux/dnotify.h> -#ifdef CONFIG_NFSD_V4 +#include <linux/fsnotify.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> -#include <linux/xattr_acl.h> +#ifdef CONFIG_NFSD_V4 #include <linux/xattr.h> #include <linux/nfs4.h> #include <linux/nfs4_acl.h> @@ -424,13 +423,13 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out_nfserr; if (pacl) { - error = set_nfsv4_acl_one(dentry, pacl, XATTR_NAME_ACL_ACCESS); + error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS); if (error < 0) goto out_nfserr; } if (dpacl) { - error = set_nfsv4_acl_one(dentry, dpacl, XATTR_NAME_ACL_DEFAULT); + error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT); if (error < 0) goto out_nfserr; } @@ -497,7 +496,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac struct posix_acl *pacl = NULL, *dpacl = NULL; unsigned int flags = 0; - pacl = _get_posix_acl(dentry, XATTR_NAME_ACL_ACCESS); + pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS); if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA) pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); if (IS_ERR(pacl)) { @@ -507,7 +506,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac } if (S_ISDIR(inode->i_mode)) { - dpacl = _get_posix_acl(dentry, XATTR_NAME_ACL_DEFAULT); + dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT); if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA) dpacl = NULL; else if (IS_ERR(dpacl)) { @@ -734,7 +733,7 @@ nfsd_sync(struct file *filp) up(&inode->i_sem); } -static void +void nfsd_sync_dir(struct dentry *dp) { nfsd_dosync(NULL, dp, dp->d_inode->i_fop); @@ -861,7 +860,7 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, nfsdstats.io_read += err; *count = err; err = 0; - dnotify_parent(file->f_dentry, DN_ACCESS); + fsnotify_access(file->f_dentry); } else err = nfserrno(err); out: @@ -917,7 +916,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, set_fs(oldfs); if (err >= 0) { nfsdstats.io_write += cnt; - dnotify_parent(file->f_dentry, DN_MODIFY); + fsnotify_modify(file->f_dentry); } /* clear setuid/setgid flag after write */ @@ -1857,3 +1856,107 @@ nfsd_racache_init(int cache_size) nfsdstats.ra_size = cache_size; return 0; } + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +struct posix_acl * +nfsd_get_posix_acl(struct svc_fh *fhp, int type) +{ + struct inode *inode = fhp->fh_dentry->d_inode; + char *name; + void *value = NULL; + ssize_t size; + struct posix_acl *acl; + + if (!IS_POSIXACL(inode) || !inode->i_op || !inode->i_op->getxattr) + return ERR_PTR(-EOPNOTSUPP); + switch(type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return ERR_PTR(-EOPNOTSUPP); + } + + size = inode->i_op->getxattr(fhp->fh_dentry, name, NULL, 0); + + if (size < 0) { + acl = ERR_PTR(size); + goto getout; + } else if (size > 0) { + value = kmalloc(size, GFP_KERNEL); + if (!value) { + acl = ERR_PTR(-ENOMEM); + goto getout; + } + size = inode->i_op->getxattr(fhp->fh_dentry, name, value, size); + if (size < 0) { + acl = ERR_PTR(size); + goto getout; + } + } + acl = posix_acl_from_xattr(value, size); + +getout: + kfree(value); + return acl; +} + +int +nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) +{ + struct inode *inode = fhp->fh_dentry->d_inode; + char *name; + void *value = NULL; + size_t size; + int error; + + if (!IS_POSIXACL(inode) || !inode->i_op || + !inode->i_op->setxattr || !inode->i_op->removexattr) + return -EOPNOTSUPP; + switch(type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return -EOPNOTSUPP; + } + + if (acl && acl->a_count) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_KERNEL); + if (!value) + return -ENOMEM; + size = posix_acl_to_xattr(acl, value, size); + if (size < 0) { + error = size; + goto getout; + } + } else + size = 0; + + if (!fhp->fh_locked) + fh_lock(fhp); /* unlocking is done automatically */ + if (size) + error = inode->i_op->setxattr(fhp->fh_dentry, name, + value, size, 0); + else { + if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT) + error = 0; + else { + error = inode->i_op->removexattr(fhp->fh_dentry, name); + if (error == -ENODATA) + error = 0; + } + } + +getout: + kfree(value); + return error; +} +#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog index 1d2ad15f153..9eecc9939df 100644 --- a/fs/ntfs/ChangeLog +++ b/fs/ntfs/ChangeLog @@ -1,21 +1,18 @@ ToDo/Notes: - Find and fix bugs. - - Checkpoint or disable the user space journal ($UsnJrnl). - In between ntfs_prepare/commit_write, need exclusion between - simultaneous file extensions. Need perhaps an NInoResizeUnderway() - flag which we can set in ntfs_prepare_write() and clear again in - ntfs_commit_write(). Just have to be careful in readpage/writepage, - as well as in truncate, that we play nice... We might need to have - a data_size field in the ntfs_inode to store the real attribute - length. Also need to be careful with initialized_size extention in + simultaneous file extensions. This is given to us by holding i_sem + on the inode. The only places in the kernel when a file is resized + are prepare/commit write and truncate for both of which i_sem is + held. Just have to be careful in readpage/writepage and all other + helpers not running under i_sem that we play nice... + Also need to be careful with initialized_size extention in ntfs_prepare_write. Basically, just be _very_ careful in this code... - OTOH, perhaps i_sem, which is held accross generic_file_write is - sufficient for synchronisation here. We then just need to make sure - ntfs_readpage/writepage/truncate interoperate properly with us. - UPDATE: The above is all ok as it is due to i_sem held. The only - thing that needs to be checked is ntfs_writepage() which does not - hold i_sem. It cannot change i_size but it needs to cope with a - concurrent i_size change. + UPDATE: The only things that need to be checked are read/writepage + which do not hold i_sem. Note writepage cannot change i_size but it + needs to cope with a concurrent i_size change, just like readpage. + Also both need to cope with concurrent changes to the other sizes, + i.e. initialized/allocated/compressed size, as well. - Implement mft.c::sync_mft_mirror_umount(). We currently will just leave the volume dirty on umount if the final iput(vol->mft_ino) causes a write of any mirrored mft records due to the mft mirror @@ -25,12 +22,161 @@ ToDo/Notes: - Enable the code for setting the NT4 compatibility flag when we start making NTFS 1.2 specific modifications. -2.1.23-WIP +2.1.23 - Implement extension of resident files and make writing safe as well as + many bug fixes, cleanups, and enhancements... - Add printk rate limiting for ntfs_warning() and ntfs_error() when compiled without debug. This avoids a possible denial of service attack. Thanks to Carl-Daniel Hailfinger from SuSE for pointing this out. + - Fix compilation warnings on ia64. (Randy Dunlap) + - Use i_size_{read,write}() instead of reading i_size by hand and cache + the value where apropriate. + - Add size_lock to the ntfs_inode structure. This is an rw spinlock + and it locks against access to the inode sizes. Note, ->size_lock + is also accessed from irq context so you must use the _irqsave and + _irqrestore lock and unlock functions, respectively. Protect all + accesses to allocated_size, initialized_size, and compressed_size. + - Minor optimization to fs/ntfs/super.c::ntfs_statfs() and its helpers. + - Implement extension of resident files in the regular file write code + paths (fs/ntfs/aops.c::ntfs_{prepare,commit}_write()). At present + this only works until the data attribute becomes too big for the mft + record after which we abort the write returning -EOPNOTSUPP from + ntfs_prepare_write(). + - Add disable_sparse mount option together with a per volume sparse + enable bit which is set appropriately and a per inode sparse disable + bit which is preset on some system file inodes as appropriate. + - Enforce that sparse support is disabled on NTFS volumes pre 3.0. + - Fix a bug in fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress() in + the creation of the unmapped runlist element for the base attribute + extent. + - Split ntfs_map_runlist() into ntfs_map_runlist() and a non-locking + helper ntfs_map_runlist_nolock() which is used by ntfs_map_runlist(). + This allows us to map runlist fragments with the runlist lock already + held without having to drop and reacquire it around the call. Adapt + all callers. + - Change ntfs_find_vcn() to ntfs_find_vcn_nolock() which takes a locked + runlist. This allows us to find runlist elements with the runlist + lock already held without having to drop and reacquire it around the + call. Adapt all callers. + - Change time to u64 in time.h::ntfs2utc() as it otherwise generates a + warning in the do_div() call on sparc32. Thanks to Meelis Roos for + the report and analysis of the warning. + - Fix a nasty runlist merge bug when merging two holes. + - Set the ntfs_inode->allocated_size to the real allocated size in the + mft record for resident attributes (fs/ntfs/inode.c). + - Small readability cleanup to use "a" instead of "ctx->attr" + everywhere (fs/ntfs/inode.c). + - Make fs/ntfs/namei.c::ntfs_get_{parent,dentry} static and move the + definition of ntfs_export_ops from fs/ntfs/super.c to namei.c. Also, + declare ntfs_export_ops in fs/ntfs/ntfs.h. + - Correct sparse file handling. The compressed values need to be + checked and set in the ntfs inode as done for compressed files and + the compressed size needs to be used for vfs inode->i_blocks instead + of the allocated size, again, as done for compressed files. + - Add AT_EA in addition to AT_DATA to whitelist for being allowed to be + non-resident in fs/ntfs/attrib.c::ntfs_attr_can_be_non_resident(). + - Add fs/ntfs/attrib.c::ntfs_attr_vcn_to_lcn_nolock() used by the new + write code. + - Fix bug in fs/ntfs/attrib.c::ntfs_find_vcn_nolock() where after + dropping the read lock and taking the write lock we were not checking + whether someone else did not already do the work we wanted to do. + - Rename fs/ntfs/attrib.c::ntfs_find_vcn_nolock() to + ntfs_attr_find_vcn_nolock() and update all callers. + - Add fs/ntfs/attrib.[hc]::ntfs_attr_make_non_resident(). + - Fix sign of various error return values to be negative in + fs/ntfs/lcnalloc.c. + - Modify ->readpage and ->writepage (fs/ntfs/aops.c) so they detect and + handle the case where an attribute is converted from resident to + non-resident by a concurrent file write. + - Remove checks for NULL before calling kfree() since kfree() does the + checking itself. (Jesper Juhl) + - Some utilities modify the boot sector but do not update the checksum. + Thus, relax the checking in fs/ntfs/super.c::is_boot_sector_ntfs() to + only emit a warning when the checksum is incorrect rather than + refusing the mount. Thanks to Bernd Casimir for pointing this + problem out. + - Update attribute definition handling. + - Add NTFS_MAX_CLUSTER_SIZE and NTFS_MAX_PAGES_PER_CLUSTER constants. + - Use NTFS_MAX_CLUSTER_SIZE in super.c instead of hard coding 0x10000. + - Use MAX_BUF_PER_PAGE instead of variable sized array allocation for + better code generation and one less sparse warning in fs/ntfs/aops.c. + - Remove spurious void pointer casts from fs/ntfs/. (Pekka Enberg) + - Use C99 style structure initialization after memory allocation where + possible (fs/ntfs/{attrib.c,index.c,super.c}). Thanks to Al Viro and + Pekka Enberg. + - Stamp the transaction log ($UsnJrnl), aka user space journal, if it + is active on the volume and we are mounting read-write or remounting + from read-only to read-write. + - Fix a bug in address space operations error recovery code paths where + if the runlist was not mapped at all and a mapping error occured we + would leave the runlist locked on exit to the function so that the + next access to the same file would try to take the lock and deadlock. + - Detect the case when Windows has been suspended to disk on the volume + to be mounted and if this is the case do not allow (re)mounting + read-write. This is done by parsing hiberfil.sys if present. + - Fix several occurences of a bug where we would perform 'var & ~const' + with a 64-bit variable and a int, i.e. 32-bit, constant. This causes + the higher order 32-bits of the 64-bit variable to be zeroed. To fix + this cast the 'const' to the same 64-bit type as 'var'. + - Change the runlist terminator of the newly allocated cluster(s) to + LCN_ENOENT in ntfs_attr_make_non_resident(). Otherwise the runlist + code gets confused. + - Add an extra parameter @last_vcn to ntfs_get_size_for_mapping_pairs() + and ntfs_mapping_pairs_build() to allow the runlist encoding to be + partial which is desirable when filling holes in sparse attributes. + Update all callers. + - Change ntfs_map_runlist_nolock() to only decompress the mapping pairs + if the requested vcn is inside it. Otherwise we get into problems + when we try to map an out of bounds vcn because we then try to map + the already mapped runlist fragment which causes + ntfs_mapping_pairs_decompress() to fail and return error. Update + ntfs_attr_find_vcn_nolock() accordingly. + - Fix a nasty deadlock that appeared in recent kernels. + The situation: VFS inode X on a mounted ntfs volume is dirty. For + same inode X, the ntfs_inode is dirty and thus corresponding on-disk + inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging + to the table of inodes, i.e. $MFT, inode 0. + What happens: + Process 1: sys_sync()/umount()/whatever... calls + __sync_single_inode() for $MFT -> do_writepages() -> write_page for + the dirty page containing the on-disk inode X, the page is now locked + -> ntfs_write_mst_block() which clears PageUptodate() on the page to + prevent anyone else getting hold of it whilst it does the write out. + This is necessary as the on-disk inode needs "fixups" applied before + the write to disk which are removed again after the write and + PageUptodate is then set again. It then analyses the page looking + for dirty on-disk inodes and when it finds one it calls + ntfs_may_write_mft_record() to see if it is safe to write this + on-disk inode. This then calls ilookup5() to check if the + corresponding VFS inode is in icache(). This in turn calls ifind() + which waits on the inode lock via wait_on_inode whilst holding the + global inode_lock. + Process 2: pdflush results in a call to __sync_single_inode for the + same VFS inode X on the ntfs volume. This locks the inode (I_LOCK) + then calls write-inode -> ntfs_write_inode -> map_mft_record() -> + read_cache_page() for the page (in page cache of table of inodes + $MFT, inode 0) containing the on-disk inode. This page has + PageUptodate() clear because of Process 1 (see above) so + read_cache_page() blocks when it tries to take the page lock for the + page so it can call ntfs_read_page(). + Thus Process 1 is holding the page lock on the page containing the + on-disk inode X and it is waiting on the inode X to be unlocked in + ifind() so it can write the page out and then unlock the page. + And Process 2 is holding the inode lock on inode X and is waiting for + the page to be unlocked so it can call ntfs_readpage() or discover + that Process 1 set PageUptodate() again and use the page. + Thus we have a deadlock due to ifind() waiting on the inode lock. + The solution: The fix is to use the newly introduced + ilookup5_nowait() which does not wait on the inode's lock and hence + avoids the deadlock. This is safe as we do not care about the VFS + inode and only use the fact that it is in the VFS inode cache and the + fact that the vfs and ntfs inodes are one struct in memory to find + the ntfs inode in memory if present. Also, the ntfs inode has its + own locking so it does not matter if the vfs inode is locked. + - Fix bug in mft record writing where we forgot to set the device in + the buffers when mapping them after the VM had discarded them. + Thanks to Martin MOKREJÅ for the bug report. 2.1.22 - Many bug and race fixes and error handling improvements. @@ -1037,7 +1183,7 @@ tng-0.0.8 - 08/03/2002 - Now using BitKeeper, http://linux-ntfs.bkbits.net/ - Further runlist merging work. (Richard Russon) - Backwards compatibility for gcc-2.95. (Richard Russon) - Update to kernel 2.5.5-pre1 and rediff the now tiny patch. - - Convert to new file system declaration using ->ntfs_get_sb() and + - Convert to new filesystem declaration using ->ntfs_get_sb() and replacing ntfs_read_super() with ntfs_fill_super(). - Set s_maxbytes to MAX_LFS_FILESIZE to avoid page cache page index overflow on 32-bit architectures. @@ -1333,7 +1479,7 @@ tng-0.0.1 - The first useful version. The driver is now actually useful! Yey. (-: It undoubtedly has got bugs though and it doesn't implement accesssing compressed files yet. Also, accessing files with attribute list attributes is not implemented yet - either. But for small or simple file systems it should work and allow + either. But for small or simple filesystems it should work and allow you to list directories, use stat on directory entries and the file system, open, read, mmap and llseek around in files. A big mile stone has been reached! @@ -1341,7 +1487,7 @@ tng-0.0.1 - The first useful version. tng-0.0.0 - Initial version tag. Initial driver implementation. The driver can mount and umount simple - NTFS file systems (i.e. ones without attribute lists in the system + NTFS filesystems (i.e. ones without attribute lists in the system files). If the mount fails there might be problems in the error handling code paths, so be warned. Otherwise it seems to be loading the system files nicely and the mft record read mapping/unmapping seems to be diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile index 7b66381a0b0..f083f27d8b6 100644 --- a/fs/ntfs/Makefile +++ b/fs/ntfs/Makefile @@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \ index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ unistr.o upcase.o -EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.22\" +EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.23\" ifeq ($(CONFIG_NTFS_DEBUG),y) EXTRA_CFLAGS += -DDEBUG @@ -15,5 +15,5 @@ endif ifeq ($(CONFIG_NTFS_RW),y) EXTRA_CFLAGS += -DNTFS_RW -ntfs-objs += bitmap.o lcnalloc.o logfile.o quota.o +ntfs-objs += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o endif diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 45d56e41ed9..78adad7a988 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -2,7 +2,7 @@ * aops.c - NTFS kernel address space operations and page cache handling. * Part of the Linux-NTFS project. * - * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2001-2005 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -66,19 +66,22 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) ni = NTFS_I(page->mapping->host); if (likely(uptodate)) { - s64 file_ofs; + s64 file_ofs, initialized_size; set_buffer_uptodate(bh); file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); /* Check for the current buffer head overflowing. */ - if (file_ofs + bh->b_size > ni->initialized_size) { + if (file_ofs + bh->b_size > initialized_size) { char *addr; int ofs = 0; - if (file_ofs < ni->initialized_size) - ofs = ni->initialized_size - file_ofs; + if (file_ofs < initialized_size) + ofs = initialized_size - file_ofs; addr = kmap_atomic(page, KM_BIO_SRC_IRQ); memset(addr + bh_offset(bh) + ofs, 0, bh->b_size - ofs); flush_dcache_page(page); @@ -132,7 +135,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) i * rec_size), rec_size); flush_dcache_page(page); kunmap_atomic(addr, KM_BIO_SRC_IRQ); - if (likely(!PageError(page) && page_uptodate)) + if (likely(page_uptodate && !PageError(page))) SetPageUptodate(page); } unlock_page(page); @@ -168,6 +171,7 @@ static int ntfs_read_block(struct page *page) runlist_element *rl; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; sector_t iblock, lblock, zblock; + unsigned long flags; unsigned int blocksize, vcn_ofs; int i, nr; unsigned char blocksize_bits; @@ -190,8 +194,10 @@ static int ntfs_read_block(struct page *page) } iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits); + read_lock_irqsave(&ni->size_lock, flags); lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits; zblock = (ni->initialized_size + blocksize - 1) >> blocksize_bits; + read_unlock_irqrestore(&ni->size_lock, flags); /* Loop through all the buffers in the page. */ rl = NULL; @@ -258,7 +264,8 @@ lock_retry_remap: goto lock_retry_remap; rl = NULL; lcn = err; - } + } else if (!rl) + up_read(&ni->runlist.lock); /* Hard error, zero out region. */ bh->b_blocknr = -1; SetPageError(page); @@ -341,14 +348,15 @@ handle_zblock: */ static int ntfs_readpage(struct file *file, struct page *page) { - loff_t i_size; ntfs_inode *ni, *base_ni; u8 *kaddr; ntfs_attr_search_ctx *ctx; MFT_RECORD *mrec; + unsigned long flags; u32 attr_len; int err = 0; +retry_readpage: BUG_ON(!PageLocked(page)); /* * This can potentially happen because we clear PageUptodate() during @@ -383,9 +391,9 @@ static int ntfs_readpage(struct file *file, struct page *page) * Attribute is resident, implying it is not compressed or encrypted. * This also means the attribute is smaller than an mft record and * hence smaller than a page, so can simply zero out any pages with - * index above 0. We can also do this if the file size is 0. + * index above 0. */ - if (unlikely(page->index > 0 || !i_size_read(VFS_I(ni)))) { + if (unlikely(page->index > 0)) { kaddr = kmap_atomic(page, KM_USER0); memset(kaddr, 0, PAGE_CACHE_SIZE); flush_dcache_page(page); @@ -402,6 +410,14 @@ static int ntfs_readpage(struct file *file, struct page *page) err = PTR_ERR(mrec); goto err_out; } + /* + * If a parallel write made the attribute non-resident, drop the mft + * record and retry the readpage. + */ + if (unlikely(NInoNonResident(ni))) { + unmap_mft_record(base_ni); + goto retry_readpage; + } ctx = ntfs_attr_get_search_ctx(base_ni, mrec); if (unlikely(!ctx)) { err = -ENOMEM; @@ -412,9 +428,10 @@ static int ntfs_readpage(struct file *file, struct page *page) if (unlikely(err)) goto put_unm_err_out; attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); - i_size = i_size_read(VFS_I(ni)); - if (unlikely(attr_len > i_size)) - attr_len = i_size; + read_lock_irqsave(&ni->size_lock, flags); + if (unlikely(attr_len > ni->initialized_size)) + attr_len = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); kaddr = kmap_atomic(page, KM_USER0); /* Copy the data to the page. */ memcpy(kaddr, (u8*)ctx->attr + @@ -463,12 +480,15 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) { VCN vcn; LCN lcn; + s64 initialized_size; + loff_t i_size; sector_t block, dblock, iblock; struct inode *vi; ntfs_inode *ni; ntfs_volume *vol; runlist_element *rl; struct buffer_head *bh, *head; + unsigned long flags; unsigned int blocksize, vcn_ofs; int err; BOOL need_end_writeback; @@ -510,11 +530,16 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) /* The first block in the page. */ block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits); + read_lock_irqsave(&ni->size_lock, flags); + i_size = i_size_read(vi); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + /* The first out of bounds block for the data size. */ - dblock = (vi->i_size + blocksize - 1) >> blocksize_bits; + dblock = (i_size + blocksize - 1) >> blocksize_bits; /* The last (fully or partially) initialized block. */ - iblock = ni->initialized_size >> blocksize_bits; + iblock = initialized_size >> blocksize_bits; /* * Be very careful. We have no exclusion from __set_page_dirty_buffers @@ -559,7 +584,7 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) /* Make sure we have enough initialized size. */ if (unlikely((block >= iblock) && - (ni->initialized_size < vi->i_size))) { + (initialized_size < i_size))) { /* * If this page is fully outside initialized size, zero * out all pages between the current initialized size @@ -666,7 +691,8 @@ lock_retry_remap: goto lock_retry_remap; rl = NULL; lcn = err; - } + } else if (!rl) + up_read(&ni->runlist.lock); /* Failed to map the buffer, even after retrying. */ bh->b_blocknr = -1; ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " @@ -801,17 +827,15 @@ static int ntfs_write_mst_block(struct page *page, ntfs_inode *ni = NTFS_I(vi); ntfs_volume *vol = ni->vol; u8 *kaddr; - unsigned char bh_size_bits = vi->i_blkbits; - unsigned int bh_size = 1 << bh_size_bits; unsigned int rec_size = ni->itype.index.block_size; ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size]; struct buffer_head *bh, *head, *tbh, *rec_start_bh; - int max_bhs = PAGE_CACHE_SIZE / bh_size; - struct buffer_head *bhs[max_bhs]; + struct buffer_head *bhs[MAX_BUF_PER_PAGE]; runlist_element *rl; - int i, nr_locked_nis, nr_recs, nr_bhs, bhs_per_rec, err, err2; - unsigned rec_size_bits; + int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2; + unsigned bh_size, rec_size_bits; BOOL sync, is_mft, page_is_dirty, rec_is_dirty; + unsigned char bh_size_bits; ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " "0x%lx.", vi->i_ino, ni->type, page->index); @@ -826,7 +850,11 @@ static int ntfs_write_mst_block(struct page *page, */ BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) || (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION))); + bh_size_bits = vi->i_blkbits; + bh_size = 1 << bh_size_bits; + max_bhs = PAGE_CACHE_SIZE / bh_size; BUG_ON(!max_bhs); + BUG_ON(max_bhs > MAX_BUF_PER_PAGE); /* Were we called for sync purposes? */ sync = (wbc->sync_mode == WB_SYNC_ALL); @@ -846,7 +874,7 @@ static int ntfs_write_mst_block(struct page *page, (PAGE_CACHE_SHIFT - bh_size_bits); /* The first out of bounds block for the data size. */ - dblock = (vi->i_size + bh_size - 1) >> bh_size_bits; + dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits; rl = NULL; err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0; @@ -858,6 +886,7 @@ static int ntfs_write_mst_block(struct page *page, if (likely(block < rec_block)) { if (unlikely(block >= dblock)) { clear_buffer_dirty(bh); + set_buffer_uptodate(bh); continue; } /* @@ -895,6 +924,7 @@ static int ntfs_write_mst_block(struct page *page, LCN lcn; unsigned int vcn_ofs; + bh->b_bdev = vol->sb->s_bdev; /* Obtain the vcn and offset of the current block. */ vcn = (VCN)block << bh_size_bits; vcn_ofs = vcn & vol->cluster_size_mask; @@ -938,8 +968,11 @@ lock_retry_remap: if (err2 == -ENOMEM) page_is_dirty = TRUE; lcn = err2; - } else + } else { err2 = -EIO; + if (!rl) + up_read(&ni->runlist.lock); + } /* Hard error. Abort writing this record. */ if (!err || err == -ENOMEM) err = err2; @@ -949,7 +982,8 @@ lock_retry_remap: "attribute type 0x%x) because " "its location on disk could " "not be determined (error " - "code %lli).", (s64)block << + "code %lli).", + (long long)block << bh_size_bits >> vol->mft_record_size_bits, ni->mft_no, ni->type, @@ -1223,19 +1257,17 @@ done: static int ntfs_writepage(struct page *page, struct writeback_control *wbc) { loff_t i_size; - struct inode *vi; - ntfs_inode *ni, *base_ni; + struct inode *vi = page->mapping->host; + ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi); char *kaddr; - ntfs_attr_search_ctx *ctx; - MFT_RECORD *m; + ntfs_attr_search_ctx *ctx = NULL; + MFT_RECORD *m = NULL; u32 attr_len; int err; +retry_writepage: BUG_ON(!PageLocked(page)); - - vi = page->mapping->host; i_size = i_size_read(vi); - /* Is the page fully outside i_size? (truncate in progress) */ if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT)) { @@ -1248,8 +1280,6 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc) ntfs_debug("Write outside i_size - truncated?"); return 0; } - ni = NTFS_I(vi); - /* NInoNonResident() == NInoIndexAllocPresent() */ if (NInoNonResident(ni)) { /* @@ -1326,6 +1356,14 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc) ctx = NULL; goto err_out; } + /* + * If a parallel write made the attribute non-resident, drop the mft + * record and retry the writepage. + */ + if (unlikely(NInoNonResident(ni))) { + unmap_mft_record(base_ni); + goto retry_writepage; + } ctx = ntfs_attr_get_search_ctx(base_ni, m); if (unlikely(!ctx)) { err = -ENOMEM; @@ -1367,15 +1405,12 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc) */ attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); - i_size = i_size_read(VFS_I(ni)); - kaddr = kmap_atomic(page, KM_USER0); + i_size = i_size_read(vi); if (unlikely(attr_len > i_size)) { - /* Zero out of bounds area in the mft record. */ - memset((u8*)ctx->attr + le16_to_cpu( - ctx->attr->data.resident.value_offset) + - i_size, 0, attr_len - i_size); attr_len = i_size; + ctx->attr->data.resident.value_length = cpu_to_le32(attr_len); } + kaddr = kmap_atomic(page, KM_USER0); /* Copy the data from the page to the mft record. */ memcpy((u8*)ctx->attr + le16_to_cpu(ctx->attr->data.resident.value_offset), @@ -1405,8 +1440,10 @@ err_out: err = 0; } else { ntfs_error(vi->i_sb, "Resident attribute write failed with " - "error %i. Setting page error flag.", err); + "error %i.", err); SetPageError(page); + NVolSetErrors(ni->vol); + make_bad_inode(vi); } unlock_page(page); if (ctx) @@ -1425,12 +1462,15 @@ static int ntfs_prepare_nonresident_write(struct page *page, { VCN vcn; LCN lcn; + s64 initialized_size; + loff_t i_size; sector_t block, ablock, iblock; struct inode *vi; ntfs_inode *ni; ntfs_volume *vol; runlist_element *rl; struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; + unsigned long flags; unsigned int vcn_ofs, block_start, block_end, blocksize; int err; BOOL is_retry; @@ -1462,16 +1502,20 @@ static int ntfs_prepare_nonresident_write(struct page *page, /* The first block in the page. */ block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits); + read_lock_irqsave(&ni->size_lock, flags); /* - * The first out of bounds block for the allocated size. No need to + * The first out of bounds block for the allocated size. No need to * round up as allocated_size is in multiples of cluster size and the * minimum cluster size is 512 bytes, which is equal to the smallest * blocksize. */ ablock = ni->allocated_size >> blocksize_bits; + i_size = i_size_read(vi); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); /* The last (fully or partially) initialized block. */ - iblock = ni->initialized_size >> blocksize_bits; + iblock = initialized_size >> blocksize_bits; /* Loop through all the buffers in the page. */ block_start = 0; @@ -1518,7 +1562,7 @@ static int ntfs_prepare_nonresident_write(struct page *page, * request, i.e. block < ablock is true. */ if (unlikely((block >= iblock) && - (ni->initialized_size < vi->i_size))) { + (initialized_size < i_size))) { /* * If this page is fully outside initialized size, zero * out all pages between the current initialized size @@ -1622,6 +1666,8 @@ lock_retry_remap: "not supported yet. " "Sorry."); err = -EOPNOTSUPP; + if (!rl) + up_read(&ni->runlist.lock); goto err_out; } else if (!is_retry && lcn == LCN_RL_NOT_MAPPED) { @@ -1636,7 +1682,8 @@ lock_retry_remap: goto lock_retry_remap; rl = NULL; lcn = err; - } + } else if (!rl) + up_read(&ni->runlist.lock); /* * Failed to map the buffer, even after * retrying. @@ -1797,6 +1844,7 @@ static int ntfs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { s64 new_size; + loff_t i_size; struct inode *vi = page->mapping->host; ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi); ntfs_volume *vol = ni->vol; @@ -1868,14 +1916,8 @@ static int ntfs_prepare_write(struct file *file, struct page *page, BUG_ON(page_has_buffers(page)); new_size = ((s64)page->index << PAGE_CACHE_SHIFT) + to; /* If we do not need to resize the attribute allocation we are done. */ - if (new_size <= vi->i_size) + if (new_size <= i_size_read(vi)) goto done; - - // FIXME: We abort for now as this code is not safe. - ntfs_error(vi->i_sb, "Changing the file size is not supported yet. " - "Sorry."); - return -EOPNOTSUPP; - /* Map, pin, and lock the (base) mft record. */ if (!NInoAttr(ni)) base_ni = ni; @@ -1904,7 +1946,15 @@ static int ntfs_prepare_write(struct file *file, struct page *page, a = ctx->attr; /* The total length of the attribute value. */ attr_len = le32_to_cpu(a->data.resident.value_length); - BUG_ON(vi->i_size != attr_len); + /* Fix an eventual previous failure of ntfs_commit_write(). */ + i_size = i_size_read(vi); + if (unlikely(attr_len > i_size)) { + attr_len = i_size; + a->data.resident.value_length = cpu_to_le32(attr_len); + } + /* If we do not need to resize the attribute allocation we are done. */ + if (new_size <= attr_len) + goto done_unm; /* Check if new size is allowed in $AttrDef. */ err = ntfs_attr_size_bounds_check(vol, ni->type, new_size); if (unlikely(err)) { @@ -1962,6 +2012,7 @@ static int ntfs_prepare_write(struct file *file, struct page *page, } flush_dcache_mft_record_page(ctx->ntfs_ino); mark_mft_record_dirty(ctx->ntfs_ino); +done_unm: ntfs_attr_put_search_ctx(ctx); unmap_mft_record(base_ni); /* @@ -2047,7 +2098,7 @@ static int ntfs_commit_nonresident_write(struct page *page, * now we know ntfs_prepare_write() would have failed in the write * exceeds i_size case, so this will never trigger which is fine. */ - if (pos > vi->i_size) { + if (pos > i_size_read(vi)) { ntfs_error(vi->i_sb, "Writing beyond the existing file size is " "not supported yet. Sorry."); return -EOPNOTSUPP; @@ -2183,9 +2234,13 @@ static int ntfs_commit_write(struct file *file, struct page *page, } kunmap_atomic(kaddr, KM_USER0); /* Update i_size if necessary. */ - if (vi->i_size < attr_len) { + if (i_size_read(vi) < attr_len) { + unsigned long flags; + + write_lock_irqsave(&ni->size_lock, flags); ni->allocated_size = ni->initialized_size = attr_len; i_size_write(vi, attr_len); + write_unlock_irqrestore(&ni->size_lock, flags); } /* Mark the mft record dirty, so it gets written back. */ flush_dcache_mft_record_page(ctx->ntfs_ino); diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index 1ff7f90a18b..cd0f9e740b1 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -1,7 +1,7 @@ /** * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2001-2005 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -21,88 +21,217 @@ */ #include <linux/buffer_head.h> +#include <linux/swap.h> #include "attrib.h" #include "debug.h" #include "layout.h" +#include "lcnalloc.h" +#include "malloc.h" #include "mft.h" #include "ntfs.h" #include "types.h" /** - * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode + * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode * @ni: ntfs inode for which to map (part of) a runlist * @vcn: map runlist part containing this vcn * * Map the part of a runlist containing the @vcn of the ntfs inode @ni. * - * Return 0 on success and -errno on error. + * Return 0 on success and -errno on error. There is one special error code + * which is not an error as such. This is -ENOENT. It means that @vcn is out + * of bounds of the runlist. * - * Locking: - The runlist must be unlocked on entry and is unlocked on return. - * - This function takes the lock for writing and modifies the runlist. + * Locking: - The runlist must be locked for writing. + * - This function modifies the runlist. */ -int ntfs_map_runlist(ntfs_inode *ni, VCN vcn) +int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn) { + VCN end_vcn; ntfs_inode *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; ntfs_attr_search_ctx *ctx; - MFT_RECORD *mrec; + runlist_element *rl; int err = 0; ntfs_debug("Mapping runlist part containing vcn 0x%llx.", (unsigned long long)vcn); - if (!NInoAttr(ni)) base_ni = ni; else base_ni = ni->ext.base_ntfs_ino; - - mrec = map_mft_record(base_ni); - if (IS_ERR(mrec)) - return PTR_ERR(mrec); - ctx = ntfs_attr_get_search_ctx(base_ni, mrec); + m = map_mft_record(base_ni); + if (IS_ERR(m)) + return PTR_ERR(m); + ctx = ntfs_attr_get_search_ctx(base_ni, m); if (unlikely(!ctx)) { err = -ENOMEM; goto err_out; } err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE, vcn, NULL, 0, ctx); - if (unlikely(err)) - goto put_err_out; + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + a = ctx->attr; + /* + * Only decompress the mapping pairs if @vcn is inside it. Otherwise + * we get into problems when we try to map an out of bounds vcn because + * we then try to map the already mapped runlist fragment and + * ntfs_mapping_pairs_decompress() fails. + */ + end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1; + if (unlikely(!a->data.non_resident.lowest_vcn && end_vcn <= 1)) + end_vcn = ni->allocated_size >> ni->vol->cluster_size_bits; + if (unlikely(vcn >= end_vcn)) { + err = -ENOENT; + goto err_out; + } + rl = ntfs_mapping_pairs_decompress(ni->vol, a, ni->runlist.rl); + if (IS_ERR(rl)) + err = PTR_ERR(rl); + else + ni->runlist.rl = rl; +err_out: + if (likely(ctx)) + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + return err; +} + +/** + * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode + * @ni: ntfs inode for which to map (part of) a runlist + * @vcn: map runlist part containing this vcn + * + * Map the part of a runlist containing the @vcn of the ntfs inode @ni. + * + * Return 0 on success and -errno on error. There is one special error code + * which is not an error as such. This is -ENOENT. It means that @vcn is out + * of bounds of the runlist. + * + * Locking: - The runlist must be unlocked on entry and is unlocked on return. + * - This function takes the runlist lock for writing and modifies the + * runlist. + */ +int ntfs_map_runlist(ntfs_inode *ni, VCN vcn) +{ + int err = 0; down_write(&ni->runlist.lock); /* Make sure someone else didn't do the work while we were sleeping. */ if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <= - LCN_RL_NOT_MAPPED)) { - runlist_element *rl; + LCN_RL_NOT_MAPPED)) + err = ntfs_map_runlist_nolock(ni, vcn); + up_write(&ni->runlist.lock); + return err; +} - rl = ntfs_mapping_pairs_decompress(ni->vol, ctx->attr, - ni->runlist.rl); - if (IS_ERR(rl)) - err = PTR_ERR(rl); - else - ni->runlist.rl = rl; +/** + * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode + * @ni: ntfs inode of the attribute whose runlist to search + * @vcn: vcn to convert + * @write_locked: true if the runlist is locked for writing + * + * Find the virtual cluster number @vcn in the runlist of the ntfs attribute + * described by the ntfs inode @ni and return the corresponding logical cluster + * number (lcn). + * + * If the @vcn is not mapped yet, the attempt is made to map the attribute + * extent containing the @vcn and the vcn to lcn conversion is retried. + * + * If @write_locked is true the caller has locked the runlist for writing and + * if false for reading. + * + * Since lcns must be >= 0, we use negative return codes with special meaning: + * + * Return code Meaning / Description + * ========================================== + * LCN_HOLE Hole / not allocated on disk. + * LCN_ENOENT There is no such vcn in the runlist, i.e. @vcn is out of bounds. + * LCN_ENOMEM Not enough memory to map runlist. + * LCN_EIO Critical error (runlist/file is corrupt, i/o error, etc). + * + * Locking: - The runlist must be locked on entry and is left locked on return. + * - If @write_locked is FALSE, i.e. the runlist is locked for reading, + * the lock may be dropped inside the function so you cannot rely on + * the runlist still being the same when this function returns. + */ +LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, + const BOOL write_locked) +{ + LCN lcn; + BOOL is_retry = FALSE; + + ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.", + ni->mft_no, (unsigned long long)vcn, + write_locked ? "write" : "read"); + BUG_ON(!ni); + BUG_ON(!NInoNonResident(ni)); + BUG_ON(vcn < 0); +retry_remap: + /* Convert vcn to lcn. If that fails map the runlist and retry once. */ + lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn); + if (likely(lcn >= LCN_HOLE)) { + ntfs_debug("Done, lcn 0x%llx.", (long long)lcn); + return lcn; } - up_write(&ni->runlist.lock); + if (lcn != LCN_RL_NOT_MAPPED) { + if (lcn != LCN_ENOENT) + lcn = LCN_EIO; + } else if (!is_retry) { + int err; -put_err_out: - ntfs_attr_put_search_ctx(ctx); -err_out: - unmap_mft_record(base_ni); - return err; + if (!write_locked) { + up_read(&ni->runlist.lock); + down_write(&ni->runlist.lock); + if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) != + LCN_RL_NOT_MAPPED)) { + up_write(&ni->runlist.lock); + down_read(&ni->runlist.lock); + goto retry_remap; + } + } + err = ntfs_map_runlist_nolock(ni, vcn); + if (!write_locked) { + up_write(&ni->runlist.lock); + down_read(&ni->runlist.lock); + } + if (likely(!err)) { + is_retry = TRUE; + goto retry_remap; + } + if (err == -ENOENT) + lcn = LCN_ENOENT; + else if (err == -ENOMEM) + lcn = LCN_ENOMEM; + else + lcn = LCN_EIO; + } + if (lcn != LCN_ENOENT) + ntfs_error(ni->vol->sb, "Failed with error code %lli.", + (long long)lcn); + return lcn; } /** - * ntfs_find_vcn - find a vcn in the runlist described by an ntfs inode - * @ni: ntfs inode describing the runlist to search - * @vcn: vcn to find - * @need_write: if false, lock for reading and if true, lock for writing + * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode + * @ni: ntfs inode describing the runlist to search + * @vcn: vcn to find + * @write_locked: true if the runlist is locked for writing * * Find the virtual cluster number @vcn in the runlist described by the ntfs * inode @ni and return the address of the runlist element containing the @vcn. - * The runlist is left locked and the caller has to unlock it. If @need_write - * is true, the runlist is locked for writing and if @need_write is false, the - * runlist is locked for reading. In the error case, the runlist is not left - * locked. + * + * If the @vcn is not mapped yet, the attempt is made to map the attribute + * extent containing the @vcn and the vcn to lcn conversion is retried. + * + * If @write_locked is true the caller has locked the runlist for writing and + * if false for reading. * * Note you need to distinguish between the lcn of the returned runlist element * being >= 0 and LCN_HOLE. In the later case you have to return zeroes on @@ -118,34 +247,29 @@ err_out: * -ENOMEM - Not enough memory to map runlist. * -EIO - Critical error (runlist/file is corrupt, i/o error, etc). * - * Locking: - The runlist must be unlocked on entry. - * - On failing return, the runlist is unlocked. - * - On successful return, the runlist is locked. If @need_write us - * true, it is locked for writing. Otherwise is is locked for - * reading. + * Locking: - The runlist must be locked on entry and is left locked on return. + * - If @write_locked is FALSE, i.e. the runlist is locked for reading, + * the lock may be dropped inside the function so you cannot rely on + * the runlist still being the same when this function returns. */ -runlist_element *ntfs_find_vcn(ntfs_inode *ni, const VCN vcn, - const BOOL need_write) +runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn, + const BOOL write_locked) { runlist_element *rl; int err = 0; BOOL is_retry = FALSE; - ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, lock for %sing.", + ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.", ni->mft_no, (unsigned long long)vcn, - !need_write ? "read" : "writ"); + write_locked ? "write" : "read"); BUG_ON(!ni); BUG_ON(!NInoNonResident(ni)); BUG_ON(vcn < 0); -lock_retry_remap: - if (!need_write) - down_read(&ni->runlist.lock); - else - down_write(&ni->runlist.lock); +retry_remap: rl = ni->runlist.rl; if (likely(rl && vcn >= rl[0].vcn)) { while (likely(rl->length)) { - if (likely(vcn < rl[1].vcn)) { + if (unlikely(vcn < rl[1].vcn)) { if (likely(rl->lcn >= LCN_HOLE)) { ntfs_debug("Done."); return rl; @@ -161,30 +285,41 @@ lock_retry_remap: err = -EIO; } } - if (!need_write) - up_read(&ni->runlist.lock); - else - up_write(&ni->runlist.lock); if (!err && !is_retry) { /* * The @vcn is in an unmapped region, map the runlist and * retry. */ - err = ntfs_map_runlist(ni, vcn); + if (!write_locked) { + up_read(&ni->runlist.lock); + down_write(&ni->runlist.lock); + if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) != + LCN_RL_NOT_MAPPED)) { + up_write(&ni->runlist.lock); + down_read(&ni->runlist.lock); + goto retry_remap; + } + } + err = ntfs_map_runlist_nolock(ni, vcn); + if (!write_locked) { + up_write(&ni->runlist.lock); + down_read(&ni->runlist.lock); + } if (likely(!err)) { is_retry = TRUE; - goto lock_retry_remap; + goto retry_remap; } /* - * -EINVAL and -ENOENT coming from a failed mapping attempt are - * equivalent to i/o errors for us as they should not happen in - * our code paths. + * -EINVAL coming from a failed mapping attempt is equivalent + * to i/o error for us as it should not happen in our code + * paths. */ - if (err == -EINVAL || err == -ENOENT) + if (err == -EINVAL) err = -EIO; } else if (!err) err = -EIO; - ntfs_error(ni->vol->sb, "Failed with error code %i.", err); + if (err != -ENOENT) + ntfs_error(ni->vol->sb, "Failed with error code %i.", err); return ERR_PTR(err); } @@ -870,15 +1005,14 @@ int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx, ntfs_inode *ni, MFT_RECORD *mrec) { - ctx->mrec = mrec; - /* Sanity checks are performed elsewhere. */ - ctx->attr = (ATTR_RECORD*)((u8*)mrec + le16_to_cpu(mrec->attrs_offset)); - ctx->is_first = TRUE; - ctx->ntfs_ino = ni; - ctx->al_entry = NULL; - ctx->base_ntfs_ino = NULL; - ctx->base_mrec = NULL; - ctx->base_attr = NULL; + *ctx = (ntfs_attr_search_ctx) { + .mrec = mrec, + /* Sanity checks are performed elsewhere. */ + .attr = (ATTR_RECORD*)((u8*)mrec + + le16_to_cpu(mrec->attrs_offset)), + .is_first = TRUE, + .ntfs_ino = ni, + }; } /** @@ -945,6 +1079,8 @@ void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx) return; } +#ifdef NTFS_RW + /** * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file * @vol: ntfs volume to which the attribute belongs @@ -1024,27 +1160,21 @@ int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type, * Check whether the attribute of @type on the ntfs volume @vol is allowed to * be non-resident. This information is obtained from $AttrDef system file. * - * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, or + * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, and * -ENOENT if the attribute is not listed in $AttrDef. */ int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type) { ATTR_DEF *ad; - /* - * $DATA is always allowed to be non-resident even if $AttrDef does not - * specify this in the flags of the $DATA attribute definition record. - */ - if (type == AT_DATA) - return 0; /* Find the attribute definition record in $AttrDef. */ ad = ntfs_attr_find_in_attrdef(vol, type); if (unlikely(!ad)) return -ENOENT; /* Check the flags and return the result. */ - if (ad->flags & CAN_BE_NON_RESIDENT) - return 0; - return -EPERM; + if (ad->flags & ATTR_DEF_RESIDENT) + return -EPERM; + return 0; } /** @@ -1067,9 +1197,9 @@ int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type) */ int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type) { - if (type != AT_INDEX_ALLOCATION && type != AT_EA) - return 0; - return -EPERM; + if (type == AT_INDEX_ALLOCATION || type == AT_EA) + return -EPERM; + return 0; } /** @@ -1117,6 +1247,328 @@ int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size) } /** + * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute + * @ni: ntfs inode describing the attribute to convert + * + * Convert the resident ntfs attribute described by the ntfs inode @ni to a + * non-resident one. + * + * Return 0 on success and -errno on error. The following error return codes + * are defined: + * -EPERM - The attribute is not allowed to be non-resident. + * -ENOMEM - Not enough memory. + * -ENOSPC - Not enough disk space. + * -EINVAL - Attribute not defined on the volume. + * -EIO - I/o error or other error. + * Note that -ENOSPC is also returned in the case that there is not enough + * space in the mft record to do the conversion. This can happen when the mft + * record is already very full. The caller is responsible for trying to make + * space in the mft record and trying again. FIXME: Do we need a separate + * error return code for this kind of -ENOSPC or is it always worth trying + * again in case the attribute may then fit in a resident state so no need to + * make it non-resident at all? Ho-hum... (AIA) + * + * NOTE to self: No changes in the attribute list are required to move from + * a resident to a non-resident attribute. + * + * Locking: - The caller must hold i_sem on the inode. + */ +int ntfs_attr_make_non_resident(ntfs_inode *ni) +{ + s64 new_size; + struct inode *vi = VFS_I(ni); + ntfs_volume *vol = ni->vol; + ntfs_inode *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + struct page *page; + runlist_element *rl; + u8 *kaddr; + unsigned long flags; + int mp_size, mp_ofs, name_ofs, arec_size, err, err2; + u32 attr_size; + u8 old_res_attr_flags; + + /* Check that the attribute is allowed to be non-resident. */ + err = ntfs_attr_can_be_non_resident(vol, ni->type); + if (unlikely(err)) { + if (err == -EPERM) + ntfs_debug("Attribute is not allowed to be " + "non-resident."); + else + ntfs_debug("Attribute not defined on the NTFS " + "volume!"); + return err; + } + /* + * The size needs to be aligned to a cluster boundary for allocation + * purposes. + */ + new_size = (i_size_read(vi) + vol->cluster_size - 1) & + ~(vol->cluster_size - 1); + if (new_size > 0) { + runlist_element *rl2; + + /* + * Will need the page later and since the page lock nests + * outside all ntfs locks, we need to get the page now. + */ + page = find_or_create_page(vi->i_mapping, 0, + mapping_gfp_mask(vi->i_mapping)); + if (unlikely(!page)) + return -ENOMEM; + /* Start by allocating clusters to hold the attribute value. */ + rl = ntfs_cluster_alloc(vol, 0, new_size >> + vol->cluster_size_bits, -1, DATA_ZONE); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + ntfs_debug("Failed to allocate cluster%s, error code " + "%i.", (new_size >> + vol->cluster_size_bits) > 1 ? "s" : "", + err); + goto page_err_out; + } + /* Change the runlist terminator to LCN_ENOENT. */ + rl2 = rl; + while (rl2->length) + rl2++; + BUG_ON(rl2->lcn != LCN_RL_NOT_MAPPED); + rl2->lcn = LCN_ENOENT; + } else { + rl = NULL; + page = NULL; + } + /* Determine the size of the mapping pairs array. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1); + if (unlikely(mp_size < 0)) { + err = mp_size; + ntfs_debug("Failed to get size for mapping pairs array, error " + "code %i.", err); + goto rl_err_out; + } + down_write(&ni->runlist.lock); + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + m = ctx->mrec; + a = ctx->attr; + BUG_ON(NInoNonResident(ni)); + BUG_ON(a->non_resident); + /* + * Calculate new offsets for the name and the mapping pairs array. + * We assume the attribute is not compressed or sparse. + */ + name_ofs = (offsetof(ATTR_REC, + data.non_resident.compressed_size) + 7) & ~7; + mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; + /* + * Determine the size of the resident part of the now non-resident + * attribute record. + */ + arec_size = (mp_ofs + mp_size + 7) & ~7; + /* + * If the page is not uptodate bring it uptodate by copying from the + * attribute value. + */ + attr_size = le32_to_cpu(a->data.resident.value_length); + BUG_ON(attr_size != i_size_read(vi)); + if (page && !PageUptodate(page)) { + kaddr = kmap_atomic(page, KM_USER0); + memcpy(kaddr, (u8*)a + + le16_to_cpu(a->data.resident.value_offset), + attr_size); + memset(kaddr + attr_size, 0, PAGE_CACHE_SIZE - attr_size); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(page); + SetPageUptodate(page); + } + /* Backup the attribute flag. */ + old_res_attr_flags = a->data.resident.flags; + /* Resize the resident part of the attribute record. */ + err = ntfs_attr_record_resize(m, a, arec_size); + if (unlikely(err)) + goto err_out; + /* + * Convert the resident part of the attribute record to describe a + * non-resident attribute. + */ + a->non_resident = 1; + /* Move the attribute name if it exists and update the offset. */ + if (a->name_length) + memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), + a->name_length * sizeof(ntfschar)); + a->name_offset = cpu_to_le16(name_ofs); + /* + * FIXME: For now just clear all of these as we do not support them + * when writing. + */ + a->flags &= cpu_to_le16(0xffff & ~le16_to_cpu(ATTR_IS_SPARSE | + ATTR_IS_ENCRYPTED | ATTR_COMPRESSION_MASK)); + /* Setup the fields specific to non-resident attributes. */ + a->data.non_resident.lowest_vcn = 0; + a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >> + vol->cluster_size_bits); + a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs); + a->data.non_resident.compression_unit = 0; + memset(&a->data.non_resident.reserved, 0, + sizeof(a->data.non_resident.reserved)); + a->data.non_resident.allocated_size = cpu_to_sle64(new_size); + a->data.non_resident.data_size = + a->data.non_resident.initialized_size = + cpu_to_sle64(attr_size); + /* Generate the mapping pairs array into the attribute record. */ + err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs, + arec_size - mp_ofs, rl, 0, -1, NULL); + if (unlikely(err)) { + ntfs_debug("Failed to build mapping pairs, error code %i.", + err); + goto undo_err_out; + } + /* Setup the in-memory attribute structure to be non-resident. */ + /* + * FIXME: For now just clear all of these as we do not support them + * when writing. + */ + NInoClearSparse(ni); + NInoClearEncrypted(ni); + NInoClearCompressed(ni); + ni->runlist.rl = rl; + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = new_size; + write_unlock_irqrestore(&ni->size_lock, flags); + /* + * This needs to be last since the address space operations ->readpage + * and ->writepage can run concurrently with us as they are not + * serialized on i_sem. Note, we are not allowed to fail once we flip + * this switch, which is another reason to do this last. + */ + NInoSetNonResident(ni); + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + if (page) { + set_page_dirty(page); + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + } + ntfs_debug("Done."); + return 0; +undo_err_out: + /* Convert the attribute back into a resident attribute. */ + a->non_resident = 0; + /* Move the attribute name if it exists and update the offset. */ + name_ofs = (offsetof(ATTR_RECORD, data.resident.reserved) + + sizeof(a->data.resident.reserved) + 7) & ~7; + if (a->name_length) + memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), + a->name_length * sizeof(ntfschar)); + mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; + a->name_offset = cpu_to_le16(name_ofs); + arec_size = (mp_ofs + attr_size + 7) & ~7; + /* Resize the resident part of the attribute record. */ + err2 = ntfs_attr_record_resize(m, a, arec_size); + if (unlikely(err2)) { + /* + * This cannot happen (well if memory corruption is at work it + * could happen in theory), but deal with it as well as we can. + * If the old size is too small, truncate the attribute, + * otherwise simply give it a larger allocated size. + * FIXME: Should check whether chkdsk complains when the + * allocated size is much bigger than the resident value size. + */ + arec_size = le32_to_cpu(a->length); + if ((mp_ofs + attr_size) > arec_size) { + err2 = attr_size; + attr_size = arec_size - mp_ofs; + ntfs_error(vol->sb, "Failed to undo partial resident " + "to non-resident attribute " + "conversion. Truncating inode 0x%lx, " + "attribute type 0x%x from %i bytes to " + "%i bytes to maintain metadata " + "consistency. THIS MEANS YOU ARE " + "LOSING %i BYTES DATA FROM THIS %s.", + vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + err2, attr_size, err2 - attr_size, + ((ni->type == AT_DATA) && + !ni->name_len) ? "FILE": "ATTRIBUTE"); + write_lock_irqsave(&ni->size_lock, flags); + ni->initialized_size = attr_size; + i_size_write(vi, attr_size); + write_unlock_irqrestore(&ni->size_lock, flags); + } + } + /* Setup the fields specific to resident attributes. */ + a->data.resident.value_length = cpu_to_le32(attr_size); + a->data.resident.value_offset = cpu_to_le16(mp_ofs); + a->data.resident.flags = old_res_attr_flags; + memset(&a->data.resident.reserved, 0, + sizeof(a->data.resident.reserved)); + /* Copy the data from the page back to the attribute value. */ + if (page) { + kaddr = kmap_atomic(page, KM_USER0); + memcpy((u8*)a + mp_ofs, kaddr, attr_size); + kunmap_atomic(kaddr, KM_USER0); + } + /* Setup the allocated size in the ntfs inode in case it changed. */ + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = arec_size - mp_ofs; + write_unlock_irqrestore(&ni->size_lock, flags); + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ni->runlist.rl = NULL; + up_write(&ni->runlist.lock); +rl_err_out: + if (rl) { + if (ntfs_cluster_free_from_rl(vol, rl) < 0) { + ntfs_error(vol->sb, "Failed to release allocated " + "cluster(s) in error code path. Run " + "chkdsk to recover the lost " + "cluster(s)."); + NVolSetErrors(vol); + } + ntfs_free(rl); +page_err_out: + unlock_page(page); + page_cache_release(page); + } + if (err == -EINVAL) + err = -EIO; + return err; +} + +/** * ntfs_attr_set - fill (a part of) an attribute with a byte * @ni: ntfs inode describing the attribute to fill * @ofs: offset inside the attribute at which to start to fill @@ -1127,6 +1579,10 @@ int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size) * byte offset @ofs inside the attribute with the constant byte @val. * * This function is effectively like memset() applied to an ntfs attribute. + * Note thie function actually only operates on the page cache pages belonging + * to the ntfs attribute and it marks them dirty after doing the memset(). + * Thus it relies on the vm dirty page write code paths to cause the modified + * pages to be written to the mft record/disk. * * Return 0 on success and -errno on error. An error code of -ESPIPE means * that @ofs + @cnt were outside the end of the attribute and no write was @@ -1155,7 +1611,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val) end = ofs + cnt; end_ofs = end & ~PAGE_CACHE_MASK; /* If the end is outside the inode size return -ESPIPE. */ - if (unlikely(end > VFS_I(ni)->i_size)) { + if (unlikely(end > i_size_read(VFS_I(ni)))) { ntfs_error(vol->sb, "Request exceeds end of attribute."); return -ESPIPE; } @@ -1256,3 +1712,5 @@ done: ntfs_debug("Done."); return 0; } + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h index e0c2c6c81bc..0e4ac6d3c0e 100644 --- a/fs/ntfs/attrib.h +++ b/fs/ntfs/attrib.h @@ -2,7 +2,7 @@ * attrib.h - Defines for attribute handling in NTFS Linux kernel driver. * Part of the Linux-NTFS project. * - * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2001-2005 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -60,10 +60,14 @@ typedef struct { ATTR_RECORD *base_attr; } ntfs_attr_search_ctx; +extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn); extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn); -extern runlist_element *ntfs_find_vcn(ntfs_inode *ni, const VCN vcn, - const BOOL need_write); +extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, + const BOOL write_locked); + +extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, + const VCN vcn, const BOOL write_locked); int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, const u32 name_len, const IGNORE_CASE_BOOL ic, @@ -85,6 +89,8 @@ extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec); extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx); +#ifdef NTFS_RW + extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type, const s64 size); extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, @@ -94,7 +100,11 @@ extern int ntfs_attr_can_be_resident(const ntfs_volume *vol, extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size); +extern int ntfs_attr_make_non_resident(ntfs_inode *ni); + extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val); +#endif /* NTFS_RW */ + #endif /* _LINUX_NTFS_ATTRIB_H */ diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index ee5ae706f86..6d265cfd49a 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -96,13 +96,14 @@ void free_compression_buffers(void) /** * zero_partial_compressed_page - zero out of bounds compressed page region */ -static void zero_partial_compressed_page(ntfs_inode *ni, struct page *page) +static void zero_partial_compressed_page(struct page *page, + const s64 initialized_size) { u8 *kp = page_address(page); unsigned int kp_ofs; ntfs_debug("Zeroing page region outside initialized size."); - if (((s64)page->index << PAGE_CACHE_SHIFT) >= ni->initialized_size) { + if (((s64)page->index << PAGE_CACHE_SHIFT) >= initialized_size) { /* * FIXME: Using clear_page() will become wrong when we get * PAGE_CACHE_SIZE != PAGE_SIZE but for now there is no problem. @@ -110,7 +111,7 @@ static void zero_partial_compressed_page(ntfs_inode *ni, struct page *page) clear_page(kp); return; } - kp_ofs = ni->initialized_size & ~PAGE_CACHE_MASK; + kp_ofs = initialized_size & ~PAGE_CACHE_MASK; memset(kp + kp_ofs, 0, PAGE_CACHE_SIZE - kp_ofs); return; } @@ -118,12 +119,12 @@ static void zero_partial_compressed_page(ntfs_inode *ni, struct page *page) /** * handle_bounds_compressed_page - test for&handle out of bounds compressed page */ -static inline void handle_bounds_compressed_page(ntfs_inode *ni, - struct page *page) +static inline void handle_bounds_compressed_page(struct page *page, + const loff_t i_size, const s64 initialized_size) { - if ((page->index >= (ni->initialized_size >> PAGE_CACHE_SHIFT)) && - (ni->initialized_size < VFS_I(ni)->i_size)) - zero_partial_compressed_page(ni, page); + if ((page->index >= (initialized_size >> PAGE_CACHE_SHIFT)) && + (initialized_size < i_size)) + zero_partial_compressed_page(page, initialized_size); return; } @@ -138,6 +139,8 @@ static inline void handle_bounds_compressed_page(ntfs_inode *ni, * @xpage_done: set to 1 if xpage was completed successfully (IN/OUT) * @cb_start: compression block to decompress (IN) * @cb_size: size of compression block @cb_start in bytes (IN) + * @i_size: file size when we started the read (IN) + * @initialized_size: initialized file size when we started the read (IN) * * The caller must have disabled preemption. ntfs_decompress() reenables it when * the critical section is finished. @@ -165,7 +168,8 @@ static inline void handle_bounds_compressed_page(ntfs_inode *ni, static int ntfs_decompress(struct page *dest_pages[], int *dest_index, int *dest_ofs, const int dest_max_index, const int dest_max_ofs, const int xpage, char *xpage_done, u8 *const cb_start, - const u32 cb_size) + const u32 cb_size, const loff_t i_size, + const s64 initialized_size) { /* * Pointers into the compressed data, i.e. the compression block (cb), @@ -219,9 +223,6 @@ return_error: spin_unlock(&ntfs_cb_lock); /* Second stage: finalize completed pages. */ if (nr_completed_pages > 0) { - struct page *page = dest_pages[completed_pages[0]]; - ntfs_inode *ni = NTFS_I(page->mapping->host); - for (i = 0; i < nr_completed_pages; i++) { int di = completed_pages[i]; @@ -230,7 +231,8 @@ return_error: * If we are outside the initialized size, zero * the out of bounds page range. */ - handle_bounds_compressed_page(ni, dp); + handle_bounds_compressed_page(dp, i_size, + initialized_size); flush_dcache_page(dp); kunmap(dp); SetPageUptodate(dp); @@ -478,12 +480,14 @@ return_overflow: */ int ntfs_read_compressed_block(struct page *page) { + loff_t i_size; + s64 initialized_size; struct address_space *mapping = page->mapping; ntfs_inode *ni = NTFS_I(mapping->host); ntfs_volume *vol = ni->vol; struct super_block *sb = vol->sb; runlist_element *rl; - unsigned long block_size = sb->s_blocksize; + unsigned long flags, block_size = sb->s_blocksize; unsigned char block_size_bits = sb->s_blocksize_bits; u8 *cb, *cb_pos, *cb_end; struct buffer_head **bhs; @@ -552,8 +556,12 @@ int ntfs_read_compressed_block(struct page *page) * The remaining pages need to be allocated and inserted into the page * cache, alignment guarantees keep all the below much simpler. (-8 */ - max_page = ((VFS_I(ni)->i_size + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT) - offset; + read_lock_irqsave(&ni->size_lock, flags); + i_size = i_size_read(VFS_I(ni)); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + max_page = ((i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - + offset; if (nr_pages < max_page) max_page = nr_pages; for (i = 0; i < max_page; i++, offset++) { @@ -824,7 +832,8 @@ lock_retry_remap: * If we are outside the initialized size, zero * the out of bounds page range. */ - handle_bounds_compressed_page(ni, page); + handle_bounds_compressed_page(page, i_size, + initialized_size); flush_dcache_page(page); kunmap(page); SetPageUptodate(page); @@ -847,7 +856,8 @@ lock_retry_remap: ntfs_debug("Found compressed compression block."); err = ntfs_decompress(pages, &cur_page, &cur_ofs, cb_max_page, cb_max_ofs, xpage, &xpage_done, - cb_pos, cb_size - (cb_pos - cb)); + cb_pos, cb_size - (cb_pos - cb), i_size, + initialized_size); /* * We can sleep from now on, lock already dropped by * ntfs_decompress(). diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c index 6fb6bb5e372..807150e2c2b 100644 --- a/fs/ntfs/debug.c +++ b/fs/ntfs/debug.c @@ -164,14 +164,17 @@ void ntfs_debug_dump_runlist(const runlist_element *rl) if (index > -LCN_ENOENT - 1) index = 3; printk(KERN_DEBUG "%-16Lx %s %-16Lx%s\n", - (rl + i)->vcn, lcn_str[index], - (rl + i)->length, (rl + i)->length ? - "" : " (runlist end)"); + (long long)(rl + i)->vcn, lcn_str[index], + (long long)(rl + i)->length, + (rl + i)->length ? "" : + " (runlist end)"); } else printk(KERN_DEBUG "%-16Lx %-16Lx %-16Lx%s\n", - (rl + i)->vcn, (rl + i)->lcn, - (rl + i)->length, (rl + i)->length ? - "" : " (runlist end)"); + (long long)(rl + i)->vcn, + (long long)(rl + i)->lcn, + (long long)(rl + i)->length, + (rl + i)->length ? "" : + " (runlist end)"); if (!(rl + i)->length) break; } diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 93577561cdb..46779471c54 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1,7 +1,7 @@ /** * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2001-2005 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -183,8 +183,7 @@ found_it: name->len = 0; *res = name; } else { - if (name) - kfree(name); + kfree(name); *res = NULL; } mref = le64_to_cpu(ie->data.dir.indexed_file); @@ -444,8 +443,7 @@ found_it2: name->len = 0; *res = name; } else { - if (name) - kfree(name); + kfree(name); *res = NULL; } mref = le64_to_cpu(ie->data.dir.indexed_file); @@ -610,7 +608,7 @@ dir_err_out: // TODO: (AIA) // The algorithm embedded in this code will be required for the time when we // want to support adding of entries to directories, where we require correct -// collation of file names in order not to cause corruption of the file system. +// collation of file names in order not to cause corruption of the filesystem. /** * ntfs_lookup_inode_by_name - find an inode in a directory given its name @@ -1101,7 +1099,7 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos, static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { s64 ia_pos, ia_start, prev_ia_pos, bmp_pos; - loff_t fpos; + loff_t fpos, i_size; struct inode *bmp_vi, *vdir = filp->f_dentry->d_inode; struct super_block *sb = vdir->i_sb; ntfs_inode *ndir = NTFS_I(vdir); @@ -1122,7 +1120,8 @@ static int ntfs_readdir(struct file *filp, void *dirent, filldir_t filldir) vdir->i_ino, fpos); rc = err = 0; /* Are we at end of dir yet? */ - if (fpos >= vdir->i_size + vol->mft_record_size) + i_size = i_size_read(vdir); + if (fpos >= i_size + vol->mft_record_size) goto done; /* Emulate . and .. for all directories. */ if (!fpos) { @@ -1264,7 +1263,7 @@ skip_index_root: bmp_mapping = bmp_vi->i_mapping; /* Get the starting bitmap bit position and sanity check it. */ bmp_pos = ia_pos >> ndir->itype.index.block_size_bits; - if (unlikely(bmp_pos >> 3 >= bmp_vi->i_size)) { + if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) { ntfs_error(sb, "Current index allocation position exceeds " "index bitmap size."); goto err_out; @@ -1301,7 +1300,7 @@ find_next_index_buffer: goto get_next_bmp_page; } /* If we have reached the end of the bitmap, we are done. */ - if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= vdir->i_size)) + if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= i_size)) goto unm_EOD; ia_pos = (bmp_pos + cur_bmp_pos) << ndir->itype.index.block_size_bits; @@ -1309,7 +1308,8 @@ find_next_index_buffer: ntfs_debug("Handling index buffer 0x%llx.", (unsigned long long)bmp_pos + cur_bmp_pos); /* If the current index buffer is in the same page we reuse the page. */ - if ((prev_ia_pos & PAGE_CACHE_MASK) != (ia_pos & PAGE_CACHE_MASK)) { + if ((prev_ia_pos & (s64)PAGE_CACHE_MASK) != + (ia_pos & (s64)PAGE_CACHE_MASK)) { prev_ia_pos = ia_pos; if (likely(ia_page != NULL)) { unlock_page(ia_page); @@ -1441,7 +1441,7 @@ unm_EOD: ntfs_unmap_page(bmp_page); EOD: /* We are finished, set fpos to EOD. */ - fpos = vdir->i_size + vol->mft_record_size; + fpos = i_size + vol->mft_record_size; abort: kfree(name); done: @@ -1461,10 +1461,8 @@ err_out: unlock_page(ia_page); ntfs_unmap_page(ia_page); } - if (ir) - kfree(ir); - if (name) - kfree(name); + kfree(ir); + kfree(name); if (ctx) ntfs_attr_put_search_ctx(ctx); if (m) @@ -1495,7 +1493,7 @@ err_out: static int ntfs_dir_open(struct inode *vi, struct file *filp) { if (sizeof(unsigned long) < 8) { - if (vi->i_size > MAX_LFS_FILESIZE) + if (i_size_read(vi) > MAX_LFS_FILESIZE) return -EFBIG; } return 0; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index db8713ea0d2..e0f530ce6b9 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -47,7 +47,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp) { if (sizeof(unsigned long) < 8) { - if (vi->i_size > MAX_LFS_FILESIZE) + if (i_size_read(vi) > MAX_LFS_FILESIZE) return -EFBIG; } return generic_file_open(vi, filp); diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c index 71bd2cd7a4d..11fd5307d78 100644 --- a/fs/ntfs/index.c +++ b/fs/ntfs/index.c @@ -1,7 +1,7 @@ /* * index.c - NTFS kernel index handling. Part of the Linux-NTFS project. * - * Copyright (c) 2004 Anton Altaparmakov + * Copyright (c) 2004-2005 Anton Altaparmakov * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published @@ -39,18 +39,8 @@ ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni) ntfs_index_context *ictx; ictx = kmem_cache_alloc(ntfs_index_ctx_cache, SLAB_NOFS); - if (ictx) { - ictx->idx_ni = idx_ni; - ictx->entry = NULL; - ictx->data = NULL; - ictx->data_len = 0; - ictx->is_in_root = 0; - ictx->ir = NULL; - ictx->actx = NULL; - ictx->base_ni = NULL; - ictx->ia = NULL; - ictx->page = NULL; - } + if (ictx) + *ictx = (ntfs_index_context){ .idx_ni = idx_ni }; return ictx; } diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 31840ba0b38..886214a77f9 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1,7 +1,7 @@ /** * inode.c - NTFS kernel inode handling. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2001-2005 Anton Altaparmakov * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published @@ -174,7 +174,7 @@ struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no) vi = iget5_locked(sb, mft_no, (test_t)ntfs_test_inode, (set_t)ntfs_init_locked_inode, &na); - if (!vi) + if (unlikely(!vi)) return ERR_PTR(-ENOMEM); err = 0; @@ -188,7 +188,7 @@ struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no) * There is no point in keeping bad inodes around if the failure was * due to ENOMEM. We want to be able to retry again later. */ - if (err == -ENOMEM) { + if (unlikely(err == -ENOMEM)) { iput(vi); vi = ERR_PTR(err); } @@ -235,7 +235,7 @@ struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode, (set_t)ntfs_init_locked_inode, &na); - if (!vi) + if (unlikely(!vi)) return ERR_PTR(-ENOMEM); err = 0; @@ -250,7 +250,7 @@ struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, * simplifies things in that we never need to check for bad attribute * inodes elsewhere. */ - if (err) { + if (unlikely(err)) { iput(vi); vi = ERR_PTR(err); } @@ -290,7 +290,7 @@ struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, vi = iget5_locked(base_vi->i_sb, na.mft_no, (test_t)ntfs_test_inode, (set_t)ntfs_init_locked_inode, &na); - if (!vi) + if (unlikely(!vi)) return ERR_PTR(-ENOMEM); err = 0; @@ -305,7 +305,7 @@ struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, * simplifies things in that we never need to check for bad index * inodes elsewhere. */ - if (err) { + if (unlikely(err)) { iput(vi); vi = ERR_PTR(err); } @@ -317,8 +317,7 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb) ntfs_inode *ni; ntfs_debug("Entering."); - ni = (ntfs_inode *)kmem_cache_alloc(ntfs_big_inode_cache, - SLAB_NOFS); + ni = kmem_cache_alloc(ntfs_big_inode_cache, SLAB_NOFS); if (likely(ni != NULL)) { ni->state = 0; return VFS_I(ni); @@ -343,7 +342,7 @@ static inline ntfs_inode *ntfs_alloc_extent_inode(void) ntfs_inode *ni; ntfs_debug("Entering."); - ni = (ntfs_inode *)kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS); + ni = kmem_cache_alloc(ntfs_inode_cache, SLAB_NOFS); if (likely(ni != NULL)) { ni->state = 0; return ni; @@ -376,6 +375,7 @@ static void ntfs_destroy_extent_inode(ntfs_inode *ni) void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni) { ntfs_debug("Entering."); + rwlock_init(&ni->size_lock); ni->initialized_size = ni->allocated_size = 0; ni->seq_no = 0; atomic_set(&ni->count, 1); @@ -524,6 +524,7 @@ static int ntfs_read_locked_inode(struct inode *vi) ntfs_volume *vol = NTFS_SB(vi->i_sb); ntfs_inode *ni; MFT_RECORD *m; + ATTR_RECORD *a; STANDARD_INFORMATION *si; ntfs_attr_search_ctx *ctx; int err = 0; @@ -632,9 +633,10 @@ static int ntfs_read_locked_inode(struct inode *vi) } goto unm_err_out; } + a = ctx->attr; /* Get the standard information attribute value. */ - si = (STANDARD_INFORMATION*)((char*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); + si = (STANDARD_INFORMATION*)((u8*)a + + le16_to_cpu(a->data.resident.value_offset)); /* Transfer information from the standard information into vi. */ /* @@ -673,15 +675,16 @@ static int ntfs_read_locked_inode(struct inode *vi) goto skip_attr_list_load; ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino); NInoSetAttrList(ni); - if (ctx->attr->flags & ATTR_IS_ENCRYPTED || - ctx->attr->flags & ATTR_COMPRESSION_MASK || - ctx->attr->flags & ATTR_IS_SPARSE) { + a = ctx->attr; + if (a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_COMPRESSION_MASK || + a->flags & ATTR_IS_SPARSE) { ntfs_error(vi->i_sb, "Attribute list attribute is " "compressed/encrypted/sparse."); goto unm_err_out; } /* Now allocate memory for the attribute list. */ - ni->attr_list_size = (u32)ntfs_attr_size(ctx->attr); + ni->attr_list_size = (u32)ntfs_attr_size(a); ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); if (!ni->attr_list) { ntfs_error(vi->i_sb, "Not enough memory to allocate " @@ -689,9 +692,9 @@ static int ntfs_read_locked_inode(struct inode *vi) err = -ENOMEM; goto unm_err_out; } - if (ctx->attr->non_resident) { + if (a->non_resident) { NInoSetAttrListNonResident(ni); - if (ctx->attr->data.non_resident.lowest_vcn) { + if (a->data.non_resident.lowest_vcn) { ntfs_error(vi->i_sb, "Attribute list has non " "zero lowest_vcn."); goto unm_err_out; @@ -701,7 +704,7 @@ static int ntfs_read_locked_inode(struct inode *vi) * exclusive access to the inode at this time. */ ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol, - ctx->attr, NULL); + a, NULL); if (IS_ERR(ni->attr_list_rl.rl)) { err = PTR_ERR(ni->attr_list_rl.rl); ni->attr_list_rl.rl = NULL; @@ -712,27 +715,26 @@ static int ntfs_read_locked_inode(struct inode *vi) /* Now load the attribute list. */ if ((err = load_attribute_list(vol, &ni->attr_list_rl, ni->attr_list, ni->attr_list_size, - sle64_to_cpu(ctx->attr->data. - non_resident.initialized_size)))) { + sle64_to_cpu(a->data.non_resident. + initialized_size)))) { ntfs_error(vi->i_sb, "Failed to load " "attribute list attribute."); goto unm_err_out; } - } else /* if (!ctx.attr->non_resident) */ { - if ((u8*)ctx->attr + le16_to_cpu( - ctx->attr->data.resident.value_offset) + - le32_to_cpu( - ctx->attr->data.resident.value_length) > + } else /* if (!a->non_resident) */ { + if ((u8*)a + le16_to_cpu(a->data.resident.value_offset) + + le32_to_cpu( + a->data.resident.value_length) > (u8*)ctx->mrec + vol->mft_record_size) { ntfs_error(vi->i_sb, "Corrupt attribute list " "in inode."); goto unm_err_out; } /* Now copy the attribute list. */ - memcpy(ni->attr_list, (u8*)ctx->attr + le16_to_cpu( - ctx->attr->data.resident.value_offset), + memcpy(ni->attr_list, (u8*)a + le16_to_cpu( + a->data.resident.value_offset), le32_to_cpu( - ctx->attr->data.resident.value_length)); + a->data.resident.value_length)); } } skip_attr_list_load: @@ -741,10 +743,11 @@ skip_attr_list_load: * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes. */ if (S_ISDIR(vi->i_mode)) { + loff_t bvi_size; struct inode *bvi; ntfs_inode *bni; INDEX_ROOT *ir; - char *ir_end, *index_end; + u8 *ir_end, *index_end; /* It is a directory, find index root attribute. */ ntfs_attr_reinit_search_ctx(ctx); @@ -760,17 +763,16 @@ skip_attr_list_load: } goto unm_err_out; } + a = ctx->attr; /* Set up the state. */ - if (unlikely(ctx->attr->non_resident)) { + if (unlikely(a->non_resident)) { ntfs_error(vol->sb, "$INDEX_ROOT attribute is not " "resident."); goto unm_err_out; } /* Ensure the attribute name is placed before the value. */ - if (unlikely(ctx->attr->name_length && - (le16_to_cpu(ctx->attr->name_offset) >= - le16_to_cpu(ctx->attr->data.resident. - value_offset)))) { + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu(a->data.resident.value_offset)))) { ntfs_error(vol->sb, "$INDEX_ROOT attribute name is " "placed after the attribute value."); goto unm_err_out; @@ -781,28 +783,27 @@ skip_attr_list_load: * encrypted. However index root cannot be both compressed and * encrypted. */ - if (ctx->attr->flags & ATTR_COMPRESSION_MASK) + if (a->flags & ATTR_COMPRESSION_MASK) NInoSetCompressed(ni); - if (ctx->attr->flags & ATTR_IS_ENCRYPTED) { - if (ctx->attr->flags & ATTR_COMPRESSION_MASK) { + if (a->flags & ATTR_IS_ENCRYPTED) { + if (a->flags & ATTR_COMPRESSION_MASK) { ntfs_error(vi->i_sb, "Found encrypted and " "compressed attribute."); goto unm_err_out; } NInoSetEncrypted(ni); } - if (ctx->attr->flags & ATTR_IS_SPARSE) + if (a->flags & ATTR_IS_SPARSE) NInoSetSparse(ni); - ir = (INDEX_ROOT*)((char*)ctx->attr + le16_to_cpu( - ctx->attr->data.resident.value_offset)); - ir_end = (char*)ir + le32_to_cpu( - ctx->attr->data.resident.value_length); - if (ir_end > (char*)ctx->mrec + vol->mft_record_size) { + ir = (INDEX_ROOT*)((u8*)a + + le16_to_cpu(a->data.resident.value_offset)); + ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length); + if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) { ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is " "corrupt."); goto unm_err_out; } - index_end = (char*)&ir->index + + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); if (index_end > ir_end) { ntfs_error(vi->i_sb, "Directory index is corrupt."); @@ -889,7 +890,8 @@ skip_attr_list_load: "attribute."); goto unm_err_out; } - if (!ctx->attr->non_resident) { + a = ctx->attr; + if (!a->non_resident) { ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " "is resident."); goto unm_err_out; @@ -898,42 +900,40 @@ skip_attr_list_load: * Ensure the attribute name is placed before the mapping pairs * array. */ - if (unlikely(ctx->attr->name_length && - (le16_to_cpu(ctx->attr->name_offset) >= - le16_to_cpu(ctx->attr->data.non_resident. - mapping_pairs_offset)))) { + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)))) { ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name " "is placed after the mapping pairs " "array."); goto unm_err_out; } - if (ctx->attr->flags & ATTR_IS_ENCRYPTED) { + if (a->flags & ATTR_IS_ENCRYPTED) { ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " "is encrypted."); goto unm_err_out; } - if (ctx->attr->flags & ATTR_IS_SPARSE) { + if (a->flags & ATTR_IS_SPARSE) { ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " "is sparse."); goto unm_err_out; } - if (ctx->attr->flags & ATTR_COMPRESSION_MASK) { + if (a->flags & ATTR_COMPRESSION_MASK) { ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " "is compressed."); goto unm_err_out; } - if (ctx->attr->data.non_resident.lowest_vcn) { + if (a->data.non_resident.lowest_vcn) { ntfs_error(vi->i_sb, "First extent of " "$INDEX_ALLOCATION attribute has non " "zero lowest_vcn."); goto unm_err_out; } - vi->i_size = sle64_to_cpu( - ctx->attr->data.non_resident.data_size); + vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); ni->initialized_size = sle64_to_cpu( - ctx->attr->data.non_resident.initialized_size); + a->data.non_resident.initialized_size); ni->allocated_size = sle64_to_cpu( - ctx->attr->data.non_resident.allocated_size); + a->data.non_resident.allocated_size); /* * We are done with the mft record, so we release it. Otherwise * we would deadlock in ntfs_attr_iget(). @@ -958,11 +958,12 @@ skip_attr_list_load: goto unm_err_out; } /* Consistency check bitmap size vs. index allocation size. */ - if ((bvi->i_size << 3) < (vi->i_size >> + bvi_size = i_size_read(bvi); + if ((bvi_size << 3) < (vi->i_size >> ni->itype.index.block_size_bits)) { ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) " "for index allocation (0x%llx).", - bvi->i_size << 3, vi->i_size); + bvi_size << 3, vi->i_size); goto unm_err_out; } skip_large_dir_stuff: @@ -1010,87 +1011,92 @@ skip_large_dir_stuff: ntfs_error(vi->i_sb, "$DATA attribute is missing."); goto unm_err_out; } + a = ctx->attr; /* Setup the state. */ - if (ctx->attr->non_resident) { + if (a->non_resident) { NInoSetNonResident(ni); - if (ctx->attr->flags & ATTR_COMPRESSION_MASK) { - NInoSetCompressed(ni); - if (vol->cluster_size > 4096) { - ntfs_error(vi->i_sb, "Found " - "compressed data but " - "compression is disabled due " - "to cluster size (%i) > 4kiB.", - vol->cluster_size); - goto unm_err_out; - } - if ((ctx->attr->flags & ATTR_COMPRESSION_MASK) - != ATTR_IS_COMPRESSED) { - ntfs_error(vi->i_sb, "Found " - "unknown compression method or " - "corrupt file."); - goto unm_err_out; + if (a->flags & (ATTR_COMPRESSION_MASK | + ATTR_IS_SPARSE)) { + if (a->flags & ATTR_COMPRESSION_MASK) { + NInoSetCompressed(ni); + if (vol->cluster_size > 4096) { + ntfs_error(vi->i_sb, "Found " + "compressed data but " + "compression is " + "disabled due to " + "cluster size (%i) > " + "4kiB.", + vol->cluster_size); + goto unm_err_out; + } + if ((a->flags & ATTR_COMPRESSION_MASK) + != ATTR_IS_COMPRESSED) { + ntfs_error(vi->i_sb, "Found " + "unknown compression " + "method or corrupt " + "file."); + goto unm_err_out; + } } - ni->itype.compressed.block_clusters = 1U << - ctx->attr->data.non_resident. - compression_unit; - if (ctx->attr->data.non_resident. - compression_unit != 4) { + if (a->flags & ATTR_IS_SPARSE) + NInoSetSparse(ni); + if (a->data.non_resident.compression_unit != + 4) { ntfs_error(vi->i_sb, "Found " "nonstandard compression unit " "(%u instead of 4). Cannot " "handle this.", - ctx->attr->data.non_resident. + a->data.non_resident. compression_unit); err = -EOPNOTSUPP; goto unm_err_out; } + ni->itype.compressed.block_clusters = 1U << + a->data.non_resident. + compression_unit; ni->itype.compressed.block_size = 1U << ( - ctx->attr->data.non_resident. + a->data.non_resident. compression_unit + vol->cluster_size_bits); ni->itype.compressed.block_size_bits = ffs( - ni->itype.compressed.block_size) - 1; + ni->itype.compressed. + block_size) - 1; + ni->itype.compressed.size = sle64_to_cpu( + a->data.non_resident. + compressed_size); } - if (ctx->attr->flags & ATTR_IS_ENCRYPTED) { - if (ctx->attr->flags & ATTR_COMPRESSION_MASK) { + if (a->flags & ATTR_IS_ENCRYPTED) { + if (a->flags & ATTR_COMPRESSION_MASK) { ntfs_error(vi->i_sb, "Found encrypted " "and compressed data."); goto unm_err_out; } NInoSetEncrypted(ni); } - if (ctx->attr->flags & ATTR_IS_SPARSE) - NInoSetSparse(ni); - if (ctx->attr->data.non_resident.lowest_vcn) { + if (a->data.non_resident.lowest_vcn) { ntfs_error(vi->i_sb, "First extent of $DATA " "attribute has non zero " "lowest_vcn."); goto unm_err_out; } - /* Setup all the sizes. */ vi->i_size = sle64_to_cpu( - ctx->attr->data.non_resident.data_size); + a->data.non_resident.data_size); ni->initialized_size = sle64_to_cpu( - ctx->attr->data.non_resident. - initialized_size); + a->data.non_resident.initialized_size); ni->allocated_size = sle64_to_cpu( - ctx->attr->data.non_resident. - allocated_size); - if (NInoCompressed(ni)) { - ni->itype.compressed.size = sle64_to_cpu( - ctx->attr->data.non_resident. - compressed_size); - } + a->data.non_resident.allocated_size); } else { /* Resident attribute. */ - /* - * Make all sizes equal for simplicity in read code - * paths. FIXME: Need to keep this in mind when - * converting to non-resident attribute in write code - * path. (Probably only affects truncate().) - */ - vi->i_size = ni->initialized_size = ni->allocated_size = - le32_to_cpu( - ctx->attr->data.resident.value_length); + vi->i_size = ni->initialized_size = le32_to_cpu( + a->data.resident.value_length); + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu( + a->data.resident.value_offset); + if (vi->i_size > ni->allocated_size) { + ntfs_error(vi->i_sb, "Resident data attribute " + "is corrupt (size exceeds " + "allocation)."); + goto unm_err_out; + } } no_data_attr_special_case: /* We are done with the mft record, so we release it. */ @@ -1117,11 +1123,10 @@ no_data_attr_special_case: * sizes of all non-resident attributes present to give us the Linux * correct size that should go into i_blocks (after division by 512). */ - if (S_ISDIR(vi->i_mode) || !NInoCompressed(ni)) - vi->i_blocks = ni->allocated_size >> 9; - else + if (S_ISREG(vi->i_mode) && (NInoCompressed(ni) || NInoSparse(ni))) vi->i_blocks = ni->itype.compressed.size >> 9; - + else + vi->i_blocks = ni->allocated_size >> 9; ntfs_debug("Done."); return 0; @@ -1166,6 +1171,7 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) ntfs_volume *vol = NTFS_SB(vi->i_sb); ntfs_inode *ni, *base_ni; MFT_RECORD *m; + ATTR_RECORD *a; ntfs_attr_search_ctx *ctx; int err = 0; @@ -1200,24 +1206,21 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) err = -ENOMEM; goto unm_err_out; } - /* Find the attribute. */ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx); if (unlikely(err)) goto unm_err_out; - - if (!ctx->attr->non_resident) { + a = ctx->attr; + if (!a->non_resident) { /* Ensure the attribute name is placed before the value. */ - if (unlikely(ctx->attr->name_length && - (le16_to_cpu(ctx->attr->name_offset) >= - le16_to_cpu(ctx->attr->data.resident. - value_offset)))) { + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu(a->data.resident.value_offset)))) { ntfs_error(vol->sb, "Attribute name is placed after " "the attribute value."); goto unm_err_out; } - if (NInoMstProtected(ni) || ctx->attr->flags) { + if (NInoMstProtected(ni) || a->flags) { ntfs_error(vi->i_sb, "Found mst protected attribute " "or attribute with non-zero flags but " "the attribute is resident. Please " @@ -1225,85 +1228,95 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) "linux-ntfs-dev@lists.sourceforge.net"); goto unm_err_out; } - /* - * Resident attribute. Make all sizes equal for simplicity in - * read code paths. - */ - vi->i_size = ni->initialized_size = ni->allocated_size = - le32_to_cpu(ctx->attr->data.resident.value_length); + vi->i_size = ni->initialized_size = le32_to_cpu( + a->data.resident.value_length); + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu(a->data.resident.value_offset); + if (vi->i_size > ni->allocated_size) { + ntfs_error(vi->i_sb, "Resident attribute is corrupt " + "(size exceeds allocation)."); + goto unm_err_out; + } } else { NInoSetNonResident(ni); /* * Ensure the attribute name is placed before the mapping pairs * array. */ - if (unlikely(ctx->attr->name_length && - (le16_to_cpu(ctx->attr->name_offset) >= - le16_to_cpu(ctx->attr->data.non_resident. - mapping_pairs_offset)))) { + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)))) { ntfs_error(vol->sb, "Attribute name is placed after " "the mapping pairs array."); goto unm_err_out; } - if (ctx->attr->flags & ATTR_COMPRESSION_MASK) { + if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) { + if (a->flags & ATTR_COMPRESSION_MASK) { + NInoSetCompressed(ni); + if ((ni->type != AT_DATA) || (ni->type == + AT_DATA && ni->name_len)) { + ntfs_error(vi->i_sb, "Found compressed " + "non-data or named " + "data attribute. " + "Please report you " + "saw this message to " + "linux-ntfs-dev@lists." + "sourceforge.net"); + goto unm_err_out; + } + if (vol->cluster_size > 4096) { + ntfs_error(vi->i_sb, "Found compressed " + "attribute but " + "compression is " + "disabled due to " + "cluster size (%i) > " + "4kiB.", + vol->cluster_size); + goto unm_err_out; + } + if ((a->flags & ATTR_COMPRESSION_MASK) != + ATTR_IS_COMPRESSED) { + ntfs_error(vi->i_sb, "Found unknown " + "compression method."); + goto unm_err_out; + } + } if (NInoMstProtected(ni)) { ntfs_error(vi->i_sb, "Found mst protected " "attribute but the attribute " - "is compressed. Please report " - "you saw this message to " - "linux-ntfs-dev@lists." - "sourceforge.net"); - goto unm_err_out; - } - NInoSetCompressed(ni); - if ((ni->type != AT_DATA) || (ni->type == AT_DATA && - ni->name_len)) { - ntfs_error(vi->i_sb, "Found compressed " - "non-data or named data " - "attribute. Please report " - "you saw this message to " + "is %s. Please report you " + "saw this message to " "linux-ntfs-dev@lists." - "sourceforge.net"); - goto unm_err_out; - } - if (vol->cluster_size > 4096) { - ntfs_error(vi->i_sb, "Found compressed " - "attribute but compression is " - "disabled due to cluster size " - "(%i) > 4kiB.", - vol->cluster_size); + "sourceforge.net", + NInoCompressed(ni) ? + "compressed" : "sparse"); goto unm_err_out; } - if ((ctx->attr->flags & ATTR_COMPRESSION_MASK) - != ATTR_IS_COMPRESSED) { - ntfs_error(vi->i_sb, "Found unknown " - "compression method."); - goto unm_err_out; - } - ni->itype.compressed.block_clusters = 1U << - ctx->attr->data.non_resident. - compression_unit; - if (ctx->attr->data.non_resident.compression_unit != - 4) { + if (a->flags & ATTR_IS_SPARSE) + NInoSetSparse(ni); + if (a->data.non_resident.compression_unit != 4) { ntfs_error(vi->i_sb, "Found nonstandard " "compression unit (%u instead " "of 4). Cannot handle this.", - ctx->attr->data.non_resident. + a->data.non_resident. compression_unit); err = -EOPNOTSUPP; goto unm_err_out; } + ni->itype.compressed.block_clusters = 1U << + a->data.non_resident.compression_unit; ni->itype.compressed.block_size = 1U << ( - ctx->attr->data.non_resident. - compression_unit + + a->data.non_resident.compression_unit + vol->cluster_size_bits); ni->itype.compressed.block_size_bits = ffs( - ni->itype.compressed.block_size) - 1; + ni->itype.compressed.block_size) - 1; + ni->itype.compressed.size = sle64_to_cpu( + a->data.non_resident.compressed_size); } - if (ctx->attr->flags & ATTR_IS_ENCRYPTED) { - if (ctx->attr->flags & ATTR_COMPRESSION_MASK) { - ntfs_error(vi->i_sb, "Found encrypted " - "and compressed data."); + if (a->flags & ATTR_IS_ENCRYPTED) { + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(vi->i_sb, "Found encrypted and " + "compressed data."); goto unm_err_out; } if (NInoMstProtected(ni)) { @@ -1317,37 +1330,17 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) } NInoSetEncrypted(ni); } - if (ctx->attr->flags & ATTR_IS_SPARSE) { - if (NInoMstProtected(ni)) { - ntfs_error(vi->i_sb, "Found mst protected " - "attribute but the attribute " - "is sparse. Please report " - "you saw this message to " - "linux-ntfs-dev@lists." - "sourceforge.net"); - goto unm_err_out; - } - NInoSetSparse(ni); - } - if (ctx->attr->data.non_resident.lowest_vcn) { + if (a->data.non_resident.lowest_vcn) { ntfs_error(vi->i_sb, "First extent of attribute has " "non-zero lowest_vcn."); goto unm_err_out; } - /* Setup all the sizes. */ - vi->i_size = sle64_to_cpu( - ctx->attr->data.non_resident.data_size); + vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); ni->initialized_size = sle64_to_cpu( - ctx->attr->data.non_resident.initialized_size); + a->data.non_resident.initialized_size); ni->allocated_size = sle64_to_cpu( - ctx->attr->data.non_resident.allocated_size); - if (NInoCompressed(ni)) { - ni->itype.compressed.size = sle64_to_cpu( - ctx->attr->data.non_resident. - compressed_size); - } + a->data.non_resident.allocated_size); } - /* Setup the operations for this attribute inode. */ vi->i_op = NULL; vi->i_fop = NULL; @@ -1355,12 +1348,10 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) vi->i_mapping->a_ops = &ntfs_mst_aops; else vi->i_mapping->a_ops = &ntfs_aops; - - if (!NInoCompressed(ni)) - vi->i_blocks = ni->allocated_size >> 9; - else + if (NInoCompressed(ni) || NInoSparse(ni)) vi->i_blocks = ni->itype.compressed.size >> 9; - + else + vi->i_blocks = ni->allocated_size >> 9; /* * Make sure the base inode doesn't go away and attach it to the * attribute inode. @@ -1429,10 +1420,12 @@ err_out: */ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) { + loff_t bvi_size; ntfs_volume *vol = NTFS_SB(vi->i_sb); ntfs_inode *ni, *base_ni, *bni; struct inode *bvi; MFT_RECORD *m; + ATTR_RECORD *a; ntfs_attr_search_ctx *ctx; INDEX_ROOT *ir; u8 *ir_end, *index_end; @@ -1474,30 +1467,28 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) "missing."); goto unm_err_out; } + a = ctx->attr; /* Set up the state. */ - if (unlikely(ctx->attr->non_resident)) { + if (unlikely(a->non_resident)) { ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident."); goto unm_err_out; } /* Ensure the attribute name is placed before the value. */ - if (unlikely(ctx->attr->name_length && - (le16_to_cpu(ctx->attr->name_offset) >= - le16_to_cpu(ctx->attr->data.resident. - value_offset)))) { + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu(a->data.resident.value_offset)))) { ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed " "after the attribute value."); goto unm_err_out; } /* Compressed/encrypted/sparse index root is not allowed. */ - if (ctx->attr->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED | + if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED | ATTR_IS_SPARSE)) { ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index " "root attribute."); goto unm_err_out; } - ir = (INDEX_ROOT*)((u8*)ctx->attr + - le16_to_cpu(ctx->attr->data.resident.value_offset)); - ir_end = (u8*)ir + le32_to_cpu(ctx->attr->data.resident.value_length); + ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->data.resident.value_offset)); + ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length); if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) { ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt."); goto unm_err_out; @@ -1570,7 +1561,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) "$INDEX_ALLOCATION attribute."); goto unm_err_out; } - if (!ctx->attr->non_resident) { + if (!a->non_resident) { ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " "resident."); goto unm_err_out; @@ -1578,37 +1569,36 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) /* * Ensure the attribute name is placed before the mapping pairs array. */ - if (unlikely(ctx->attr->name_length && (le16_to_cpu( - ctx->attr->name_offset) >= le16_to_cpu( - ctx->attr->data.non_resident.mapping_pairs_offset)))) { + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)))) { ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is " "placed after the mapping pairs array."); goto unm_err_out; } - if (ctx->attr->flags & ATTR_IS_ENCRYPTED) { + if (a->flags & ATTR_IS_ENCRYPTED) { ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " "encrypted."); goto unm_err_out; } - if (ctx->attr->flags & ATTR_IS_SPARSE) { + if (a->flags & ATTR_IS_SPARSE) { ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse."); goto unm_err_out; } - if (ctx->attr->flags & ATTR_COMPRESSION_MASK) { + if (a->flags & ATTR_COMPRESSION_MASK) { ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " "compressed."); goto unm_err_out; } - if (ctx->attr->data.non_resident.lowest_vcn) { + if (a->data.non_resident.lowest_vcn) { ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION " "attribute has non zero lowest_vcn."); goto unm_err_out; } - vi->i_size = sle64_to_cpu(ctx->attr->data.non_resident.data_size); + vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); ni->initialized_size = sle64_to_cpu( - ctx->attr->data.non_resident.initialized_size); - ni->allocated_size = sle64_to_cpu( - ctx->attr->data.non_resident.allocated_size); + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu(a->data.non_resident.allocated_size); /* * We are done with the mft record, so we release it. Otherwise * we would deadlock in ntfs_attr_iget(). @@ -1632,10 +1622,10 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) goto iput_unm_err_out; } /* Consistency check bitmap size vs. index allocation size. */ - if ((bvi->i_size << 3) < (vi->i_size >> - ni->itype.index.block_size_bits)) { + bvi_size = i_size_read(bvi); + if ((bvi_size << 3) < (vi->i_size >> ni->itype.index.block_size_bits)) { ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for " - "index allocation (0x%llx).", bvi->i_size << 3, + "index allocation (0x%llx).", bvi_size << 3, vi->i_size); goto iput_unm_err_out; } @@ -1646,7 +1636,6 @@ skip_large_index_stuff: vi->i_fop = NULL; vi->i_mapping->a_ops = &ntfs_mst_aops; vi->i_blocks = ni->allocated_size >> 9; - /* * Make sure the base inode doesn't go away and attach it to the * index inode. @@ -1712,7 +1701,7 @@ int ntfs_read_inode_mount(struct inode *vi) struct buffer_head *bh; ntfs_inode *ni; MFT_RECORD *m = NULL; - ATTR_RECORD *attr; + ATTR_RECORD *a; ntfs_attr_search_ctx *ctx; unsigned int i, nr_blocks; int err; @@ -1727,10 +1716,10 @@ int ntfs_read_inode_mount(struct inode *vi) /* Setup the data attribute. It is special as it is mst protected. */ NInoSetNonResident(ni); NInoSetMstProtected(ni); + NInoSetSparseDisabled(ni); ni->type = AT_DATA; ni->name = NULL; ni->name_len = 0; - /* * This sets up our little cheat allowing us to reuse the async read io * completion handler for directories. @@ -1808,9 +1797,10 @@ int ntfs_read_inode_mount(struct inode *vi) ntfs_debug("Attribute list attribute found in $MFT."); NInoSetAttrList(ni); - if (ctx->attr->flags & ATTR_IS_ENCRYPTED || - ctx->attr->flags & ATTR_COMPRESSION_MASK || - ctx->attr->flags & ATTR_IS_SPARSE) { + a = ctx->attr; + if (a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_COMPRESSION_MASK || + a->flags & ATTR_IS_SPARSE) { ntfs_error(sb, "Attribute list attribute is " "compressed/encrypted/sparse. Not " "allowed. $MFT is corrupt. You should " @@ -1818,16 +1808,16 @@ int ntfs_read_inode_mount(struct inode *vi) goto put_err_out; } /* Now allocate memory for the attribute list. */ - ni->attr_list_size = (u32)ntfs_attr_size(ctx->attr); + ni->attr_list_size = (u32)ntfs_attr_size(a); ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); if (!ni->attr_list) { ntfs_error(sb, "Not enough memory to allocate buffer " "for attribute list."); goto put_err_out; } - if (ctx->attr->non_resident) { + if (a->non_resident) { NInoSetAttrListNonResident(ni); - if (ctx->attr->data.non_resident.lowest_vcn) { + if (a->data.non_resident.lowest_vcn) { ntfs_error(sb, "Attribute list has non zero " "lowest_vcn. $MFT is corrupt. " "You should run chkdsk."); @@ -1835,7 +1825,7 @@ int ntfs_read_inode_mount(struct inode *vi) } /* Setup the runlist. */ ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol, - ctx->attr, NULL); + a, NULL); if (IS_ERR(ni->attr_list_rl.rl)) { err = PTR_ERR(ni->attr_list_rl.rl); ni->attr_list_rl.rl = NULL; @@ -1847,7 +1837,7 @@ int ntfs_read_inode_mount(struct inode *vi) /* Now load the attribute list. */ if ((err = load_attribute_list(vol, &ni->attr_list_rl, ni->attr_list, ni->attr_list_size, - sle64_to_cpu(ctx->attr->data. + sle64_to_cpu(a->data. non_resident.initialized_size)))) { ntfs_error(sb, "Failed to load attribute list " "attribute with error code %i.", @@ -1855,20 +1845,20 @@ int ntfs_read_inode_mount(struct inode *vi) goto put_err_out; } } else /* if (!ctx.attr->non_resident) */ { - if ((u8*)ctx->attr + le16_to_cpu( - ctx->attr->data.resident.value_offset) + + if ((u8*)a + le16_to_cpu( + a->data.resident.value_offset) + le32_to_cpu( - ctx->attr->data.resident.value_length) > + a->data.resident.value_length) > (u8*)ctx->mrec + vol->mft_record_size) { ntfs_error(sb, "Corrupt attribute list " "attribute."); goto put_err_out; } /* Now copy the attribute list. */ - memcpy(ni->attr_list, (u8*)ctx->attr + le16_to_cpu( - ctx->attr->data.resident.value_offset), + memcpy(ni->attr_list, (u8*)a + le16_to_cpu( + a->data.resident.value_offset), le32_to_cpu( - ctx->attr->data.resident.value_length)); + a->data.resident.value_length)); } /* The attribute list is now setup in memory. */ /* @@ -1934,25 +1924,25 @@ int ntfs_read_inode_mount(struct inode *vi) ntfs_attr_reinit_search_ctx(ctx); /* Now load all attribute extents. */ - attr = NULL; + a = NULL; next_vcn = last_vcn = highest_vcn = 0; while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0, ctx))) { runlist_element *nrl; /* Cache the current attribute. */ - attr = ctx->attr; + a = ctx->attr; /* $MFT must be non-resident. */ - if (!attr->non_resident) { + if (!a->non_resident) { ntfs_error(sb, "$MFT must be non-resident but a " "resident extent was found. $MFT is " "corrupt. Run chkdsk."); goto put_err_out; } /* $MFT must be uncompressed and unencrypted. */ - if (attr->flags & ATTR_COMPRESSION_MASK || - attr->flags & ATTR_IS_ENCRYPTED || - attr->flags & ATTR_IS_SPARSE) { + if (a->flags & ATTR_COMPRESSION_MASK || + a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_IS_SPARSE) { ntfs_error(sb, "$MFT must be uncompressed, " "non-sparse, and unencrypted but a " "compressed/sparse/encrypted extent " @@ -1966,7 +1956,7 @@ int ntfs_read_inode_mount(struct inode *vi) * as we have exclusive access to the inode at this time and we * are a mount in progress task, too. */ - nrl = ntfs_mapping_pairs_decompress(vol, attr, ni->runlist.rl); + nrl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl); if (IS_ERR(nrl)) { ntfs_error(sb, "ntfs_mapping_pairs_decompress() " "failed with error code %ld. $MFT is " @@ -1977,7 +1967,7 @@ int ntfs_read_inode_mount(struct inode *vi) /* Are we in the first extent? */ if (!next_vcn) { - if (attr->data.non_resident.lowest_vcn) { + if (a->data.non_resident.lowest_vcn) { ntfs_error(sb, "First extent of $DATA " "attribute has non zero " "lowest_vcn. $MFT is corrupt. " @@ -1986,15 +1976,15 @@ int ntfs_read_inode_mount(struct inode *vi) } /* Get the last vcn in the $DATA attribute. */ last_vcn = sle64_to_cpu( - attr->data.non_resident.allocated_size) + a->data.non_resident.allocated_size) >> vol->cluster_size_bits; /* Fill in the inode size. */ vi->i_size = sle64_to_cpu( - attr->data.non_resident.data_size); - ni->initialized_size = sle64_to_cpu(attr->data. - non_resident.initialized_size); + a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); ni->allocated_size = sle64_to_cpu( - attr->data.non_resident.allocated_size); + a->data.non_resident.allocated_size); /* * Verify the number of mft records does not exceed * 2^32 - 1. @@ -2051,7 +2041,7 @@ int ntfs_read_inode_mount(struct inode *vi) } /* Get the lowest vcn for the next extent. */ - highest_vcn = sle64_to_cpu(attr->data.non_resident.highest_vcn); + highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); next_vcn = highest_vcn + 1; /* Only one extent or error, which we catch below. */ @@ -2060,7 +2050,7 @@ int ntfs_read_inode_mount(struct inode *vi) /* Avoid endless loops due to corruption. */ if (next_vcn < sle64_to_cpu( - attr->data.non_resident.lowest_vcn)) { + a->data.non_resident.lowest_vcn)) { ntfs_error(sb, "$MFT has corrupt attribute list " "attribute. Run chkdsk."); goto put_err_out; @@ -2071,7 +2061,7 @@ int ntfs_read_inode_mount(struct inode *vi) "$MFT is corrupt. Run chkdsk."); goto put_err_out; } - if (!attr) { + if (!a) { ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is " "corrupt. Run chkdsk."); goto put_err_out; @@ -2275,6 +2265,8 @@ int ntfs_show_options(struct seq_file *sf, struct vfsmount *mnt) seq_printf(sf, ",case_sensitive"); if (NVolShowSystemFiles(vol)) seq_printf(sf, ",show_sys_files"); + if (!NVolSparseEnabled(vol)) + seq_printf(sf, ",disable_sparse"); for (i = 0; on_errors_arr[i].val; i++) { if (on_errors_arr[i].val & vol->on_errors) seq_printf(sf, ",errors=%s", on_errors_arr[i].str); @@ -2311,6 +2303,7 @@ int ntfs_truncate(struct inode *vi) ntfs_volume *vol = ni->vol; ntfs_attr_search_ctx *ctx; MFT_RECORD *m; + ATTR_RECORD *a; const char *te = " Leaving file length out of sync with i_size."; int err; @@ -2347,14 +2340,15 @@ int ntfs_truncate(struct inode *vi) vi->i_ino, err); goto err_out; } + a = ctx->attr; /* If the size has not changed there is nothing to do. */ - if (ntfs_attr_size(ctx->attr) == i_size_read(vi)) + if (ntfs_attr_size(a) == i_size_read(vi)) goto done; // TODO: Implement the truncate... ntfs_error(vi->i_sb, "Inode size has changed but this is not " "implemented yet. Resetting inode size to old value. " " This is most likely a bug in the ntfs driver!"); - i_size_write(vi, ntfs_attr_size(ctx->attr)); + i_size_write(vi, ntfs_attr_size(a)); done: ntfs_attr_put_search_ctx(ctx); unmap_mft_record(ni); @@ -2515,18 +2509,18 @@ int ntfs_write_inode(struct inode *vi, int sync) nt = utc2ntfs(vi->i_mtime); if (si->last_data_change_time != nt) { ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, " - "new = 0x%llx", vi->i_ino, + "new = 0x%llx", vi->i_ino, (long long) sle64_to_cpu(si->last_data_change_time), - sle64_to_cpu(nt)); + (long long)sle64_to_cpu(nt)); si->last_data_change_time = nt; modified = TRUE; } nt = utc2ntfs(vi->i_ctime); if (si->last_mft_change_time != nt) { ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, " - "new = 0x%llx", vi->i_ino, + "new = 0x%llx", vi->i_ino, (long long) sle64_to_cpu(si->last_mft_change_time), - sle64_to_cpu(nt)); + (long long)sle64_to_cpu(nt)); si->last_mft_change_time = nt; modified = TRUE; } @@ -2534,8 +2528,8 @@ int ntfs_write_inode(struct inode *vi, int sync) if (si->last_access_time != nt) { ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, - sle64_to_cpu(si->last_access_time), - sle64_to_cpu(nt)); + (long long)sle64_to_cpu(si->last_access_time), + (long long)sle64_to_cpu(nt)); si->last_access_time = nt; modified = TRUE; } diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 99580455f2e..3de5c023196 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -2,7 +2,7 @@ * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of * the Linux-NTFS project. * - * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2001-2005 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -44,6 +44,7 @@ typedef struct _ntfs_inode ntfs_inode; * fields already provided in the VFS inode. */ struct _ntfs_inode { + rwlock_t size_lock; /* Lock serializing access to inode sizes. */ s64 initialized_size; /* Copy from the attribute record. */ s64 allocated_size; /* Copy from the attribute record. */ unsigned long state; /* NTFS specific flags describing this inode. @@ -109,7 +110,7 @@ struct _ntfs_inode { u8 block_size_bits; /* Log2 of the above. */ u8 vcn_size_bits; /* Log2 of the above. */ } index; - struct { /* It is a compressed file or an attribute inode. */ + struct { /* It is a compressed/sparse file/attribute inode. */ s64 size; /* Copy of compressed_size from $DATA. */ u32 block_size; /* Size of a compression block @@ -165,6 +166,7 @@ typedef enum { NI_Sparse, /* 1: Unnamed data attr is sparse (f). 1: Create sparse files by default (d). 1: Attribute is sparse (a). */ + NI_SparseDisabled, /* 1: May not create sparse regions. */ NI_TruncateFailed, /* 1: Last ntfs_truncate() call failed. */ } ntfs_inode_state_bits; @@ -217,6 +219,7 @@ NINO_FNS(IndexAllocPresent) NINO_FNS(Compressed) NINO_FNS(Encrypted) NINO_FNS(Sparse) +NINO_FNS(SparseDisabled) NINO_FNS(TruncateFailed) /* diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h index 47b33899992..609ad1728ce 100644 --- a/fs/ntfs/layout.h +++ b/fs/ntfs/layout.h @@ -2,7 +2,7 @@ * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS * project. * - * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2001-2005 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -547,26 +547,44 @@ enum { COLLATION_NTOFS_ULONG = const_cpu_to_le32(0x10), COLLATION_NTOFS_SID = const_cpu_to_le32(0x11), COLLATION_NTOFS_SECURITY_HASH = const_cpu_to_le32(0x12), - COLLATION_NTOFS_ULONGS = const_cpu_to_le32(0x13) + COLLATION_NTOFS_ULONGS = const_cpu_to_le32(0x13), }; typedef le32 COLLATION_RULE; /* * The flags (32-bit) describing attribute properties in the attribute - * definition structure. FIXME: This information is from Regis's information - * and, according to him, it is not certain and probably incomplete. - * The INDEXABLE flag is fairly certainly correct as only the file name - * attribute has this flag set and this is the only attribute indexed in NT4. + * definition structure. FIXME: This information is based on Regis's + * information and, according to him, it is not certain and probably + * incomplete. The INDEXABLE flag is fairly certainly correct as only the file + * name attribute has this flag set and this is the only attribute indexed in + * NT4. */ enum { - INDEXABLE = const_cpu_to_le32(0x02), /* Attribute can be - indexed. */ - NEED_TO_REGENERATE = const_cpu_to_le32(0x40), /* Need to regenerate - during regeneration - phase. */ - CAN_BE_NON_RESIDENT = const_cpu_to_le32(0x80), /* Attribute can be - non-resident. */ + ATTR_DEF_INDEXABLE = const_cpu_to_le32(0x02), /* Attribute can be + indexed. */ + ATTR_DEF_MULTIPLE = const_cpu_to_le32(0x04), /* Attribute type + can be present multiple times in the + mft records of an inode. */ + ATTR_DEF_NOT_ZERO = const_cpu_to_le32(0x08), /* Attribute value + must contain at least one non-zero + byte. */ + ATTR_DEF_INDEXED_UNIQUE = const_cpu_to_le32(0x10), /* Attribute must be + indexed and the attribute value must be + unique for the attribute type in all of + the mft records of an inode. */ + ATTR_DEF_NAMED_UNIQUE = const_cpu_to_le32(0x20), /* Attribute must be + named and the name must be unique for + the attribute type in all of the mft + records of an inode. */ + ATTR_DEF_RESIDENT = const_cpu_to_le32(0x40), /* Attribute must be + resident. */ + ATTR_DEF_ALWAYS_LOG = const_cpu_to_le32(0x80), /* Always log + modifications to this attribute, + regardless of whether it is resident or + non-resident. Without this, only log + modifications if the attribute is + resident. */ }; typedef le32 ATTR_DEF_FLAGS; @@ -749,10 +767,11 @@ typedef struct { record header aligned to 8-byte boundary. */ /* 34*/ u8 compression_unit; /* The compression unit expressed as the log to the base 2 of the number of - clusters in a compression unit. 0 means not - compressed. (This effectively limits the + clusters in a compression unit. 0 means not + compressed. (This effectively limits the compression unit size to be a power of two - clusters.) WinNT4 only uses a value of 4. */ + clusters.) WinNT4 only uses a value of 4. + Sparse files also have this set to 4. */ /* 35*/ u8 reserved[5]; /* Align to 8-byte boundary. */ /* The sizes below are only used when lowest_vcn is zero, as otherwise it would be difficult to keep them up-to-date.*/ @@ -772,10 +791,10 @@ typedef struct { data_size. */ /* sizeof(uncompressed attr) = 64*/ /* 64*/ sle64 compressed_size; /* Byte size of the attribute - value after compression. Only present when - compressed. Always is a multiple of the - cluster size. Represents the actual amount of - disk space being used on the disk. */ + value after compression. Only present when + compressed or sparse. Always is a multiple of + the cluster size. Represents the actual amount + of disk space being used on the disk. */ /* sizeof(compressed attr) = 72*/ } __attribute__ ((__packed__)) non_resident; } __attribute__ ((__packed__)) data; @@ -834,7 +853,7 @@ enum { /* Note, this is a copy of the corresponding bit from the mft record, telling us whether this file has a view index present (eg. object id index, quota index, one of the security indexes or the encrypting - file system related indexes). */ + filesystem related indexes). */ }; typedef le32 FILE_ATTR_FLAGS; @@ -917,20 +936,12 @@ typedef struct { /* 56*/ le64 quota_charged; /* Byte size of the charge to the quota for all streams of the file. Note: Is zero if quotas are disabled. */ - /* 64*/ le64 usn; /* Last update sequence number - of the file. This is a direct index into the - change (aka usn) journal file. It is zero if - the usn journal is disabled. - NOTE: To disable the journal need to delete - the journal file itself and to then walk the - whole mft and set all Usn entries in all mft - records to zero! (This can take a while!) - The journal is FILE_Extend/$UsnJrnl. Win2k - will recreate the journal and initiate - logging if necessary when mounting the - partition. This, in contrast to disabling the - journal is a very fast process, so the user - won't even notice it. */ + /* 64*/ leUSN usn; /* Last update sequence number + of the file. This is a direct index into the + transaction log file ($UsnJrnl). It is zero if + the usn journal is disabled or this file has + not been subject to logging yet. See usnjrnl.h + for details. */ } __attribute__ ((__packed__)) v3; /* sizeof() = 72 bytes (NTFS 3.x) */ } __attribute__ ((__packed__)) ver; @@ -1893,7 +1904,7 @@ enum { VOLUME_FLAGS_MASK = const_cpu_to_le16(0x803f), /* To make our life easier when checking if we must mount read-only. */ - VOLUME_MUST_MOUNT_RO_MASK = const_cpu_to_le16(0x8037), + VOLUME_MUST_MOUNT_RO_MASK = const_cpu_to_le16(0x8027), } __attribute__ ((__packed__)); typedef le16 VOLUME_FLAGS; diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c index 23fd911078b..a4bc07616e5 100644 --- a/fs/ntfs/lcnalloc.c +++ b/fs/ntfs/lcnalloc.c @@ -1,7 +1,7 @@ /* * lcnalloc.c - Cluster (de)allocation code. Part of the Linux-NTFS project. * - * Copyright (c) 2004 Anton Altaparmakov + * Copyright (c) 2004-2005 Anton Altaparmakov * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published @@ -60,7 +60,7 @@ int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, if (rl->lcn < 0) continue; err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length); - if (unlikely(err && (!ret || ret == ENOMEM) && ret != err)) + if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err)) ret = err; } ntfs_debug("Done."); @@ -140,6 +140,7 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn; LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size; s64 clusters; + loff_t i_size; struct inode *lcnbmp_vi; runlist_element *rl = NULL; struct address_space *mapping; @@ -249,6 +250,7 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, clusters = count; rlpos = rlsize = 0; mapping = lcnbmp_vi->i_mapping; + i_size = i_size_read(lcnbmp_vi); while (1) { ntfs_debug("Start of outer while loop: done_zones 0x%x, " "search_zone %i, pass %i, zone_start 0x%llx, " @@ -263,7 +265,7 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, last_read_pos = bmp_pos >> 3; ntfs_debug("last_read_pos 0x%llx.", (unsigned long long)last_read_pos); - if (last_read_pos > lcnbmp_vi->i_size) { + if (last_read_pos > i_size) { ntfs_debug("End of attribute reached. " "Skipping to zone_pass_done."); goto zone_pass_done; @@ -287,11 +289,11 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, buf_size = last_read_pos & ~PAGE_CACHE_MASK; buf = page_address(page) + buf_size; buf_size = PAGE_CACHE_SIZE - buf_size; - if (unlikely(last_read_pos + buf_size > lcnbmp_vi->i_size)) - buf_size = lcnbmp_vi->i_size - last_read_pos; + if (unlikely(last_read_pos + buf_size > i_size)) + buf_size = i_size - last_read_pos; buf_size <<= 3; lcn = bmp_pos & 7; - bmp_pos &= ~7; + bmp_pos &= ~(LCN)7; ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, " "bmp_pos 0x%llx, need_writeback %i.", buf_size, (unsigned long long)lcn, @@ -309,7 +311,7 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, (unsigned int)*byte); /* Skip full bytes. */ if (*byte == 0xff) { - lcn = (lcn + 8) & ~7; + lcn = (lcn + 8) & ~(LCN)7; ntfs_debug("Continuing while loop 1."); continue; } @@ -691,7 +693,7 @@ switch_to_data1_zone: search_zone = 2; if (zone == MFT_ZONE || mft_zone_size <= 0) { ntfs_debug("No free clusters left, going to out."); /* Really no more space left on device. */ - err = ENOSPC; + err = -ENOSPC; goto out; } /* zone == DATA_ZONE && mft_zone_size > 0 */ ntfs_debug("Shrinking mft zone."); @@ -755,13 +757,13 @@ out: if (rl) { int err2; - if (err == ENOSPC) + if (err == -ENOSPC) ntfs_debug("Not enough space to complete allocation, " - "err ENOSPC, first free lcn 0x%llx, " + "err -ENOSPC, first free lcn 0x%llx, " "could allocate up to 0x%llx " "clusters.", (unsigned long long)rl[0].lcn, - (unsigned long long)count - clusters); + (unsigned long long)(count - clusters)); /* Deallocate all allocated clusters. */ ntfs_debug("Attempting rollback..."); err2 = ntfs_cluster_free_from_rl_nolock(vol, rl); @@ -773,10 +775,10 @@ out: } /* Free the runlist. */ ntfs_free(rl); - } else if (err == ENOSPC) - ntfs_debug("No space left at all, err = ENOSPC, " - "first free lcn = 0x%llx.", - (unsigned long long)vol->data1_zone_pos); + } else if (err == -ENOSPC) + ntfs_debug("No space left at all, err = -ENOSPC, first free " + "lcn = 0x%llx.", + (long long)vol->data1_zone_pos); up_write(&vol->lcnbmp_lock); return ERR_PTR(err); } @@ -846,8 +848,8 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count, total_freed = real_freed = 0; - /* This returns with ni->runlist locked for reading on success. */ - rl = ntfs_find_vcn(ni, start_vcn, FALSE); + down_read(&ni->runlist.lock); + rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, FALSE); if (IS_ERR(rl)) { if (!is_rollback) ntfs_error(vol->sb, "Failed to find first runlist " @@ -861,7 +863,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count, ntfs_error(vol->sb, "First runlist element has " "invalid lcn, aborting."); err = -EIO; - goto unl_err_out; + goto err_out; } /* Find the starting cluster inside the run that needs freeing. */ delta = start_vcn - rl->vcn; @@ -879,7 +881,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count, if (!is_rollback) ntfs_error(vol->sb, "Failed to clear first run " "(error %i), aborting.", err); - goto unl_err_out; + goto err_out; } /* We have freed @to_free real clusters. */ real_freed = to_free; @@ -899,30 +901,15 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count, if (unlikely(rl->lcn < LCN_HOLE)) { VCN vcn; - /* - * Attempt to map runlist, dropping runlist lock for - * the duration. - */ + /* Attempt to map runlist. */ vcn = rl->vcn; - up_read(&ni->runlist.lock); - err = ntfs_map_runlist(ni, vcn); - if (err) { - if (!is_rollback) - ntfs_error(vol->sb, "Failed to map " - "runlist fragment."); - if (err == -EINVAL || err == -ENOENT) - err = -EIO; - goto err_out; - } - /* - * This returns with ni->runlist locked for reading on - * success. - */ - rl = ntfs_find_vcn(ni, vcn, FALSE); + rl = ntfs_attr_find_vcn_nolock(ni, vcn, FALSE); if (IS_ERR(rl)) { err = PTR_ERR(rl); if (!is_rollback) - ntfs_error(vol->sb, "Failed to find " + ntfs_error(vol->sb, "Failed to map " + "runlist fragment or " + "failed to find " "subsequent runlist " "element."); goto err_out; @@ -935,7 +922,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count, (unsigned long long) rl->lcn); err = -EIO; - goto unl_err_out; + goto err_out; } } /* The number of clusters in this run that need freeing. */ @@ -951,7 +938,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count, if (!is_rollback) ntfs_error(vol->sb, "Failed to clear " "subsequent run."); - goto unl_err_out; + goto err_out; } /* We have freed @to_free real clusters. */ real_freed += to_free; @@ -972,9 +959,8 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count, /* We are done. Return the number of actually freed clusters. */ ntfs_debug("Done."); return real_freed; -unl_err_out: - up_read(&ni->runlist.lock); err_out: + up_read(&ni->runlist.lock); if (is_rollback) return err; /* If no real clusters were freed, no need to rollback. */ diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c index 5e280abafab..8edb8e20fb0 100644 --- a/fs/ntfs/logfile.c +++ b/fs/ntfs/logfile.c @@ -1,7 +1,7 @@ /* * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project. * - * Copyright (c) 2002-2004 Anton Altaparmakov + * Copyright (c) 2002-2005 Anton Altaparmakov * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published @@ -410,7 +410,7 @@ err_out: } /** - * ntfs_ckeck_logfile - check in the journal if the volume is consistent + * ntfs_check_logfile - check the journal for consistency * @log_vi: struct inode of loaded journal $LogFile to check * * Check the $LogFile journal for consistency and return TRUE if it is @@ -443,7 +443,7 @@ BOOL ntfs_check_logfile(struct inode *log_vi) /* An empty $LogFile must have been clean before it got emptied. */ if (NVolLogFileEmpty(vol)) goto is_empty; - size = log_vi->i_size; + size = i_size_read(log_vi); /* Make sure the file doesn't exceed the maximum allowed size. */ if (size > MaxLogFileSize) size = MaxLogFileSize; @@ -464,7 +464,7 @@ BOOL ntfs_check_logfile(struct inode *log_vi) * optimize log_page_size and log_page_bits into constants. */ log_page_bits = generic_ffs(log_page_size) - 1; - size &= ~(log_page_size - 1); + size &= ~(s64)(log_page_size - 1); /* * Ensure the log file is big enough to store at least the two restart * pages and the minimum number of log record pages. @@ -689,7 +689,8 @@ BOOL ntfs_empty_logfile(struct inode *log_vi) if (!NVolLogFileEmpty(vol)) { int err; - err = ntfs_attr_set(NTFS_I(log_vi), 0, log_vi->i_size, 0xff); + err = ntfs_attr_set(NTFS_I(log_vi), 0, i_size_read(log_vi), + 0xff); if (unlikely(err)) { ntfs_error(vol->sb, "Failed to fill $LogFile with " "0xff bytes (error code %i).", err); diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index dfa85ac2f8b..317f7c679fd 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -1,7 +1,7 @@ /** * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2001-2005 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -45,6 +45,7 @@ */ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni) { + loff_t i_size; ntfs_volume *vol = ni->vol; struct inode *mft_vi = vol->mft_ino; struct page *page; @@ -60,13 +61,14 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni) index = ni->mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT; ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK; + i_size = i_size_read(mft_vi); /* The maximum valid index into the page cache for $MFT's data. */ - end_index = mft_vi->i_size >> PAGE_CACHE_SHIFT; + end_index = i_size >> PAGE_CACHE_SHIFT; /* If the wanted index is out of bounds the mft record doesn't exist. */ if (unlikely(index >= end_index)) { - if (index > end_index || (mft_vi->i_size & ~PAGE_CACHE_MASK) < - ofs + vol->mft_record_size) { + if (index > end_index || (i_size & ~PAGE_CACHE_MASK) < ofs + + vol->mft_record_size) { page = ERR_PTR(-ENOENT); ntfs_error(vol->sb, "Attemt to read mft record 0x%lx, " "which is beyond the end of the mft. " @@ -285,7 +287,7 @@ MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, } unmap_mft_record(ni); ntfs_error(base_ni->vol->sb, "Found stale extent mft " - "reference! Corrupt file system. " + "reference! Corrupt filesystem. " "Run chkdsk."); return ERR_PTR(-EIO); } @@ -316,7 +318,7 @@ map_err_out: /* Verify the sequence number if it is present. */ if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) { ntfs_error(base_ni->vol->sb, "Found stale extent mft " - "reference! Corrupt file system. Run chkdsk."); + "reference! Corrupt filesystem. Run chkdsk."); destroy_ni = TRUE; m = ERR_PTR(-EIO); goto unm_err_out; @@ -531,6 +533,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, LCN lcn; unsigned int vcn_ofs; + bh->b_bdev = vol->sb->s_bdev; /* Obtain the vcn and offset of the current block. */ vcn = ((VCN)mft_no << vol->mft_record_size_bits) + (block_start - m_start); @@ -723,6 +726,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) LCN lcn; unsigned int vcn_ofs; + bh->b_bdev = vol->sb->s_bdev; /* Obtain the vcn and offset of the current block. */ vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) + (block_start - m_start); @@ -946,20 +950,23 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no, na.name_len = 0; na.type = AT_UNUSED; /* - * For inode 0, i.e. $MFT itself, we cannot use ilookup5() from here or - * we deadlock because the inode is already locked by the kernel - * (fs/fs-writeback.c::__sync_single_inode()) and ilookup5() waits - * until the inode is unlocked before returning it and it never gets - * unlocked because ntfs_should_write_mft_record() never returns. )-: - * Fortunately, we have inode 0 pinned in icache for the duration of - * the mount so we can access it directly. + * Optimize inode 0, i.e. $MFT itself, since we have it in memory and + * we get here for it rather often. */ if (!mft_no) { /* Balance the below iput(). */ vi = igrab(mft_vi); BUG_ON(vi != mft_vi); - } else - vi = ilookup5(sb, mft_no, (test_t)ntfs_test_inode, &na); + } else { + /* + * Have to use ilookup5_nowait() since ilookup5() waits for the + * inode lock which causes ntfs to deadlock when a concurrent + * inode write via the inode dirty code paths and the page + * dirty code path of the inode dirty code path when writing + * $MFT occurs. + */ + vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na); + } if (vi) { ntfs_debug("Base inode 0x%lx is in icache.", mft_no); /* The inode is in icache. */ @@ -1014,7 +1021,13 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no, na.mft_no = MREF_LE(m->base_mft_record); ntfs_debug("Mft record 0x%lx is an extent record. Looking for base " "inode 0x%lx in icache.", mft_no, na.mft_no); - vi = ilookup5(sb, na.mft_no, (test_t)ntfs_test_inode, &na); + if (!na.mft_no) { + /* Balance the below iput(). */ + vi = igrab(mft_vi); + BUG_ON(vi != mft_vi); + } else + vi = ilookup5_nowait(sb, na.mft_no, (test_t)ntfs_test_inode, + &na); if (!vi) { /* * The base inode is not in icache, write this extent mft @@ -1121,6 +1134,7 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol, ntfs_inode *base_ni) { s64 pass_end, ll, data_pos, pass_start, ofs, bit; + unsigned long flags; struct address_space *mftbmp_mapping; u8 *buf, *byte; struct page *page; @@ -1134,9 +1148,13 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol, * Set the end of the pass making sure we do not overflow the mft * bitmap. */ + read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags); pass_end = NTFS_I(vol->mft_ino)->allocated_size >> vol->mft_record_size_bits; + read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags); + read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags); ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3; + read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags); if (pass_end > ll) pass_end = ll; pass = 1; @@ -1263,6 +1281,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) { LCN lcn; s64 ll; + unsigned long flags; struct page *page; ntfs_inode *mft_ni, *mftbmp_ni; runlist_element *rl, *rl2 = NULL; @@ -1284,17 +1303,20 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) /* * Determine the last lcn of the mft bitmap. The allocated size of the * mft bitmap cannot be zero so we are ok to do this. - * ntfs_find_vcn() returns the runlist locked on success. */ - rl = ntfs_find_vcn(mftbmp_ni, (mftbmp_ni->allocated_size - 1) >> - vol->cluster_size_bits, TRUE); + down_write(&mftbmp_ni->runlist.lock); + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + ll = mftbmp_ni->allocated_size; + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + rl = ntfs_attr_find_vcn_nolock(mftbmp_ni, + (ll - 1) >> vol->cluster_size_bits, TRUE); if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { + up_write(&mftbmp_ni->runlist.lock); ntfs_error(vol->sb, "Failed to determine last allocated " "cluster of mft bitmap attribute."); - if (!IS_ERR(rl)) { - up_write(&mftbmp_ni->runlist.lock); + if (!IS_ERR(rl)) ret = -EIO; - } else + else ret = PTR_ERR(rl); return ret; } @@ -1396,7 +1418,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) BUG_ON(ll < rl2->vcn); BUG_ON(ll >= rl2->vcn + rl2->length); /* Get the size for the new mapping pairs array for this extent. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll); + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); if (unlikely(mp_size <= 0)) { ntfs_error(vol->sb, "Get size for mapping pairs failed for " "mft bitmap attribute extent."); @@ -1418,6 +1440,8 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) // TODO: Deal with this by moving this extent to a new mft // record or by starting a new extent in a new mft record or by // moving other attributes out of this mft record. + // Note: It will need to be a special mft record and if none of + // those are available it gets rather complicated... ntfs_error(vol->sb, "Not enough space in this mft record to " "accomodate extended mft bitmap attribute " "extent. Cannot handle this yet."); @@ -1428,7 +1452,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) /* Generate the mapping pairs array directly into the attr record. */ ret = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), - mp_size, rl2, ll, NULL); + mp_size, rl2, ll, -1, NULL); if (unlikely(ret)) { ntfs_error(vol->sb, "Failed to build mapping pairs array for " "mft bitmap attribute."); @@ -1458,9 +1482,11 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) } a = ctx->attr; } + write_lock_irqsave(&mftbmp_ni->size_lock, flags); mftbmp_ni->allocated_size += vol->cluster_size; a->data.non_resident.allocated_size = cpu_to_sle64(mftbmp_ni->allocated_size); + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); /* Ensure the changes make it to disk. */ flush_dcache_mft_record_page(ctx->ntfs_ino); mark_mft_record_dirty(ctx->ntfs_ino); @@ -1476,7 +1502,9 @@ restore_undo_alloc: 0, ctx)) { ntfs_error(vol->sb, "Failed to find last attribute extent of " "mft bitmap attribute.%s", es); + write_lock_irqsave(&mftbmp_ni->size_lock, flags); mftbmp_ni->allocated_size += vol->cluster_size; + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); ntfs_attr_put_search_ctx(ctx); unmap_mft_record(mft_ni); up_write(&mftbmp_ni->runlist.lock); @@ -1512,7 +1540,7 @@ undo_alloc: a->data.non_resident.mapping_pairs_offset), old_alen - le16_to_cpu( a->data.non_resident.mapping_pairs_offset), - rl2, ll, NULL)) { + rl2, ll, -1, NULL)) { ntfs_error(vol->sb, "Failed to restore mapping pairs " "array.%s", es); NVolSetErrors(vol); @@ -1550,6 +1578,7 @@ undo_alloc: static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol) { s64 old_data_size, old_initialized_size; + unsigned long flags; struct inode *mftbmp_vi; ntfs_inode *mft_ni, *mftbmp_ni; ntfs_attr_search_ctx *ctx; @@ -1583,7 +1612,8 @@ static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol) goto put_err_out; } a = ctx->attr; - old_data_size = mftbmp_vi->i_size; + write_lock_irqsave(&mftbmp_ni->size_lock, flags); + old_data_size = i_size_read(mftbmp_vi); old_initialized_size = mftbmp_ni->initialized_size; /* * We can simply update the initialized_size before filling the space @@ -1593,11 +1623,12 @@ static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol) mftbmp_ni->initialized_size += 8; a->data.non_resident.initialized_size = cpu_to_sle64(mftbmp_ni->initialized_size); - if (mftbmp_ni->initialized_size > mftbmp_vi->i_size) { - mftbmp_vi->i_size = mftbmp_ni->initialized_size; + if (mftbmp_ni->initialized_size > old_data_size) { + i_size_write(mftbmp_vi, mftbmp_ni->initialized_size); a->data.non_resident.data_size = - cpu_to_sle64(mftbmp_vi->i_size); + cpu_to_sle64(mftbmp_ni->initialized_size); } + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); /* Ensure the changes make it to disk. */ flush_dcache_mft_record_page(ctx->ntfs_ino); mark_mft_record_dirty(ctx->ntfs_ino); @@ -1636,22 +1667,28 @@ unm_err_out: goto err_out; } a = ctx->attr; + write_lock_irqsave(&mftbmp_ni->size_lock, flags); mftbmp_ni->initialized_size = old_initialized_size; a->data.non_resident.initialized_size = cpu_to_sle64(old_initialized_size); - if (mftbmp_vi->i_size != old_data_size) { - mftbmp_vi->i_size = old_data_size; + if (i_size_read(mftbmp_vi) != old_data_size) { + i_size_write(mftbmp_vi, old_data_size); a->data.non_resident.data_size = cpu_to_sle64(old_data_size); } + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); flush_dcache_mft_record_page(ctx->ntfs_ino); mark_mft_record_dirty(ctx->ntfs_ino); ntfs_attr_put_search_ctx(ctx); unmap_mft_record(mft_ni); +#ifdef DEBUG + read_lock_irqsave(&mftbmp_ni->size_lock, flags); ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, " "data_size 0x%llx, initialized_size 0x%llx.", (long long)mftbmp_ni->allocated_size, - (long long)mftbmp_vi->i_size, + (long long)i_size_read(mftbmp_vi), (long long)mftbmp_ni->initialized_size); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); +#endif /* DEBUG */ err_out: return ret; } @@ -1679,7 +1716,8 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) { LCN lcn; VCN old_last_vcn; - s64 min_nr, nr, ll = 0; + s64 min_nr, nr, ll; + unsigned long flags; ntfs_inode *mft_ni; runlist_element *rl, *rl2; ntfs_attr_search_ctx *ctx = NULL; @@ -1695,23 +1733,25 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) * Determine the preferred allocation location, i.e. the last lcn of * the mft data attribute. The allocated size of the mft data * attribute cannot be zero so we are ok to do this. - * ntfs_find_vcn() returns the runlist locked on success. */ - rl = ntfs_find_vcn(mft_ni, (mft_ni->allocated_size - 1) >> - vol->cluster_size_bits, TRUE); + down_write(&mft_ni->runlist.lock); + read_lock_irqsave(&mft_ni->size_lock, flags); + ll = mft_ni->allocated_size; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + rl = ntfs_attr_find_vcn_nolock(mft_ni, + (ll - 1) >> vol->cluster_size_bits, TRUE); if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { + up_write(&mft_ni->runlist.lock); ntfs_error(vol->sb, "Failed to determine last allocated " "cluster of mft data attribute."); - if (!IS_ERR(rl)) { - up_write(&mft_ni->runlist.lock); + if (!IS_ERR(rl)) ret = -EIO; - } else + else ret = PTR_ERR(rl); return ret; } lcn = rl->lcn + rl->length; - ntfs_debug("Last lcn of mft data attribute is 0x%llx.", - (long long)lcn); + ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn); /* Minimum allocation is one mft record worth of clusters. */ min_nr = vol->mft_record_size >> vol->cluster_size_bits; if (!min_nr) @@ -1721,12 +1761,13 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) if (!nr) nr = min_nr; /* Ensure we do not go above 2^32-1 mft records. */ - if (unlikely((mft_ni->allocated_size + - (nr << vol->cluster_size_bits)) >> + read_lock_irqsave(&mft_ni->size_lock, flags); + ll = mft_ni->allocated_size; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + if (unlikely((ll + (nr << vol->cluster_size_bits)) >> vol->mft_record_size_bits >= (1ll << 32))) { nr = min_nr; - if (unlikely((mft_ni->allocated_size + - (nr << vol->cluster_size_bits)) >> + if (unlikely((ll + (nr << vol->cluster_size_bits)) >> vol->mft_record_size_bits >= (1ll << 32))) { ntfs_warning(vol->sb, "Cannot allocate mft record " "because the maximum number of inodes " @@ -1772,7 +1813,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) return PTR_ERR(rl); } mft_ni->runlist.rl = rl; - ntfs_debug("Allocated %lli clusters.", nr); + ntfs_debug("Allocated %lli clusters.", (long long)nr); /* Find the last run in the new runlist. */ for (; rl[1].length; rl++) ; @@ -1808,7 +1849,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) BUG_ON(ll < rl2->vcn); BUG_ON(ll >= rl2->vcn + rl2->length); /* Get the size for the new mapping pairs array for this extent. */ - mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll); + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); if (unlikely(mp_size <= 0)) { ntfs_error(vol->sb, "Get size for mapping pairs failed for " "mft data attribute extent."); @@ -1832,7 +1873,11 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) // moving other attributes out of this mft record. // Note: Use the special reserved mft records and ensure that // this extent is not required to find the mft record in - // question. + // question. If no free special records left we would need to + // move an existing record away, insert ours in its place, and + // then place the moved record into the newly allocated space + // and we would then need to update all references to this mft + // record appropriately. This is rather complicated... ntfs_error(vol->sb, "Not enough space in this mft record to " "accomodate extended mft data attribute " "extent. Cannot handle this yet."); @@ -1843,7 +1888,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) /* Generate the mapping pairs array directly into the attr record. */ ret = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), - mp_size, rl2, ll, NULL); + mp_size, rl2, ll, -1, NULL); if (unlikely(ret)) { ntfs_error(vol->sb, "Failed to build mapping pairs array of " "mft data attribute."); @@ -1875,9 +1920,11 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) } a = ctx->attr; } + write_lock_irqsave(&mft_ni->size_lock, flags); mft_ni->allocated_size += nr << vol->cluster_size_bits; a->data.non_resident.allocated_size = cpu_to_sle64(mft_ni->allocated_size); + write_unlock_irqrestore(&mft_ni->size_lock, flags); /* Ensure the changes make it to disk. */ flush_dcache_mft_record_page(ctx->ntfs_ino); mark_mft_record_dirty(ctx->ntfs_ino); @@ -1892,7 +1939,9 @@ restore_undo_alloc: CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) { ntfs_error(vol->sb, "Failed to find last attribute extent of " "mft data attribute.%s", es); + write_lock_irqsave(&mft_ni->size_lock, flags); mft_ni->allocated_size += nr << vol->cluster_size_bits; + write_unlock_irqrestore(&mft_ni->size_lock, flags); ntfs_attr_put_search_ctx(ctx); unmap_mft_record(mft_ni); up_write(&mft_ni->runlist.lock); @@ -1921,7 +1970,7 @@ undo_alloc: a->data.non_resident.mapping_pairs_offset), old_alen - le16_to_cpu( a->data.non_resident.mapping_pairs_offset), - rl2, ll, NULL)) { + rl2, ll, -1, NULL)) { ntfs_error(vol->sb, "Failed to restore mapping pairs " "array.%s", es); NVolSetErrors(vol); @@ -1991,7 +2040,7 @@ static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no, "reports this as corruption, please email " "linux-ntfs-dev@lists.sourceforge.net stating " "that you saw this message and that the " - "modified file system created was corrupt. " + "modified filesystem created was corrupt. " "Thank you."); } /* Set the update sequence number to 1. */ @@ -2036,6 +2085,7 @@ static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no, */ static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no) { + loff_t i_size; struct inode *mft_vi = vol->mft_ino; struct page *page; MFT_RECORD *m; @@ -2051,10 +2101,11 @@ static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no) index = mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT; ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK; /* The maximum valid index into the page cache for $MFT's data. */ - end_index = mft_vi->i_size >> PAGE_CACHE_SHIFT; + i_size = i_size_read(mft_vi); + end_index = i_size >> PAGE_CACHE_SHIFT; if (unlikely(index >= end_index)) { if (unlikely(index > end_index || ofs + vol->mft_record_size >= - (mft_vi->i_size & ~PAGE_CACHE_MASK))) { + (i_size & ~PAGE_CACHE_MASK))) { ntfs_error(vol->sb, "Tried to format non-existing mft " "record 0x%llx.", (long long)mft_no); return -ENOENT; @@ -2188,6 +2239,7 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, ntfs_inode *base_ni, MFT_RECORD **mrec) { s64 ll, bit, old_data_initialized, old_data_size; + unsigned long flags; struct inode *vi; struct page *page; ntfs_inode *mft_ni, *mftbmp_ni, *ni; @@ -2237,9 +2289,13 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, * the first 24 mft records as they are special and whilst they may not * be in use, we do not allocate from them. */ + read_lock_irqsave(&mft_ni->size_lock, flags); ll = mft_ni->initialized_size >> vol->mft_record_size_bits; - if (mftbmp_ni->initialized_size << 3 > ll && - mftbmp_ni->initialized_size > 3) { + read_unlock_irqrestore(&mft_ni->size_lock, flags); + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + old_data_initialized = mftbmp_ni->initialized_size; + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + if (old_data_initialized << 3 > ll && old_data_initialized > 3) { bit = ll; if (bit < 24) bit = 24; @@ -2254,15 +2310,18 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, * mft record that we can allocate. * Note: The smallest mft record we allocate is mft record 24. */ - bit = mftbmp_ni->initialized_size << 3; + bit = old_data_initialized << 3; if (unlikely(bit >= (1ll << 32))) goto max_err_out; + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + old_data_size = mftbmp_ni->allocated_size; ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, " "data_size 0x%llx, initialized_size 0x%llx.", - (long long)mftbmp_ni->allocated_size, - (long long)vol->mftbmp_ino->i_size, - (long long)mftbmp_ni->initialized_size); - if (mftbmp_ni->initialized_size + 8 > mftbmp_ni->allocated_size) { + (long long)old_data_size, + (long long)i_size_read(vol->mftbmp_ino), + (long long)old_data_initialized); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + if (old_data_initialized + 8 > old_data_size) { /* Need to extend bitmap by one more cluster. */ ntfs_debug("mftbmp: initialized_size + 8 > allocated_size."); err = ntfs_mft_bitmap_extend_allocation_nolock(vol); @@ -2270,12 +2329,16 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, up_write(&vol->mftbmp_lock); goto err_out; } +#ifdef DEBUG + read_lock_irqsave(&mftbmp_ni->size_lock, flags); ntfs_debug("Status of mftbmp after allocation extension: " "allocated_size 0x%llx, data_size 0x%llx, " "initialized_size 0x%llx.", (long long)mftbmp_ni->allocated_size, - (long long)vol->mftbmp_ino->i_size, + (long long)i_size_read(vol->mftbmp_ino), (long long)mftbmp_ni->initialized_size); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); +#endif /* DEBUG */ } /* * We now have sufficient allocated space, extend the initialized_size @@ -2287,12 +2350,16 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, up_write(&vol->mftbmp_lock); goto err_out; } +#ifdef DEBUG + read_lock_irqsave(&mftbmp_ni->size_lock, flags); ntfs_debug("Status of mftbmp after initialized extention: " "allocated_size 0x%llx, data_size 0x%llx, " "initialized_size 0x%llx.", (long long)mftbmp_ni->allocated_size, - (long long)vol->mftbmp_ino->i_size, + (long long)i_size_read(vol->mftbmp_ino), (long long)mftbmp_ni->initialized_size); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); +#endif /* DEBUG */ ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit); found_free_rec: /* @bit is the found free mft record, allocate it in the mft bitmap. */ @@ -2314,7 +2381,10 @@ have_alloc_rec: * parallel allocation could allocate the same mft record as this one. */ ll = (bit + 1) << vol->mft_record_size_bits; - if (ll <= mft_ni->initialized_size) { + read_lock_irqsave(&mft_ni->size_lock, flags); + old_data_initialized = mft_ni->initialized_size; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + if (ll <= old_data_initialized) { ntfs_debug("Allocated mft record already initialized."); goto mft_rec_already_initialized; } @@ -2325,26 +2395,30 @@ have_alloc_rec: * actually traversed more than once when a freshly formatted volume is * first written to so it optimizes away nicely in the common case. */ + read_lock_irqsave(&mft_ni->size_lock, flags); ntfs_debug("Status of mft data before extension: " "allocated_size 0x%llx, data_size 0x%llx, " "initialized_size 0x%llx.", (long long)mft_ni->allocated_size, - (long long)vol->mft_ino->i_size, + (long long)i_size_read(vol->mft_ino), (long long)mft_ni->initialized_size); while (ll > mft_ni->allocated_size) { + read_unlock_irqrestore(&mft_ni->size_lock, flags); err = ntfs_mft_data_extend_allocation_nolock(vol); if (unlikely(err)) { ntfs_error(vol->sb, "Failed to extend mft data " "allocation."); goto undo_mftbmp_alloc_nolock; } + read_lock_irqsave(&mft_ni->size_lock, flags); ntfs_debug("Status of mft data after allocation extension: " "allocated_size 0x%llx, data_size 0x%llx, " "initialized_size 0x%llx.", (long long)mft_ni->allocated_size, - (long long)vol->mft_ino->i_size, + (long long)i_size_read(vol->mft_ino), (long long)mft_ni->initialized_size); } + read_unlock_irqrestore(&mft_ni->size_lock, flags); /* * Extend mft data initialized size (and data size of course) to reach * the allocated mft record, formatting the mft records allong the way. @@ -2352,6 +2426,7 @@ have_alloc_rec: * needed by ntfs_mft_record_format(). We will update the attribute * record itself in one fell swoop later on. */ + write_lock_irqsave(&mft_ni->size_lock, flags); old_data_initialized = mft_ni->initialized_size; old_data_size = vol->mft_ino->i_size; while (ll > mft_ni->initialized_size) { @@ -2360,8 +2435,9 @@ have_alloc_rec: new_initialized_size = mft_ni->initialized_size + vol->mft_record_size; mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits; - if (new_initialized_size > vol->mft_ino->i_size) - vol->mft_ino->i_size = new_initialized_size; + if (new_initialized_size > i_size_read(vol->mft_ino)) + i_size_write(vol->mft_ino, new_initialized_size); + write_unlock_irqrestore(&mft_ni->size_lock, flags); ntfs_debug("Initializing mft record 0x%llx.", (long long)mft_no); err = ntfs_mft_record_format(vol, mft_no); @@ -2369,8 +2445,10 @@ have_alloc_rec: ntfs_error(vol->sb, "Failed to format mft record."); goto undo_data_init; } + write_lock_irqsave(&mft_ni->size_lock, flags); mft_ni->initialized_size = new_initialized_size; } + write_unlock_irqrestore(&mft_ni->size_lock, flags); record_formatted = TRUE; /* Update the mft data attribute record to reflect the new sizes. */ m = map_mft_record(mft_ni); @@ -2396,22 +2474,27 @@ have_alloc_rec: goto undo_data_init; } a = ctx->attr; + read_lock_irqsave(&mft_ni->size_lock, flags); a->data.non_resident.initialized_size = cpu_to_sle64(mft_ni->initialized_size); - a->data.non_resident.data_size = cpu_to_sle64(vol->mft_ino->i_size); + a->data.non_resident.data_size = + cpu_to_sle64(i_size_read(vol->mft_ino)); + read_unlock_irqrestore(&mft_ni->size_lock, flags); /* Ensure the changes make it to disk. */ flush_dcache_mft_record_page(ctx->ntfs_ino); mark_mft_record_dirty(ctx->ntfs_ino); ntfs_attr_put_search_ctx(ctx); unmap_mft_record(mft_ni); + read_lock_irqsave(&mft_ni->size_lock, flags); ntfs_debug("Status of mft data after mft record initialization: " "allocated_size 0x%llx, data_size 0x%llx, " "initialized_size 0x%llx.", (long long)mft_ni->allocated_size, - (long long)vol->mft_ino->i_size, + (long long)i_size_read(vol->mft_ino), (long long)mft_ni->initialized_size); - BUG_ON(vol->mft_ino->i_size > mft_ni->allocated_size); - BUG_ON(mft_ni->initialized_size > vol->mft_ino->i_size); + BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size); + BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino)); + read_unlock_irqrestore(&mft_ni->size_lock, flags); mft_rec_already_initialized: /* * We can finally drop the mft bitmap lock as the mft data attribute @@ -2652,8 +2735,10 @@ mft_rec_already_initialized: *mrec = m; return ni; undo_data_init: + write_lock_irqsave(&mft_ni->size_lock, flags); mft_ni->initialized_size = old_data_initialized; - vol->mft_ino->i_size = old_data_size; + i_size_write(vol->mft_ino, old_data_size); + write_unlock_irqrestore(&mft_ni->size_lock, flags); goto undo_mftbmp_alloc_nolock; undo_mftbmp_alloc: down_write(&vol->mftbmp_lock); diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 7c7e13b43b2..351dbc3b6e4 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -153,8 +153,7 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with " "error code %li.", dent_ino, PTR_ERR(dent_inode)); - if (name) - kfree(name); + kfree(name); /* Return the error code. */ return (struct dentry *)dent_inode; } @@ -380,7 +379,7 @@ struct inode_operations ntfs_dir_inode_ops = { * Return the dentry of the parent directory on success or the error code on * error (IS_ERR() is true). */ -struct dentry *ntfs_get_parent(struct dentry *child_dent) +static struct dentry *ntfs_get_parent(struct dentry *child_dent) { struct inode *vi = child_dent->d_inode; ntfs_inode *ni = NTFS_I(vi); @@ -465,7 +464,7 @@ try_next: * * Return the dentry on success or the error code on error (IS_ERR() is true). */ -struct dentry *ntfs_get_dentry(struct super_block *sb, void *fh) +static struct dentry *ntfs_get_dentry(struct super_block *sb, void *fh) { struct inode *vi; struct dentry *dent; @@ -496,3 +495,30 @@ struct dentry *ntfs_get_dentry(struct super_block *sb, void *fh) ntfs_debug("Done for inode 0x%lx, generation 0x%x.", ino, gen); return dent; } + +/** + * Export operations allowing NFS exporting of mounted NTFS partitions. + * + * We use the default ->decode_fh() and ->encode_fh() for now. Note that they + * use 32 bits to store the inode number which is an unsigned long so on 64-bit + * architectures is usually 64 bits so it would all fail horribly on huge + * volumes. I guess we need to define our own encode and decode fh functions + * that store 64-bit inode numbers at some point but for now we will ignore the + * problem... + * + * We also use the default ->get_name() helper (used by ->decode_fh() via + * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs + * independent. + * + * The default ->get_parent() just returns -EACCES so we have to provide our + * own and the default ->get_dentry() is incompatible with NTFS due to not + * allowing the inode number 0 which is used in NTFS for the system file $MFT + * and due to using iget() whereas NTFS needs ntfs_iget(). + */ +struct export_operations ntfs_export_ops = { + .get_parent = ntfs_get_parent, /* Find the parent of a given + directory. */ + .get_dentry = ntfs_get_dentry, /* Find a dentry for the inode + given a file handle + sub-fragment. */ +}; diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h index 720ffb71bab..446b5014115 100644 --- a/fs/ntfs/ntfs.h +++ b/fs/ntfs/ntfs.h @@ -2,7 +2,7 @@ * ntfs.h - Defines for NTFS Linux kernel driver. Part of the Linux-NTFS * project. * - * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2001-2005 Anton Altaparmakov * Copyright (C) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -31,6 +31,7 @@ #include <linux/fs.h> #include <linux/nls.h> #include <linux/smp.h> +#include <linux/pagemap.h> #include "types.h" #include "volume.h" @@ -41,6 +42,9 @@ typedef enum { NTFS_BLOCK_SIZE_BITS = 9, NTFS_SB_MAGIC = 0x5346544e, /* 'NTFS' */ NTFS_MAX_NAME_LEN = 255, + NTFS_MAX_ATTR_NAME_LEN = 255, + NTFS_MAX_CLUSTER_SIZE = 64 * 1024, /* 64kiB */ + NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_CACHE_SIZE, } NTFS_CONSTANTS; /* Global variables. */ @@ -65,6 +69,8 @@ extern struct inode_operations ntfs_dir_inode_ops; extern struct file_operations ntfs_empty_file_ops; extern struct inode_operations ntfs_empty_inode_ops; +extern struct export_operations ntfs_export_ops; + /** * NTFS_SB - return the ntfs volume given a vfs super block * @sb: VFS super block diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c index 8438fb1da21..758855b0414 100644 --- a/fs/ntfs/runlist.c +++ b/fs/ntfs/runlist.c @@ -1,7 +1,7 @@ /** * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2001-2005 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -59,7 +59,7 @@ static inline void ntfs_rl_mc(runlist_element *dstbase, int dst, * * As the runlists grow, more memory will be required. To prevent the * kernel having to allocate and reallocate large numbers of small bits of - * memory, this function returns and entire page of memory. + * memory, this function returns an entire page of memory. * * It is up to the caller to serialize access to the runlist @rl. * @@ -113,8 +113,11 @@ static inline BOOL ntfs_are_rl_mergeable(runlist_element *dst, BUG_ON(!dst); BUG_ON(!src); - if ((dst->lcn < 0) || (src->lcn < 0)) /* Are we merging holes? */ + if ((dst->lcn < 0) || (src->lcn < 0)) { /* Are we merging holes? */ + if (dst->lcn == LCN_HOLE && src->lcn == LCN_HOLE) + return TRUE; return FALSE; + } if ((dst->lcn + dst->length) != src->lcn) /* Are the runs contiguous? */ return FALSE; if ((dst->vcn + dst->length) != src->vcn) /* Are the runs misaligned? */ @@ -855,30 +858,42 @@ mpa_err: if (!attr->data.non_resident.lowest_vcn) { VCN max_cluster; - max_cluster = (sle64_to_cpu( + max_cluster = ((sle64_to_cpu( attr->data.non_resident.allocated_size) + vol->cluster_size - 1) >> - vol->cluster_size_bits; + vol->cluster_size_bits) - 1; /* - * If there is a difference between the highest_vcn and the - * highest cluster, the runlist is either corrupt or, more - * likely, there are more extents following this one. + * A highest_vcn of zero means this is a single extent + * attribute so simply terminate the runlist with LCN_ENOENT). */ - if (deltaxcn < --max_cluster) { - ntfs_debug("More extents to follow; deltaxcn = 0x%llx, " - "max_cluster = 0x%llx", - (unsigned long long)deltaxcn, - (unsigned long long)max_cluster); - rl[rlpos].vcn = vcn; - vcn += rl[rlpos].length = max_cluster - deltaxcn; - rl[rlpos].lcn = LCN_RL_NOT_MAPPED; - rlpos++; - } else if (unlikely(deltaxcn > max_cluster)) { - ntfs_error(vol->sb, "Corrupt attribute. deltaxcn = " - "0x%llx, max_cluster = 0x%llx", - (unsigned long long)deltaxcn, - (unsigned long long)max_cluster); - goto mpa_err; + if (deltaxcn) { + /* + * If there is a difference between the highest_vcn and + * the highest cluster, the runlist is either corrupt + * or, more likely, there are more extents following + * this one. + */ + if (deltaxcn < max_cluster) { + ntfs_debug("More extents to follow; deltaxcn " + "= 0x%llx, max_cluster = " + "0x%llx", + (unsigned long long)deltaxcn, + (unsigned long long) + max_cluster); + rl[rlpos].vcn = vcn; + vcn += rl[rlpos].length = max_cluster - + deltaxcn; + rl[rlpos].lcn = LCN_RL_NOT_MAPPED; + rlpos++; + } else if (unlikely(deltaxcn > max_cluster)) { + ntfs_error(vol->sb, "Corrupt attribute. " + "deltaxcn = 0x%llx, " + "max_cluster = 0x%llx", + (unsigned long long)deltaxcn, + (unsigned long long) + max_cluster); + goto mpa_err; + } } rl[rlpos].lcn = LCN_ENOENT; } else /* Not the base extent. There may be more extents to follow. */ @@ -918,17 +933,18 @@ err_out: * * It is up to the caller to serialize access to the runlist @rl. * - * Since lcns must be >= 0, we use negative return values with special meaning: + * Since lcns must be >= 0, we use negative return codes with special meaning: * - * Return value Meaning / Description + * Return code Meaning / Description * ================================================== - * -1 = LCN_HOLE Hole / not allocated on disk. - * -2 = LCN_RL_NOT_MAPPED This is part of the runlist which has not been - * inserted into the runlist yet. - * -3 = LCN_ENOENT There is no such vcn in the attribute. + * LCN_HOLE Hole / not allocated on disk. + * LCN_RL_NOT_MAPPED This is part of the runlist which has not been + * inserted into the runlist yet. + * LCN_ENOENT There is no such vcn in the attribute. * * Locking: - The caller must have locked the runlist (for reading or writing). - * - This function does not touch the lock. + * - This function does not touch the lock, nor does it modify the + * runlist. */ LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn) { @@ -964,6 +980,39 @@ LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn) return LCN_ENOENT; } +#ifdef NTFS_RW + +/** + * ntfs_rl_find_vcn_nolock - find a vcn in a runlist + * @rl: runlist to search + * @vcn: vcn to find + * + * Find the virtual cluster number @vcn in the runlist @rl and return the + * address of the runlist element containing the @vcn on success. + * + * Return NULL if @rl is NULL or @vcn is in an unmapped part/out of bounds of + * the runlist. + * + * Locking: The runlist must be locked on entry. + */ +runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, const VCN vcn) +{ + BUG_ON(vcn < 0); + if (unlikely(!rl || vcn < rl[0].vcn)) + return NULL; + while (likely(rl->length)) { + if (unlikely(vcn < rl[1].vcn)) { + if (likely(rl->lcn >= LCN_HOLE)) + return rl; + return NULL; + } + rl++; + } + if (likely(rl->lcn == LCN_ENOENT)) + return rl; + return NULL; +} + /** * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number * @n: number for which to get the number of bytes for @@ -999,10 +1048,17 @@ static inline int ntfs_get_nr_significant_bytes(const s64 n) * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array * @vol: ntfs volume (needed for the ntfs version) * @rl: locked runlist to determine the size of the mapping pairs of - * @start_vcn: vcn at which to start the mapping pairs array + * @first_vcn: first vcn which to include in the mapping pairs array + * @last_vcn: last vcn which to include in the mapping pairs array * * Walk the locked runlist @rl and calculate the size in bytes of the mapping - * pairs array corresponding to the runlist @rl, starting at vcn @start_vcn. + * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and + * finishing with vcn @last_vcn. + * + * A @last_vcn of -1 means end of runlist and in that case the size of the + * mapping pairs array corresponding to the runlist starting at vcn @first_vcn + * and finishing at the end of the runlist is determined. + * * This for example allows us to allocate a buffer of the right size when * building the mapping pairs array. * @@ -1018,34 +1074,50 @@ static inline int ntfs_get_nr_significant_bytes(const s64 n) * remains locked throughout, and is left locked upon return. */ int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, - const runlist_element *rl, const VCN start_vcn) + const runlist_element *rl, const VCN first_vcn, + const VCN last_vcn) { LCN prev_lcn; int rls; + BOOL the_end = FALSE; - BUG_ON(start_vcn < 0); + BUG_ON(first_vcn < 0); + BUG_ON(last_vcn < -1); + BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); if (!rl) { - BUG_ON(start_vcn); + BUG_ON(first_vcn); + BUG_ON(last_vcn > 0); return 1; } - /* Skip to runlist element containing @start_vcn. */ - while (rl->length && start_vcn >= rl[1].vcn) + /* Skip to runlist element containing @first_vcn. */ + while (rl->length && first_vcn >= rl[1].vcn) rl++; - if ((!rl->length && start_vcn > rl->vcn) || start_vcn < rl->vcn) + if (unlikely((!rl->length && first_vcn > rl->vcn) || + first_vcn < rl->vcn)) return -EINVAL; prev_lcn = 0; /* Always need the termining zero byte. */ rls = 1; /* Do the first partial run if present. */ - if (start_vcn > rl->vcn) { - s64 delta; + if (first_vcn > rl->vcn) { + s64 delta, length = rl->length; /* We know rl->length != 0 already. */ - if (rl->length < 0 || rl->lcn < LCN_HOLE) + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) goto err_out; - delta = start_vcn - rl->vcn; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = TRUE; + } + delta = first_vcn - rl->vcn; /* Header byte + length. */ - rls += 1 + ntfs_get_nr_significant_bytes(rl->length - delta); + rls += 1 + ntfs_get_nr_significant_bytes(length - delta); /* * If the logical cluster number (lcn) denotes a hole and we * are on NTFS 3.0+, we don't store it at all, i.e. we need @@ -1053,9 +1125,9 @@ int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, * Note: this assumes that on NTFS 1.2-, holes are stored with * an lcn of -1 and not a delta_lcn of -1 (unless both are -1). */ - if (rl->lcn >= 0 || vol->major_ver < 3) { + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { prev_lcn = rl->lcn; - if (rl->lcn >= 0) + if (likely(rl->lcn >= 0)) prev_lcn += delta; /* Change in lcn. */ rls += ntfs_get_nr_significant_bytes(prev_lcn); @@ -1064,11 +1136,23 @@ int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, rl++; } /* Do the full runs. */ - for (; rl->length; rl++) { - if (rl->length < 0 || rl->lcn < LCN_HOLE) + for (; rl->length && !the_end; rl++) { + s64 length = rl->length; + + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = TRUE; + } /* Header byte + length. */ - rls += 1 + ntfs_get_nr_significant_bytes(rl->length); + rls += 1 + ntfs_get_nr_significant_bytes(length); /* * If the logical cluster number (lcn) denotes a hole and we * are on NTFS 3.0+, we don't store it at all, i.e. we need @@ -1076,7 +1160,7 @@ int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, * Note: this assumes that on NTFS 1.2-, holes are stored with * an lcn of -1 and not a delta_lcn of -1 (unless both are -1). */ - if (rl->lcn >= 0 || vol->major_ver < 3) { + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { /* Change in lcn. */ rls += ntfs_get_nr_significant_bytes(rl->lcn - prev_lcn); @@ -1119,7 +1203,7 @@ static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max, i = 0; do { - if (dst > dst_max) + if (unlikely(dst > dst_max)) goto err_out; *dst++ = l & 0xffll; l >>= 8; @@ -1128,12 +1212,12 @@ static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max, j = (n >> 8 * (i - 1)) & 0xff; /* If the sign bit is wrong, we need an extra byte. */ if (n < 0 && j >= 0) { - if (dst > dst_max) + if (unlikely(dst > dst_max)) goto err_out; i++; *dst = (s8)-1; } else if (n > 0 && j < 0) { - if (dst > dst_max) + if (unlikely(dst > dst_max)) goto err_out; i++; *dst = (s8)0; @@ -1149,13 +1233,18 @@ err_out: * @dst: destination buffer to which to write the mapping pairs array * @dst_len: size of destination buffer @dst in bytes * @rl: locked runlist for which to build the mapping pairs array - * @start_vcn: vcn at which to start the mapping pairs array + * @first_vcn: first vcn which to include in the mapping pairs array + * @last_vcn: last vcn which to include in the mapping pairs array * @stop_vcn: first vcn outside destination buffer on success or -ENOSPC * * Create the mapping pairs array from the locked runlist @rl, starting at vcn - * @start_vcn and save the array in @dst. @dst_len is the size of @dst in - * bytes and it should be at least equal to the value obtained by calling - * ntfs_get_size_for_mapping_pairs(). + * @first_vcn and finishing with vcn @last_vcn and save the array in @dst. + * @dst_len is the size of @dst in bytes and it should be at least equal to the + * value obtained by calling ntfs_get_size_for_mapping_pairs(). + * + * A @last_vcn of -1 means end of runlist and in that case the mapping pairs + * array corresponding to the runlist starting at vcn @first_vcn and finishing + * at the end of the runlist is created. * * If @rl is NULL, just write a single terminator byte to @dst. * @@ -1164,7 +1253,7 @@ err_out: * been filled with all the mapping pairs that will fit, thus it can be treated * as partial success, in that a new attribute extent needs to be created or * the next extent has to be used and the mapping pairs build has to be - * continued with @start_vcn set to *@stop_vcn. + * continued with @first_vcn set to *@stop_vcn. * * Return 0 on success and -errno on error. The following error codes are * defined: @@ -1178,27 +1267,32 @@ err_out: */ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, const int dst_len, const runlist_element *rl, - const VCN start_vcn, VCN *const stop_vcn) + const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn) { LCN prev_lcn; s8 *dst_max, *dst_next; int err = -ENOSPC; + BOOL the_end = FALSE; s8 len_len, lcn_len; - BUG_ON(start_vcn < 0); + BUG_ON(first_vcn < 0); + BUG_ON(last_vcn < -1); + BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); BUG_ON(dst_len < 1); if (!rl) { - BUG_ON(start_vcn); + BUG_ON(first_vcn); + BUG_ON(last_vcn > 0); if (stop_vcn) *stop_vcn = 0; /* Terminator byte. */ *dst = 0; return 0; } - /* Skip to runlist element containing @start_vcn. */ - while (rl->length && start_vcn >= rl[1].vcn) + /* Skip to runlist element containing @first_vcn. */ + while (rl->length && first_vcn >= rl[1].vcn) rl++; - if ((!rl->length && start_vcn > rl->vcn) || start_vcn < rl->vcn) + if (unlikely((!rl->length && first_vcn > rl->vcn) || + first_vcn < rl->vcn)) return -EINVAL; /* * @dst_max is used for bounds checking in @@ -1207,17 +1301,27 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, dst_max = dst + dst_len - 1; prev_lcn = 0; /* Do the first partial run if present. */ - if (start_vcn > rl->vcn) { - s64 delta; + if (first_vcn > rl->vcn) { + s64 delta, length = rl->length; /* We know rl->length != 0 already. */ - if (rl->length < 0 || rl->lcn < LCN_HOLE) + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) goto err_out; - delta = start_vcn - rl->vcn; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = TRUE; + } + delta = first_vcn - rl->vcn; /* Write length. */ len_len = ntfs_write_significant_bytes(dst + 1, dst_max, - rl->length - delta); - if (len_len < 0) + length - delta); + if (unlikely(len_len < 0)) goto size_err; /* * If the logical cluster number (lcn) denotes a hole and we @@ -1228,19 +1332,19 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, * case on NT4. - We assume that we just need to write the lcn * change until someone tells us otherwise... (AIA) */ - if (rl->lcn >= 0 || vol->major_ver < 3) { + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { prev_lcn = rl->lcn; - if (rl->lcn >= 0) + if (likely(rl->lcn >= 0)) prev_lcn += delta; /* Write change in lcn. */ lcn_len = ntfs_write_significant_bytes(dst + 1 + len_len, dst_max, prev_lcn); - if (lcn_len < 0) + if (unlikely(lcn_len < 0)) goto size_err; } else lcn_len = 0; dst_next = dst + len_len + lcn_len + 1; - if (dst_next > dst_max) + if (unlikely(dst_next > dst_max)) goto size_err; /* Update header byte. */ *dst = lcn_len << 4 | len_len; @@ -1250,13 +1354,25 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, rl++; } /* Do the full runs. */ - for (; rl->length; rl++) { - if (rl->length < 0 || rl->lcn < LCN_HOLE) + for (; rl->length && !the_end; rl++) { + s64 length = rl->length; + + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = TRUE; + } /* Write length. */ len_len = ntfs_write_significant_bytes(dst + 1, dst_max, - rl->length); - if (len_len < 0) + length); + if (unlikely(len_len < 0)) goto size_err; /* * If the logical cluster number (lcn) denotes a hole and we @@ -1267,17 +1383,17 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, * case on NT4. - We assume that we just need to write the lcn * change until someone tells us otherwise... (AIA) */ - if (rl->lcn >= 0 || vol->major_ver < 3) { + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { /* Write change in lcn. */ lcn_len = ntfs_write_significant_bytes(dst + 1 + len_len, dst_max, rl->lcn - prev_lcn); - if (lcn_len < 0) + if (unlikely(lcn_len < 0)) goto size_err; prev_lcn = rl->lcn; } else lcn_len = 0; dst_next = dst + len_len + lcn_len + 1; - if (dst_next > dst_max) + if (unlikely(dst_next > dst_max)) goto size_err; /* Update header byte. */ *dst = lcn_len << 4 | len_len; @@ -1436,3 +1552,5 @@ int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist, ntfs_debug("Done."); return 0; } + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h index 7107fde59df..aa0ee6540e7 100644 --- a/fs/ntfs/runlist.h +++ b/fs/ntfs/runlist.h @@ -2,7 +2,7 @@ * runlist.h - Defines for runlist handling in NTFS Linux kernel driver. * Part of the Linux-NTFS project. * - * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2001-2005 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -66,6 +66,8 @@ typedef enum { LCN_HOLE = -1, /* Keep this as highest value or die! */ LCN_RL_NOT_MAPPED = -2, LCN_ENOENT = -3, + LCN_ENOMEM = -4, + LCN_EIO = -5, } LCN_SPECIAL_VALUES; extern runlist_element *ntfs_runlists_merge(runlist_element *drl, @@ -76,14 +78,22 @@ extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn); +#ifdef NTFS_RW + +extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, + const VCN vcn); + extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, - const runlist_element *rl, const VCN start_vcn); + const runlist_element *rl, const VCN first_vcn, + const VCN last_vcn); extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, const int dst_len, const runlist_element *rl, - const VCN start_vcn, VCN *const stop_vcn); + const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn); extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist, const s64 new_length); +#endif /* NTFS_RW */ + #endif /* _LINUX_NTFS_RUNLIST_H */ diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 212a3d0f207..41aa8eb6755 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1,7 +1,7 @@ /* * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2001-2005 Anton Altaparmakov * Copyright (c) 2001,2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -34,14 +34,16 @@ #include "sysctl.h" #include "logfile.h" #include "quota.h" +#include "usnjrnl.h" #include "dir.h" #include "debug.h" #include "index.h" #include "aops.h" +#include "layout.h" #include "malloc.h" #include "ntfs.h" -/* Number of mounted file systems which have compression enabled. */ +/* Number of mounted filesystems which have compression enabled. */ static unsigned long ntfs_nr_compression_users; /* A global default upcase table and a corresponding reference count. */ @@ -102,7 +104,7 @@ static BOOL parse_options(ntfs_volume *vol, char *opt) gid_t gid = (gid_t)-1; mode_t fmask = (mode_t)-1, dmask = (mode_t)-1; int mft_zone_multiplier = -1, on_errors = -1; - int show_sys_files = -1, case_sensitive = -1; + int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1; struct nls_table *nls_map = NULL, *old_nls; /* I am lazy... (-8 */ @@ -162,6 +164,7 @@ static BOOL parse_options(ntfs_volume *vol, char *opt) else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, TRUE) else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files) else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive) + else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse) else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors, on_errors_arr) else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes")) @@ -291,6 +294,21 @@ no_mount_options: else NVolClearCaseSensitive(vol); } + if (disable_sparse != -1) { + if (disable_sparse) + NVolClearSparseEnabled(vol); + else { + if (!NVolSparseEnabled(vol) && + vol->major_ver && vol->major_ver < 3) + ntfs_warning(vol->sb, "Not enabling sparse " + "support due to NTFS volume " + "version %i.%i (need at least " + "version 3.0).", vol->major_ver, + vol->minor_ver); + else + NVolSetSparseEnabled(vol); + } + } return TRUE; needs_arg: ntfs_error(vol->sb, "The %s option requires an argument.", p); @@ -480,6 +498,12 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) NVolSetErrors(vol); return -EROFS; } + if (!ntfs_stamp_usnjrnl(vol)) { + ntfs_error(sb, "Failed to stamp transation log " + "($UsnJrnl)%s", es); + NVolSetErrors(vol); + return -EROFS; + } } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { /* Remounting read-only. */ if (!NVolErrors(vol)) { @@ -516,16 +540,19 @@ static BOOL is_boot_sector_ntfs(const struct super_block *sb, { /* * Check that checksum == sum of u32 values from b to the checksum - * field. If checksum is zero, no checking is done. + * field. If checksum is zero, no checking is done. We will work when + * the checksum test fails, since some utilities update the boot sector + * ignoring the checksum which leaves the checksum out-of-date. We + * report a warning if this is the case. */ - if ((void*)b < (void*)&b->checksum && b->checksum) { + if ((void*)b < (void*)&b->checksum && b->checksum && !silent) { le32 *u; u32 i; for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u) i += le32_to_cpup(u); if (le32_to_cpu(b->checksum) != i) - goto not_ntfs; + ntfs_warning(sb, "Invalid boot sector checksum."); } /* Check OEMidentifier is "NTFS " */ if (b->oem_id != magicNTFS) @@ -541,9 +568,9 @@ static BOOL is_boot_sector_ntfs(const struct super_block *sb, default: goto not_ntfs; } - /* Check the cluster size is not above 65536 bytes. */ + /* Check the cluster size is not above the maximum (64kiB). */ if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) * - b->bpb.sectors_per_cluster > 0x10000) + b->bpb.sectors_per_cluster > NTFS_MAX_CLUSTER_SIZE) goto not_ntfs; /* Check reserved/unused fields are really zero. */ if (le16_to_cpu(b->bpb.reserved_sectors) || @@ -575,7 +602,7 @@ static BOOL is_boot_sector_ntfs(const struct super_block *sb, * many BIOSes will refuse to boot from a bootsector if the magic is * incorrect, so we emit a warning. */ - if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55)) + if (!silent && b->end_of_sector_marker != const_cpu_to_le16(0xaa55)) ntfs_warning(sb, "Invalid end of sector marker."); return TRUE; not_ntfs: @@ -967,6 +994,7 @@ static BOOL load_and_init_mft_mirror(ntfs_volume *vol) tmp_ni = NTFS_I(tmp_ino); /* The $MFTMirr, like the $MFT is multi sector transfer protected. */ NInoSetMstProtected(tmp_ni); + NInoSetSparseDisabled(tmp_ni); /* * Set up our little cheat allowing us to reuse the async read io * completion handler for directories. @@ -990,12 +1018,12 @@ static BOOL load_and_init_mft_mirror(ntfs_volume *vol) */ static BOOL check_mft_mirror(ntfs_volume *vol) { - unsigned long index; struct super_block *sb = vol->sb; ntfs_inode *mirr_ni; struct page *mft_page, *mirr_page; u8 *kmft, *kmirr; runlist_element *rl, rl2[2]; + pgoff_t index; int mrecs_per_page, i; ntfs_debug("Entering."); @@ -1122,11 +1150,130 @@ static BOOL load_and_check_logfile(ntfs_volume *vol) /* ntfs_check_logfile() will have displayed error output. */ return FALSE; } + NInoSetSparseDisabled(NTFS_I(tmp_ino)); vol->logfile_ino = tmp_ino; ntfs_debug("Done."); return TRUE; } +#define NTFS_HIBERFIL_HEADER_SIZE 4096 + +/** + * check_windows_hibernation_status - check if Windows is suspended on a volume + * @vol: ntfs super block of device to check + * + * Check if Windows is hibernated on the ntfs volume @vol. This is done by + * looking for the file hiberfil.sys in the root directory of the volume. If + * the file is not present Windows is definitely not suspended. + * + * If hiberfil.sys exists and is less than 4kiB in size it means Windows is + * definitely suspended (this volume is not the system volume). Caveat: on a + * system with many volumes it is possible that the < 4kiB check is bogus but + * for now this should do fine. + * + * If hiberfil.sys exists and is larger than 4kiB in size, we need to read the + * hiberfil header (which is the first 4kiB). If this begins with "hibr", + * Windows is definitely suspended. If it is completely full of zeroes, + * Windows is definitely not hibernated. Any other case is treated as if + * Windows is suspended. This caters for the above mentioned caveat of a + * system with many volumes where no "hibr" magic would be present and there is + * no zero header. + * + * Return 0 if Windows is not hibernated on the volume, >0 if Windows is + * hibernated on the volume, and -errno on error. + */ +static int check_windows_hibernation_status(ntfs_volume *vol) +{ + MFT_REF mref; + struct inode *vi; + ntfs_inode *ni; + struct page *page; + u32 *kaddr, *kend; + ntfs_name *name = NULL; + int ret = 1; + static const ntfschar hiberfil[13] = { const_cpu_to_le16('h'), + const_cpu_to_le16('i'), const_cpu_to_le16('b'), + const_cpu_to_le16('e'), const_cpu_to_le16('r'), + const_cpu_to_le16('f'), const_cpu_to_le16('i'), + const_cpu_to_le16('l'), const_cpu_to_le16('.'), + const_cpu_to_le16('s'), const_cpu_to_le16('y'), + const_cpu_to_le16('s'), 0 }; + + ntfs_debug("Entering."); + /* + * Find the inode number for the hibernation file by looking up the + * filename hiberfil.sys in the root directory. + */ + down(&vol->root_ino->i_sem); + mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12, + &name); + up(&vol->root_ino->i_sem); + if (IS_ERR_MREF(mref)) { + ret = MREF_ERR(mref); + /* If the file does not exist, Windows is not hibernated. */ + if (ret == -ENOENT) { + ntfs_debug("hiberfil.sys not present. Windows is not " + "hibernated on the volume."); + return 0; + } + /* A real error occured. */ + ntfs_error(vol->sb, "Failed to find inode number for " + "hiberfil.sys."); + return ret; + } + /* We do not care for the type of match that was found. */ + kfree(name); + /* Get the inode. */ + vi = ntfs_iget(vol->sb, MREF(mref)); + if (IS_ERR(vi) || is_bad_inode(vi)) { + if (!IS_ERR(vi)) + iput(vi); + ntfs_error(vol->sb, "Failed to load hiberfil.sys."); + return IS_ERR(vi) ? PTR_ERR(vi) : -EIO; + } + if (unlikely(i_size_read(vi) < NTFS_HIBERFIL_HEADER_SIZE)) { + ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx). " + "Windows is hibernated on the volume. This " + "is not the system volume.", i_size_read(vi)); + goto iput_out; + } + ni = NTFS_I(vi); + page = ntfs_map_page(vi->i_mapping, 0); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read from hiberfil.sys."); + ret = PTR_ERR(page); + goto iput_out; + } + kaddr = (u32*)page_address(page); + if (*(le32*)kaddr == const_cpu_to_le32(0x72626968)/*'hibr'*/) { + ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is " + "hibernated on the volume. This is the " + "system volume."); + goto unm_iput_out; + } + kend = kaddr + NTFS_HIBERFIL_HEADER_SIZE/sizeof(*kaddr); + do { + if (unlikely(*kaddr)) { + ntfs_debug("hiberfil.sys is larger than 4kiB " + "(0x%llx), does not contain the " + "\"hibr\" magic, and does not have a " + "zero header. Windows is hibernated " + "on the volume. This is not the " + "system volume.", i_size_read(vi)); + goto unm_iput_out; + } + } while (++kaddr < kend); + ntfs_debug("hiberfil.sys contains a zero header. Windows is not " + "hibernated on the volume. This is the system " + "volume."); + ret = 0; +unm_iput_out: + ntfs_unmap_page(page); +iput_out: + iput(vi); + return ret; +} + /** * load_and_init_quota - load and setup the quota file for a volume if present * @vol: ntfs super block describing device whose quota file to load @@ -1175,8 +1322,7 @@ static BOOL load_and_init_quota(ntfs_volume *vol) return FALSE; } /* We do not care for the type of match that was found. */ - if (name) - kfree(name); + kfree(name); /* Get the inode. */ tmp_ino = ntfs_iget(vol->sb, MREF(mref)); if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { @@ -1198,6 +1344,167 @@ static BOOL load_and_init_quota(ntfs_volume *vol) } /** + * load_and_init_usnjrnl - load and setup the transaction log if present + * @vol: ntfs super block describing device whose usnjrnl file to load + * + * Return TRUE on success or FALSE on error. + * + * If $UsnJrnl is not present or in the process of being disabled, we set + * NVolUsnJrnlStamped() and return success. + * + * If the $UsnJrnl $DATA/$J attribute has a size equal to the lowest valid usn, + * i.e. transaction logging has only just been enabled or the journal has been + * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped() + * and return success. + */ +static BOOL load_and_init_usnjrnl(ntfs_volume *vol) +{ + MFT_REF mref; + struct inode *tmp_ino; + ntfs_inode *tmp_ni; + struct page *page; + ntfs_name *name = NULL; + USN_HEADER *uh; + static const ntfschar UsnJrnl[9] = { const_cpu_to_le16('$'), + const_cpu_to_le16('U'), const_cpu_to_le16('s'), + const_cpu_to_le16('n'), const_cpu_to_le16('J'), + const_cpu_to_le16('r'), const_cpu_to_le16('n'), + const_cpu_to_le16('l'), 0 }; + static ntfschar Max[5] = { const_cpu_to_le16('$'), + const_cpu_to_le16('M'), const_cpu_to_le16('a'), + const_cpu_to_le16('x'), 0 }; + static ntfschar J[3] = { const_cpu_to_le16('$'), + const_cpu_to_le16('J'), 0 }; + + ntfs_debug("Entering."); + /* + * Find the inode number for the transaction log file by looking up the + * filename $UsnJrnl in the extended system files directory $Extend. + */ + down(&vol->extend_ino->i_sem); + mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8, + &name); + up(&vol->extend_ino->i_sem); + if (IS_ERR_MREF(mref)) { + /* + * If the file does not exist, transaction logging is disabled, + * just return success. + */ + if (MREF_ERR(mref) == -ENOENT) { + ntfs_debug("$UsnJrnl not present. Volume does not " + "have transaction logging enabled."); +not_enabled: + /* + * No need to try to stamp the transaction log if + * transaction logging is not enabled. + */ + NVolSetUsnJrnlStamped(vol); + return TRUE; + } + /* A real error occured. */ + ntfs_error(vol->sb, "Failed to find inode number for " + "$UsnJrnl."); + return FALSE; + } + /* We do not care for the type of match that was found. */ + kfree(name); + /* Get the inode. */ + tmp_ino = ntfs_iget(vol->sb, MREF(mref)); + if (unlikely(IS_ERR(tmp_ino) || is_bad_inode(tmp_ino))) { + if (!IS_ERR(tmp_ino)) + iput(tmp_ino); + ntfs_error(vol->sb, "Failed to load $UsnJrnl."); + return FALSE; + } + vol->usnjrnl_ino = tmp_ino; + /* + * If the transaction log is in the process of being deleted, we can + * ignore it. + */ + if (unlikely(vol->vol_flags & VOLUME_DELETE_USN_UNDERWAY)) { + ntfs_debug("$UsnJrnl in the process of being disabled. " + "Volume does not have transaction logging " + "enabled."); + goto not_enabled; + } + /* Get the $DATA/$Max attribute. */ + tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, Max, 4); + if (IS_ERR(tmp_ino)) { + ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max " + "attribute."); + return FALSE; + } + vol->usnjrnl_max_ino = tmp_ino; + if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) { + ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max " + "attribute (size is 0x%llx but should be at " + "least 0x%x bytes).", i_size_read(tmp_ino), + sizeof(USN_HEADER)); + return FALSE; + } + /* Get the $DATA/$J attribute. */ + tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2); + if (IS_ERR(tmp_ino)) { + ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J " + "attribute."); + return FALSE; + } + vol->usnjrnl_j_ino = tmp_ino; + /* Verify $J is non-resident and sparse. */ + tmp_ni = NTFS_I(vol->usnjrnl_j_ino); + if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) { + ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident " + "and/or not sparse."); + return FALSE; + } + /* Read the USN_HEADER from $DATA/$Max. */ + page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max " + "attribute."); + return FALSE; + } + uh = (USN_HEADER*)page_address(page); + /* Sanity check the $Max. */ + if (unlikely(sle64_to_cpu(uh->allocation_delta) > + sle64_to_cpu(uh->maximum_size))) { + ntfs_error(vol->sb, "Allocation delta (0x%llx) exceeds " + "maximum size (0x%llx). $UsnJrnl is corrupt.", + (long long)sle64_to_cpu(uh->allocation_delta), + (long long)sle64_to_cpu(uh->maximum_size)); + ntfs_unmap_page(page); + return FALSE; + } + /* + * If the transaction log has been stamped and nothing has been written + * to it since, we do not need to stamp it. + */ + if (unlikely(sle64_to_cpu(uh->lowest_valid_usn) >= + i_size_read(vol->usnjrnl_j_ino))) { + if (likely(sle64_to_cpu(uh->lowest_valid_usn) == + i_size_read(vol->usnjrnl_j_ino))) { + ntfs_unmap_page(page); + ntfs_debug("$UsnJrnl is enabled but nothing has been " + "logged since it was last stamped. " + "Treating this as if the volume does " + "not have transaction logging " + "enabled."); + goto not_enabled; + } + ntfs_error(vol->sb, "$UsnJrnl has lowest valid usn (0x%llx) " + "which is out of bounds (0x%llx). $UsnJrnl " + "is corrupt.", + (long long)sle64_to_cpu(uh->lowest_valid_usn), + i_size_read(vol->usnjrnl_j_ino)); + ntfs_unmap_page(page); + return FALSE; + } + ntfs_unmap_page(page); + ntfs_debug("Done."); + return TRUE; +} + +/** * load_and_init_attrdef - load the attribute definitions table for a volume * @vol: ntfs super block describing device whose attrdef to load * @@ -1205,10 +1512,11 @@ static BOOL load_and_init_quota(ntfs_volume *vol) */ static BOOL load_and_init_attrdef(ntfs_volume *vol) { + loff_t i_size; struct super_block *sb = vol->sb; struct inode *ino; struct page *page; - unsigned long index, max_index; + pgoff_t index, max_index; unsigned int size; ntfs_debug("Entering."); @@ -1219,14 +1527,16 @@ static BOOL load_and_init_attrdef(ntfs_volume *vol) iput(ino); goto failed; } + NInoSetSparseDisabled(NTFS_I(ino)); /* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */ - if (!ino->i_size || ino->i_size > 0x7fffffff) + i_size = i_size_read(ino); + if (i_size <= 0 || i_size > 0x7fffffff) goto iput_failed; - vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(ino->i_size); + vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(i_size); if (!vol->attrdef) goto iput_failed; index = 0; - max_index = ino->i_size >> PAGE_CACHE_SHIFT; + max_index = i_size >> PAGE_CACHE_SHIFT; size = PAGE_CACHE_SIZE; while (index < max_index) { /* Read the attrdef table and copy it into the linear buffer. */ @@ -1239,12 +1549,12 @@ read_partial_attrdef_page: ntfs_unmap_page(page); }; if (size == PAGE_CACHE_SIZE) { - size = ino->i_size & ~PAGE_CACHE_MASK; + size = i_size & ~PAGE_CACHE_MASK; if (size) goto read_partial_attrdef_page; } - vol->attrdef_size = ino->i_size; - ntfs_debug("Read %llu bytes from $AttrDef.", ino->i_size); + vol->attrdef_size = i_size; + ntfs_debug("Read %llu bytes from $AttrDef.", i_size); iput(ino); return TRUE; free_iput_failed: @@ -1267,10 +1577,11 @@ failed: */ static BOOL load_and_init_upcase(ntfs_volume *vol) { + loff_t i_size; struct super_block *sb = vol->sb; struct inode *ino; struct page *page; - unsigned long index, max_index; + pgoff_t index, max_index; unsigned int size; int i, max; @@ -1286,14 +1597,15 @@ static BOOL load_and_init_upcase(ntfs_volume *vol) * The upcase size must not be above 64k Unicode characters, must not * be zero and must be a multiple of sizeof(ntfschar). */ - if (!ino->i_size || ino->i_size & (sizeof(ntfschar) - 1) || - ino->i_size > 64ULL * 1024 * sizeof(ntfschar)) + i_size = i_size_read(ino); + if (!i_size || i_size & (sizeof(ntfschar) - 1) || + i_size > 64ULL * 1024 * sizeof(ntfschar)) goto iput_upcase_failed; - vol->upcase = (ntfschar*)ntfs_malloc_nofs(ino->i_size); + vol->upcase = (ntfschar*)ntfs_malloc_nofs(i_size); if (!vol->upcase) goto iput_upcase_failed; index = 0; - max_index = ino->i_size >> PAGE_CACHE_SHIFT; + max_index = i_size >> PAGE_CACHE_SHIFT; size = PAGE_CACHE_SIZE; while (index < max_index) { /* Read the upcase table and copy it into the linear buffer. */ @@ -1306,13 +1618,13 @@ read_partial_upcase_page: ntfs_unmap_page(page); }; if (size == PAGE_CACHE_SIZE) { - size = ino->i_size & ~PAGE_CACHE_MASK; + size = i_size & ~PAGE_CACHE_MASK; if (size) goto read_partial_upcase_page; } - vol->upcase_len = ino->i_size >> UCHAR_T_SIZE_BITS; + vol->upcase_len = i_size >> UCHAR_T_SIZE_BITS; ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).", - ino->i_size, 64 * 1024 * sizeof(ntfschar)); + i_size, 64 * 1024 * sizeof(ntfschar)); iput(ino); down(&ntfs_lock); if (!default_upcase) { @@ -1376,6 +1688,9 @@ static BOOL load_system_files(ntfs_volume *vol) MFT_RECORD *m; VOLUME_INFORMATION *vi; ntfs_attr_search_ctx *ctx; +#ifdef NTFS_RW + int err; +#endif /* NTFS_RW */ ntfs_debug("Entering."); #ifdef NTFS_RW @@ -1435,7 +1750,8 @@ static BOOL load_system_files(ntfs_volume *vol) iput(vol->lcnbmp_ino); goto bitmap_failed; } - if ((vol->nr_clusters + 7) >> 3 > vol->lcnbmp_ino->i_size) { + NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino)); + if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) { iput(vol->lcnbmp_ino); bitmap_failed: ntfs_error(sb, "Failed to load $Bitmap."); @@ -1486,6 +1802,12 @@ get_ctx_vol_failed: unmap_mft_record(NTFS_I(vol->vol_ino)); printk(KERN_INFO "NTFS volume version %i.%i.\n", vol->major_ver, vol->minor_ver); + if (vol->major_ver < 3 && NVolSparseEnabled(vol)) { + ntfs_warning(vol->sb, "Disabling sparse support due to NTFS " + "volume version %i.%i (need at least version " + "3.0).", vol->major_ver, vol->minor_ver); + NVolClearSparseEnabled(vol); + } #ifdef NTFS_RW /* Make sure that no unsupported volume flags are set. */ if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { @@ -1545,6 +1867,50 @@ get_ctx_vol_failed: /* This will prevent a read-write remount. */ NVolSetErrors(vol); } +#endif /* NTFS_RW */ + /* Get the root directory inode so we can do path lookups. */ + vol->root_ino = ntfs_iget(sb, FILE_root); + if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) { + if (!IS_ERR(vol->root_ino)) + iput(vol->root_ino); + ntfs_error(sb, "Failed to load root directory."); + goto iput_logfile_err_out; + } +#ifdef NTFS_RW + /* + * Check if Windows is suspended to disk on the target volume. If it + * is hibernated, we must not write *anything* to the disk so set + * NVolErrors() without setting the dirty volume flag and mount + * read-only. This will prevent read-write remounting and it will also + * prevent all writes. + */ + err = check_windows_hibernation_status(vol); + if (unlikely(err)) { + static const char *es1a = "Failed to determine if Windows is " + "hibernated"; + static const char *es1b = "Windows is hibernated"; + static const char *es2 = ". Run chkdsk."; + const char *es1; + + es1 = err < 0 ? es1a : es1b; + /* If a read-write mount, convert it to a read-only mount. */ + if (!(sb->s_flags & MS_RDONLY)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_root_err_out; + } + sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } /* If (still) a read-write mount, mark the volume dirty. */ if (!(sb->s_flags & MS_RDONLY) && ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { @@ -1558,7 +1924,7 @@ get_ctx_vol_failed: ntfs_error(sb, "%s and neither on_errors=continue nor " "on_errors=remount-ro was specified%s", es1, es2); - goto iput_logfile_err_out; + goto iput_root_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; @@ -1585,7 +1951,7 @@ get_ctx_vol_failed: ntfs_error(sb, "%s and neither on_errors=continue nor " "on_errors=remount-ro was specified%s", es1, es2); - goto iput_logfile_err_out; + goto iput_root_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; @@ -1604,23 +1970,15 @@ get_ctx_vol_failed: ntfs_error(sb, "%s and neither on_errors=continue nor " "on_errors=remount-ro was specified%s", es1, es2); - goto iput_logfile_err_out; + goto iput_root_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; NVolSetErrors(vol); } #endif /* NTFS_RW */ - /* Get the root directory inode. */ - vol->root_ino = ntfs_iget(sb, FILE_root); - if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) { - if (!IS_ERR(vol->root_ino)) - iput(vol->root_ino); - ntfs_error(sb, "Failed to load root directory."); - goto iput_logfile_err_out; - } /* If on NTFS versions before 3.0, we are done. */ - if (vol->major_ver < 3) + if (unlikely(vol->major_ver < 3)) return TRUE; /* NTFS 3.0+ specific initialization. */ /* Get the security descriptors inode. */ @@ -1631,7 +1989,7 @@ get_ctx_vol_failed: ntfs_error(sb, "Failed to load $Secure."); goto iput_root_err_out; } - // FIXME: Initialize security. + // TODO: Initialize security. /* Get the extended system files' directory inode. */ vol->extend_ino = ntfs_iget(sb, FILE_Extend); if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) { @@ -1682,10 +2040,60 @@ get_ctx_vol_failed: sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; NVolSetErrors(vol); } - // TODO: Delete or checkpoint the $UsnJrnl if it exists. + /* + * Find the transaction log file ($UsnJrnl), load it if present, check + * it, and set it up. + */ + if (!load_and_init_usnjrnl(vol)) { + static const char *es1 = "Failed to load $UsnJrnl"; + static const char *es2 = ". Run chkdsk."; + + /* If a read-write mount, convert it to a read-only mount. */ + if (!(sb->s_flags & MS_RDONLY)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_usnjrnl_err_out; + } + sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } + /* If (still) a read-write mount, stamp the transaction log. */ + if (!(sb->s_flags & MS_RDONLY) && !ntfs_stamp_usnjrnl(vol)) { + static const char *es1 = "Failed to stamp transaction log " + "($UsnJrnl)"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_usnjrnl_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + NVolSetErrors(vol); + } #endif /* NTFS_RW */ return TRUE; #ifdef NTFS_RW +iput_usnjrnl_err_out: + if (vol->usnjrnl_j_ino) + iput(vol->usnjrnl_j_ino); + if (vol->usnjrnl_max_ino) + iput(vol->usnjrnl_max_ino); + if (vol->usnjrnl_ino) + iput(vol->usnjrnl_ino); iput_quota_err_out: if (vol->quota_q_ino) iput(vol->quota_q_ino); @@ -1759,6 +2167,12 @@ static void ntfs_put_super(struct super_block *sb) /* NTFS 3.0+ specific. */ if (vol->major_ver >= 3) { + if (vol->usnjrnl_j_ino) + ntfs_commit_inode(vol->usnjrnl_j_ino); + if (vol->usnjrnl_max_ino) + ntfs_commit_inode(vol->usnjrnl_max_ino); + if (vol->usnjrnl_ino) + ntfs_commit_inode(vol->usnjrnl_ino); if (vol->quota_q_ino) ntfs_commit_inode(vol->quota_q_ino); if (vol->quota_ino) @@ -1814,6 +2228,18 @@ static void ntfs_put_super(struct super_block *sb) /* NTFS 3.0+ specific clean up. */ if (vol->major_ver >= 3) { #ifdef NTFS_RW + if (vol->usnjrnl_j_ino) { + iput(vol->usnjrnl_j_ino); + vol->usnjrnl_j_ino = NULL; + } + if (vol->usnjrnl_max_ino) { + iput(vol->usnjrnl_max_ino); + vol->usnjrnl_max_ino = NULL; + } + if (vol->usnjrnl_ino) { + iput(vol->usnjrnl_ino); + vol->usnjrnl_ino = NULL; + } if (vol->quota_q_ino) { iput(vol->quota_q_ino); vol->quota_q_ino = NULL; @@ -1959,8 +2385,7 @@ static s64 get_nr_free_clusters(ntfs_volume *vol) struct address_space *mapping = vol->lcnbmp_ino->i_mapping; filler_t *readpage = (filler_t*)mapping->a_ops->readpage; struct page *page; - unsigned long index, max_index; - unsigned int max_size; + pgoff_t index, max_index; ntfs_debug("Entering."); /* Serialize accesses to the cluster bitmap. */ @@ -1972,11 +2397,10 @@ static s64 get_nr_free_clusters(ntfs_volume *vol) */ max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - /* Use multiples of 4 bytes. */ - max_size = PAGE_CACHE_SIZE >> 2; - ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%x.", - max_index, max_size); - for (index = 0UL; index < max_index; index++) { + /* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */ + ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.", + max_index, PAGE_CACHE_SIZE / 4); + for (index = 0; index < max_index; index++) { unsigned int i; /* * Read the page from page cache, getting it from backing store @@ -2008,7 +2432,7 @@ static s64 get_nr_free_clusters(ntfs_volume *vol) * the result as all out of range bytes are set to zero by * ntfs_readpage(). */ - for (i = 0; i < max_size; i++) + for (i = 0; i < PAGE_CACHE_SIZE / 4; i++) nr_free -= (s64)hweight32(kaddr[i]); kunmap_atomic(kaddr, KM_USER0); page_cache_release(page); @@ -2031,6 +2455,8 @@ static s64 get_nr_free_clusters(ntfs_volume *vol) /** * __get_nr_free_mft_records - return the number of free inodes on a volume * @vol: ntfs volume for which to obtain free inode count + * @nr_free: number of mft records in filesystem + * @max_index: maximum number of pages containing set bits * * Calculate the number of free mft records (inodes) on the mounted NTFS * volume @vol. We actually calculate the number of mft records in use instead @@ -2043,32 +2469,20 @@ static s64 get_nr_free_clusters(ntfs_volume *vol) * * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing. */ -static unsigned long __get_nr_free_mft_records(ntfs_volume *vol) +static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, + s64 nr_free, const pgoff_t max_index) { - s64 nr_free; u32 *kaddr; struct address_space *mapping = vol->mftbmp_ino->i_mapping; filler_t *readpage = (filler_t*)mapping->a_ops->readpage; struct page *page; - unsigned long index, max_index; - unsigned int max_size; + pgoff_t index; ntfs_debug("Entering."); - /* Number of mft records in file system (at this point in time). */ - nr_free = vol->mft_ino->i_size >> vol->mft_record_size_bits; - /* - * Convert the maximum number of set bits into bytes rounded up, then - * convert into multiples of PAGE_CACHE_SIZE, rounding up so that if we - * have one full and one partial page max_index = 2. - */ - max_index = ((((NTFS_I(vol->mft_ino)->initialized_size >> - vol->mft_record_size_bits) + 7) >> 3) + - PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - /* Use multiples of 4 bytes. */ - max_size = PAGE_CACHE_SIZE >> 2; + /* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */ ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = " - "0x%x.", max_index, max_size); - for (index = 0UL; index < max_index; index++) { + "0x%lx.", max_index, PAGE_CACHE_SIZE / 4); + for (index = 0; index < max_index; index++) { unsigned int i; /* * Read the page from page cache, getting it from backing store @@ -2100,7 +2514,7 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol) * the result as all out of range bytes are set to zero by * ntfs_readpage(). */ - for (i = 0; i < max_size; i++) + for (i = 0; i < PAGE_CACHE_SIZE / 4; i++) nr_free -= (s64)hweight32(kaddr[i]); kunmap_atomic(kaddr, KM_USER0); page_cache_release(page); @@ -2134,8 +2548,11 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol) */ static int ntfs_statfs(struct super_block *sb, struct kstatfs *sfs) { - ntfs_volume *vol = NTFS_SB(sb); s64 size; + ntfs_volume *vol = NTFS_SB(sb); + ntfs_inode *mft_ni = NTFS_I(vol->mft_ino); + pgoff_t max_index; + unsigned long flags; ntfs_debug("Entering."); /* Type of filesystem. */ @@ -2143,13 +2560,13 @@ static int ntfs_statfs(struct super_block *sb, struct kstatfs *sfs) /* Optimal transfer block size. */ sfs->f_bsize = PAGE_CACHE_SIZE; /* - * Total data blocks in file system in units of f_bsize and since + * Total data blocks in filesystem in units of f_bsize and since * inodes are also stored in data blocs ($MFT is a file) this is just * the total clusters. */ sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >> PAGE_CACHE_SHIFT; - /* Free data blocks in file system in units of f_bsize. */ + /* Free data blocks in filesystem in units of f_bsize. */ size = get_nr_free_clusters(vol) << vol->cluster_size_bits >> PAGE_CACHE_SHIFT; if (size < 0LL) @@ -2158,17 +2575,27 @@ static int ntfs_statfs(struct super_block *sb, struct kstatfs *sfs) sfs->f_bavail = sfs->f_bfree = size; /* Serialize accesses to the inode bitmap. */ down_read(&vol->mftbmp_lock); - /* Number of inodes in file system (at this point in time). */ - sfs->f_files = vol->mft_ino->i_size >> vol->mft_record_size_bits; + read_lock_irqsave(&mft_ni->size_lock, flags); + size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits; + /* + * Convert the maximum number of set bits into bytes rounded up, then + * convert into multiples of PAGE_CACHE_SIZE, rounding up so that if we + * have one full and one partial page max_index = 2. + */ + max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits) + + 7) >> 3) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + /* Number of inodes in filesystem (at this point in time). */ + sfs->f_files = size; /* Free inodes in fs (based on current total count). */ - sfs->f_ffree = __get_nr_free_mft_records(vol); + sfs->f_ffree = __get_nr_free_mft_records(vol, size, max_index); up_read(&vol->mftbmp_lock); /* * File system id. This is extremely *nix flavour dependent and even * within Linux itself all fs do their own thing. I interpret this to * mean a unique id associated with the mounted fs and not the id - * associated with the file system driver, the latter is already given - * by the file system type in sfs->f_type. Thus we use the 64-bit + * associated with the filesystem driver, the latter is already given + * by the filesystem type in sfs->f_type. Thus we use the 64-bit * volume serial number splitting it into two 32-bit parts. We enter * the least significant 32-bits in f_fsid[0] and the most significant * 32-bits in f_fsid[1]. @@ -2219,53 +2646,19 @@ static struct super_operations ntfs_sops = { proc. */ }; - /** - * Declarations for NTFS specific export operations (fs/ntfs/namei.c). - */ -extern struct dentry *ntfs_get_parent(struct dentry *child_dent); -extern struct dentry *ntfs_get_dentry(struct super_block *sb, void *fh); - -/** - * Export operations allowing NFS exporting of mounted NTFS partitions. - * - * We use the default ->decode_fh() and ->encode_fh() for now. Note that they - * use 32 bits to store the inode number which is an unsigned long so on 64-bit - * architectures is usually 64 bits so it would all fail horribly on huge - * volumes. I guess we need to define our own encode and decode fh functions - * that store 64-bit inode numbers at some point but for now we will ignore the - * problem... - * - * We also use the default ->get_name() helper (used by ->decode_fh() via - * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs - * independent. - * - * The default ->get_parent() just returns -EACCES so we have to provide our - * own and the default ->get_dentry() is incompatible with NTFS due to not - * allowing the inode number 0 which is used in NTFS for the system file $MFT - * and due to using iget() whereas NTFS needs ntfs_iget(). - */ -static struct export_operations ntfs_export_ops = { - .get_parent = ntfs_get_parent, /* Find the parent of a given - directory. */ - .get_dentry = ntfs_get_dentry, /* Find a dentry for the inode - given a file handle - sub-fragment. */ -}; - -/** - * ntfs_fill_super - mount an ntfs files system - * @sb: super block of ntfs file system to mount + * ntfs_fill_super - mount an ntfs filesystem + * @sb: super block of ntfs filesystem to mount * @opt: string containing the mount options * @silent: silence error output * * ntfs_fill_super() is called by the VFS to mount the device described by @sb - * with the mount otions in @data with the NTFS file system. + * with the mount otions in @data with the NTFS filesystem. * * If @silent is true, remain silent even if errors are detected. This is used - * during bootup, when the kernel tries to mount the root file system with all - * registered file systems one after the other until one succeeds. This implies - * that all file systems except the correct one will quite correctly and + * during bootup, when the kernel tries to mount the root filesystem with all + * registered filesystems one after the other until one succeeds. This implies + * that all filesystems except the correct one will quite correctly and * expectedly return an error, but nobody wants to see error messages when in * fact this is what is supposed to happen. * @@ -2292,39 +2685,25 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) return -ENOMEM; } /* Initialize ntfs_volume structure. */ - memset(vol, 0, sizeof(ntfs_volume)); - vol->sb = sb; - vol->upcase = NULL; - vol->attrdef = NULL; - vol->mft_ino = NULL; - vol->mftbmp_ino = NULL; + *vol = (ntfs_volume) { + .sb = sb, + /* + * Default is group and other don't have any access to files or + * directories while owner has full access. Further, files by + * default are not executable but directories are of course + * browseable. + */ + .fmask = 0177, + .dmask = 0077, + }; init_rwsem(&vol->mftbmp_lock); -#ifdef NTFS_RW - vol->mftmirr_ino = NULL; - vol->logfile_ino = NULL; -#endif /* NTFS_RW */ - vol->lcnbmp_ino = NULL; init_rwsem(&vol->lcnbmp_lock); - vol->vol_ino = NULL; - vol->root_ino = NULL; - vol->secure_ino = NULL; - vol->extend_ino = NULL; -#ifdef NTFS_RW - vol->quota_ino = NULL; - vol->quota_q_ino = NULL; -#endif /* NTFS_RW */ - vol->nls_map = NULL; - - /* - * Default is group and other don't have any access to files or - * directories while owner has full access. Further, files by default - * are not executable but directories are of course browseable. - */ - vol->fmask = 0177; - vol->dmask = 0077; unlock_kernel(); + /* By default, enable sparse support. */ + NVolSetSparseEnabled(vol); + /* Important to get the mount options dealt with now. */ if (!parse_options(vol, (char*)opt)) goto err_out_now; @@ -2347,7 +2726,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) } /* Get the size of the device in units of NTFS_BLOCK_SIZE bytes. */ - vol->nr_blocks = sb->s_bdev->bd_inode->i_size >> NTFS_BLOCK_SIZE_BITS; + vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >> + NTFS_BLOCK_SIZE_BITS; /* Read the boot sector and return unlocked buffer head to it. */ if (!(bh = read_ntfs_boot_sector(sb, silent))) { @@ -2476,6 +2856,18 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) /* NTFS 3.0+ specific clean up. */ if (vol->major_ver >= 3) { #ifdef NTFS_RW + if (vol->usnjrnl_j_ino) { + iput(vol->usnjrnl_j_ino); + vol->usnjrnl_j_ino = NULL; + } + if (vol->usnjrnl_max_ino) { + iput(vol->usnjrnl_max_ino); + vol->usnjrnl_max_ino = NULL; + } + if (vol->usnjrnl_ino) { + iput(vol->usnjrnl_ino); + vol->usnjrnl_ino = NULL; + } if (vol->quota_q_ino) { iput(vol->quota_q_ino); vol->quota_q_ino = NULL; @@ -2581,7 +2973,7 @@ err_out_now: */ kmem_cache_t *ntfs_name_cache; -/* Slab caches for efficient allocation/deallocation of of inodes. */ +/* Slab caches for efficient allocation/deallocation of inodes. */ kmem_cache_t *ntfs_inode_cache; kmem_cache_t *ntfs_big_inode_cache; @@ -2705,7 +3097,7 @@ static int __init init_ntfs_fs(void) ntfs_debug("NTFS driver registered successfully."); return 0; /* Success! */ } - printk(KERN_CRIT "NTFS: Failed to register NTFS file system driver!\n"); + printk(KERN_CRIT "NTFS: Failed to register NTFS filesystem driver!\n"); sysctl_err_out: kmem_cache_destroy(ntfs_big_inode_cache); @@ -2719,7 +3111,7 @@ actx_err_out: kmem_cache_destroy(ntfs_index_ctx_cache); ictx_err_out: if (!err) { - printk(KERN_CRIT "NTFS: Aborting NTFS file system driver " + printk(KERN_CRIT "NTFS: Aborting NTFS filesystem driver " "registration...\n"); err = -ENOMEM; } @@ -2759,7 +3151,7 @@ static void __exit exit_ntfs_fs(void) } MODULE_AUTHOR("Anton Altaparmakov <aia21@cantab.net>"); -MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2004 Anton Altaparmakov"); +MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2005 Anton Altaparmakov"); MODULE_VERSION(NTFS_VERSION); MODULE_LICENSE("GPL"); #ifdef DEBUG diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c index 75067e4f303..1c23138d00b 100644 --- a/fs/ntfs/sysctl.c +++ b/fs/ntfs/sysctl.c @@ -3,7 +3,7 @@ * the Linux-NTFS project. Adapted from the old NTFS driver, * Copyright (C) 1997 Martin von Löwis, Régis Duchesne * - * Copyright (c) 2002-2004 Anton Altaparmakov + * Copyright (c) 2002-2005 Anton Altaparmakov * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published @@ -67,7 +67,7 @@ int ntfs_sysctl(int add) return -ENOMEM; #ifdef CONFIG_PROC_FS /* - * If the proc file system is in use and we are a module, need + * If the proc filesystem is in use and we are a module, need * to set the owner of our proc entry to our module. In the * non-modular case, THIS_MODULE is NULL, so this is ok. */ diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h index df749cc0aac..c8064cae8f1 100644 --- a/fs/ntfs/sysctl.h +++ b/fs/ntfs/sysctl.h @@ -26,7 +26,7 @@ #include <linux/config.h> -#if (DEBUG && CONFIG_SYSCTL) +#if defined(DEBUG) && defined(CONFIG_SYSCTL) extern int ntfs_sysctl(int add); diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h index a09a51dabe4..01233989d5d 100644 --- a/fs/ntfs/time.h +++ b/fs/ntfs/time.h @@ -1,7 +1,7 @@ /* * time.h - NTFS time conversion functions. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2001-2005 Anton Altaparmakov * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published @@ -87,7 +87,7 @@ static inline struct timespec ntfs2utc(const sle64 time) struct timespec ts; /* Subtract the NTFS time offset. */ - s64 t = sle64_to_cpu(time) - NTFS_TIME_OFFSET; + u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET); /* * Convert the time to 1-second intervals and the remainder to * 1-nano-second intervals. diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h index 08a55aa53d4..6e4a7e3343f 100644 --- a/fs/ntfs/types.h +++ b/fs/ntfs/types.h @@ -2,7 +2,7 @@ * types.h - Defines for NTFS Linux kernel driver specific types. * Part of the Linux-NTFS project. * - * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2001-2005 Anton Altaparmakov * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published @@ -53,6 +53,14 @@ typedef sle64 leLCN; typedef s64 LSN; typedef sle64 leLSN; +/* + * The NTFS transaction log $UsnJrnl uses usn which are signed 64-bit values. + * We define our own type USN, to allow for type checking and better code + * readability. + */ +typedef s64 USN; +typedef sle64 leUSN; + typedef enum { FALSE = 0, TRUE = 1 diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c index 560b0ea255b..19c42e231b4 100644 --- a/fs/ntfs/unistr.c +++ b/fs/ntfs/unistr.c @@ -264,7 +264,7 @@ int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, /* We don't trust outside sources. */ if (ins) { - ucs = (ntfschar*)kmem_cache_alloc(ntfs_name_cache, SLAB_NOFS); + ucs = kmem_cache_alloc(ntfs_name_cache, SLAB_NOFS); if (ucs) { for (i = o = 0; i < ins_len; i += wc_len) { wc_len = nls->char2uni(ins + i, ins_len - i, diff --git a/fs/ntfs/usnjrnl.c b/fs/ntfs/usnjrnl.c new file mode 100644 index 00000000000..77773240d13 --- /dev/null +++ b/fs/ntfs/usnjrnl.c @@ -0,0 +1,84 @@ +/* + * usnjrnl.h - NTFS kernel transaction log ($UsnJrnl) handling. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifdef NTFS_RW + +#include <linux/fs.h> +#include <linux/highmem.h> +#include <linux/mm.h> + +#include "aops.h" +#include "debug.h" +#include "endian.h" +#include "time.h" +#include "types.h" +#include "usnjrnl.h" +#include "volume.h" + +/** + * ntfs_stamp_usnjrnl - stamp the transaction log ($UsnJrnl) on an ntfs volume + * @vol: ntfs volume on which to stamp the transaction log + * + * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return + * TRUE on success and FALSE on error. + * + * This function assumes that the transaction log has already been loaded and + * consistency checked by a call to fs/ntfs/super.c::load_and_init_usnjrnl(). + */ +BOOL ntfs_stamp_usnjrnl(ntfs_volume *vol) +{ + ntfs_debug("Entering."); + if (likely(!NVolUsnJrnlStamped(vol))) { + sle64 stamp; + struct page *page; + USN_HEADER *uh; + + page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read from " + "$UsnJrnl/$DATA/$Max attribute."); + return FALSE; + } + uh = (USN_HEADER*)page_address(page); + stamp = get_current_ntfs_time(); + ntfs_debug("Stamping transaction log ($UsnJrnl): old " + "journal_id 0x%llx, old lowest_valid_usn " + "0x%llx, new journal_id 0x%llx, new " + "lowest_valid_usn 0x%llx.", + (long long)sle64_to_cpu(uh->journal_id), + (long long)sle64_to_cpu(uh->lowest_valid_usn), + (long long)sle64_to_cpu(stamp), + i_size_read(vol->usnjrnl_j_ino)); + uh->lowest_valid_usn = + cpu_to_sle64(i_size_read(vol->usnjrnl_j_ino)); + uh->journal_id = stamp; + flush_dcache_page(page); + set_page_dirty(page); + ntfs_unmap_page(page); + /* Set the flag so we do not have to do it again on remount. */ + NVolSetUsnJrnlStamped(vol); + } + ntfs_debug("Done."); + return TRUE; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h new file mode 100644 index 00000000000..ff988b0deb4 --- /dev/null +++ b/fs/ntfs/usnjrnl.h @@ -0,0 +1,205 @@ +/* + * usnjrnl.h - Defines for NTFS kernel transaction log ($UsnJrnl) handling. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2005 Anton Altaparmakov + * + * This program/include file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as published + * by the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program/include file is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program (in the main directory of the Linux-NTFS + * distribution in the file COPYING); if not, write to the Free Software + * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _LINUX_NTFS_USNJRNL_H +#define _LINUX_NTFS_USNJRNL_H + +#ifdef NTFS_RW + +#include "types.h" +#include "endian.h" +#include "layout.h" +#include "volume.h" + +/* + * Transaction log ($UsnJrnl) organization: + * + * The transaction log records whenever a file is modified in any way. So for + * example it will record that file "blah" was written to at a particular time + * but not what was written. If will record that a file was deleted or + * created, that a file was truncated, etc. See below for all the reason + * codes used. + * + * The transaction log is in the $Extend directory which is in the root + * directory of each volume. If it is not present it means transaction + * logging is disabled. If it is present it means transaction logging is + * either enabled or in the process of being disabled in which case we can + * ignore it as it will go away as soon as Windows gets its hands on it. + * + * To determine whether the transaction logging is enabled or in the process + * of being disabled, need to check the volume flags in the + * $VOLUME_INFORMATION attribute in the $Volume system file (which is present + * in the root directory and has a fixed mft record number, see layout.h). + * If the flag VOLUME_DELETE_USN_UNDERWAY is set it means the transaction log + * is in the process of being disabled and if this flag is clear it means the + * transaction log is enabled. + * + * The transaction log consists of two parts; the $DATA/$Max attribute as well + * as the $DATA/$J attribute. $Max is a header describing the transaction + * log whilst $J is the transaction log data itself as a sequence of variable + * sized USN_RECORDs (see below for all the structures). + * + * We do not care about transaction logging at this point in time but we still + * need to let windows know that the transaction log is out of date. To do + * this we need to stamp the transaction log. This involves setting the + * lowest_valid_usn field in the $DATA/$Max attribute to the usn to be used + * for the next added USN_RECORD to the $DATA/$J attribute as well as + * generating a new journal_id in $DATA/$Max. + * + * The journal_id is as of the current version (2.0) of the transaction log + * simply the 64-bit timestamp of when the journal was either created or last + * stamped. + * + * To determine the next usn there are two ways. The first is to parse + * $DATA/$J and to find the last USN_RECORD in it and to add its record_length + * to its usn (which is the byte offset in the $DATA/$J attribute). The + * second is simply to take the data size of the attribute. Since the usns + * are simply byte offsets into $DATA/$J, this is exactly the next usn. For + * obvious reasons we use the second method as it is much simpler and faster. + * + * As an aside, note that to actually disable the transaction log, one would + * need to set the VOLUME_DELETE_USN_UNDERWAY flag (see above), then go + * through all the mft records on the volume and set the usn field in their + * $STANDARD_INFORMATION attribute to zero. Once that is done, one would need + * to delete the transaction log file, i.e. \$Extent\$UsnJrnl, and finally, + * one would need to clear the VOLUME_DELETE_USN_UNDERWAY flag. + * + * Note that if a volume is unmounted whilst the transaction log is being + * disabled, the process will continue the next time the volume is mounted. + * This is why we can safely mount read-write when we see a transaction log + * in the process of being deleted. + */ + +/* Some $UsnJrnl related constants. */ +#define UsnJrnlMajorVer 2 +#define UsnJrnlMinorVer 0 + +/* + * $DATA/$Max attribute. This is (always?) resident and has a fixed size of + * 32 bytes. It contains the header describing the transaction log. + */ +typedef struct { +/*Ofs*/ +/* 0*/sle64 maximum_size; /* The maximum on-disk size of the $DATA/$J + attribute. */ +/* 8*/sle64 allocation_delta; /* Number of bytes by which to increase the + size of the $DATA/$J attribute. */ +/*0x10*/sle64 journal_id; /* Current id of the transaction log. */ +/*0x18*/leUSN lowest_valid_usn; /* Lowest valid usn in $DATA/$J for the + current journal_id. */ +/* sizeof() = 32 (0x20) bytes */ +} __attribute__ ((__packed__)) USN_HEADER; + +/* + * Reason flags (32-bit). Cumulative flags describing the change(s) to the + * file since it was last opened. I think the names speak for themselves but + * if you disagree check out the descriptions in the Linux NTFS project NTFS + * documentation: http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html + */ +enum { + USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001), + USN_REASON_DATA_EXTEND = const_cpu_to_le32(0x00000002), + USN_REASON_DATA_TRUNCATION = const_cpu_to_le32(0x00000004), + USN_REASON_NAMED_DATA_OVERWRITE = const_cpu_to_le32(0x00000010), + USN_REASON_NAMED_DATA_EXTEND = const_cpu_to_le32(0x00000020), + USN_REASON_NAMED_DATA_TRUNCATION= const_cpu_to_le32(0x00000040), + USN_REASON_FILE_CREATE = const_cpu_to_le32(0x00000100), + USN_REASON_FILE_DELETE = const_cpu_to_le32(0x00000200), + USN_REASON_EA_CHANGE = const_cpu_to_le32(0x00000400), + USN_REASON_SECURITY_CHANGE = const_cpu_to_le32(0x00000800), + USN_REASON_RENAME_OLD_NAME = const_cpu_to_le32(0x00001000), + USN_REASON_RENAME_NEW_NAME = const_cpu_to_le32(0x00002000), + USN_REASON_INDEXABLE_CHANGE = const_cpu_to_le32(0x00004000), + USN_REASON_BASIC_INFO_CHANGE = const_cpu_to_le32(0x00008000), + USN_REASON_HARD_LINK_CHANGE = const_cpu_to_le32(0x00010000), + USN_REASON_COMPRESSION_CHANGE = const_cpu_to_le32(0x00020000), + USN_REASON_ENCRYPTION_CHANGE = const_cpu_to_le32(0x00040000), + USN_REASON_OBJECT_ID_CHANGE = const_cpu_to_le32(0x00080000), + USN_REASON_REPARSE_POINT_CHANGE = const_cpu_to_le32(0x00100000), + USN_REASON_STREAM_CHANGE = const_cpu_to_le32(0x00200000), + USN_REASON_CLOSE = const_cpu_to_le32(0x80000000), +}; + +typedef le32 USN_REASON_FLAGS; + +/* + * Source info flags (32-bit). Information about the source of the change(s) + * to the file. For detailed descriptions of what these mean, see the Linux + * NTFS project NTFS documentation: + * http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html + */ +enum { + USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001), + USN_SOURCE_AUXILIARY_DATA = const_cpu_to_le32(0x00000002), + USN_SOURCE_REPLICATION_MANAGEMENT = const_cpu_to_le32(0x00000004), +}; + +typedef le32 USN_SOURCE_INFO_FLAGS; + +/* + * $DATA/$J attribute. This is always non-resident, is marked as sparse, and + * is of variabled size. It consists of a sequence of variable size + * USN_RECORDS. The minimum allocated_size is allocation_delta as + * specified in $DATA/$Max. When the maximum_size specified in $DATA/$Max is + * exceeded by more than allocation_delta bytes, allocation_delta bytes are + * allocated and appended to the $DATA/$J attribute and an equal number of + * bytes at the beginning of the attribute are freed and made sparse. Note the + * making sparse only happens at volume checkpoints and hence the actual + * $DATA/$J size can exceed maximum_size + allocation_delta temporarily. + */ +typedef struct { +/*Ofs*/ +/* 0*/le32 length; /* Byte size of this record (8-byte + aligned). */ +/* 4*/le16 major_ver; /* Major version of the transaction log used + for this record. */ +/* 6*/le16 minor_ver; /* Minor version of the transaction log used + for this record. */ +/* 8*/leMFT_REF mft_reference;/* The mft reference of the file (or + directory) described by this record. */ +/*0x10*/leMFT_REF parent_directory;/* The mft reference of the parent + directory of the file described by this + record. */ +/*0x18*/leUSN usn; /* The usn of this record. Equals the offset + within the $DATA/$J attribute. */ +/*0x20*/sle64 time; /* Time when this record was created. */ +/*0x28*/USN_REASON_FLAGS reason;/* Reason flags (see above). */ +/*0x2c*/USN_SOURCE_INFO_FLAGS source_info;/* Source info flags (see above). */ +/*0x30*/le32 security_id; /* File security_id copied from + $STANDARD_INFORMATION. */ +/*0x34*/FILE_ATTR_FLAGS file_attributes; /* File attributes copied from + $STANDARD_INFORMATION or $FILE_NAME (not + sure which). */ +/*0x38*/le16 file_name_size; /* Size of the file name in bytes. */ +/*0x3a*/le16 file_name_offset; /* Offset to the file name in bytes from the + start of this record. */ +/*0x3c*/ntfschar file_name[0]; /* Use when creating only. When reading use + file_name_offset to determine the location + of the name. */ +/* sizeof() = 60 (0x3c) bytes */ +} __attribute__ ((__packed__)) USN_RECORD; + +extern BOOL ntfs_stamp_usnjrnl(ntfs_volume *vol); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_USNJRNL_H */ diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h index 4b97fa8635a..375cd20a9f6 100644 --- a/fs/ntfs/volume.h +++ b/fs/ntfs/volume.h @@ -2,7 +2,7 @@ * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part * of the Linux-NTFS project. * - * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2001-2005 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -54,7 +54,7 @@ typedef struct { mode_t dmask; /* The mask for directory permissions. */ u8 mft_zone_multiplier; /* Initial mft zone multiplier. */ - u8 on_errors; /* What to do on file system errors. */ + u8 on_errors; /* What to do on filesystem errors. */ /* NTFS bootsector provided information. */ u16 sector_size; /* in bytes */ u8 sector_size_bits; /* log2(sector_size) */ @@ -125,6 +125,10 @@ typedef struct { /* $Quota stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ struct inode *quota_ino; /* The VFS inode of $Quota. */ struct inode *quota_q_ino; /* Attribute inode for $Quota/$Q. */ + /* $UsnJrnl stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ + struct inode *usnjrnl_ino; /* The VFS inode of $UsnJrnl. */ + struct inode *usnjrnl_max_ino; /* Attribute inode for $UsnJrnl/$Max. */ + struct inode *usnjrnl_j_ino; /* Attribute inode for $UsnJrnl/$J. */ #endif /* NTFS_RW */ struct nls_table *nls_map; } ntfs_volume; @@ -141,6 +145,8 @@ typedef enum { file names in WIN32 namespace. */ NV_LogFileEmpty, /* 1: $LogFile journal is empty. */ NV_QuotaOutOfDate, /* 1: $Quota is out of date. */ + NV_UsnJrnlStamped, /* 1: $UsnJrnl has been stamped. */ + NV_SparseEnabled, /* 1: May create sparse files. */ } ntfs_volume_flags; /* @@ -167,5 +173,7 @@ NVOL_FNS(ShowSystemFiles) NVOL_FNS(CaseSensitive) NVOL_FNS(LogFileEmpty) NVOL_FNS(QuotaOutOfDate) +NVOL_FNS(UsnJrnlStamped) +NVOL_FNS(SparseEnabled) #endif /* _LINUX_NTFS_VOLUME_H */ diff --git a/fs/open.c b/fs/open.c index 963bd81a44c..32bf05e2996 100644 --- a/fs/open.c +++ b/fs/open.c @@ -10,7 +10,7 @@ #include <linux/file.h> #include <linux/smp_lock.h> #include <linux/quotaops.h> -#include <linux/dnotify.h> +#include <linux/fsnotify.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/tty.h> @@ -21,6 +21,7 @@ #include <linux/vfs.h> #include <asm/uaccess.h> #include <linux/fs.h> +#include <linux/personality.h> #include <linux/pagemap.h> #include <linux/syscalls.h> @@ -807,7 +808,9 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) /* NB: we're sure to have correct a_ops only after f_op->open */ if (f->f_flags & O_DIRECT) { - if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) { + if (!f->f_mapping->a_ops || + ((!f->f_mapping->a_ops->direct_IO) && + (!f->f_mapping->a_ops->get_xip_page))) { fput(f); f = ERR_PTR(-EINVAL); } @@ -933,31 +936,28 @@ EXPORT_SYMBOL(fd_install); asmlinkage long sys_open(const char __user * filename, int flags, int mode) { char * tmp; - int fd, error; + int fd; + + if (force_o_largefile()) + flags |= O_LARGEFILE; -#if BITS_PER_LONG != 32 - flags |= O_LARGEFILE; -#endif tmp = getname(filename); fd = PTR_ERR(tmp); if (!IS_ERR(tmp)) { fd = get_unused_fd(); if (fd >= 0) { struct file *f = filp_open(tmp, flags, mode); - error = PTR_ERR(f); - if (IS_ERR(f)) - goto out_error; - fd_install(fd, f); + if (IS_ERR(f)) { + put_unused_fd(fd); + fd = PTR_ERR(f); + } else { + fsnotify_open(f->f_dentry); + fd_install(fd, f); + } } -out: putname(tmp); } return fd; - -out_error: - put_unused_fd(fd); - fd = error; - goto out; } EXPORT_SYMBOL_GPL(sys_open); @@ -980,23 +980,15 @@ asmlinkage long sys_creat(const char __user * pathname, int mode) */ int filp_close(struct file *filp, fl_owner_t id) { - int retval; - - /* Report and clear outstanding errors */ - retval = filp->f_error; - if (retval) - filp->f_error = 0; + int retval = 0; if (!file_count(filp)) { printk(KERN_ERR "VFS: Close: file count is 0\n"); - return retval; + return 0; } - if (filp->f_op && filp->f_op->flush) { - int err = filp->f_op->flush(filp); - if (!retval) - retval = err; - } + if (filp->f_op && filp->f_op->flush) + retval = filp->f_op->flush(filp); dnotify_flush(filp, id); locks_remove_posix(filp, id); diff --git a/fs/partitions/Makefile b/fs/partitions/Makefile index 4c83c17969e..66d5cc26faf 100644 --- a/fs/partitions/Makefile +++ b/fs/partitions/Makefile @@ -17,4 +17,3 @@ obj-$(CONFIG_SUN_PARTITION) += sun.o obj-$(CONFIG_ULTRIX_PARTITION) += ultrix.o obj-$(CONFIG_IBM_PARTITION) += ibm.o obj-$(CONFIG_EFI_PARTITION) += efi.o -obj-$(CONFIG_NEC98_PARTITION) += nec98.o msdos.o diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 2cab98a9a62..77e178f1316 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -79,9 +79,6 @@ static int (*check_part[])(struct parsed_partitions *, struct block_device *) = #ifdef CONFIG_LDM_PARTITION ldm_partition, /* this must come before msdos */ #endif -#ifdef CONFIG_NEC98_PARTITION - nec98_partition, /* must be come before `msdos_partition' */ -#endif #ifdef CONFIG_MSDOS_PARTITION msdos_partition, #endif diff --git a/fs/partitions/check.h b/fs/partitions/check.h index 43adcc68e47..17ae8ecd9e8 100644 --- a/fs/partitions/check.h +++ b/fs/partitions/check.h @@ -30,7 +30,3 @@ put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) extern int warn_no_part; -extern void parse_bsd(struct parsed_partitions *state, - struct block_device *bdev, u32 offset, u32 size, - int origin, char *flavour, int max_partitions); - diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c index 584a27b2bbd..9935d254186 100644 --- a/fs/partitions/msdos.c +++ b/fs/partitions/msdos.c @@ -202,12 +202,12 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, #endif } -#if defined(CONFIG_BSD_DISKLABEL) || defined(CONFIG_NEC98_PARTITION) +#if defined(CONFIG_BSD_DISKLABEL) /* * Create devices for BSD partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. */ -void +static void parse_bsd(struct parsed_partitions *state, struct block_device *bdev, u32 offset, u32 size, int origin, char *flavour, int max_partitions) diff --git a/fs/proc/Makefile b/fs/proc/Makefile index 738b9b60293..7431d7ba2d0 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -11,4 +11,5 @@ proc-y += inode.o root.o base.o generic.o array.o \ kmsg.o proc_tty.o proc_misc.o proc-$(CONFIG_PROC_KCORE) += kcore.o +proc-$(CONFIG_PROC_VMCORE) += vmcore.o proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o diff --git a/fs/proc/base.c b/fs/proc/base.c index e31903aadd9..491f2d9f89a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -314,7 +314,7 @@ static int may_ptrace_attach(struct task_struct *task) (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) goto out; rmb(); - if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) + if (task->mm->dumpable != 1 && !capable(CAP_SYS_PTRACE)) goto out; if (security_ptrace(current, task)) goto out; @@ -890,7 +890,7 @@ static struct file_operations proc_seccomp_operations = { }; #endif /* CONFIG_SECCOMP */ -static int proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) +static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; int error = -EACCES; @@ -907,7 +907,7 @@ static int proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) error = PROC_I(inode)->op.proc_get_link(inode, &nd->dentry, &nd->mnt); nd->last_type = LAST_BIND; out: - return error; + return ERR_PTR(error); } static int do_proc_readlink(struct dentry *dentry, struct vfsmount *mnt, @@ -1113,7 +1113,9 @@ static int task_dumpable(struct task_struct *task) if (mm) dumpable = mm->dumpable; task_unlock(task); - return dumpable; + if(dumpable == 1) + return 1; + return 0; } @@ -1690,11 +1692,11 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer, return vfs_readlink(dentry,buffer,buflen,tmp); } -static int proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) +static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) { char tmp[30]; sprintf(tmp, "%d", current->tgid); - return vfs_follow_link(nd,tmp); + return ERR_PTR(vfs_follow_link(nd,tmp)); } static struct inode_operations proc_self_inode_operations = { diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 6c6315d0402..abe8920313f 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -329,10 +329,10 @@ static void release_inode_number(unsigned int inum) spin_unlock(&proc_inum_lock); } -static int proc_follow_link(struct dentry *dentry, struct nameidata *nd) +static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) { nd_set_link(nd, PDE(dentry->d_inode)->data); - return 0; + return NULL; } static struct inode_operations proc_link_inode_operations = { diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index a60a3b3d8a7..a3453555a94 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -44,6 +44,7 @@ #include <linux/jiffies.h> #include <linux/sysrq.h> #include <linux/vmalloc.h> +#include <linux/crash_dump.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/io.h> @@ -219,6 +220,19 @@ static struct file_operations fragmentation_file_operations = { .release = seq_release, }; +extern struct seq_operations zoneinfo_op; +static int zoneinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &zoneinfo_op); +} + +static struct file_operations proc_zoneinfo_file_operations = { + .open = zoneinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static int version_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -438,7 +452,7 @@ static int devices_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { int len = get_chrdev_list(page); - len += get_blkdev_list(page+len); + len += get_blkdev_list(page+len, len); return proc_calc_metrics(page, start, off, count, eof, len); } @@ -589,6 +603,7 @@ void __init proc_misc_init(void) create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations); create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations); create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations); + create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations); create_seq_entry("diskstats", 0, &proc_diskstats_operations); #ifdef CONFIG_MODULES create_seq_entry("modules", 0, &proc_modules_operations); @@ -604,6 +619,11 @@ void __init proc_misc_init(void) (size_t)high_memory - PAGE_OFFSET + PAGE_SIZE; } #endif +#ifdef CONFIG_PROC_VMCORE + proc_vmcore = create_proc_entry("vmcore", S_IRUSR, NULL); + if (proc_vmcore) + proc_vmcore->proc_fops = &proc_vmcore_operations; +#endif #ifdef CONFIG_MAGIC_SYSRQ entry = create_proc_entry("sysrq-trigger", S_IWUSR, NULL); if (entry) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c new file mode 100644 index 00000000000..3b2e7b69e63 --- /dev/null +++ b/fs/proc/vmcore.c @@ -0,0 +1,669 @@ +/* + * fs/proc/vmcore.c Interface for accessing the crash + * dump from the system's previous life. + * Heavily borrowed from fs/proc/kcore.c + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * Copyright (C) IBM Corporation, 2004. All rights reserved + * + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/proc_fs.h> +#include <linux/user.h> +#include <linux/a.out.h> +#include <linux/elf.h> +#include <linux/elfcore.h> +#include <linux/proc_fs.h> +#include <linux/highmem.h> +#include <linux/bootmem.h> +#include <linux/init.h> +#include <linux/crash_dump.h> +#include <linux/list.h> +#include <asm/uaccess.h> +#include <asm/io.h> + +/* List representing chunks of contiguous memory areas and their offsets in + * vmcore file. + */ +static LIST_HEAD(vmcore_list); + +/* Stores the pointer to the buffer containing kernel elf core headers. */ +static char *elfcorebuf; +static size_t elfcorebuf_sz; + +/* Total size of vmcore file. */ +static u64 vmcore_size; + +struct proc_dir_entry *proc_vmcore = NULL; + +/* Reads a page from the oldmem device from given offset. */ +static ssize_t read_from_oldmem(char *buf, size_t count, + loff_t *ppos, int userbuf) +{ + unsigned long pfn, offset; + size_t nr_bytes; + ssize_t read = 0, tmp; + + if (!count) + return 0; + + offset = (unsigned long)(*ppos % PAGE_SIZE); + pfn = (unsigned long)(*ppos / PAGE_SIZE); + if (pfn > saved_max_pfn) + return -EINVAL; + + do { + if (count > (PAGE_SIZE - offset)) + nr_bytes = PAGE_SIZE - offset; + else + nr_bytes = count; + + tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf); + if (tmp < 0) + return tmp; + *ppos += nr_bytes; + count -= nr_bytes; + buf += nr_bytes; + read += nr_bytes; + ++pfn; + offset = 0; + } while (count); + + return read; +} + +/* Maps vmcore file offset to respective physical address in memroy. */ +static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list, + struct vmcore **m_ptr) +{ + struct vmcore *m; + u64 paddr; + + list_for_each_entry(m, vc_list, list) { + u64 start, end; + start = m->offset; + end = m->offset + m->size - 1; + if (offset >= start && offset <= end) { + paddr = m->paddr + offset - start; + *m_ptr = m; + return paddr; + } + } + *m_ptr = NULL; + return 0; +} + +/* Read from the ELF header and then the crash dump. On error, negative value is + * returned otherwise number of bytes read are returned. + */ +static ssize_t read_vmcore(struct file *file, char __user *buffer, + size_t buflen, loff_t *fpos) +{ + ssize_t acc = 0, tmp; + size_t tsz, nr_bytes; + u64 start; + struct vmcore *curr_m = NULL; + + if (buflen == 0 || *fpos >= vmcore_size) + return 0; + + /* trim buflen to not go beyond EOF */ + if (buflen > vmcore_size - *fpos) + buflen = vmcore_size - *fpos; + + /* Read ELF core header */ + if (*fpos < elfcorebuf_sz) { + tsz = elfcorebuf_sz - *fpos; + if (buflen < tsz) + tsz = buflen; + if (copy_to_user(buffer, elfcorebuf + *fpos, tsz)) + return -EFAULT; + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (buflen == 0) + return acc; + } + + start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m); + if (!curr_m) + return -EINVAL; + if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) + tsz = buflen; + + /* Calculate left bytes in current memory segment. */ + nr_bytes = (curr_m->size - (start - curr_m->paddr)); + if (tsz > nr_bytes) + tsz = nr_bytes; + + while (buflen) { + tmp = read_from_oldmem(buffer, tsz, &start, 1); + if (tmp < 0) + return tmp; + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + if (start >= (curr_m->paddr + curr_m->size)) { + if (curr_m->list.next == &vmcore_list) + return acc; /*EOF*/ + curr_m = list_entry(curr_m->list.next, + struct vmcore, list); + start = curr_m->paddr; + } + if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) + tsz = buflen; + /* Calculate left bytes in current memory segment. */ + nr_bytes = (curr_m->size - (start - curr_m->paddr)); + if (tsz > nr_bytes) + tsz = nr_bytes; + } + return acc; +} + +static int open_vmcore(struct inode *inode, struct file *filp) +{ + return 0; +} + +struct file_operations proc_vmcore_operations = { + .read = read_vmcore, + .open = open_vmcore, +}; + +static struct vmcore* __init get_new_element(void) +{ + struct vmcore *p; + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p) + memset(p, 0, sizeof(*p)); + return p; +} + +static u64 __init get_vmcore_size_elf64(char *elfptr) +{ + int i; + u64 size; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr *phdr_ptr; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); + size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr)); + for (i = 0; i < ehdr_ptr->e_phnum; i++) { + size += phdr_ptr->p_memsz; + phdr_ptr++; + } + return size; +} + +static u64 __init get_vmcore_size_elf32(char *elfptr) +{ + int i; + u64 size; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr *phdr_ptr; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); + size = sizeof(Elf32_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr)); + for (i = 0; i < ehdr_ptr->e_phnum; i++) { + size += phdr_ptr->p_memsz; + phdr_ptr++; + } + return size; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, + struct list_head *vc_list) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr phdr, *phdr_ptr; + Elf64_Nhdr *nhdr_ptr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + int j; + void *notes_section; + struct vmcore *new; + u64 offset, max_sz, sz, real_sz = 0; + if (phdr_ptr->p_type != PT_NOTE) + continue; + nr_ptnote++; + max_sz = phdr_ptr->p_memsz; + offset = phdr_ptr->p_offset; + notes_section = kmalloc(max_sz, GFP_KERNEL); + if (!notes_section) + return -ENOMEM; + rc = read_from_oldmem(notes_section, max_sz, &offset, 0); + if (rc < 0) { + kfree(notes_section); + return rc; + } + nhdr_ptr = notes_section; + for (j = 0; j < max_sz; j += sz) { + if (nhdr_ptr->n_namesz == 0) + break; + sz = sizeof(Elf64_Nhdr) + + ((nhdr_ptr->n_namesz + 3) & ~3) + + ((nhdr_ptr->n_descsz + 3) & ~3); + real_sz += sz; + nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz); + } + + /* Add this contiguous chunk of notes section to vmcore list.*/ + new = get_new_element(); + if (!new) { + kfree(notes_section); + return -ENOMEM; + } + new->paddr = phdr_ptr->p_offset; + new->size = real_sz; + list_add_tail(&new->list, vc_list); + phdr_sz += real_sz; + kfree(notes_section); + } + + /* Prepare merged PT_NOTE program header. */ + phdr.p_type = PT_NOTE; + phdr.p_flags = 0; + note_off = sizeof(Elf64_Ehdr) + + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr); + phdr.p_offset = note_off; + phdr.p_vaddr = phdr.p_paddr = 0; + phdr.p_filesz = phdr.p_memsz = phdr_sz; + phdr.p_align = 0; + + /* Add merged PT_NOTE program header*/ + tmp = elfptr + sizeof(Elf64_Ehdr); + memcpy(tmp, &phdr, sizeof(phdr)); + tmp += sizeof(phdr); + + /* Remove unwanted PT_NOTE program headers. */ + i = (nr_ptnote - 1) * sizeof(Elf64_Phdr); + *elfsz = *elfsz - i; + memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr))); + + /* Modify e_phnum to reflect merged headers. */ + ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + + return 0; +} + +/* Merges all the PT_NOTE headers into one. */ +static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, + struct list_head *vc_list) +{ + int i, nr_ptnote=0, rc=0; + char *tmp; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr phdr, *phdr_ptr; + Elf32_Nhdr *nhdr_ptr; + u64 phdr_sz = 0, note_off; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + int j; + void *notes_section; + struct vmcore *new; + u64 offset, max_sz, sz, real_sz = 0; + if (phdr_ptr->p_type != PT_NOTE) + continue; + nr_ptnote++; + max_sz = phdr_ptr->p_memsz; + offset = phdr_ptr->p_offset; + notes_section = kmalloc(max_sz, GFP_KERNEL); + if (!notes_section) + return -ENOMEM; + rc = read_from_oldmem(notes_section, max_sz, &offset, 0); + if (rc < 0) { + kfree(notes_section); + return rc; + } + nhdr_ptr = notes_section; + for (j = 0; j < max_sz; j += sz) { + if (nhdr_ptr->n_namesz == 0) + break; + sz = sizeof(Elf32_Nhdr) + + ((nhdr_ptr->n_namesz + 3) & ~3) + + ((nhdr_ptr->n_descsz + 3) & ~3); + real_sz += sz; + nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz); + } + + /* Add this contiguous chunk of notes section to vmcore list.*/ + new = get_new_element(); + if (!new) { + kfree(notes_section); + return -ENOMEM; + } + new->paddr = phdr_ptr->p_offset; + new->size = real_sz; + list_add_tail(&new->list, vc_list); + phdr_sz += real_sz; + kfree(notes_section); + } + + /* Prepare merged PT_NOTE program header. */ + phdr.p_type = PT_NOTE; + phdr.p_flags = 0; + note_off = sizeof(Elf32_Ehdr) + + (ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf32_Phdr); + phdr.p_offset = note_off; + phdr.p_vaddr = phdr.p_paddr = 0; + phdr.p_filesz = phdr.p_memsz = phdr_sz; + phdr.p_align = 0; + + /* Add merged PT_NOTE program header*/ + tmp = elfptr + sizeof(Elf32_Ehdr); + memcpy(tmp, &phdr, sizeof(phdr)); + tmp += sizeof(phdr); + + /* Remove unwanted PT_NOTE program headers. */ + i = (nr_ptnote - 1) * sizeof(Elf32_Phdr); + *elfsz = *elfsz - i; + memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf32_Ehdr)-sizeof(Elf32_Phdr))); + + /* Modify e_phnum to reflect merged headers. */ + ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + + return 0; +} + +/* Add memory chunks represented by program headers to vmcore list. Also update + * the new offset fields of exported program headers. */ +static int __init process_ptload_program_headers_elf64(char *elfptr, + size_t elfsz, + struct list_head *vc_list) +{ + int i; + Elf64_Ehdr *ehdr_ptr; + Elf64_Phdr *phdr_ptr; + loff_t vmcore_off; + struct vmcore *new; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */ + + /* First program header is PT_NOTE header. */ + vmcore_off = sizeof(Elf64_Ehdr) + + (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) + + phdr_ptr->p_memsz; /* Note sections */ + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_LOAD) + continue; + + /* Add this contiguous chunk of memory to vmcore list.*/ + new = get_new_element(); + if (!new) + return -ENOMEM; + new->paddr = phdr_ptr->p_offset; + new->size = phdr_ptr->p_memsz; + list_add_tail(&new->list, vc_list); + + /* Update the program header offset. */ + phdr_ptr->p_offset = vmcore_off; + vmcore_off = vmcore_off + phdr_ptr->p_memsz; + } + return 0; +} + +static int __init process_ptload_program_headers_elf32(char *elfptr, + size_t elfsz, + struct list_head *vc_list) +{ + int i; + Elf32_Ehdr *ehdr_ptr; + Elf32_Phdr *phdr_ptr; + loff_t vmcore_off; + struct vmcore *new; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + phdr_ptr = (Elf32_Phdr*)(elfptr + sizeof(Elf32_Ehdr)); /* PT_NOTE hdr */ + + /* First program header is PT_NOTE header. */ + vmcore_off = sizeof(Elf32_Ehdr) + + (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr) + + phdr_ptr->p_memsz; /* Note sections */ + + for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) { + if (phdr_ptr->p_type != PT_LOAD) + continue; + + /* Add this contiguous chunk of memory to vmcore list.*/ + new = get_new_element(); + if (!new) + return -ENOMEM; + new->paddr = phdr_ptr->p_offset; + new->size = phdr_ptr->p_memsz; + list_add_tail(&new->list, vc_list); + + /* Update the program header offset */ + phdr_ptr->p_offset = vmcore_off; + vmcore_off = vmcore_off + phdr_ptr->p_memsz; + } + return 0; +} + +/* Sets offset fields of vmcore elements. */ +static void __init set_vmcore_list_offsets_elf64(char *elfptr, + struct list_head *vc_list) +{ + loff_t vmcore_off; + Elf64_Ehdr *ehdr_ptr; + struct vmcore *m; + + ehdr_ptr = (Elf64_Ehdr *)elfptr; + + /* Skip Elf header and program headers. */ + vmcore_off = sizeof(Elf64_Ehdr) + + (ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr); + + list_for_each_entry(m, vc_list, list) { + m->offset = vmcore_off; + vmcore_off += m->size; + } +} + +/* Sets offset fields of vmcore elements. */ +static void __init set_vmcore_list_offsets_elf32(char *elfptr, + struct list_head *vc_list) +{ + loff_t vmcore_off; + Elf32_Ehdr *ehdr_ptr; + struct vmcore *m; + + ehdr_ptr = (Elf32_Ehdr *)elfptr; + + /* Skip Elf header and program headers. */ + vmcore_off = sizeof(Elf32_Ehdr) + + (ehdr_ptr->e_phnum) * sizeof(Elf32_Phdr); + + list_for_each_entry(m, vc_list, list) { + m->offset = vmcore_off; + vmcore_off += m->size; + } +} + +static int __init parse_crash_elf64_headers(void) +{ + int rc=0; + Elf64_Ehdr ehdr; + u64 addr; + + addr = elfcorehdr_addr; + + /* Read Elf header */ + rc = read_from_oldmem((char*)&ehdr, sizeof(Elf64_Ehdr), &addr, 0); + if (rc < 0) + return rc; + + /* Do some basic Verification. */ + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || + (ehdr.e_type != ET_CORE) || + !elf_check_arch(&ehdr) || + ehdr.e_ident[EI_CLASS] != ELFCLASS64 || + ehdr.e_ident[EI_VERSION] != EV_CURRENT || + ehdr.e_version != EV_CURRENT || + ehdr.e_ehsize != sizeof(Elf64_Ehdr) || + ehdr.e_phentsize != sizeof(Elf64_Phdr) || + ehdr.e_phnum == 0) { + printk(KERN_WARNING "Warning: Core image elf header is not" + "sane\n"); + return -EINVAL; + } + + /* Read in all elf headers. */ + elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr); + elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); + if (!elfcorebuf) + return -ENOMEM; + addr = elfcorehdr_addr; + rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); + if (rc < 0) { + kfree(elfcorebuf); + return rc; + } + + /* Merge all PT_NOTE headers into one. */ + rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list); + if (rc) { + kfree(elfcorebuf); + return rc; + } + rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz, + &vmcore_list); + if (rc) { + kfree(elfcorebuf); + return rc; + } + set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list); + return 0; +} + +static int __init parse_crash_elf32_headers(void) +{ + int rc=0; + Elf32_Ehdr ehdr; + u64 addr; + + addr = elfcorehdr_addr; + + /* Read Elf header */ + rc = read_from_oldmem((char*)&ehdr, sizeof(Elf32_Ehdr), &addr, 0); + if (rc < 0) + return rc; + + /* Do some basic Verification. */ + if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || + (ehdr.e_type != ET_CORE) || + !elf_check_arch(&ehdr) || + ehdr.e_ident[EI_CLASS] != ELFCLASS32|| + ehdr.e_ident[EI_VERSION] != EV_CURRENT || + ehdr.e_version != EV_CURRENT || + ehdr.e_ehsize != sizeof(Elf32_Ehdr) || + ehdr.e_phentsize != sizeof(Elf32_Phdr) || + ehdr.e_phnum == 0) { + printk(KERN_WARNING "Warning: Core image elf header is not" + "sane\n"); + return -EINVAL; + } + + /* Read in all elf headers. */ + elfcorebuf_sz = sizeof(Elf32_Ehdr) + ehdr.e_phnum * sizeof(Elf32_Phdr); + elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL); + if (!elfcorebuf) + return -ENOMEM; + addr = elfcorehdr_addr; + rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0); + if (rc < 0) { + kfree(elfcorebuf); + return rc; + } + + /* Merge all PT_NOTE headers into one. */ + rc = merge_note_headers_elf32(elfcorebuf, &elfcorebuf_sz, &vmcore_list); + if (rc) { + kfree(elfcorebuf); + return rc; + } + rc = process_ptload_program_headers_elf32(elfcorebuf, elfcorebuf_sz, + &vmcore_list); + if (rc) { + kfree(elfcorebuf); + return rc; + } + set_vmcore_list_offsets_elf32(elfcorebuf, &vmcore_list); + return 0; +} + +static int __init parse_crash_elf_headers(void) +{ + unsigned char e_ident[EI_NIDENT]; + u64 addr; + int rc=0; + + addr = elfcorehdr_addr; + rc = read_from_oldmem(e_ident, EI_NIDENT, &addr, 0); + if (rc < 0) + return rc; + if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) { + printk(KERN_WARNING "Warning: Core image elf header" + " not found\n"); + return -EINVAL; + } + + if (e_ident[EI_CLASS] == ELFCLASS64) { + rc = parse_crash_elf64_headers(); + if (rc) + return rc; + + /* Determine vmcore size. */ + vmcore_size = get_vmcore_size_elf64(elfcorebuf); + } else if (e_ident[EI_CLASS] == ELFCLASS32) { + rc = parse_crash_elf32_headers(); + if (rc) + return rc; + + /* Determine vmcore size. */ + vmcore_size = get_vmcore_size_elf32(elfcorebuf); + } else { + printk(KERN_WARNING "Warning: Core image elf header is not" + " sane\n"); + return -EINVAL; + } + return 0; +} + +/* Init function for vmcore module. */ +static int __init vmcore_init(void) +{ + int rc = 0; + + /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/ + if (!(elfcorehdr_addr < ELFCORE_ADDR_MAX)) + return rc; + rc = parse_crash_elf_headers(); + if (rc) { + printk(KERN_WARNING "Kdump: vmcore not initialized\n"); + return rc; + } + + /* Initialize /proc/vmcore size if proc is already up. */ + if (proc_vmcore) + proc_vmcore->size = vmcore_size; + return 0; +} +module_init(vmcore_init) diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index cd66147cca0..7a8f5595c26 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -61,7 +61,7 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir) ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1; else { le = (struct qnx4_link_info*)de; - ino = ( le->dl_inode_blk - 1 ) * + ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) * QNX4_INODES_PER_BLOCK + le->dl_inode_ndx; } diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index aa92d6b76a9..b79162a3547 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -236,7 +236,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock ) struct buffer_head *bh = NULL; struct qnx4_xblk *xblk = NULL; struct qnx4_inode_entry *qnx4_inode = qnx4_raw_inode(inode); - qnx4_nxtnt_t nxtnt = le16_to_cpu(qnx4_inode->di_num_xtnts); + u16 nxtnt = le16_to_cpu(qnx4_inode->di_num_xtnts); if ( iblock < le32_to_cpu(qnx4_inode->di_first_xtnt.xtnt_size) ) { // iblock is in the first extent. This is easy. @@ -372,7 +372,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) printk("qnx4: unable to read the superblock\n"); goto outnobh; } - if ( le32_to_cpu( *(__u32*)bh->b_data ) != QNX4_SUPER_MAGIC ) { + if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) { if (!silent) printk("qnx4: wrong fsid in superblock.\n"); goto out; diff --git a/fs/quota.c b/fs/quota.c index 3f0333a51a2..f5d1cff5519 100644 --- a/fs/quota.c +++ b/fs/quota.c @@ -149,36 +149,6 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t return error; } -static struct super_block *get_super_to_sync(int type) -{ - struct list_head *head; - int cnt, dirty; - -restart: - spin_lock(&sb_lock); - list_for_each(head, &super_blocks) { - struct super_block *sb = list_entry(head, struct super_block, s_list); - - /* This test just improves performance so it needn't be reliable... */ - for (cnt = 0, dirty = 0; cnt < MAXQUOTAS; cnt++) - if ((type == cnt || type == -1) && sb_has_quota_enabled(sb, cnt) - && info_any_dirty(&sb_dqopt(sb)->info[cnt])) - dirty = 1; - if (!dirty) - continue; - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (!sb->s_root) { - drop_super(sb); - goto restart; - } - return sb; - } - spin_unlock(&sb_lock); - return NULL; -} - static void quota_sync_sb(struct super_block *sb, int type) { int cnt; @@ -219,17 +189,35 @@ static void quota_sync_sb(struct super_block *sb, int type) void sync_dquots(struct super_block *sb, int type) { + int cnt, dirty; + if (sb) { if (sb->s_qcop->quota_sync) quota_sync_sb(sb, type); + return; } - else { - while ((sb = get_super_to_sync(type)) != NULL) { - if (sb->s_qcop->quota_sync) - quota_sync_sb(sb, type); - drop_super(sb); - } + + spin_lock(&sb_lock); +restart: + list_for_each_entry(sb, &super_blocks, s_list) { + /* This test just improves performance so it needn't be reliable... */ + for (cnt = 0, dirty = 0; cnt < MAXQUOTAS; cnt++) + if ((type == cnt || type == -1) && sb_has_quota_enabled(sb, cnt) + && info_any_dirty(&sb_dqopt(sb)->info[cnt])) + dirty = 1; + if (!dirty) + continue; + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + if (sb->s_root && sb->s_qcop->quota_sync) + quota_sync_sb(sb, type); + up_read(&sb->s_umount); + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; } + spin_unlock(&sb_lock); } /* Copy parameters and call proper function */ diff --git a/fs/read_write.c b/fs/read_write.c index c4c2bee373e..563abd09b5c 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -10,7 +10,7 @@ #include <linux/file.h> #include <linux/uio.h> #include <linux/smp_lock.h> -#include <linux/dnotify.h> +#include <linux/fsnotify.h> #include <linux/security.h> #include <linux/module.h> #include <linux/syscalls.h> @@ -203,6 +203,16 @@ Einval: return -EINVAL; } +static void wait_on_retry_sync_kiocb(struct kiocb *iocb) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + if (!kiocbIsKicked(iocb)) + schedule(); + else + kiocbClearKicked(iocb); + __set_current_state(TASK_RUNNING); +} + ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct kiocb kiocb; @@ -210,7 +220,10 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - ret = filp->f_op->aio_read(&kiocb, buf, len, kiocb.ki_pos); + while (-EIOCBRETRY == + (ret = filp->f_op->aio_read(&kiocb, buf, len, kiocb.ki_pos))) + wait_on_retry_sync_kiocb(&kiocb); + if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -239,7 +252,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) else ret = do_sync_read(file, buf, count, pos); if (ret > 0) { - dnotify_parent(file->f_dentry, DN_ACCESS); + fsnotify_access(file->f_dentry); current->rchar += ret; } current->syscr++; @@ -258,7 +271,10 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - ret = filp->f_op->aio_write(&kiocb, buf, len, kiocb.ki_pos); + while (-EIOCBRETRY == + (ret = filp->f_op->aio_write(&kiocb, buf, len, kiocb.ki_pos))) + wait_on_retry_sync_kiocb(&kiocb); + if (-EIOCBQUEUED == ret) ret = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -287,7 +303,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ else ret = do_sync_write(file, buf, count, pos); if (ret > 0) { - dnotify_parent(file->f_dentry, DN_MODIFY); + fsnotify_modify(file->f_dentry); current->wchar += ret; } current->syscw++; @@ -523,9 +539,12 @@ static ssize_t do_readv_writev(int type, struct file *file, out: if (iov != iovstack) kfree(iov); - if ((ret + (type == READ)) > 0) - dnotify_parent(file->f_dentry, - (type == READ) ? DN_ACCESS : DN_MODIFY); + if ((ret + (type == READ)) > 0) { + if (type == READ) + fsnotify_access(file->f_dentry); + else + fsnotify_modify(file->f_dentry); + } return ret; Efault: ret = -EFAULT; diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index 49c479c9454..909f71e9a30 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -46,1125 +46,1221 @@ #define TEST_OPTION(optname, s) \ test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s)) -static inline void get_bit_address (struct super_block * s, - b_blocknr_t block, int * bmap_nr, int * offset) +static inline void get_bit_address(struct super_block *s, + b_blocknr_t block, int *bmap_nr, int *offset) { - /* It is in the bitmap block number equal to the block - * number divided by the number of bits in a block. */ - *bmap_nr = block / (s->s_blocksize << 3); - /* Within that bitmap block it is located at bit offset *offset. */ - *offset = block & ((s->s_blocksize << 3) - 1 ); - return; + /* It is in the bitmap block number equal to the block + * number divided by the number of bits in a block. */ + *bmap_nr = block / (s->s_blocksize << 3); + /* Within that bitmap block it is located at bit offset *offset. */ + *offset = block & ((s->s_blocksize << 3) - 1); + return; } #ifdef CONFIG_REISERFS_CHECK -int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value) +int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value) { - int i, j; + int i, j; - if (block == 0 || block >= SB_BLOCK_COUNT (s)) { - reiserfs_warning (s, "vs-4010: is_reusable: block number is out of range %lu (%u)", - block, SB_BLOCK_COUNT (s)); - return 0; - } - - /* it can't be one of the bitmap blocks */ - for (i = 0; i < SB_BMAP_NR (s); i ++) - if (block == SB_AP_BITMAP (s)[i].bh->b_blocknr) { - reiserfs_warning (s, "vs: 4020: is_reusable: " - "bitmap block %lu(%u) can't be freed or reused", - block, SB_BMAP_NR (s)); - return 0; + if (block == 0 || block >= SB_BLOCK_COUNT(s)) { + reiserfs_warning(s, + "vs-4010: is_reusable: block number is out of range %lu (%u)", + block, SB_BLOCK_COUNT(s)); + return 0; } - - get_bit_address (s, block, &i, &j); - if (i >= SB_BMAP_NR (s)) { - reiserfs_warning (s, "vs-4030: is_reusable: there is no so many bitmap blocks: " - "block=%lu, bitmap_nr=%d", block, i); - return 0; - } + /* it can't be one of the bitmap blocks */ + for (i = 0; i < SB_BMAP_NR(s); i++) + if (block == SB_AP_BITMAP(s)[i].bh->b_blocknr) { + reiserfs_warning(s, "vs: 4020: is_reusable: " + "bitmap block %lu(%u) can't be freed or reused", + block, SB_BMAP_NR(s)); + return 0; + } - if ((bit_value == 0 && - reiserfs_test_le_bit(j, SB_AP_BITMAP(s)[i].bh->b_data)) || - (bit_value == 1 && - reiserfs_test_le_bit(j, SB_AP_BITMAP (s)[i].bh->b_data) == 0)) { - reiserfs_warning (s, "vs-4040: is_reusable: corresponding bit of block %lu does not " - "match required value (i==%d, j==%d) test_bit==%d", - block, i, j, reiserfs_test_le_bit (j, SB_AP_BITMAP (s)[i].bh->b_data)); + get_bit_address(s, block, &i, &j); - return 0; - } + if (i >= SB_BMAP_NR(s)) { + reiserfs_warning(s, + "vs-4030: is_reusable: there is no so many bitmap blocks: " + "block=%lu, bitmap_nr=%d", block, i); + return 0; + } - if (bit_value == 0 && block == SB_ROOT_BLOCK (s)) { - reiserfs_warning (s, "vs-4050: is_reusable: this is root block (%u), " - "it must be busy", SB_ROOT_BLOCK (s)); - return 0; - } + if ((bit_value == 0 && + reiserfs_test_le_bit(j, SB_AP_BITMAP(s)[i].bh->b_data)) || + (bit_value == 1 && + reiserfs_test_le_bit(j, SB_AP_BITMAP(s)[i].bh->b_data) == 0)) { + reiserfs_warning(s, + "vs-4040: is_reusable: corresponding bit of block %lu does not " + "match required value (i==%d, j==%d) test_bit==%d", + block, i, j, reiserfs_test_le_bit(j, + SB_AP_BITMAP + (s)[i].bh-> + b_data)); + + return 0; + } - return 1; + if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) { + reiserfs_warning(s, + "vs-4050: is_reusable: this is root block (%u), " + "it must be busy", SB_ROOT_BLOCK(s)); + return 0; + } + + return 1; } -#endif /* CONFIG_REISERFS_CHECK */ +#endif /* CONFIG_REISERFS_CHECK */ /* searches in journal structures for a given block number (bmap, off). If block is found in reiserfs journal it suggests next free block candidate to test. */ -static inline int is_block_in_journal (struct super_block * s, int bmap, int -off, int *next) +static inline int is_block_in_journal(struct super_block *s, int bmap, int + off, int *next) { - b_blocknr_t tmp; - - if (reiserfs_in_journal (s, bmap, off, 1, &tmp)) { - if (tmp) { /* hint supplied */ - *next = tmp; - PROC_INFO_INC( s, scan_bitmap.in_journal_hint ); - } else { - (*next) = off + 1; /* inc offset to avoid looping. */ - PROC_INFO_INC( s, scan_bitmap.in_journal_nohint ); + b_blocknr_t tmp; + + if (reiserfs_in_journal(s, bmap, off, 1, &tmp)) { + if (tmp) { /* hint supplied */ + *next = tmp; + PROC_INFO_INC(s, scan_bitmap.in_journal_hint); + } else { + (*next) = off + 1; /* inc offset to avoid looping. */ + PROC_INFO_INC(s, scan_bitmap.in_journal_nohint); + } + PROC_INFO_INC(s, scan_bitmap.retry); + return 1; } - PROC_INFO_INC( s, scan_bitmap.retry ); - return 1; - } - return 0; + return 0; } /* it searches for a window of zero bits with given minimum and maximum lengths in one bitmap * block; */ -static int scan_bitmap_block (struct reiserfs_transaction_handle *th, - int bmap_n, int *beg, int boundary, int min, int max, int unfm) +static int scan_bitmap_block(struct reiserfs_transaction_handle *th, + int bmap_n, int *beg, int boundary, int min, + int max, int unfm) { - struct super_block *s = th->t_super; - struct reiserfs_bitmap_info *bi=&SB_AP_BITMAP(s)[bmap_n]; - int end, next; - int org = *beg; + struct super_block *s = th->t_super; + struct reiserfs_bitmap_info *bi = &SB_AP_BITMAP(s)[bmap_n]; + int end, next; + int org = *beg; - BUG_ON (!th->t_trans_id); + BUG_ON(!th->t_trans_id); - RFALSE(bmap_n >= SB_BMAP_NR (s), "Bitmap %d is out of range (0..%d)",bmap_n, SB_BMAP_NR (s) - 1); - PROC_INFO_INC( s, scan_bitmap.bmap ); + RFALSE(bmap_n >= SB_BMAP_NR(s), "Bitmap %d is out of range (0..%d)", + bmap_n, SB_BMAP_NR(s) - 1); + PROC_INFO_INC(s, scan_bitmap.bmap); /* this is unclear and lacks comments, explain how journal bitmaps work here for the reader. Convey a sense of the design here. What is a window? */ /* - I mean `a window of zero bits' as in description of this function - Zam. */ - - if ( !bi ) { - reiserfs_warning (s, "NULL bitmap info pointer for bitmap %d", bmap_n); - return 0; - } - if (buffer_locked (bi->bh)) { - PROC_INFO_INC( s, scan_bitmap.wait ); - __wait_on_buffer (bi->bh); - } - - while (1) { - cont: - if (bi->free_count < min) - return 0; // No free blocks in this bitmap - - /* search for a first zero bit -- beggining of a window */ - *beg = reiserfs_find_next_zero_le_bit - ((unsigned long*)(bi->bh->b_data), boundary, *beg); - - if (*beg + min > boundary) { /* search for a zero bit fails or the rest of bitmap block - * cannot contain a zero window of minimum size */ - return 0; - } - if (unfm && is_block_in_journal(s,bmap_n, *beg, beg)) - continue; - /* first zero bit found; we check next bits */ - for (end = *beg + 1;; end ++) { - if (end >= *beg + max || end >= boundary || reiserfs_test_le_bit (end, bi->bh->b_data)) { - next = end; - break; - } - /* finding the other end of zero bit window requires looking into journal structures (in - * case of searching for free blocks for unformatted nodes) */ - if (unfm && is_block_in_journal(s, bmap_n, end, &next)) - break; + if (!bi) { + reiserfs_warning(s, "NULL bitmap info pointer for bitmap %d", + bmap_n); + return 0; + } + if (buffer_locked(bi->bh)) { + PROC_INFO_INC(s, scan_bitmap.wait); + __wait_on_buffer(bi->bh); } - /* now (*beg) points to beginning of zero bits window, - * (end) points to one bit after the window end */ - if (end - *beg >= min) { /* it seems we have found window of proper size */ - int i; - reiserfs_prepare_for_journal (s, bi->bh, 1); - /* try to set all blocks used checking are they still free */ - for (i = *beg; i < end; i++) { - /* It seems that we should not check in journal again. */ - if (reiserfs_test_and_set_le_bit (i, bi->bh->b_data)) { - /* bit was set by another process - * while we slept in prepare_for_journal() */ - PROC_INFO_INC( s, scan_bitmap.stolen ); - if (i >= *beg + min) { /* we can continue with smaller set of allocated blocks, - * if length of this set is more or equal to `min' */ - end = i; - break; - } - /* otherwise we clear all bit were set ... */ - while (--i >= *beg) - reiserfs_test_and_clear_le_bit (i, bi->bh->b_data); - reiserfs_restore_prepared_buffer (s, bi->bh); - *beg = org; - /* ... and search again in current block from beginning */ - goto cont; + while (1) { + cont: + if (bi->free_count < min) + return 0; // No free blocks in this bitmap + + /* search for a first zero bit -- beggining of a window */ + *beg = reiserfs_find_next_zero_le_bit + ((unsigned long *)(bi->bh->b_data), boundary, *beg); + + if (*beg + min > boundary) { /* search for a zero bit fails or the rest of bitmap block + * cannot contain a zero window of minimum size */ + return 0; } - } - bi->free_count -= (end - *beg); - journal_mark_dirty (th, s, bi->bh); - /* free block count calculation */ - reiserfs_prepare_for_journal (s, SB_BUFFER_WITH_SB(s), 1); - PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg)); - journal_mark_dirty (th, s, SB_BUFFER_WITH_SB(s)); + if (unfm && is_block_in_journal(s, bmap_n, *beg, beg)) + continue; + /* first zero bit found; we check next bits */ + for (end = *beg + 1;; end++) { + if (end >= *beg + max || end >= boundary + || reiserfs_test_le_bit(end, bi->bh->b_data)) { + next = end; + break; + } + /* finding the other end of zero bit window requires looking into journal structures (in + * case of searching for free blocks for unformatted nodes) */ + if (unfm && is_block_in_journal(s, bmap_n, end, &next)) + break; + } - return end - (*beg); - } else { - *beg = next; + /* now (*beg) points to beginning of zero bits window, + * (end) points to one bit after the window end */ + if (end - *beg >= min) { /* it seems we have found window of proper size */ + int i; + reiserfs_prepare_for_journal(s, bi->bh, 1); + /* try to set all blocks used checking are they still free */ + for (i = *beg; i < end; i++) { + /* It seems that we should not check in journal again. */ + if (reiserfs_test_and_set_le_bit + (i, bi->bh->b_data)) { + /* bit was set by another process + * while we slept in prepare_for_journal() */ + PROC_INFO_INC(s, scan_bitmap.stolen); + if (i >= *beg + min) { /* we can continue with smaller set of allocated blocks, + * if length of this set is more or equal to `min' */ + end = i; + break; + } + /* otherwise we clear all bit were set ... */ + while (--i >= *beg) + reiserfs_test_and_clear_le_bit + (i, bi->bh->b_data); + reiserfs_restore_prepared_buffer(s, + bi-> + bh); + *beg = org; + /* ... and search again in current block from beginning */ + goto cont; + } + } + bi->free_count -= (end - *beg); + journal_mark_dirty(th, s, bi->bh); + + /* free block count calculation */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), + 1); + PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg)); + journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s)); + + return end - (*beg); + } else { + *beg = next; + } } - } } -static int bmap_hash_id(struct super_block *s, u32 id) { - char * hash_in = NULL; - unsigned long hash; - unsigned bm; - - if (id <= 2) { - bm = 1; - } else { - hash_in = (char *)(&id); - hash = keyed_hash(hash_in, 4); - bm = hash % SB_BMAP_NR(s); - if (!bm) - bm = 1; - } - /* this can only be true when SB_BMAP_NR = 1 */ - if (bm >= SB_BMAP_NR(s)) - bm = 0; - return bm; +static int bmap_hash_id(struct super_block *s, u32 id) +{ + char *hash_in = NULL; + unsigned long hash; + unsigned bm; + + if (id <= 2) { + bm = 1; + } else { + hash_in = (char *)(&id); + hash = keyed_hash(hash_in, 4); + bm = hash % SB_BMAP_NR(s); + if (!bm) + bm = 1; + } + /* this can only be true when SB_BMAP_NR = 1 */ + if (bm >= SB_BMAP_NR(s)) + bm = 0; + return bm; } /* * hashes the id and then returns > 0 if the block group for the * corresponding hash is full */ -static inline int block_group_used(struct super_block *s, u32 id) { - int bm; - bm = bmap_hash_id(s, id); - if (SB_AP_BITMAP(s)[bm].free_count > ((s->s_blocksize << 3) * 60 / 100) ) { - return 0; - } - return 1; +static inline int block_group_used(struct super_block *s, u32 id) +{ + int bm; + bm = bmap_hash_id(s, id); + if (SB_AP_BITMAP(s)[bm].free_count > ((s->s_blocksize << 3) * 60 / 100)) { + return 0; + } + return 1; } /* * the packing is returned in disk byte order */ -__le32 reiserfs_choose_packing(struct inode *dir) +__le32 reiserfs_choose_packing(struct inode * dir) { - __le32 packing; - if (TEST_OPTION(packing_groups, dir->i_sb)) { - u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id); - /* - * some versions of reiserfsck expect packing locality 1 to be - * special - */ - if (parent_dir == 1 || block_group_used(dir->i_sb,parent_dir)) - packing = INODE_PKEY(dir)->k_objectid; - else - packing = INODE_PKEY(dir)->k_dir_id; - } else - packing = INODE_PKEY(dir)->k_objectid; - return packing; + __le32 packing; + if (TEST_OPTION(packing_groups, dir->i_sb)) { + u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id); + /* + * some versions of reiserfsck expect packing locality 1 to be + * special + */ + if (parent_dir == 1 || block_group_used(dir->i_sb, parent_dir)) + packing = INODE_PKEY(dir)->k_objectid; + else + packing = INODE_PKEY(dir)->k_dir_id; + } else + packing = INODE_PKEY(dir)->k_objectid; + return packing; } - + /* Tries to find contiguous zero bit window (given size) in given region of * bitmap and place new blocks there. Returns number of allocated blocks. */ -static int scan_bitmap (struct reiserfs_transaction_handle *th, - b_blocknr_t *start, b_blocknr_t finish, - int min, int max, int unfm, unsigned long file_block) +static int scan_bitmap(struct reiserfs_transaction_handle *th, + b_blocknr_t * start, b_blocknr_t finish, + int min, int max, int unfm, unsigned long file_block) { - int nr_allocated=0; - struct super_block * s = th->t_super; - /* find every bm and bmap and bmap_nr in this file, and change them all to bitmap_blocknr - * - Hans, it is not a block number - Zam. */ - - int bm, off; - int end_bm, end_off; - int off_max = s->s_blocksize << 3; - - BUG_ON (!th->t_trans_id); - - PROC_INFO_INC( s, scan_bitmap.call ); - if ( SB_FREE_BLOCKS(s) <= 0) - return 0; // No point in looking for more free blocks - - get_bit_address (s, *start, &bm, &off); - get_bit_address (s, finish, &end_bm, &end_off); - if (bm > SB_BMAP_NR(s)) - return 0; - if (end_bm > SB_BMAP_NR(s)) - end_bm = SB_BMAP_NR(s); - - /* When the bitmap is more than 10% free, anyone can allocate. - * When it's less than 10% free, only files that already use the - * bitmap are allowed. Once we pass 80% full, this restriction - * is lifted. - * - * We do this so that files that grow later still have space close to - * their original allocation. This improves locality, and presumably - * performance as a result. - * - * This is only an allocation policy and does not make up for getting a - * bad hint. Decent hinting must be implemented for this to work well. - */ - if ( TEST_OPTION(skip_busy, s) && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s)/20 ) { - for (;bm < end_bm; bm++, off = 0) { - if ( ( off && (!unfm || (file_block != 0))) || SB_AP_BITMAP(s)[bm].free_count > (s->s_blocksize << 3) / 10 ) - nr_allocated = scan_bitmap_block(th, bm, &off, off_max, min, max, unfm); - if (nr_allocated) - goto ret; - } - /* we know from above that start is a reasonable number */ - get_bit_address (s, *start, &bm, &off); - } - - for (;bm < end_bm; bm++, off = 0) { - nr_allocated = scan_bitmap_block(th, bm, &off, off_max, min, max, unfm); - if (nr_allocated) - goto ret; - } - - nr_allocated = scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm); - - ret: - *start = bm * off_max + off; - return nr_allocated; + int nr_allocated = 0; + struct super_block *s = th->t_super; + /* find every bm and bmap and bmap_nr in this file, and change them all to bitmap_blocknr + * - Hans, it is not a block number - Zam. */ + + int bm, off; + int end_bm, end_off; + int off_max = s->s_blocksize << 3; + + BUG_ON(!th->t_trans_id); + + PROC_INFO_INC(s, scan_bitmap.call); + if (SB_FREE_BLOCKS(s) <= 0) + return 0; // No point in looking for more free blocks + + get_bit_address(s, *start, &bm, &off); + get_bit_address(s, finish, &end_bm, &end_off); + if (bm > SB_BMAP_NR(s)) + return 0; + if (end_bm > SB_BMAP_NR(s)) + end_bm = SB_BMAP_NR(s); + + /* When the bitmap is more than 10% free, anyone can allocate. + * When it's less than 10% free, only files that already use the + * bitmap are allowed. Once we pass 80% full, this restriction + * is lifted. + * + * We do this so that files that grow later still have space close to + * their original allocation. This improves locality, and presumably + * performance as a result. + * + * This is only an allocation policy and does not make up for getting a + * bad hint. Decent hinting must be implemented for this to work well. + */ + if (TEST_OPTION(skip_busy, s) + && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s) / 20) { + for (; bm < end_bm; bm++, off = 0) { + if ((off && (!unfm || (file_block != 0))) + || SB_AP_BITMAP(s)[bm].free_count > + (s->s_blocksize << 3) / 10) + nr_allocated = + scan_bitmap_block(th, bm, &off, off_max, + min, max, unfm); + if (nr_allocated) + goto ret; + } + /* we know from above that start is a reasonable number */ + get_bit_address(s, *start, &bm, &off); + } + + for (; bm < end_bm; bm++, off = 0) { + nr_allocated = + scan_bitmap_block(th, bm, &off, off_max, min, max, unfm); + if (nr_allocated) + goto ret; + } + + nr_allocated = + scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm); + + ret: + *start = bm * off_max + off; + return nr_allocated; } -static void _reiserfs_free_block (struct reiserfs_transaction_handle *th, - struct inode *inode, b_blocknr_t block, - int for_unformatted) +static void _reiserfs_free_block(struct reiserfs_transaction_handle *th, + struct inode *inode, b_blocknr_t block, + int for_unformatted) { - struct super_block * s = th->t_super; - struct reiserfs_super_block * rs; - struct buffer_head * sbh; - struct reiserfs_bitmap_info *apbi; - int nr, offset; + struct super_block *s = th->t_super; + struct reiserfs_super_block *rs; + struct buffer_head *sbh; + struct reiserfs_bitmap_info *apbi; + int nr, offset; - BUG_ON (!th->t_trans_id); + BUG_ON(!th->t_trans_id); - PROC_INFO_INC( s, free_block ); + PROC_INFO_INC(s, free_block); - rs = SB_DISK_SUPER_BLOCK (s); - sbh = SB_BUFFER_WITH_SB (s); - apbi = SB_AP_BITMAP(s); + rs = SB_DISK_SUPER_BLOCK(s); + sbh = SB_BUFFER_WITH_SB(s); + apbi = SB_AP_BITMAP(s); - get_bit_address (s, block, &nr, &offset); + get_bit_address(s, block, &nr, &offset); - if (nr >= sb_bmap_nr (rs)) { - reiserfs_warning (s, "vs-4075: reiserfs_free_block: " - "block %lu is out of range on %s", - block, reiserfs_bdevname (s)); - return; - } - - reiserfs_prepare_for_journal(s, apbi[nr].bh, 1 ) ; - - /* clear bit for the given block in bit map */ - if (!reiserfs_test_and_clear_le_bit (offset, apbi[nr].bh->b_data)) { - reiserfs_warning (s, "vs-4080: reiserfs_free_block: " - "free_block (%s:%lu)[dev:blocknr]: bit already cleared", - reiserfs_bdevname (s), block); - } - apbi[nr].free_count ++; - journal_mark_dirty (th, s, apbi[nr].bh); - - reiserfs_prepare_for_journal(s, sbh, 1) ; - /* update super block */ - set_sb_free_blocks( rs, sb_free_blocks(rs) + 1 ); - - journal_mark_dirty (th, s, sbh); - if (for_unformatted) - DQUOT_FREE_BLOCK_NODIRTY(inode, 1); + if (nr >= sb_bmap_nr(rs)) { + reiserfs_warning(s, "vs-4075: reiserfs_free_block: " + "block %lu is out of range on %s", + block, reiserfs_bdevname(s)); + return; + } + + reiserfs_prepare_for_journal(s, apbi[nr].bh, 1); + + /* clear bit for the given block in bit map */ + if (!reiserfs_test_and_clear_le_bit(offset, apbi[nr].bh->b_data)) { + reiserfs_warning(s, "vs-4080: reiserfs_free_block: " + "free_block (%s:%lu)[dev:blocknr]: bit already cleared", + reiserfs_bdevname(s), block); + } + apbi[nr].free_count++; + journal_mark_dirty(th, s, apbi[nr].bh); + + reiserfs_prepare_for_journal(s, sbh, 1); + /* update super block */ + set_sb_free_blocks(rs, sb_free_blocks(rs) + 1); + + journal_mark_dirty(th, s, sbh); + if (for_unformatted) + DQUOT_FREE_BLOCK_NODIRTY(inode, 1); } -void reiserfs_free_block (struct reiserfs_transaction_handle *th, - struct inode *inode, b_blocknr_t block, - int for_unformatted) +void reiserfs_free_block(struct reiserfs_transaction_handle *th, + struct inode *inode, b_blocknr_t block, + int for_unformatted) { - struct super_block * s = th->t_super; + struct super_block *s = th->t_super; - BUG_ON (!th->t_trans_id); + BUG_ON(!th->t_trans_id); - RFALSE(!s, "vs-4061: trying to free block on nonexistent device"); - RFALSE(is_reusable (s, block, 1) == 0, "vs-4071: can not free such block"); - /* mark it before we clear it, just in case */ - journal_mark_freed(th, s, block) ; - _reiserfs_free_block(th, inode, block, for_unformatted) ; + RFALSE(!s, "vs-4061: trying to free block on nonexistent device"); + RFALSE(is_reusable(s, block, 1) == 0, + "vs-4071: can not free such block"); + /* mark it before we clear it, just in case */ + journal_mark_freed(th, s, block); + _reiserfs_free_block(th, inode, block, for_unformatted); } /* preallocated blocks don't need to be run through journal_mark_freed */ -static void reiserfs_free_prealloc_block (struct reiserfs_transaction_handle *th, - struct inode *inode, b_blocknr_t block) { - RFALSE(!th->t_super, "vs-4060: trying to free block on nonexistent device"); - RFALSE(is_reusable (th->t_super, block, 1) == 0, "vs-4070: can not free such block"); - BUG_ON (!th->t_trans_id); - _reiserfs_free_block(th, inode, block, 1) ; +static void reiserfs_free_prealloc_block(struct reiserfs_transaction_handle *th, + struct inode *inode, b_blocknr_t block) +{ + RFALSE(!th->t_super, + "vs-4060: trying to free block on nonexistent device"); + RFALSE(is_reusable(th->t_super, block, 1) == 0, + "vs-4070: can not free such block"); + BUG_ON(!th->t_trans_id); + _reiserfs_free_block(th, inode, block, 1); } -static void __discard_prealloc (struct reiserfs_transaction_handle * th, - struct reiserfs_inode_info *ei) +static void __discard_prealloc(struct reiserfs_transaction_handle *th, + struct reiserfs_inode_info *ei) { - unsigned long save = ei->i_prealloc_block ; - int dirty = 0; - struct inode *inode = &ei->vfs_inode; - BUG_ON (!th->t_trans_id); + unsigned long save = ei->i_prealloc_block; + int dirty = 0; + struct inode *inode = &ei->vfs_inode; + BUG_ON(!th->t_trans_id); #ifdef CONFIG_REISERFS_CHECK - if (ei->i_prealloc_count < 0) - reiserfs_warning (th->t_super, "zam-4001:%s: inode has negative prealloc blocks count.", __FUNCTION__ ); + if (ei->i_prealloc_count < 0) + reiserfs_warning(th->t_super, + "zam-4001:%s: inode has negative prealloc blocks count.", + __FUNCTION__); #endif - while (ei->i_prealloc_count > 0) { - reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block); - ei->i_prealloc_block++; - ei->i_prealloc_count --; - dirty = 1; - } - if (dirty) - reiserfs_update_sd(th, inode); - ei->i_prealloc_block = save; - list_del_init(&(ei->i_prealloc_list)); + while (ei->i_prealloc_count > 0) { + reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block); + ei->i_prealloc_block++; + ei->i_prealloc_count--; + dirty = 1; + } + if (dirty) + reiserfs_update_sd(th, inode); + ei->i_prealloc_block = save; + list_del_init(&(ei->i_prealloc_list)); } /* FIXME: It should be inline function */ -void reiserfs_discard_prealloc (struct reiserfs_transaction_handle *th, - struct inode *inode) +void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th, + struct inode *inode) { - struct reiserfs_inode_info *ei = REISERFS_I(inode); - BUG_ON (!th->t_trans_id); - if (ei->i_prealloc_count) - __discard_prealloc(th, ei); + struct reiserfs_inode_info *ei = REISERFS_I(inode); + BUG_ON(!th->t_trans_id); + if (ei->i_prealloc_count) + __discard_prealloc(th, ei); } -void reiserfs_discard_all_prealloc (struct reiserfs_transaction_handle *th) +void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th) { - struct list_head * plist = &SB_JOURNAL(th->t_super)->j_prealloc_list; + struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list; - BUG_ON (!th->t_trans_id); + BUG_ON(!th->t_trans_id); - while (!list_empty(plist)) { - struct reiserfs_inode_info *ei; - ei = list_entry(plist->next, struct reiserfs_inode_info, i_prealloc_list); + while (!list_empty(plist)) { + struct reiserfs_inode_info *ei; + ei = list_entry(plist->next, struct reiserfs_inode_info, + i_prealloc_list); #ifdef CONFIG_REISERFS_CHECK - if (!ei->i_prealloc_count) { - reiserfs_warning (th->t_super, "zam-4001:%s: inode is in prealloc list but has no preallocated blocks.", __FUNCTION__); - } + if (!ei->i_prealloc_count) { + reiserfs_warning(th->t_super, + "zam-4001:%s: inode is in prealloc list but has no preallocated blocks.", + __FUNCTION__); + } #endif - __discard_prealloc(th, ei); - } + __discard_prealloc(th, ei); + } } -void reiserfs_init_alloc_options (struct super_block *s) +void reiserfs_init_alloc_options(struct super_block *s) { - set_bit (_ALLOC_skip_busy, &SB_ALLOC_OPTS(s)); - set_bit (_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s)); - set_bit (_ALLOC_packing_groups, &SB_ALLOC_OPTS(s)); + set_bit(_ALLOC_skip_busy, &SB_ALLOC_OPTS(s)); + set_bit(_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s)); + set_bit(_ALLOC_packing_groups, &SB_ALLOC_OPTS(s)); } /* block allocator related options are parsed here */ -int reiserfs_parse_alloc_options(struct super_block * s, char * options) +int reiserfs_parse_alloc_options(struct super_block *s, char *options) { - char * this_char, * value; - - REISERFS_SB(s)->s_alloc_options.bits = 0; /* clear default settings */ - - while ( (this_char = strsep (&options, ":")) != NULL ) { - if ((value = strchr (this_char, '=')) != NULL) - *value++ = 0; - - if (!strcmp(this_char, "concentrating_formatted_nodes")) { - int temp; - SET_OPTION(concentrating_formatted_nodes); - temp = (value && *value) ? simple_strtoul (value, &value, 0) : 10; - if (temp <= 0 || temp > 100) { - REISERFS_SB(s)->s_alloc_options.border = 10; - } else { - REISERFS_SB(s)->s_alloc_options.border = 100 / temp; - } - continue; - } - if (!strcmp(this_char, "displacing_large_files")) { - SET_OPTION(displacing_large_files); - REISERFS_SB(s)->s_alloc_options.large_file_size = - (value && *value) ? simple_strtoul (value, &value, 0) : 16; - continue; - } - if (!strcmp(this_char, "displacing_new_packing_localities")) { - SET_OPTION(displacing_new_packing_localities); - continue; - }; - - if (!strcmp(this_char, "old_hashed_relocation")) { - SET_OPTION(old_hashed_relocation); - continue; - } + char *this_char, *value; + + REISERFS_SB(s)->s_alloc_options.bits = 0; /* clear default settings */ + + while ((this_char = strsep(&options, ":")) != NULL) { + if ((value = strchr(this_char, '=')) != NULL) + *value++ = 0; + + if (!strcmp(this_char, "concentrating_formatted_nodes")) { + int temp; + SET_OPTION(concentrating_formatted_nodes); + temp = (value + && *value) ? simple_strtoul(value, &value, + 0) : 10; + if (temp <= 0 || temp > 100) { + REISERFS_SB(s)->s_alloc_options.border = 10; + } else { + REISERFS_SB(s)->s_alloc_options.border = + 100 / temp; + } + continue; + } + if (!strcmp(this_char, "displacing_large_files")) { + SET_OPTION(displacing_large_files); + REISERFS_SB(s)->s_alloc_options.large_file_size = + (value + && *value) ? simple_strtoul(value, &value, 0) : 16; + continue; + } + if (!strcmp(this_char, "displacing_new_packing_localities")) { + SET_OPTION(displacing_new_packing_localities); + continue; + }; + + if (!strcmp(this_char, "old_hashed_relocation")) { + SET_OPTION(old_hashed_relocation); + continue; + } - if (!strcmp(this_char, "new_hashed_relocation")) { - SET_OPTION(new_hashed_relocation); - continue; - } + if (!strcmp(this_char, "new_hashed_relocation")) { + SET_OPTION(new_hashed_relocation); + continue; + } - if (!strcmp(this_char, "dirid_groups")) { - SET_OPTION(dirid_groups); - continue; - } - if (!strcmp(this_char, "oid_groups")) { - SET_OPTION(oid_groups); - continue; - } - if (!strcmp(this_char, "packing_groups")) { - SET_OPTION(packing_groups); - continue; - } - if (!strcmp(this_char, "hashed_formatted_nodes")) { - SET_OPTION(hashed_formatted_nodes); - continue; - } + if (!strcmp(this_char, "dirid_groups")) { + SET_OPTION(dirid_groups); + continue; + } + if (!strcmp(this_char, "oid_groups")) { + SET_OPTION(oid_groups); + continue; + } + if (!strcmp(this_char, "packing_groups")) { + SET_OPTION(packing_groups); + continue; + } + if (!strcmp(this_char, "hashed_formatted_nodes")) { + SET_OPTION(hashed_formatted_nodes); + continue; + } - if (!strcmp(this_char, "skip_busy")) { - SET_OPTION(skip_busy); - continue; - } + if (!strcmp(this_char, "skip_busy")) { + SET_OPTION(skip_busy); + continue; + } - if (!strcmp(this_char, "hundredth_slices")) { - SET_OPTION(hundredth_slices); - continue; - } + if (!strcmp(this_char, "hundredth_slices")) { + SET_OPTION(hundredth_slices); + continue; + } - if (!strcmp(this_char, "old_way")) { - SET_OPTION(old_way); - continue; - } + if (!strcmp(this_char, "old_way")) { + SET_OPTION(old_way); + continue; + } - if (!strcmp(this_char, "displace_based_on_dirid")) { - SET_OPTION(displace_based_on_dirid); - continue; - } + if (!strcmp(this_char, "displace_based_on_dirid")) { + SET_OPTION(displace_based_on_dirid); + continue; + } - if (!strcmp(this_char, "preallocmin")) { - REISERFS_SB(s)->s_alloc_options.preallocmin = - (value && *value) ? simple_strtoul (value, &value, 0) : 4; - continue; - } + if (!strcmp(this_char, "preallocmin")) { + REISERFS_SB(s)->s_alloc_options.preallocmin = + (value + && *value) ? simple_strtoul(value, &value, 0) : 4; + continue; + } + + if (!strcmp(this_char, "preallocsize")) { + REISERFS_SB(s)->s_alloc_options.preallocsize = + (value + && *value) ? simple_strtoul(value, &value, + 0) : + PREALLOCATION_SIZE; + continue; + } - if (!strcmp(this_char, "preallocsize")) { - REISERFS_SB(s)->s_alloc_options.preallocsize = - (value && *value) ? simple_strtoul (value, &value, 0) : PREALLOCATION_SIZE; - continue; + reiserfs_warning(s, "zam-4001: %s : unknown option - %s", + __FUNCTION__, this_char); + return 1; } - reiserfs_warning (s, "zam-4001: %s : unknown option - %s", - __FUNCTION__ , this_char); - return 1; - } - - reiserfs_warning (s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s)); - return 0; + reiserfs_warning(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s)); + return 0; } - -static inline void new_hashed_relocation (reiserfs_blocknr_hint_t * hint) + +static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint) { - char * hash_in; - if (hint->formatted_node) { - hash_in = (char*)&hint->key.k_dir_id; - } else { - if (!hint->inode) { - //hint->search_start = hint->beg; - hash_in = (char*)&hint->key.k_dir_id; - } else - if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) - hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); - else - hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid); - } + char *hash_in; + if (hint->formatted_node) { + hash_in = (char *)&hint->key.k_dir_id; + } else { + if (!hint->inode) { + //hint->search_start = hint->beg; + hash_in = (char *)&hint->key.k_dir_id; + } else + if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); + else + hash_in = + (char *)(&INODE_PKEY(hint->inode)->k_objectid); + } - hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); + hint->search_start = + hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); } /* * Relocation based on dirid, hashing them into a given bitmap block * files. Formatted nodes are unaffected, a seperate policy covers them */ -static void -dirid_groups (reiserfs_blocknr_hint_t *hint) +static void dirid_groups(reiserfs_blocknr_hint_t * hint) { - unsigned long hash; - __u32 dirid = 0; - int bm = 0; - struct super_block *sb = hint->th->t_super; - if (hint->inode) - dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); - else if (hint->formatted_node) - dirid = hint->key.k_dir_id; - - if (dirid) { - bm = bmap_hash_id(sb, dirid); - hash = bm * (sb->s_blocksize << 3); - /* give a portion of the block group to metadata */ + unsigned long hash; + __u32 dirid = 0; + int bm = 0; + struct super_block *sb = hint->th->t_super; if (hint->inode) - hash += sb->s_blocksize/2; - hint->search_start = hash; - } + dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); + else if (hint->formatted_node) + dirid = hint->key.k_dir_id; + + if (dirid) { + bm = bmap_hash_id(sb, dirid); + hash = bm * (sb->s_blocksize << 3); + /* give a portion of the block group to metadata */ + if (hint->inode) + hash += sb->s_blocksize / 2; + hint->search_start = hash; + } } /* * Relocation based on oid, hashing them into a given bitmap block * files. Formatted nodes are unaffected, a seperate policy covers them */ -static void -oid_groups (reiserfs_blocknr_hint_t *hint) +static void oid_groups(reiserfs_blocknr_hint_t * hint) { - if (hint->inode) { - unsigned long hash; - __u32 oid; - __u32 dirid; - int bm; - - dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); - - /* keep the root dir and it's first set of subdirs close to - * the start of the disk - */ - if (dirid <= 2) - hash = (hint->inode->i_sb->s_blocksize << 3); - else { - oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid); - bm = bmap_hash_id(hint->inode->i_sb, oid); - hash = bm * (hint->inode->i_sb->s_blocksize << 3); + if (hint->inode) { + unsigned long hash; + __u32 oid; + __u32 dirid; + int bm; + + dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id); + + /* keep the root dir and it's first set of subdirs close to + * the start of the disk + */ + if (dirid <= 2) + hash = (hint->inode->i_sb->s_blocksize << 3); + else { + oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid); + bm = bmap_hash_id(hint->inode->i_sb, oid); + hash = bm * (hint->inode->i_sb->s_blocksize << 3); + } + hint->search_start = hash; } - hint->search_start = hash; - } } /* returns 1 if it finds an indirect item and gets valid hint info * from it, otherwise 0 */ -static int get_left_neighbor(reiserfs_blocknr_hint_t *hint) +static int get_left_neighbor(reiserfs_blocknr_hint_t * hint) { - struct path * path; - struct buffer_head * bh; - struct item_head * ih; - int pos_in_item; - __le32 * item; - int ret = 0; - - if (!hint->path) /* reiserfs code can call this function w/o pointer to path + struct path *path; + struct buffer_head *bh; + struct item_head *ih; + int pos_in_item; + __le32 *item; + int ret = 0; + + if (!hint->path) /* reiserfs code can call this function w/o pointer to path * structure supplied; then we rely on supplied search_start */ - return 0; - - path = hint->path; - bh = get_last_bh(path); - RFALSE( !bh, "green-4002: Illegal path specified to get_left_neighbor"); - ih = get_ih(path); - pos_in_item = path->pos_in_item; - item = get_item (path); - - hint->search_start = bh->b_blocknr; - - if (!hint->formatted_node && is_indirect_le_ih (ih)) { - /* for indirect item: go to left and look for the first non-hole entry - in the indirect item */ - if (pos_in_item == I_UNFM_NUM (ih)) - pos_in_item--; -// pos_in_item = I_UNFM_NUM (ih) - 1; - while (pos_in_item >= 0) { - int t=get_block_num(item,pos_in_item); - if (t) { - hint->search_start = t; - ret = 1; - break; - } - pos_in_item --; + return 0; + + path = hint->path; + bh = get_last_bh(path); + RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor"); + ih = get_ih(path); + pos_in_item = path->pos_in_item; + item = get_item(path); + + hint->search_start = bh->b_blocknr; + + if (!hint->formatted_node && is_indirect_le_ih(ih)) { + /* for indirect item: go to left and look for the first non-hole entry + in the indirect item */ + if (pos_in_item == I_UNFM_NUM(ih)) + pos_in_item--; +// pos_in_item = I_UNFM_NUM (ih) - 1; + while (pos_in_item >= 0) { + int t = get_block_num(item, pos_in_item); + if (t) { + hint->search_start = t; + ret = 1; + break; + } + pos_in_item--; + } } - } - /* does result value fit into specified region? */ - return ret; + /* does result value fit into specified region? */ + return ret; } /* should be, if formatted node, then try to put on first part of the device specified as number of percent with mount option device, else try to put on last of device. This is not to say it is good code to do so, but the effect should be measured. */ -static inline void set_border_in_hint(struct super_block *s, reiserfs_blocknr_hint_t *hint) +static inline void set_border_in_hint(struct super_block *s, + reiserfs_blocknr_hint_t * hint) { - b_blocknr_t border = SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border; + b_blocknr_t border = + SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border; - if (hint->formatted_node) - hint->end = border - 1; - else - hint->beg = border; + if (hint->formatted_node) + hint->end = border - 1; + else + hint->beg = border; } -static inline void displace_large_file(reiserfs_blocknr_hint_t *hint) +static inline void displace_large_file(reiserfs_blocknr_hint_t * hint) { - if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) - hint->search_start = hint->beg + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id), 4) % (hint->end - hint->beg); - else - hint->search_start = hint->beg + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid), 4) % (hint->end - hint->beg); + if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) + hint->search_start = + hint->beg + + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id), + 4) % (hint->end - hint->beg); + else + hint->search_start = + hint->beg + + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid), + 4) % (hint->end - hint->beg); } -static inline void hash_formatted_node(reiserfs_blocknr_hint_t *hint) +static inline void hash_formatted_node(reiserfs_blocknr_hint_t * hint) { - char * hash_in; + char *hash_in; - if (!hint->inode) - hash_in = (char*)&hint->key.k_dir_id; - else if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) - hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); - else - hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid); + if (!hint->inode) + hash_in = (char *)&hint->key.k_dir_id; + else if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super)) + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id); + else + hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid); - hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); + hint->search_start = + hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg); } -static inline int this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t *hint) +static inline int +this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t * + hint) { - return hint->block == REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size; + return hint->block == + REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size; } #ifdef DISPLACE_NEW_PACKING_LOCALITIES -static inline void displace_new_packing_locality (reiserfs_blocknr_hint_t *hint) +static inline void displace_new_packing_locality(reiserfs_blocknr_hint_t * hint) { - struct in_core_key * key = &hint->key; + struct in_core_key *key = &hint->key; - hint->th->displace_new_blocks = 0; - hint->search_start = hint->beg + keyed_hash((char*)(&key->k_objectid),4) % (hint->end - hint->beg); + hint->th->displace_new_blocks = 0; + hint->search_start = + hint->beg + keyed_hash((char *)(&key->k_objectid), + 4) % (hint->end - hint->beg); } - #endif +#endif -static inline int old_hashed_relocation (reiserfs_blocknr_hint_t * hint) +static inline int old_hashed_relocation(reiserfs_blocknr_hint_t * hint) { - b_blocknr_t border; - u32 hash_in; - - if (hint->formatted_node || hint->inode == NULL) { - return 0; - } + b_blocknr_t border; + u32 hash_in; - hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id); - border = hint->beg + (u32) keyed_hash(((char *) (&hash_in)), 4) % (hint->end - hint->beg - 1); - if (border > hint->search_start) - hint->search_start = border; + if (hint->formatted_node || hint->inode == NULL) { + return 0; + } - return 1; - } - -static inline int old_way (reiserfs_blocknr_hint_t * hint) -{ - b_blocknr_t border; - - if (hint->formatted_node || hint->inode == NULL) { - return 0; - } - - border = hint->beg + le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end - hint->beg); - if (border > hint->search_start) - hint->search_start = border; + hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id); + border = + hint->beg + (u32) keyed_hash(((char *)(&hash_in)), + 4) % (hint->end - hint->beg - 1); + if (border > hint->search_start) + hint->search_start = border; - return 1; + return 1; } -static inline void hundredth_slices (reiserfs_blocknr_hint_t * hint) +static inline int old_way(reiserfs_blocknr_hint_t * hint) { - struct in_core_key * key = &hint->key; - b_blocknr_t slice_start; + b_blocknr_t border; + + if (hint->formatted_node || hint->inode == NULL) { + return 0; + } + + border = + hint->beg + + le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end - + hint->beg); + if (border > hint->search_start) + hint->search_start = border; - slice_start = (keyed_hash((char*)(&key->k_dir_id),4) % 100) * (hint->end / 100); - if ( slice_start > hint->search_start || slice_start + (hint->end / 100) <= hint->search_start) { - hint->search_start = slice_start; - } + return 1; +} + +static inline void hundredth_slices(reiserfs_blocknr_hint_t * hint) +{ + struct in_core_key *key = &hint->key; + b_blocknr_t slice_start; + + slice_start = + (keyed_hash((char *)(&key->k_dir_id), 4) % 100) * (hint->end / 100); + if (slice_start > hint->search_start + || slice_start + (hint->end / 100) <= hint->search_start) { + hint->search_start = slice_start; + } } - -static void determine_search_start(reiserfs_blocknr_hint_t *hint, - int amount_needed) + +static void determine_search_start(reiserfs_blocknr_hint_t * hint, + int amount_needed) { - struct super_block *s = hint->th->t_super; - int unfm_hint; + struct super_block *s = hint->th->t_super; + int unfm_hint; - hint->beg = 0; - hint->end = SB_BLOCK_COUNT(s) - 1; + hint->beg = 0; + hint->end = SB_BLOCK_COUNT(s) - 1; - /* This is former border algorithm. Now with tunable border offset */ - if (concentrating_formatted_nodes(s)) - set_border_in_hint(s, hint); + /* This is former border algorithm. Now with tunable border offset */ + if (concentrating_formatted_nodes(s)) + set_border_in_hint(s, hint); #ifdef DISPLACE_NEW_PACKING_LOCALITIES - /* whenever we create a new directory, we displace it. At first we will - hash for location, later we might look for a moderately empty place for - it */ - if (displacing_new_packing_localities(s) - && hint->th->displace_new_blocks) { - displace_new_packing_locality(hint); - - /* we do not continue determine_search_start, - * if new packing locality is being displaced */ - return; - } + /* whenever we create a new directory, we displace it. At first we will + hash for location, later we might look for a moderately empty place for + it */ + if (displacing_new_packing_localities(s) + && hint->th->displace_new_blocks) { + displace_new_packing_locality(hint); + + /* we do not continue determine_search_start, + * if new packing locality is being displaced */ + return; + } #endif - - /* all persons should feel encouraged to add more special cases here and - * test them */ - if (displacing_large_files(s) && !hint->formatted_node - && this_blocknr_allocation_would_make_it_a_large_file(hint)) { - displace_large_file(hint); - return; - } - - /* if none of our special cases is relevant, use the left neighbor in the - tree order of the new node we are allocating for */ - if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes,s)) { - hash_formatted_node(hint); - return; - } + /* all persons should feel encouraged to add more special cases here and + * test them */ - unfm_hint = get_left_neighbor(hint); + if (displacing_large_files(s) && !hint->formatted_node + && this_blocknr_allocation_would_make_it_a_large_file(hint)) { + displace_large_file(hint); + return; + } - /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation, - new blocks are displaced based on directory ID. Also, if suggested search_start - is less than last preallocated block, we start searching from it, assuming that - HDD dataflow is faster in forward direction */ - if ( TEST_OPTION(old_way, s)) { - if (!hint->formatted_node) { - if ( !reiserfs_hashed_relocation(s)) - old_way(hint); - else if (!reiserfs_no_unhashed_relocation(s)) - old_hashed_relocation(hint); + /* if none of our special cases is relevant, use the left neighbor in the + tree order of the new node we are allocating for */ + if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) { + hash_formatted_node(hint); + return; + } - if ( hint->inode && hint->search_start < REISERFS_I(hint->inode)->i_prealloc_block) - hint->search_start = REISERFS_I(hint->inode)->i_prealloc_block; + unfm_hint = get_left_neighbor(hint); + + /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation, + new blocks are displaced based on directory ID. Also, if suggested search_start + is less than last preallocated block, we start searching from it, assuming that + HDD dataflow is faster in forward direction */ + if (TEST_OPTION(old_way, s)) { + if (!hint->formatted_node) { + if (!reiserfs_hashed_relocation(s)) + old_way(hint); + else if (!reiserfs_no_unhashed_relocation(s)) + old_hashed_relocation(hint); + + if (hint->inode + && hint->search_start < + REISERFS_I(hint->inode)->i_prealloc_block) + hint->search_start = + REISERFS_I(hint->inode)->i_prealloc_block; + } + return; } - return; - } - /* This is an approach proposed by Hans */ - if ( TEST_OPTION(hundredth_slices, s) && ! (displacing_large_files(s) && !hint->formatted_node)) { - hundredth_slices(hint); - return; - } - - /* old_hashed_relocation only works on unformatted */ - if (!unfm_hint && !hint->formatted_node && - TEST_OPTION(old_hashed_relocation, s)) - { - old_hashed_relocation(hint); - } - /* new_hashed_relocation works with both formatted/unformatted nodes */ - if ((!unfm_hint || hint->formatted_node) && - TEST_OPTION(new_hashed_relocation, s)) - { - new_hashed_relocation(hint); - } - /* dirid grouping works only on unformatted nodes */ - if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups,s)) - { - dirid_groups(hint); - } + /* This is an approach proposed by Hans */ + if (TEST_OPTION(hundredth_slices, s) + && !(displacing_large_files(s) && !hint->formatted_node)) { + hundredth_slices(hint); + return; + } + /* old_hashed_relocation only works on unformatted */ + if (!unfm_hint && !hint->formatted_node && + TEST_OPTION(old_hashed_relocation, s)) { + old_hashed_relocation(hint); + } + /* new_hashed_relocation works with both formatted/unformatted nodes */ + if ((!unfm_hint || hint->formatted_node) && + TEST_OPTION(new_hashed_relocation, s)) { + new_hashed_relocation(hint); + } + /* dirid grouping works only on unformatted nodes */ + if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) { + dirid_groups(hint); + } #ifdef DISPLACE_NEW_PACKING_LOCALITIES - if (hint->formatted_node && TEST_OPTION(dirid_groups,s)) - { - dirid_groups(hint); - } + if (hint->formatted_node && TEST_OPTION(dirid_groups, s)) { + dirid_groups(hint); + } #endif - /* oid grouping works only on unformatted nodes */ - if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups,s)) - { - oid_groups(hint); - } - return; + /* oid grouping works only on unformatted nodes */ + if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups, s)) { + oid_groups(hint); + } + return; } static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint) { - /* make minimum size a mount option and benchmark both ways */ - /* we preallocate blocks only for regular files, specific size */ - /* benchmark preallocating always and see what happens */ - - hint->prealloc_size = 0; - - if (!hint->formatted_node && hint->preallocate) { - if (S_ISREG(hint->inode->i_mode) - && hint->inode->i_size >= REISERFS_SB(hint->th->t_super)->s_alloc_options.preallocmin * hint->inode->i_sb->s_blocksize) - hint->prealloc_size = REISERFS_SB(hint->th->t_super)->s_alloc_options.preallocsize - 1; - } - return CARRY_ON; + /* make minimum size a mount option and benchmark both ways */ + /* we preallocate blocks only for regular files, specific size */ + /* benchmark preallocating always and see what happens */ + + hint->prealloc_size = 0; + + if (!hint->formatted_node && hint->preallocate) { + if (S_ISREG(hint->inode->i_mode) + && hint->inode->i_size >= + REISERFS_SB(hint->th->t_super)->s_alloc_options. + preallocmin * hint->inode->i_sb->s_blocksize) + hint->prealloc_size = + REISERFS_SB(hint->th->t_super)->s_alloc_options. + preallocsize - 1; + } + return CARRY_ON; } /* XXX I know it could be merged with upper-level function; but may be result function would be too complex. */ -static inline int allocate_without_wrapping_disk (reiserfs_blocknr_hint_t * hint, - b_blocknr_t * new_blocknrs, - b_blocknr_t start, b_blocknr_t finish, - int min, - int amount_needed, int prealloc_size) +static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint, + b_blocknr_t * new_blocknrs, + b_blocknr_t start, + b_blocknr_t finish, int min, + int amount_needed, + int prealloc_size) { - int rest = amount_needed; - int nr_allocated; - - while (rest > 0 && start <= finish) { - nr_allocated = scan_bitmap (hint->th, &start, finish, min, - rest + prealloc_size, !hint->formatted_node, - hint->block); - - if (nr_allocated == 0) /* no new blocks allocated, return */ - break; - - /* fill free_blocknrs array first */ - while (rest > 0 && nr_allocated > 0) { - * new_blocknrs ++ = start ++; - rest --; nr_allocated --; - } + int rest = amount_needed; + int nr_allocated; + + while (rest > 0 && start <= finish) { + nr_allocated = scan_bitmap(hint->th, &start, finish, min, + rest + prealloc_size, + !hint->formatted_node, hint->block); + + if (nr_allocated == 0) /* no new blocks allocated, return */ + break; + + /* fill free_blocknrs array first */ + while (rest > 0 && nr_allocated > 0) { + *new_blocknrs++ = start++; + rest--; + nr_allocated--; + } - /* do we have something to fill prealloc. array also ? */ - if (nr_allocated > 0) { - /* it means prealloc_size was greater that 0 and we do preallocation */ - list_add(&REISERFS_I(hint->inode)->i_prealloc_list, - &SB_JOURNAL(hint->th->t_super)->j_prealloc_list); - REISERFS_I(hint->inode)->i_prealloc_block = start; - REISERFS_I(hint->inode)->i_prealloc_count = nr_allocated; - break; + /* do we have something to fill prealloc. array also ? */ + if (nr_allocated > 0) { + /* it means prealloc_size was greater that 0 and we do preallocation */ + list_add(&REISERFS_I(hint->inode)->i_prealloc_list, + &SB_JOURNAL(hint->th->t_super)-> + j_prealloc_list); + REISERFS_I(hint->inode)->i_prealloc_block = start; + REISERFS_I(hint->inode)->i_prealloc_count = + nr_allocated; + break; + } } - } - return (amount_needed - rest); + return (amount_needed - rest); } static inline int blocknrs_and_prealloc_arrays_from_search_start - (reiserfs_blocknr_hint_t *hint, b_blocknr_t *new_blocknrs, int amount_needed) -{ - struct super_block *s = hint->th->t_super; - b_blocknr_t start = hint->search_start; - b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1; - int passno = 0; - int nr_allocated = 0; - int bigalloc = 0; - - determine_prealloc_size(hint); - if (!hint->formatted_node) { - int quota_ret; + (reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, + int amount_needed) { + struct super_block *s = hint->th->t_super; + b_blocknr_t start = hint->search_start; + b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1; + int passno = 0; + int nr_allocated = 0; + int bigalloc = 0; + + determine_prealloc_size(hint); + if (!hint->formatted_node) { + int quota_ret; #ifdef REISERQUOTA_DEBUG - reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: allocating %d blocks id=%u", amount_needed, hint->inode->i_uid); + reiserfs_debug(s, REISERFS_DEBUG_CODE, + "reiserquota: allocating %d blocks id=%u", + amount_needed, hint->inode->i_uid); #endif - quota_ret = DQUOT_ALLOC_BLOCK_NODIRTY(hint->inode, amount_needed); - if (quota_ret) /* Quota exceeded? */ - return QUOTA_EXCEEDED; - if (hint->preallocate && hint->prealloc_size ) { + quota_ret = + DQUOT_ALLOC_BLOCK_NODIRTY(hint->inode, amount_needed); + if (quota_ret) /* Quota exceeded? */ + return QUOTA_EXCEEDED; + if (hint->preallocate && hint->prealloc_size) { #ifdef REISERQUOTA_DEBUG - reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: allocating (prealloc) %d blocks id=%u", hint->prealloc_size, hint->inode->i_uid); + reiserfs_debug(s, REISERFS_DEBUG_CODE, + "reiserquota: allocating (prealloc) %d blocks id=%u", + hint->prealloc_size, hint->inode->i_uid); #endif - quota_ret = DQUOT_PREALLOC_BLOCK_NODIRTY(hint->inode, hint->prealloc_size); - if (quota_ret) - hint->preallocate=hint->prealloc_size=0; + quota_ret = + DQUOT_PREALLOC_BLOCK_NODIRTY(hint->inode, + hint->prealloc_size); + if (quota_ret) + hint->preallocate = hint->prealloc_size = 0; + } + /* for unformatted nodes, force large allocations */ + bigalloc = amount_needed; } - /* for unformatted nodes, force large allocations */ - bigalloc = amount_needed; - } - do { - /* in bigalloc mode, nr_allocated should stay zero until - * the entire allocation is filled - */ - if (unlikely(bigalloc && nr_allocated)) { - reiserfs_warning(s, "bigalloc is %d, nr_allocated %d\n", - bigalloc, nr_allocated); - /* reset things to a sane value */ - bigalloc = amount_needed - nr_allocated; - } - /* - * try pass 0 and pass 1 looking for a nice big - * contiguous allocation. Then reset and look - * for anything you can find. - */ - if (passno == 2 && bigalloc) { - passno = 0; - bigalloc = 0; - } - switch (passno++) { - case 0: /* Search from hint->search_start to end of disk */ - start = hint->search_start; - finish = SB_BLOCK_COUNT(s) - 1; - break; - case 1: /* Search from hint->beg to hint->search_start */ - start = hint->beg; - finish = hint->search_start; - break; - case 2: /* Last chance: Search from 0 to hint->beg */ - start = 0; - finish = hint->beg; - break; - default: /* We've tried searching everywhere, not enough space */ - /* Free the blocks */ - if (!hint->formatted_node) { + do { + /* in bigalloc mode, nr_allocated should stay zero until + * the entire allocation is filled + */ + if (unlikely(bigalloc && nr_allocated)) { + reiserfs_warning(s, "bigalloc is %d, nr_allocated %d\n", + bigalloc, nr_allocated); + /* reset things to a sane value */ + bigalloc = amount_needed - nr_allocated; + } + /* + * try pass 0 and pass 1 looking for a nice big + * contiguous allocation. Then reset and look + * for anything you can find. + */ + if (passno == 2 && bigalloc) { + passno = 0; + bigalloc = 0; + } + switch (passno++) { + case 0: /* Search from hint->search_start to end of disk */ + start = hint->search_start; + finish = SB_BLOCK_COUNT(s) - 1; + break; + case 1: /* Search from hint->beg to hint->search_start */ + start = hint->beg; + finish = hint->search_start; + break; + case 2: /* Last chance: Search from 0 to hint->beg */ + start = 0; + finish = hint->beg; + break; + default: /* We've tried searching everywhere, not enough space */ + /* Free the blocks */ + if (!hint->formatted_node) { #ifdef REISERQUOTA_DEBUG - reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: freeing (nospace) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated, hint->inode->i_uid); + reiserfs_debug(s, REISERFS_DEBUG_CODE, + "reiserquota: freeing (nospace) %d blocks id=%u", + amount_needed + + hint->prealloc_size - + nr_allocated, + hint->inode->i_uid); #endif - DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated); /* Free not allocated blocks */ - } - while (nr_allocated --) - reiserfs_free_block(hint->th, hint->inode, new_blocknrs[nr_allocated], !hint->formatted_node); - - return NO_DISK_SPACE; - } - } while ((nr_allocated += allocate_without_wrapping_disk (hint, - new_blocknrs + nr_allocated, start, finish, - bigalloc ? bigalloc : 1, - amount_needed - nr_allocated, - hint->prealloc_size)) - < amount_needed); - if ( !hint->formatted_node && - amount_needed + hint->prealloc_size > - nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) { - /* Some of preallocation blocks were not allocated */ + DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated); /* Free not allocated blocks */ + } + while (nr_allocated--) + reiserfs_free_block(hint->th, hint->inode, + new_blocknrs[nr_allocated], + !hint->formatted_node); + + return NO_DISK_SPACE; + } + } while ((nr_allocated += allocate_without_wrapping_disk(hint, + new_blocknrs + + nr_allocated, + start, finish, + bigalloc ? + bigalloc : 1, + amount_needed - + nr_allocated, + hint-> + prealloc_size)) + < amount_needed); + if (!hint->formatted_node && + amount_needed + hint->prealloc_size > + nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) { + /* Some of preallocation blocks were not allocated */ #ifdef REISERQUOTA_DEBUG - reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: freeing (failed prealloc) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated - REISERFS_I(hint->inode)->i_prealloc_count, hint->inode->i_uid); + reiserfs_debug(s, REISERFS_DEBUG_CODE, + "reiserquota: freeing (failed prealloc) %d blocks id=%u", + amount_needed + hint->prealloc_size - + nr_allocated - + REISERFS_I(hint->inode)->i_prealloc_count, + hint->inode->i_uid); #endif - DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + - hint->prealloc_size - nr_allocated - - REISERFS_I(hint->inode)->i_prealloc_count); - } + DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + + hint->prealloc_size - nr_allocated - + REISERFS_I(hint->inode)-> + i_prealloc_count); + } - return CARRY_ON; + return CARRY_ON; } /* grab new blocknrs from preallocated list */ /* return amount still needed after using them */ -static int use_preallocated_list_if_available (reiserfs_blocknr_hint_t *hint, - b_blocknr_t *new_blocknrs, int amount_needed) +static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint, + b_blocknr_t * new_blocknrs, + int amount_needed) { - struct inode * inode = hint->inode; + struct inode *inode = hint->inode; - if (REISERFS_I(inode)->i_prealloc_count > 0) { - while (amount_needed) { + if (REISERFS_I(inode)->i_prealloc_count > 0) { + while (amount_needed) { - *new_blocknrs ++ = REISERFS_I(inode)->i_prealloc_block ++; - REISERFS_I(inode)->i_prealloc_count --; + *new_blocknrs++ = REISERFS_I(inode)->i_prealloc_block++; + REISERFS_I(inode)->i_prealloc_count--; - amount_needed --; + amount_needed--; - if (REISERFS_I(inode)->i_prealloc_count <= 0) { - list_del(&REISERFS_I(inode)->i_prealloc_list); - break; - } + if (REISERFS_I(inode)->i_prealloc_count <= 0) { + list_del(&REISERFS_I(inode)->i_prealloc_list); + break; + } + } } - } - /* return amount still needed after using preallocated blocks */ - return amount_needed; + /* return amount still needed after using preallocated blocks */ + return amount_needed; } -int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint, - b_blocknr_t * new_blocknrs, int amount_needed, - int reserved_by_us /* Amount of blocks we have - already reserved */) +int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, int amount_needed, int reserved_by_us /* Amount of blocks we have + already reserved */ ) { - int initial_amount_needed = amount_needed; - int ret; - struct super_block *s = hint->th->t_super; - - /* Check if there is enough space, taking into account reserved space */ - if ( SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks < - amount_needed - reserved_by_us) - return NO_DISK_SPACE; - /* should this be if !hint->inode && hint->preallocate? */ - /* do you mean hint->formatted_node can be removed ? - Zam */ - /* hint->formatted_node cannot be removed because we try to access - inode information here, and there is often no inode assotiated with - metadata allocations - green */ - - if (!hint->formatted_node && hint->preallocate) { - amount_needed = use_preallocated_list_if_available + int initial_amount_needed = amount_needed; + int ret; + struct super_block *s = hint->th->t_super; + + /* Check if there is enough space, taking into account reserved space */ + if (SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks < + amount_needed - reserved_by_us) + return NO_DISK_SPACE; + /* should this be if !hint->inode && hint->preallocate? */ + /* do you mean hint->formatted_node can be removed ? - Zam */ + /* hint->formatted_node cannot be removed because we try to access + inode information here, and there is often no inode assotiated with + metadata allocations - green */ + + if (!hint->formatted_node && hint->preallocate) { + amount_needed = use_preallocated_list_if_available + (hint, new_blocknrs, amount_needed); + if (amount_needed == 0) /* all blocknrs we need we got from + prealloc. list */ + return CARRY_ON; + new_blocknrs += (initial_amount_needed - amount_needed); + } + + /* find search start and save it in hint structure */ + determine_search_start(hint, amount_needed); + if (hint->search_start >= SB_BLOCK_COUNT(s)) + hint->search_start = SB_BLOCK_COUNT(s) - 1; + + /* allocation itself; fill new_blocknrs and preallocation arrays */ + ret = blocknrs_and_prealloc_arrays_from_search_start (hint, new_blocknrs, amount_needed); - if (amount_needed == 0) /* all blocknrs we need we got from - prealloc. list */ - return CARRY_ON; - new_blocknrs += (initial_amount_needed - amount_needed); - } - - /* find search start and save it in hint structure */ - determine_search_start(hint, amount_needed); - if (hint->search_start >= SB_BLOCK_COUNT(s)) - hint->search_start = SB_BLOCK_COUNT(s) - 1; - - /* allocation itself; fill new_blocknrs and preallocation arrays */ - ret = blocknrs_and_prealloc_arrays_from_search_start - (hint, new_blocknrs, amount_needed); - - /* we used prealloc. list to fill (partially) new_blocknrs array. If final allocation fails we - * need to return blocks back to prealloc. list or just free them. -- Zam (I chose second - * variant) */ - - if (ret != CARRY_ON) { - while (amount_needed ++ < initial_amount_needed) { - reiserfs_free_block(hint->th, hint->inode, *(--new_blocknrs), 1); + + /* we used prealloc. list to fill (partially) new_blocknrs array. If final allocation fails we + * need to return blocks back to prealloc. list or just free them. -- Zam (I chose second + * variant) */ + + if (ret != CARRY_ON) { + while (amount_needed++ < initial_amount_needed) { + reiserfs_free_block(hint->th, hint->inode, + *(--new_blocknrs), 1); + } } - } - return ret; + return ret; } /* These 2 functions are here to provide blocks reservation to the rest of kernel */ /* Reserve @blocks amount of blocks in fs pointed by @sb. Caller must make sure there are actually this much blocks on the FS available */ -void reiserfs_claim_blocks_to_be_allocated( - struct super_block *sb, /* super block of - filesystem where - blocks should be - reserved */ - int blocks /* How much to reserve */ - ) +void reiserfs_claim_blocks_to_be_allocated(struct super_block *sb, /* super block of + filesystem where + blocks should be + reserved */ + int blocks /* How much to reserve */ + ) { - /* Fast case, if reservation is zero - exit immediately. */ - if ( !blocks ) - return; + /* Fast case, if reservation is zero - exit immediately. */ + if (!blocks) + return; - spin_lock(&REISERFS_SB(sb)->bitmap_lock); - REISERFS_SB(sb)->reserved_blocks += blocks; - spin_unlock(&REISERFS_SB(sb)->bitmap_lock); + spin_lock(&REISERFS_SB(sb)->bitmap_lock); + REISERFS_SB(sb)->reserved_blocks += blocks; + spin_unlock(&REISERFS_SB(sb)->bitmap_lock); } /* Unreserve @blocks amount of blocks in fs pointed by @sb */ -void reiserfs_release_claimed_blocks( - struct super_block *sb, /* super block of - filesystem where - blocks should be - reserved */ - int blocks /* How much to unreserve */ - ) +void reiserfs_release_claimed_blocks(struct super_block *sb, /* super block of + filesystem where + blocks should be + reserved */ + int blocks /* How much to unreserve */ + ) { - /* Fast case, if unreservation is zero - exit immediately. */ - if ( !blocks ) - return; + /* Fast case, if unreservation is zero - exit immediately. */ + if (!blocks) + return; - spin_lock(&REISERFS_SB(sb)->bitmap_lock); - REISERFS_SB(sb)->reserved_blocks -= blocks; - spin_unlock(&REISERFS_SB(sb)->bitmap_lock); - RFALSE( REISERFS_SB(sb)->reserved_blocks < 0, "amount of blocks reserved became zero?"); + spin_lock(&REISERFS_SB(sb)->bitmap_lock); + REISERFS_SB(sb)->reserved_blocks -= blocks; + spin_unlock(&REISERFS_SB(sb)->bitmap_lock); + RFALSE(REISERFS_SB(sb)->reserved_blocks < 0, + "amount of blocks reserved became zero?"); } /* This function estimates how much pages we will be able to write to FS used for reiserfs_file_write() purposes for now. */ -int reiserfs_can_fit_pages ( struct super_block *sb /* superblock of filesystem - to estimate space */ ) +int reiserfs_can_fit_pages(struct super_block *sb /* superblock of filesystem + to estimate space */ ) { int space; spin_lock(&REISERFS_SB(sb)->bitmap_lock); - space = (SB_FREE_BLOCKS(sb) - REISERFS_SB(sb)->reserved_blocks) >> ( PAGE_CACHE_SHIFT - sb->s_blocksize_bits); + space = + (SB_FREE_BLOCKS(sb) - + REISERFS_SB(sb)->reserved_blocks) >> (PAGE_CACHE_SHIFT - + sb->s_blocksize_bits); spin_unlock(&REISERFS_SB(sb)->bitmap_lock); - return space>0?space:0; + return space > 0 ? space : 0; } diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index fbde4b01a32..9dd71e80703 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -12,264 +12,286 @@ #include <linux/buffer_head.h> #include <asm/uaccess.h> -extern struct reiserfs_key MIN_KEY; +extern struct reiserfs_key MIN_KEY; -static int reiserfs_readdir (struct file *, void *, filldir_t); -static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, int datasync) ; +static int reiserfs_readdir(struct file *, void *, filldir_t); +static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, + int datasync); struct file_operations reiserfs_dir_operations = { - .read = generic_read_dir, - .readdir = reiserfs_readdir, - .fsync = reiserfs_dir_fsync, - .ioctl = reiserfs_ioctl, + .read = generic_read_dir, + .readdir = reiserfs_readdir, + .fsync = reiserfs_dir_fsync, + .ioctl = reiserfs_ioctl, }; -static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, int datasync) { - struct inode *inode = dentry->d_inode; - int err; - reiserfs_write_lock(inode->i_sb); - err = reiserfs_commit_for_inode(inode) ; - reiserfs_write_unlock(inode->i_sb) ; - if (err < 0) - return err; - return 0; +static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, + int datasync) +{ + struct inode *inode = dentry->d_inode; + int err; + reiserfs_write_lock(inode->i_sb); + err = reiserfs_commit_for_inode(inode); + reiserfs_write_unlock(inode->i_sb); + if (err < 0) + return err; + return 0; } - #define store_ih(where,what) copy_item_head (where, what) // -static int reiserfs_readdir (struct file * filp, void * dirent, filldir_t filldir) +static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; - struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ - INITIALIZE_PATH (path_to_entry); - struct buffer_head * bh; - int item_num, entry_num; - const struct reiserfs_key * rkey; - struct item_head * ih, tmp_ih; - int search_res; - char * local_buf; - loff_t next_pos; - char small_buf[32] ; /* avoid kmalloc if we can */ - struct reiserfs_dir_entry de; - int ret = 0; - - reiserfs_write_lock(inode->i_sb); - - reiserfs_check_lock_depth(inode->i_sb, "readdir") ; - - /* form key for search the next directory entry using f_pos field of - file structure */ - make_cpu_key (&pos_key, inode, (filp->f_pos) ? (filp->f_pos) : DOT_OFFSET, - TYPE_DIRENTRY, 3); - next_pos = cpu_key_k_offset (&pos_key); - - /* reiserfs_warning (inode->i_sb, "reiserfs_readdir 1: f_pos = %Ld", filp->f_pos);*/ - - path_to_entry.reada = PATH_READA; - while (1) { - research: - /* search the directory item, containing entry with specified key */ - search_res = search_by_entry_key (inode->i_sb, &pos_key, &path_to_entry, &de); - if (search_res == IO_ERROR) { - // FIXME: we could just skip part of directory which could - // not be read - ret = -EIO; - goto out; - } - entry_num = de.de_entry_num; - bh = de.de_bh; - item_num = de.de_item_num; - ih = de.de_ih; - store_ih (&tmp_ih, ih); - - /* we must have found item, that is item of this directory, */ - RFALSE( COMP_SHORT_KEYS (&(ih->ih_key), &pos_key), - "vs-9000: found item %h does not match to dir we readdir %K", - ih, &pos_key); - RFALSE( item_num > B_NR_ITEMS (bh) - 1, - "vs-9005 item_num == %d, item amount == %d", - item_num, B_NR_ITEMS (bh)); - - /* and entry must be not more than number of entries in the item */ - RFALSE( I_ENTRY_COUNT (ih) < entry_num, - "vs-9010: entry number is too big %d (%d)", - entry_num, I_ENTRY_COUNT (ih)); - - if (search_res == POSITION_FOUND || entry_num < I_ENTRY_COUNT (ih)) { - /* go through all entries in the directory item beginning from the entry, that has been found */ - struct reiserfs_de_head * deh = B_I_DEH (bh, ih) + entry_num; - - for (; entry_num < I_ENTRY_COUNT (ih); entry_num ++, deh ++) { - int d_reclen; - char * d_name; - off_t d_off; - ino_t d_ino; - - if (!de_visible (deh)) - /* it is hidden entry */ - continue; - d_reclen = entry_length (bh, ih, entry_num); - d_name = B_I_DEH_ENTRY_FILE_NAME (bh, ih, deh); - if (!d_name[d_reclen - 1]) - d_reclen = strlen (d_name); - - if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)){ - /* too big to send back to VFS */ - continue ; - } - - /* Ignore the .reiserfs_priv entry */ - if (reiserfs_xattrs (inode->i_sb) && - !old_format_only(inode->i_sb) && - filp->f_dentry == inode->i_sb->s_root && - REISERFS_SB(inode->i_sb)->priv_root && - REISERFS_SB(inode->i_sb)->priv_root->d_inode && - deh_objectid(deh) == le32_to_cpu (INODE_PKEY(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->k_objectid)) { - continue; - } - - d_off = deh_offset (deh); - filp->f_pos = d_off ; - d_ino = deh_objectid (deh); - if (d_reclen <= 32) { - local_buf = small_buf ; - } else { - local_buf = reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb) ; - if (!local_buf) { - pathrelse (&path_to_entry); - ret = -ENOMEM ; + struct inode *inode = filp->f_dentry->d_inode; + struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ + INITIALIZE_PATH(path_to_entry); + struct buffer_head *bh; + int item_num, entry_num; + const struct reiserfs_key *rkey; + struct item_head *ih, tmp_ih; + int search_res; + char *local_buf; + loff_t next_pos; + char small_buf[32]; /* avoid kmalloc if we can */ + struct reiserfs_dir_entry de; + int ret = 0; + + reiserfs_write_lock(inode->i_sb); + + reiserfs_check_lock_depth(inode->i_sb, "readdir"); + + /* form key for search the next directory entry using f_pos field of + file structure */ + make_cpu_key(&pos_key, inode, + (filp->f_pos) ? (filp->f_pos) : DOT_OFFSET, TYPE_DIRENTRY, + 3); + next_pos = cpu_key_k_offset(&pos_key); + + /* reiserfs_warning (inode->i_sb, "reiserfs_readdir 1: f_pos = %Ld", filp->f_pos); */ + + path_to_entry.reada = PATH_READA; + while (1) { + research: + /* search the directory item, containing entry with specified key */ + search_res = + search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry, + &de); + if (search_res == IO_ERROR) { + // FIXME: we could just skip part of directory which could + // not be read + ret = -EIO; goto out; - } - if (item_moved (&tmp_ih, &path_to_entry)) { - reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; - goto research; - } - } - // Note, that we copy name to user space via temporary - // buffer (local_buf) because filldir will block if - // user space buffer is swapped out. At that time - // entry can move to somewhere else - memcpy (local_buf, d_name, d_reclen); - if (filldir (dirent, local_buf, d_reclen, d_off, d_ino, - DT_UNKNOWN) < 0) { - if (local_buf != small_buf) { - reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; - } - goto end; } - if (local_buf != small_buf) { - reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; + entry_num = de.de_entry_num; + bh = de.de_bh; + item_num = de.de_item_num; + ih = de.de_ih; + store_ih(&tmp_ih, ih); + + /* we must have found item, that is item of this directory, */ + RFALSE(COMP_SHORT_KEYS(&(ih->ih_key), &pos_key), + "vs-9000: found item %h does not match to dir we readdir %K", + ih, &pos_key); + RFALSE(item_num > B_NR_ITEMS(bh) - 1, + "vs-9005 item_num == %d, item amount == %d", + item_num, B_NR_ITEMS(bh)); + + /* and entry must be not more than number of entries in the item */ + RFALSE(I_ENTRY_COUNT(ih) < entry_num, + "vs-9010: entry number is too big %d (%d)", + entry_num, I_ENTRY_COUNT(ih)); + + if (search_res == POSITION_FOUND + || entry_num < I_ENTRY_COUNT(ih)) { + /* go through all entries in the directory item beginning from the entry, that has been found */ + struct reiserfs_de_head *deh = + B_I_DEH(bh, ih) + entry_num; + + for (; entry_num < I_ENTRY_COUNT(ih); + entry_num++, deh++) { + int d_reclen; + char *d_name; + off_t d_off; + ino_t d_ino; + + if (!de_visible(deh)) + /* it is hidden entry */ + continue; + d_reclen = entry_length(bh, ih, entry_num); + d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh); + if (!d_name[d_reclen - 1]) + d_reclen = strlen(d_name); + + if (d_reclen > + REISERFS_MAX_NAME(inode->i_sb-> + s_blocksize)) { + /* too big to send back to VFS */ + continue; + } + + /* Ignore the .reiserfs_priv entry */ + if (reiserfs_xattrs(inode->i_sb) && + !old_format_only(inode->i_sb) && + filp->f_dentry == inode->i_sb->s_root && + REISERFS_SB(inode->i_sb)->priv_root && + REISERFS_SB(inode->i_sb)->priv_root->d_inode + && deh_objectid(deh) == + le32_to_cpu(INODE_PKEY + (REISERFS_SB(inode->i_sb)-> + priv_root->d_inode)-> + k_objectid)) { + continue; + } + + d_off = deh_offset(deh); + filp->f_pos = d_off; + d_ino = deh_objectid(deh); + if (d_reclen <= 32) { + local_buf = small_buf; + } else { + local_buf = + reiserfs_kmalloc(d_reclen, GFP_NOFS, + inode->i_sb); + if (!local_buf) { + pathrelse(&path_to_entry); + ret = -ENOMEM; + goto out; + } + if (item_moved(&tmp_ih, &path_to_entry)) { + reiserfs_kfree(local_buf, + d_reclen, + inode->i_sb); + goto research; + } + } + // Note, that we copy name to user space via temporary + // buffer (local_buf) because filldir will block if + // user space buffer is swapped out. At that time + // entry can move to somewhere else + memcpy(local_buf, d_name, d_reclen); + if (filldir + (dirent, local_buf, d_reclen, d_off, d_ino, + DT_UNKNOWN) < 0) { + if (local_buf != small_buf) { + reiserfs_kfree(local_buf, + d_reclen, + inode->i_sb); + } + goto end; + } + if (local_buf != small_buf) { + reiserfs_kfree(local_buf, d_reclen, + inode->i_sb); + } + // next entry should be looked for with such offset + next_pos = deh_offset(deh) + 1; + + if (item_moved(&tmp_ih, &path_to_entry)) { + goto research; + } + } /* for */ } - // next entry should be looked for with such offset - next_pos = deh_offset (deh) + 1; + if (item_num != B_NR_ITEMS(bh) - 1) + // end of directory has been reached + goto end; + + /* item we went through is last item of node. Using right + delimiting key check is it directory end */ + rkey = get_rkey(&path_to_entry, inode->i_sb); + if (!comp_le_keys(rkey, &MIN_KEY)) { + /* set pos_key to key, that is the smallest and greater + that key of the last entry in the item */ + set_cpu_key_k_offset(&pos_key, next_pos); + continue; + } - if (item_moved (&tmp_ih, &path_to_entry)) { - goto research; + if (COMP_SHORT_KEYS(rkey, &pos_key)) { + // end of directory has been reached + goto end; } - } /* for */ - } - - if (item_num != B_NR_ITEMS (bh) - 1) - // end of directory has been reached - goto end; - - /* item we went through is last item of node. Using right - delimiting key check is it directory end */ - rkey = get_rkey (&path_to_entry, inode->i_sb); - if (! comp_le_keys (rkey, &MIN_KEY)) { - /* set pos_key to key, that is the smallest and greater - that key of the last entry in the item */ - set_cpu_key_k_offset (&pos_key, next_pos); - continue; - } - - if ( COMP_SHORT_KEYS (rkey, &pos_key)) { - // end of directory has been reached - goto end; - } - - /* directory continues in the right neighboring block */ - set_cpu_key_k_offset (&pos_key, le_key_k_offset (KEY_FORMAT_3_5, rkey)); - - } /* while */ - - - end: - filp->f_pos = next_pos; - pathrelse (&path_to_entry); - reiserfs_check_path(&path_to_entry) ; - out: - reiserfs_write_unlock(inode->i_sb); - return ret; + + /* directory continues in the right neighboring block */ + set_cpu_key_k_offset(&pos_key, + le_key_k_offset(KEY_FORMAT_3_5, rkey)); + + } /* while */ + + end: + filp->f_pos = next_pos; + pathrelse(&path_to_entry); + reiserfs_check_path(&path_to_entry); + out: + reiserfs_write_unlock(inode->i_sb); + return ret; } /* compose directory item containing "." and ".." entries (entries are not aligned to 4 byte boundary) */ /* the last four params are LE */ -void make_empty_dir_item_v1 (char * body, __le32 dirid, __le32 objid, - __le32 par_dirid, __le32 par_objid) +void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid, + __le32 par_dirid, __le32 par_objid) { - struct reiserfs_de_head * deh; - - memset (body, 0, EMPTY_DIR_SIZE_V1); - deh = (struct reiserfs_de_head *)body; - - /* direntry header of "." */ - put_deh_offset( &(deh[0]), DOT_OFFSET ); - /* these two are from make_le_item_head, and are are LE */ - deh[0].deh_dir_id = dirid; - deh[0].deh_objectid = objid; - deh[0].deh_state = 0; /* Endian safe if 0 */ - put_deh_location( &(deh[0]), EMPTY_DIR_SIZE_V1 - strlen( "." )); - mark_de_visible(&(deh[0])); - - /* direntry header of ".." */ - put_deh_offset( &(deh[1]), DOT_DOT_OFFSET); - /* key of ".." for the root directory */ - /* these two are from the inode, and are are LE */ - deh[1].deh_dir_id = par_dirid; - deh[1].deh_objectid = par_objid; - deh[1].deh_state = 0; /* Endian safe if 0 */ - put_deh_location( &(deh[1]), deh_location( &(deh[0]) ) - strlen( ".." ) ); - mark_de_visible(&(deh[1])); - - /* copy ".." and "." */ - memcpy (body + deh_location( &(deh[0]) ), ".", 1); - memcpy (body + deh_location( &(deh[1]) ), "..", 2); + struct reiserfs_de_head *deh; + + memset(body, 0, EMPTY_DIR_SIZE_V1); + deh = (struct reiserfs_de_head *)body; + + /* direntry header of "." */ + put_deh_offset(&(deh[0]), DOT_OFFSET); + /* these two are from make_le_item_head, and are are LE */ + deh[0].deh_dir_id = dirid; + deh[0].deh_objectid = objid; + deh[0].deh_state = 0; /* Endian safe if 0 */ + put_deh_location(&(deh[0]), EMPTY_DIR_SIZE_V1 - strlen(".")); + mark_de_visible(&(deh[0])); + + /* direntry header of ".." */ + put_deh_offset(&(deh[1]), DOT_DOT_OFFSET); + /* key of ".." for the root directory */ + /* these two are from the inode, and are are LE */ + deh[1].deh_dir_id = par_dirid; + deh[1].deh_objectid = par_objid; + deh[1].deh_state = 0; /* Endian safe if 0 */ + put_deh_location(&(deh[1]), deh_location(&(deh[0])) - strlen("..")); + mark_de_visible(&(deh[1])); + + /* copy ".." and "." */ + memcpy(body + deh_location(&(deh[0])), ".", 1); + memcpy(body + deh_location(&(deh[1])), "..", 2); } /* compose directory item containing "." and ".." entries */ -void make_empty_dir_item (char * body, __le32 dirid, __le32 objid, - __le32 par_dirid, __le32 par_objid) +void make_empty_dir_item(char *body, __le32 dirid, __le32 objid, + __le32 par_dirid, __le32 par_objid) { - struct reiserfs_de_head * deh; - - memset (body, 0, EMPTY_DIR_SIZE); - deh = (struct reiserfs_de_head *)body; - - /* direntry header of "." */ - put_deh_offset( &(deh[0]), DOT_OFFSET ); - /* these two are from make_le_item_head, and are are LE */ - deh[0].deh_dir_id = dirid; - deh[0].deh_objectid = objid; - deh[0].deh_state = 0; /* Endian safe if 0 */ - put_deh_location( &(deh[0]), EMPTY_DIR_SIZE - ROUND_UP( strlen( "." ) ) ); - mark_de_visible(&(deh[0])); - - /* direntry header of ".." */ - put_deh_offset( &(deh[1]), DOT_DOT_OFFSET ); - /* key of ".." for the root directory */ - /* these two are from the inode, and are are LE */ - deh[1].deh_dir_id = par_dirid; - deh[1].deh_objectid = par_objid; - deh[1].deh_state = 0; /* Endian safe if 0 */ - put_deh_location( &(deh[1]), deh_location( &(deh[0])) - ROUND_UP( strlen( ".." ) ) ); - mark_de_visible(&(deh[1])); - - /* copy ".." and "." */ - memcpy (body + deh_location( &(deh[0]) ), ".", 1); - memcpy (body + deh_location( &(deh[1]) ), "..", 2); + struct reiserfs_de_head *deh; + + memset(body, 0, EMPTY_DIR_SIZE); + deh = (struct reiserfs_de_head *)body; + + /* direntry header of "." */ + put_deh_offset(&(deh[0]), DOT_OFFSET); + /* these two are from make_le_item_head, and are are LE */ + deh[0].deh_dir_id = dirid; + deh[0].deh_objectid = objid; + deh[0].deh_state = 0; /* Endian safe if 0 */ + put_deh_location(&(deh[0]), EMPTY_DIR_SIZE - ROUND_UP(strlen("."))); + mark_de_visible(&(deh[0])); + + /* direntry header of ".." */ + put_deh_offset(&(deh[1]), DOT_DOT_OFFSET); + /* key of ".." for the root directory */ + /* these two are from the inode, and are are LE */ + deh[1].deh_dir_id = par_dirid; + deh[1].deh_objectid = par_objid; + deh[1].deh_state = 0; /* Endian safe if 0 */ + put_deh_location(&(deh[1]), + deh_location(&(deh[0])) - ROUND_UP(strlen(".."))); + mark_de_visible(&(deh[1])); + + /* copy ".." and "." */ + memcpy(body + deh_location(&(deh[0])), ".", 1); + memcpy(body + deh_location(&(deh[1])), "..", 2); } diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 2118db2896c..b2264ba3cc5 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -8,7 +8,6 @@ /* balance the tree according to the analysis made before, */ /* and using buffers obtained after all above. */ - /** ** balance_leaf_when_delete ** balance_leaf @@ -24,23 +23,22 @@ #ifdef CONFIG_REISERFS_CHECK -struct tree_balance * cur_tb = NULL; /* detects whether more than one - copy of tb exists as a means - of checking whether schedule - is interrupting do_balance */ +struct tree_balance *cur_tb = NULL; /* detects whether more than one + copy of tb exists as a means + of checking whether schedule + is interrupting do_balance */ #endif -inline void do_balance_mark_leaf_dirty (struct tree_balance * tb, - struct buffer_head * bh, int flag) +inline void do_balance_mark_leaf_dirty(struct tree_balance *tb, + struct buffer_head *bh, int flag) { - journal_mark_dirty(tb->transaction_handle, - tb->transaction_handle->t_super, bh) ; + journal_mark_dirty(tb->transaction_handle, + tb->transaction_handle->t_super, bh); } #define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty #define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty - /* summary: if deleting something ( tb->insert_size[0] < 0 ) return(balance_leaf_when_delete()); (flag d handled here) @@ -64,8 +62,6 @@ be performed by do_balance. -Hans */ - - /* Balance leaf node in case of delete or cut: insert_size[0] < 0 * * lnum, rnum can have values >= -1 @@ -73,1384 +69,1933 @@ be performed by do_balance. * 0 means that nothing should be done with the neighbor * >0 means to shift entirely or partly the specified number of items to the neighbor */ -static int balance_leaf_when_delete (struct tree_balance * tb, int flag) +static int balance_leaf_when_delete(struct tree_balance *tb, int flag) { - struct buffer_head * tbS0 = PATH_PLAST_BUFFER (tb->tb_path); - int item_pos = PATH_LAST_POSITION (tb->tb_path); - int pos_in_item = tb->tb_path->pos_in_item; - struct buffer_info bi; - int n; - struct item_head * ih; + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int item_pos = PATH_LAST_POSITION(tb->tb_path); + int pos_in_item = tb->tb_path->pos_in_item; + struct buffer_info bi; + int n; + struct item_head *ih; - RFALSE( tb->FR[0] && B_LEVEL (tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1, - "vs- 12000: level: wrong FR %z", tb->FR[0]); - RFALSE( tb->blknum[0] > 1, - "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]); - RFALSE( ! tb->blknum[0] && ! PATH_H_PPARENT(tb->tb_path, 0), - "PAP-12010: tree can not be empty"); + RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1, + "vs- 12000: level: wrong FR %z", tb->FR[0]); + RFALSE(tb->blknum[0] > 1, + "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]); + RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0), + "PAP-12010: tree can not be empty"); - ih = B_N_PITEM_HEAD (tbS0, item_pos); + ih = B_N_PITEM_HEAD(tbS0, item_pos); - /* Delete or truncate the item */ + /* Delete or truncate the item */ - switch (flag) { - case M_DELETE: /* delete item in S[0] */ + switch (flag) { + case M_DELETE: /* delete item in S[0] */ + + RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0], + "vs-12013: mode Delete, insert size %d, ih to be deleted %h", + -tb->insert_size[0], ih); + + bi.tb = tb; + bi.bi_bh = tbS0; + bi.bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + bi.bi_position = PATH_H_POSITION(tb->tb_path, 1); + leaf_delete_items(&bi, 0, item_pos, 1, -1); + + if (!item_pos && tb->CFL[0]) { + if (B_NR_ITEMS(tbS0)) { + replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, + 0); + } else { + if (!PATH_H_POSITION(tb->tb_path, 1)) + replace_key(tb, tb->CFL[0], tb->lkey[0], + PATH_H_PPARENT(tb->tb_path, + 0), 0); + } + } - RFALSE( ih_item_len(ih) + IH_SIZE != -tb->insert_size[0], - "vs-12013: mode Delete, insert size %d, ih to be deleted %h", - -tb->insert_size [0], ih); + RFALSE(!item_pos && !tb->CFL[0], + "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0], + tb->L[0]); - bi.tb = tb; - bi.bi_bh = tbS0; - bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); - bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); - leaf_delete_items (&bi, 0, item_pos, 1, -1); - - if ( ! item_pos && tb->CFL[0] ) { - if ( B_NR_ITEMS(tbS0) ) { - replace_key(tb, tb->CFL[0],tb->lkey[0],tbS0,0); - } - else { - if ( ! PATH_H_POSITION (tb->tb_path, 1) ) - replace_key(tb, tb->CFL[0],tb->lkey[0],PATH_H_PPARENT(tb->tb_path, 0),0); - } - } - - RFALSE( ! item_pos && !tb->CFL[0], - "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0], tb->L[0]); - - break; - - case M_CUT: { /* cut item in S[0] */ - bi.tb = tb; - bi.bi_bh = tbS0; - bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); - bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); - if (is_direntry_le_ih (ih)) { - - /* UFS unlink semantics are such that you can only delete one directory entry at a time. */ - /* when we cut a directory tb->insert_size[0] means number of entries to be cut (always 1) */ - tb->insert_size[0] = -1; - leaf_cut_from_buffer (&bi, item_pos, pos_in_item, -tb->insert_size[0]); - - RFALSE( ! item_pos && ! pos_in_item && ! tb->CFL[0], - "PAP-12030: can not change delimiting key. CFL[0]=%p", - tb->CFL[0]); - - if ( ! item_pos && ! pos_in_item && tb->CFL[0] ) { - replace_key(tb, tb->CFL[0],tb->lkey[0],tbS0,0); - } - } else { - leaf_cut_from_buffer (&bi, item_pos, pos_in_item, -tb->insert_size[0]); - - RFALSE( ! ih_item_len(ih), - "PAP-12035: cut must leave non-zero dynamic length of item"); - } - break; - } - - default: - print_cur_tb ("12040"); - reiserfs_panic (tb->tb_sb, "PAP-12040: balance_leaf_when_delete: unexpectable mode: %s(%d)", - (flag == M_PASTE) ? "PASTE" : ((flag == M_INSERT) ? "INSERT" : "UNKNOWN"), flag); - } - - /* the rule is that no shifting occurs unless by shifting a node can be freed */ - n = B_NR_ITEMS(tbS0); - if ( tb->lnum[0] ) /* L[0] takes part in balancing */ - { - if ( tb->lnum[0] == -1 ) /* L[0] must be joined with S[0] */ - { - if ( tb->rnum[0] == -1 ) /* R[0] must be also joined with S[0] */ - { - if ( tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0) ) - { - /* all contents of all the 3 buffers will be in L[0] */ - if ( PATH_H_POSITION (tb->tb_path, 1) == 0 && 1 < B_NR_ITEMS(tb->FR[0]) ) - replace_key(tb, tb->CFL[0],tb->lkey[0],tb->FR[0],1); - - leaf_move_items (LEAF_FROM_S_TO_L, tb, n, -1, NULL); - leaf_move_items (LEAF_FROM_R_TO_L, tb, B_NR_ITEMS(tb->R[0]), -1, NULL); - - reiserfs_invalidate_buffer (tb, tbS0); - reiserfs_invalidate_buffer (tb, tb->R[0]); - - return 0; + break; + + case M_CUT:{ /* cut item in S[0] */ + bi.tb = tb; + bi.bi_bh = tbS0; + bi.bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + bi.bi_position = PATH_H_POSITION(tb->tb_path, 1); + if (is_direntry_le_ih(ih)) { + + /* UFS unlink semantics are such that you can only delete one directory entry at a time. */ + /* when we cut a directory tb->insert_size[0] means number of entries to be cut (always 1) */ + tb->insert_size[0] = -1; + leaf_cut_from_buffer(&bi, item_pos, pos_in_item, + -tb->insert_size[0]); + + RFALSE(!item_pos && !pos_in_item && !tb->CFL[0], + "PAP-12030: can not change delimiting key. CFL[0]=%p", + tb->CFL[0]); + + if (!item_pos && !pos_in_item && tb->CFL[0]) { + replace_key(tb, tb->CFL[0], tb->lkey[0], + tbS0, 0); + } + } else { + leaf_cut_from_buffer(&bi, item_pos, pos_in_item, + -tb->insert_size[0]); + + RFALSE(!ih_item_len(ih), + "PAP-12035: cut must leave non-zero dynamic length of item"); + } + break; } - /* all contents of all the 3 buffers will be in R[0] */ - leaf_move_items (LEAF_FROM_S_TO_R, tb, n, -1, NULL); - leaf_move_items (LEAF_FROM_L_TO_R, tb, B_NR_ITEMS(tb->L[0]), -1, NULL); - /* right_delimiting_key is correct in R[0] */ - replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); + default: + print_cur_tb("12040"); + reiserfs_panic(tb->tb_sb, + "PAP-12040: balance_leaf_when_delete: unexpectable mode: %s(%d)", + (flag == + M_PASTE) ? "PASTE" : ((flag == + M_INSERT) ? "INSERT" : + "UNKNOWN"), flag); + } - reiserfs_invalidate_buffer (tb, tbS0); - reiserfs_invalidate_buffer (tb, tb->L[0]); + /* the rule is that no shifting occurs unless by shifting a node can be freed */ + n = B_NR_ITEMS(tbS0); + if (tb->lnum[0]) { /* L[0] takes part in balancing */ + if (tb->lnum[0] == -1) { /* L[0] must be joined with S[0] */ + if (tb->rnum[0] == -1) { /* R[0] must be also joined with S[0] */ + if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) { + /* all contents of all the 3 buffers will be in L[0] */ + if (PATH_H_POSITION(tb->tb_path, 1) == 0 + && 1 < B_NR_ITEMS(tb->FR[0])) + replace_key(tb, tb->CFL[0], + tb->lkey[0], + tb->FR[0], 1); + + leaf_move_items(LEAF_FROM_S_TO_L, tb, n, + -1, NULL); + leaf_move_items(LEAF_FROM_R_TO_L, tb, + B_NR_ITEMS(tb->R[0]), + -1, NULL); + + reiserfs_invalidate_buffer(tb, tbS0); + reiserfs_invalidate_buffer(tb, + tb->R[0]); + + return 0; + } + /* all contents of all the 3 buffers will be in R[0] */ + leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1, + NULL); + leaf_move_items(LEAF_FROM_L_TO_R, tb, + B_NR_ITEMS(tb->L[0]), -1, NULL); + + /* right_delimiting_key is correct in R[0] */ + replace_key(tb, tb->CFR[0], tb->rkey[0], + tb->R[0], 0); - return -1; - } + reiserfs_invalidate_buffer(tb, tbS0); + reiserfs_invalidate_buffer(tb, tb->L[0]); - RFALSE( tb->rnum[0] != 0, - "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]); - /* all contents of L[0] and S[0] will be in L[0] */ - leaf_shift_left(tb, n, -1); + return -1; + } - reiserfs_invalidate_buffer (tb, tbS0); + RFALSE(tb->rnum[0] != 0, + "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]); + /* all contents of L[0] and S[0] will be in L[0] */ + leaf_shift_left(tb, n, -1); - return 0; + reiserfs_invalidate_buffer(tb, tbS0); + + return 0; + } + /* a part of contents of S[0] will be in L[0] and the rest part of S[0] will be in R[0] */ + + RFALSE((tb->lnum[0] + tb->rnum[0] < n) || + (tb->lnum[0] + tb->rnum[0] > n + 1), + "PAP-12050: rnum(%d) and lnum(%d) and item number(%d) in S[0] are not consistent", + tb->rnum[0], tb->lnum[0], n); + RFALSE((tb->lnum[0] + tb->rnum[0] == n) && + (tb->lbytes != -1 || tb->rbytes != -1), + "PAP-12055: bad rbytes (%d)/lbytes (%d) parameters when items are not split", + tb->rbytes, tb->lbytes); + RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) && + (tb->lbytes < 1 || tb->rbytes != -1), + "PAP-12060: bad rbytes (%d)/lbytes (%d) parameters when items are split", + tb->rbytes, tb->lbytes); + + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + + reiserfs_invalidate_buffer(tb, tbS0); + + return 0; } - /* a part of contents of S[0] will be in L[0] and the rest part of S[0] will be in R[0] */ - - RFALSE( ( tb->lnum[0] + tb->rnum[0] < n ) || - ( tb->lnum[0] + tb->rnum[0] > n+1 ), - "PAP-12050: rnum(%d) and lnum(%d) and item number(%d) in S[0] are not consistent", - tb->rnum[0], tb->lnum[0], n); - RFALSE( ( tb->lnum[0] + tb->rnum[0] == n ) && - (tb->lbytes != -1 || tb->rbytes != -1), - "PAP-12055: bad rbytes (%d)/lbytes (%d) parameters when items are not split", - tb->rbytes, tb->lbytes); - RFALSE( ( tb->lnum[0] + tb->rnum[0] == n + 1 ) && - (tb->lbytes < 1 || tb->rbytes != -1), - "PAP-12060: bad rbytes (%d)/lbytes (%d) parameters when items are split", - tb->rbytes, tb->lbytes); - - leaf_shift_left (tb, tb->lnum[0], tb->lbytes); - leaf_shift_right(tb, tb->rnum[0], tb->rbytes); - - reiserfs_invalidate_buffer (tb, tbS0); - return 0; - } + if (tb->rnum[0] == -1) { + /* all contents of R[0] and S[0] will be in R[0] */ + leaf_shift_right(tb, n, -1); + reiserfs_invalidate_buffer(tb, tbS0); + return 0; + } - if ( tb->rnum[0] == -1 ) { - /* all contents of R[0] and S[0] will be in R[0] */ - leaf_shift_right(tb, n, -1); - reiserfs_invalidate_buffer (tb, tbS0); + RFALSE(tb->rnum[0], + "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]); return 0; - } - - RFALSE( tb->rnum[0], - "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]); - return 0; } - -static int balance_leaf (struct tree_balance * tb, - struct item_head * ih, /* item header of inserted item (this is on little endian) */ - const char * body, /* body of inserted item or bytes to paste */ - int flag, /* i - insert, d - delete, c - cut, p - paste - (see comment to do_balance) */ - struct item_head * insert_key, /* in our processing of one level we sometimes determine what - must be inserted into the next higher level. This insertion - consists of a key or two keys and their corresponding - pointers */ - struct buffer_head ** insert_ptr /* inserted node-ptrs for the next level */ +static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item header of inserted item (this is on little endian) */ + const char *body, /* body of inserted item or bytes to paste */ + int flag, /* i - insert, d - delete, c - cut, p - paste + (see comment to do_balance) */ + struct item_head *insert_key, /* in our processing of one level we sometimes determine what + must be inserted into the next higher level. This insertion + consists of a key or two keys and their corresponding + pointers */ + struct buffer_head **insert_ptr /* inserted node-ptrs for the next level */ ) { - struct buffer_head * tbS0 = PATH_PLAST_BUFFER (tb->tb_path); - int item_pos = PATH_LAST_POSITION (tb->tb_path); /* index into the array of item headers in S[0] - of the affected item */ - struct buffer_info bi; - struct buffer_head *S_new[2]; /* new nodes allocated to hold what could not fit into S */ - int snum[2]; /* number of items that will be placed - into S_new (includes partially shifted - items) */ - int sbytes[2]; /* if an item is partially shifted into S_new then - if it is a directory item - it is the number of entries from the item that are shifted into S_new - else - it is the number of bytes from the item that are shifted into S_new - */ - int n, i; - int ret_val; - int pos_in_item; - int zeros_num; - - PROC_INFO_INC( tb -> tb_sb, balance_at[ 0 ] ); - - /* Make balance in case insert_size[0] < 0 */ - if ( tb->insert_size[0] < 0 ) - return balance_leaf_when_delete (tb, flag); - - zeros_num = 0; - if (flag == M_INSERT && body == 0) - zeros_num = ih_item_len( ih ); - - pos_in_item = tb->tb_path->pos_in_item; - /* for indirect item pos_in_item is measured in unformatted node - pointers. Recalculate to bytes */ - if (flag != M_INSERT && is_indirect_le_ih (B_N_PITEM_HEAD (tbS0, item_pos))) - pos_in_item *= UNFM_P_SIZE; - - if ( tb->lnum[0] > 0 ) { - /* Shift lnum[0] items from S[0] to the left neighbor L[0] */ - if ( item_pos < tb->lnum[0] ) { - /* new item or it part falls to L[0], shift it too */ - n = B_NR_ITEMS(tb->L[0]); - - switch (flag) { - case M_INSERT: /* insert item into L[0] */ - - if ( item_pos == tb->lnum[0] - 1 && tb->lbytes != -1 ) { - /* part of new item falls into L[0] */ - int new_item_len; - int version; - - ret_val = leaf_shift_left (tb, tb->lnum[0]-1, -1); - - /* Calculate item length to insert to S[0] */ - new_item_len = ih_item_len(ih) - tb->lbytes; - /* Calculate and check item length to insert to L[0] */ - put_ih_item_len(ih, ih_item_len(ih) - new_item_len ); - - RFALSE( ih_item_len(ih) <= 0, - "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d", - ih_item_len(ih)); - - /* Insert new item into L[0] */ - bi.tb = tb; - bi.bi_bh = tb->L[0]; - bi.bi_parent = tb->FL[0]; - bi.bi_position = get_left_neighbor_position (tb, 0); - leaf_insert_into_buf (&bi, n + item_pos - ret_val, ih, body, - zeros_num > ih_item_len(ih) ? ih_item_len(ih) : zeros_num); - - version = ih_version (ih); - - /* Calculate key component, item length and body to insert into S[0] */ - set_le_ih_k_offset( ih, le_ih_k_offset( ih ) + (tb->lbytes << (is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0)) ); - - put_ih_item_len( ih, new_item_len ); - if ( tb->lbytes > zeros_num ) { - body += (tb->lbytes - zeros_num); - zeros_num = 0; - } - else - zeros_num -= tb->lbytes; - - RFALSE( ih_item_len(ih) <= 0, - "PAP-12085: there is nothing to insert into S[0]: ih_item_len=%d", - ih_item_len(ih)); - } else { - /* new item in whole falls into L[0] */ - /* Shift lnum[0]-1 items to L[0] */ - ret_val = leaf_shift_left(tb, tb->lnum[0]-1, tb->lbytes); - /* Insert new item into L[0] */ - bi.tb = tb; - bi.bi_bh = tb->L[0]; - bi.bi_parent = tb->FL[0]; - bi.bi_position = get_left_neighbor_position (tb, 0); - leaf_insert_into_buf (&bi, n + item_pos - ret_val, ih, body, zeros_num); - tb->insert_size[0] = 0; - zeros_num = 0; - } - break; - - case M_PASTE: /* append item in L[0] */ - - if ( item_pos == tb->lnum[0] - 1 && tb->lbytes != -1 ) { - /* we must shift the part of the appended item */ - if ( is_direntry_le_ih (B_N_PITEM_HEAD (tbS0, item_pos))) { - - RFALSE( zeros_num, - "PAP-12090: invalid parameter in case of a directory"); - /* directory item */ - if ( tb->lbytes > pos_in_item ) { - /* new directory entry falls into L[0] */ - struct item_head * pasted; - int l_pos_in_item = pos_in_item; - - /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */ - ret_val = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1); - if ( ret_val && ! item_pos ) { - pasted = B_N_PITEM_HEAD(tb->L[0],B_NR_ITEMS(tb->L[0])-1); - l_pos_in_item += I_ENTRY_COUNT(pasted) - (tb->lbytes-1); - } - - /* Append given directory entry to directory item */ - bi.tb = tb; - bi.bi_bh = tb->L[0]; - bi.bi_parent = tb->FL[0]; - bi.bi_position = get_left_neighbor_position (tb, 0); - leaf_paste_in_buffer (&bi, n + item_pos - ret_val, l_pos_in_item, - tb->insert_size[0], body, zeros_num); - - /* previous string prepared space for pasting new entry, following string pastes this entry */ - - /* when we have merge directory item, pos_in_item has been changed too */ - - /* paste new directory entry. 1 is entry number */ - leaf_paste_entries (bi.bi_bh, n + item_pos - ret_val, l_pos_in_item, 1, - (struct reiserfs_de_head *)body, - body + DEH_SIZE, tb->insert_size[0] - ); - tb->insert_size[0] = 0; - } else { - /* new directory item doesn't fall into L[0] */ - /* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */ - leaf_shift_left (tb, tb->lnum[0], tb->lbytes); - } - /* Calculate new position to append in item body */ - pos_in_item -= tb->lbytes; - } - else { - /* regular object */ - RFALSE( tb->lbytes <= 0, - "PAP-12095: there is nothing to shift to L[0]. lbytes=%d", - tb->lbytes); - RFALSE( pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)), - "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d", - ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos)), pos_in_item); - - if ( tb->lbytes >= pos_in_item ) { - /* appended item will be in L[0] in whole */ - int l_n; - - /* this bytes number must be appended to the last item of L[h] */ - l_n = tb->lbytes - pos_in_item; - - /* Calculate new insert_size[0] */ - tb->insert_size[0] -= l_n; - - RFALSE( tb->insert_size[0] <= 0, - "PAP-12105: there is nothing to paste into L[0]. insert_size=%d", - tb->insert_size[0]); - ret_val = leaf_shift_left(tb,tb->lnum[0], - ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos))); - /* Append to body of item in L[0] */ - bi.tb = tb; - bi.bi_bh = tb->L[0]; - bi.bi_parent = tb->FL[0]; - bi.bi_position = get_left_neighbor_position (tb, 0); - leaf_paste_in_buffer( - &bi,n + item_pos - ret_val, - ih_item_len( B_N_PITEM_HEAD(tb->L[0],n+item_pos-ret_val)), - l_n,body, zeros_num > l_n ? l_n : zeros_num - ); - /* 0-th item in S0 can be only of DIRECT type when l_n != 0*/ - { - int version; - int temp_l = l_n; - - RFALSE (ih_item_len (B_N_PITEM_HEAD (tbS0, 0)), - "PAP-12106: item length must be 0"); - RFALSE (comp_short_le_keys (B_N_PKEY (tbS0, 0), - B_N_PKEY (tb->L[0], - n + item_pos - ret_val)), - "PAP-12107: items must be of the same file"); - if (is_indirect_le_ih(B_N_PITEM_HEAD (tb->L[0], - n + item_pos - ret_val))) { - temp_l = l_n << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT); + struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); + int item_pos = PATH_LAST_POSITION(tb->tb_path); /* index into the array of item headers in S[0] + of the affected item */ + struct buffer_info bi; + struct buffer_head *S_new[2]; /* new nodes allocated to hold what could not fit into S */ + int snum[2]; /* number of items that will be placed + into S_new (includes partially shifted + items) */ + int sbytes[2]; /* if an item is partially shifted into S_new then + if it is a directory item + it is the number of entries from the item that are shifted into S_new + else + it is the number of bytes from the item that are shifted into S_new + */ + int n, i; + int ret_val; + int pos_in_item; + int zeros_num; + + PROC_INFO_INC(tb->tb_sb, balance_at[0]); + + /* Make balance in case insert_size[0] < 0 */ + if (tb->insert_size[0] < 0) + return balance_leaf_when_delete(tb, flag); + + zeros_num = 0; + if (flag == M_INSERT && body == 0) + zeros_num = ih_item_len(ih); + + pos_in_item = tb->tb_path->pos_in_item; + /* for indirect item pos_in_item is measured in unformatted node + pointers. Recalculate to bytes */ + if (flag != M_INSERT + && is_indirect_le_ih(B_N_PITEM_HEAD(tbS0, item_pos))) + pos_in_item *= UNFM_P_SIZE; + + if (tb->lnum[0] > 0) { + /* Shift lnum[0] items from S[0] to the left neighbor L[0] */ + if (item_pos < tb->lnum[0]) { + /* new item or it part falls to L[0], shift it too */ + n = B_NR_ITEMS(tb->L[0]); + + switch (flag) { + case M_INSERT: /* insert item into L[0] */ + + if (item_pos == tb->lnum[0] - 1 + && tb->lbytes != -1) { + /* part of new item falls into L[0] */ + int new_item_len; + int version; + + ret_val = + leaf_shift_left(tb, tb->lnum[0] - 1, + -1); + + /* Calculate item length to insert to S[0] */ + new_item_len = + ih_item_len(ih) - tb->lbytes; + /* Calculate and check item length to insert to L[0] */ + put_ih_item_len(ih, + ih_item_len(ih) - + new_item_len); + + RFALSE(ih_item_len(ih) <= 0, + "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d", + ih_item_len(ih)); + + /* Insert new item into L[0] */ + bi.tb = tb; + bi.bi_bh = tb->L[0]; + bi.bi_parent = tb->FL[0]; + bi.bi_position = + get_left_neighbor_position(tb, 0); + leaf_insert_into_buf(&bi, + n + item_pos - + ret_val, ih, body, + zeros_num > + ih_item_len(ih) ? + ih_item_len(ih) : + zeros_num); + + version = ih_version(ih); + + /* Calculate key component, item length and body to insert into S[0] */ + set_le_ih_k_offset(ih, + le_ih_k_offset(ih) + + (tb-> + lbytes << + (is_indirect_le_ih + (ih) ? tb->tb_sb-> + s_blocksize_bits - + UNFM_P_SHIFT : + 0))); + + put_ih_item_len(ih, new_item_len); + if (tb->lbytes > zeros_num) { + body += + (tb->lbytes - zeros_num); + zeros_num = 0; + } else + zeros_num -= tb->lbytes; + + RFALSE(ih_item_len(ih) <= 0, + "PAP-12085: there is nothing to insert into S[0]: ih_item_len=%d", + ih_item_len(ih)); + } else { + /* new item in whole falls into L[0] */ + /* Shift lnum[0]-1 items to L[0] */ + ret_val = + leaf_shift_left(tb, tb->lnum[0] - 1, + tb->lbytes); + /* Insert new item into L[0] */ + bi.tb = tb; + bi.bi_bh = tb->L[0]; + bi.bi_parent = tb->FL[0]; + bi.bi_position = + get_left_neighbor_position(tb, 0); + leaf_insert_into_buf(&bi, + n + item_pos - + ret_val, ih, body, + zeros_num); + tb->insert_size[0] = 0; + zeros_num = 0; } - /* update key of first item in S0 */ - version = ih_version (B_N_PITEM_HEAD (tbS0, 0)); - set_le_key_k_offset (version, B_N_PKEY (tbS0, 0), - le_key_k_offset (version, B_N_PKEY (tbS0, 0)) + temp_l); - /* update left delimiting key */ - set_le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0]), - le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0])) + temp_l); - } - - /* Calculate new body, position in item and insert_size[0] */ - if ( l_n > zeros_num ) { - body += (l_n - zeros_num); - zeros_num = 0; - } - else - zeros_num -= l_n; - pos_in_item = 0; - - RFALSE( comp_short_le_keys - (B_N_PKEY(tbS0,0), - B_N_PKEY(tb->L[0],B_NR_ITEMS(tb->L[0])-1)) || - - !op_is_left_mergeable - (B_N_PKEY (tbS0, 0), tbS0->b_size) || - !op_is_left_mergeable - (B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0]), - tbS0->b_size), - "PAP-12120: item must be merge-able with left neighboring item"); - } - else /* only part of the appended item will be in L[0] */ - { - /* Calculate position in item for append in S[0] */ - pos_in_item -= tb->lbytes; - - RFALSE( pos_in_item <= 0, - "PAP-12125: no place for paste. pos_in_item=%d", pos_in_item); - - /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ - leaf_shift_left(tb,tb->lnum[0],tb->lbytes); - } - } - } - else /* appended item will be in L[0] in whole */ - { - struct item_head * pasted; - - if ( ! item_pos && op_is_left_mergeable (B_N_PKEY (tbS0, 0), tbS0->b_size) ) - { /* if we paste into first item of S[0] and it is left mergable */ - /* then increment pos_in_item by the size of the last item in L[0] */ - pasted = B_N_PITEM_HEAD(tb->L[0],n-1); - if ( is_direntry_le_ih (pasted) ) - pos_in_item += ih_entry_count(pasted); - else - pos_in_item += ih_item_len(pasted); + break; + + case M_PASTE: /* append item in L[0] */ + + if (item_pos == tb->lnum[0] - 1 + && tb->lbytes != -1) { + /* we must shift the part of the appended item */ + if (is_direntry_le_ih + (B_N_PITEM_HEAD(tbS0, item_pos))) { + + RFALSE(zeros_num, + "PAP-12090: invalid parameter in case of a directory"); + /* directory item */ + if (tb->lbytes > pos_in_item) { + /* new directory entry falls into L[0] */ + struct item_head + *pasted; + int l_pos_in_item = + pos_in_item; + + /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */ + ret_val = + leaf_shift_left(tb, + tb-> + lnum + [0], + tb-> + lbytes + - + 1); + if (ret_val + && !item_pos) { + pasted = + B_N_PITEM_HEAD + (tb->L[0], + B_NR_ITEMS + (tb-> + L[0]) - + 1); + l_pos_in_item += + I_ENTRY_COUNT + (pasted) - + (tb-> + lbytes - + 1); + } + + /* Append given directory entry to directory item */ + bi.tb = tb; + bi.bi_bh = tb->L[0]; + bi.bi_parent = + tb->FL[0]; + bi.bi_position = + get_left_neighbor_position + (tb, 0); + leaf_paste_in_buffer + (&bi, + n + item_pos - + ret_val, + l_pos_in_item, + tb->insert_size[0], + body, zeros_num); + + /* previous string prepared space for pasting new entry, following string pastes this entry */ + + /* when we have merge directory item, pos_in_item has been changed too */ + + /* paste new directory entry. 1 is entry number */ + leaf_paste_entries(bi. + bi_bh, + n + + item_pos + - + ret_val, + l_pos_in_item, + 1, + (struct + reiserfs_de_head + *) + body, + body + + + DEH_SIZE, + tb-> + insert_size + [0] + ); + tb->insert_size[0] = 0; + } else { + /* new directory item doesn't fall into L[0] */ + /* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */ + leaf_shift_left(tb, + tb-> + lnum[0], + tb-> + lbytes); + } + /* Calculate new position to append in item body */ + pos_in_item -= tb->lbytes; + } else { + /* regular object */ + RFALSE(tb->lbytes <= 0, + "PAP-12095: there is nothing to shift to L[0]. lbytes=%d", + tb->lbytes); + RFALSE(pos_in_item != + ih_item_len + (B_N_PITEM_HEAD + (tbS0, item_pos)), + "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d", + ih_item_len + (B_N_PITEM_HEAD + (tbS0, item_pos)), + pos_in_item); + + if (tb->lbytes >= pos_in_item) { + /* appended item will be in L[0] in whole */ + int l_n; + + /* this bytes number must be appended to the last item of L[h] */ + l_n = + tb->lbytes - + pos_in_item; + + /* Calculate new insert_size[0] */ + tb->insert_size[0] -= + l_n; + + RFALSE(tb-> + insert_size[0] <= + 0, + "PAP-12105: there is nothing to paste into L[0]. insert_size=%d", + tb-> + insert_size[0]); + ret_val = + leaf_shift_left(tb, + tb-> + lnum + [0], + ih_item_len + (B_N_PITEM_HEAD + (tbS0, + item_pos))); + /* Append to body of item in L[0] */ + bi.tb = tb; + bi.bi_bh = tb->L[0]; + bi.bi_parent = + tb->FL[0]; + bi.bi_position = + get_left_neighbor_position + (tb, 0); + leaf_paste_in_buffer + (&bi, + n + item_pos - + ret_val, + ih_item_len + (B_N_PITEM_HEAD + (tb->L[0], + n + item_pos - + ret_val)), l_n, + body, + zeros_num > + l_n ? l_n : + zeros_num); + /* 0-th item in S0 can be only of DIRECT type when l_n != 0 */ + { + int version; + int temp_l = + l_n; + + RFALSE + (ih_item_len + (B_N_PITEM_HEAD + (tbS0, + 0)), + "PAP-12106: item length must be 0"); + RFALSE + (comp_short_le_keys + (B_N_PKEY + (tbS0, 0), + B_N_PKEY + (tb->L[0], + n + + item_pos + - + ret_val)), + "PAP-12107: items must be of the same file"); + if (is_indirect_le_ih(B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val))) { + temp_l = + l_n + << + (tb-> + tb_sb-> + s_blocksize_bits + - + UNFM_P_SHIFT); + } + /* update key of first item in S0 */ + version = + ih_version + (B_N_PITEM_HEAD + (tbS0, 0)); + set_le_key_k_offset + (version, + B_N_PKEY + (tbS0, 0), + le_key_k_offset + (version, + B_N_PKEY + (tbS0, + 0)) + + temp_l); + /* update left delimiting key */ + set_le_key_k_offset + (version, + B_N_PDELIM_KEY + (tb-> + CFL[0], + tb-> + lkey[0]), + le_key_k_offset + (version, + B_N_PDELIM_KEY + (tb-> + CFL[0], + tb-> + lkey[0])) + + temp_l); + } + + /* Calculate new body, position in item and insert_size[0] */ + if (l_n > zeros_num) { + body += + (l_n - + zeros_num); + zeros_num = 0; + } else + zeros_num -= + l_n; + pos_in_item = 0; + + RFALSE + (comp_short_le_keys + (B_N_PKEY(tbS0, 0), + B_N_PKEY(tb->L[0], + B_NR_ITEMS + (tb-> + L[0]) - + 1)) + || + !op_is_left_mergeable + (B_N_PKEY(tbS0, 0), + tbS0->b_size) + || + !op_is_left_mergeable + (B_N_PDELIM_KEY + (tb->CFL[0], + tb->lkey[0]), + tbS0->b_size), + "PAP-12120: item must be merge-able with left neighboring item"); + } else { /* only part of the appended item will be in L[0] */ + + /* Calculate position in item for append in S[0] */ + pos_in_item -= + tb->lbytes; + + RFALSE(pos_in_item <= 0, + "PAP-12125: no place for paste. pos_in_item=%d", + pos_in_item); + + /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ + leaf_shift_left(tb, + tb-> + lnum[0], + tb-> + lbytes); + } + } + } else { /* appended item will be in L[0] in whole */ + + struct item_head *pasted; + + if (!item_pos && op_is_left_mergeable(B_N_PKEY(tbS0, 0), tbS0->b_size)) { /* if we paste into first item of S[0] and it is left mergable */ + /* then increment pos_in_item by the size of the last item in L[0] */ + pasted = + B_N_PITEM_HEAD(tb->L[0], + n - 1); + if (is_direntry_le_ih(pasted)) + pos_in_item += + ih_entry_count + (pasted); + else + pos_in_item += + ih_item_len(pasted); + } + + /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ + ret_val = + leaf_shift_left(tb, tb->lnum[0], + tb->lbytes); + /* Append to body of item in L[0] */ + bi.tb = tb; + bi.bi_bh = tb->L[0]; + bi.bi_parent = tb->FL[0]; + bi.bi_position = + get_left_neighbor_position(tb, 0); + leaf_paste_in_buffer(&bi, + n + item_pos - + ret_val, + pos_in_item, + tb->insert_size[0], + body, zeros_num); + + /* if appended item is directory, paste entry */ + pasted = + B_N_PITEM_HEAD(tb->L[0], + n + item_pos - + ret_val); + if (is_direntry_le_ih(pasted)) + leaf_paste_entries(bi.bi_bh, + n + + item_pos - + ret_val, + pos_in_item, + 1, + (struct + reiserfs_de_head + *)body, + body + + DEH_SIZE, + tb-> + insert_size + [0] + ); + /* if appended item is indirect item, put unformatted node into un list */ + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + tb->insert_size[0] = 0; + zeros_num = 0; + } + break; + default: /* cases d and t */ + reiserfs_panic(tb->tb_sb, + "PAP-12130: balance_leaf: lnum > 0: unexpectable mode: %s(%d)", + (flag == + M_DELETE) ? "DELETE" : ((flag == + M_CUT) + ? "CUT" + : + "UNKNOWN"), + flag); } - - /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ - ret_val = leaf_shift_left(tb,tb->lnum[0],tb->lbytes); - /* Append to body of item in L[0] */ - bi.tb = tb; - bi.bi_bh = tb->L[0]; - bi.bi_parent = tb->FL[0]; - bi.bi_position = get_left_neighbor_position (tb, 0); - leaf_paste_in_buffer (&bi, n + item_pos - ret_val, pos_in_item, tb->insert_size[0], - body, zeros_num); - - /* if appended item is directory, paste entry */ - pasted = B_N_PITEM_HEAD (tb->L[0], n + item_pos - ret_val); - if (is_direntry_le_ih (pasted)) - leaf_paste_entries ( - bi.bi_bh, n + item_pos - ret_val, pos_in_item, 1, - (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0] - ); - /* if appended item is indirect item, put unformatted node into un list */ - if (is_indirect_le_ih (pasted)) - set_ih_free_space (pasted, 0); - tb->insert_size[0] = 0; - zeros_num = 0; + } else { + /* new item doesn't fall into L[0] */ + leaf_shift_left(tb, tb->lnum[0], tb->lbytes); } - break; - default: /* cases d and t */ - reiserfs_panic (tb->tb_sb, "PAP-12130: balance_leaf: lnum > 0: unexpectable mode: %s(%d)", - (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); - } - } else { - /* new item doesn't fall into L[0] */ - leaf_shift_left(tb,tb->lnum[0],tb->lbytes); } - } /* tb->lnum[0] > 0 */ - /* Calculate new item position */ - item_pos -= ( tb->lnum[0] - (( tb->lbytes != -1 ) ? 1 : 0)); - - if ( tb->rnum[0] > 0 ) { - /* shift rnum[0] items from S[0] to the right neighbor R[0] */ - n = B_NR_ITEMS(tbS0); - switch ( flag ) { - - case M_INSERT: /* insert item */ - if ( n - tb->rnum[0] < item_pos ) - { /* new item or its part falls to R[0] */ - if ( item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1 ) - { /* part of new item falls into R[0] */ - loff_t old_key_comp, old_len, r_zeros_number; - const char * r_body; - int version; - loff_t offset; - - leaf_shift_right(tb,tb->rnum[0]-1,-1); - - version = ih_version(ih); - /* Remember key component and item length */ - old_key_comp = le_ih_k_offset( ih ); - old_len = ih_item_len(ih); - - /* Calculate key component and item length to insert into R[0] */ - offset = le_ih_k_offset( ih ) + ((old_len - tb->rbytes )<<(is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0)); - set_le_ih_k_offset( ih, offset ); - put_ih_item_len( ih, tb->rbytes); - /* Insert part of the item into R[0] */ - bi.tb = tb; - bi.bi_bh = tb->R[0]; - bi.bi_parent = tb->FR[0]; - bi.bi_position = get_right_neighbor_position (tb, 0); - if ( (old_len - tb->rbytes) > zeros_num ) { - r_zeros_number = 0; - r_body = body + (old_len - tb->rbytes) - zeros_num; - } - else { - r_body = body; - r_zeros_number = zeros_num - (old_len - tb->rbytes); - zeros_num -= r_zeros_number; - } - - leaf_insert_into_buf (&bi, 0, ih, r_body, r_zeros_number); - - /* Replace right delimiting key by first key in R[0] */ - replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); - - /* Calculate key component and item length to insert into S[0] */ - set_le_ih_k_offset( ih, old_key_comp ); - put_ih_item_len( ih, old_len - tb->rbytes ); - - tb->insert_size[0] -= tb->rbytes; + /* tb->lnum[0] > 0 */ + /* Calculate new item position */ + item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0)); + + if (tb->rnum[0] > 0) { + /* shift rnum[0] items from S[0] to the right neighbor R[0] */ + n = B_NR_ITEMS(tbS0); + switch (flag) { + + case M_INSERT: /* insert item */ + if (n - tb->rnum[0] < item_pos) { /* new item or its part falls to R[0] */ + if (item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { /* part of new item falls into R[0] */ + loff_t old_key_comp, old_len, + r_zeros_number; + const char *r_body; + int version; + loff_t offset; + + leaf_shift_right(tb, tb->rnum[0] - 1, + -1); + + version = ih_version(ih); + /* Remember key component and item length */ + old_key_comp = le_ih_k_offset(ih); + old_len = ih_item_len(ih); + + /* Calculate key component and item length to insert into R[0] */ + offset = + le_ih_k_offset(ih) + + ((old_len - + tb-> + rbytes) << (is_indirect_le_ih(ih) + ? tb->tb_sb-> + s_blocksize_bits - + UNFM_P_SHIFT : 0)); + set_le_ih_k_offset(ih, offset); + put_ih_item_len(ih, tb->rbytes); + /* Insert part of the item into R[0] */ + bi.tb = tb; + bi.bi_bh = tb->R[0]; + bi.bi_parent = tb->FR[0]; + bi.bi_position = + get_right_neighbor_position(tb, 0); + if ((old_len - tb->rbytes) > zeros_num) { + r_zeros_number = 0; + r_body = + body + (old_len - + tb->rbytes) - + zeros_num; + } else { + r_body = body; + r_zeros_number = + zeros_num - (old_len - + tb->rbytes); + zeros_num -= r_zeros_number; + } + + leaf_insert_into_buf(&bi, 0, ih, r_body, + r_zeros_number); + + /* Replace right delimiting key by first key in R[0] */ + replace_key(tb, tb->CFR[0], tb->rkey[0], + tb->R[0], 0); + + /* Calculate key component and item length to insert into S[0] */ + set_le_ih_k_offset(ih, old_key_comp); + put_ih_item_len(ih, + old_len - tb->rbytes); + + tb->insert_size[0] -= tb->rbytes; + + } else { /* whole new item falls into R[0] */ + + /* Shift rnum[0]-1 items to R[0] */ + ret_val = + leaf_shift_right(tb, + tb->rnum[0] - 1, + tb->rbytes); + /* Insert new item into R[0] */ + bi.tb = tb; + bi.bi_bh = tb->R[0]; + bi.bi_parent = tb->FR[0]; + bi.bi_position = + get_right_neighbor_position(tb, 0); + leaf_insert_into_buf(&bi, + item_pos - n + + tb->rnum[0] - 1, + ih, body, + zeros_num); + + if (item_pos - n + tb->rnum[0] - 1 == 0) { + replace_key(tb, tb->CFR[0], + tb->rkey[0], + tb->R[0], 0); + + } + zeros_num = tb->insert_size[0] = 0; + } + } else { /* new item or part of it doesn't fall into R[0] */ - } - else /* whole new item falls into R[0] */ - { - /* Shift rnum[0]-1 items to R[0] */ - ret_val = leaf_shift_right(tb,tb->rnum[0]-1,tb->rbytes); - /* Insert new item into R[0] */ - bi.tb = tb; - bi.bi_bh = tb->R[0]; - bi.bi_parent = tb->FR[0]; - bi.bi_position = get_right_neighbor_position (tb, 0); - leaf_insert_into_buf (&bi, item_pos - n + tb->rnum[0] - 1, ih, body, zeros_num); - - if ( item_pos - n + tb->rnum[0] - 1 == 0 ) { - replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); - - } - zeros_num = tb->insert_size[0] = 0; - } - } - else /* new item or part of it doesn't fall into R[0] */ - { - leaf_shift_right(tb,tb->rnum[0],tb->rbytes); - } - break; - - case M_PASTE: /* append item */ - - if ( n - tb->rnum[0] <= item_pos ) /* pasted item or part of it falls to R[0] */ - { - if ( item_pos == n - tb->rnum[0] && tb->rbytes != -1 ) - { /* we must shift the part of the appended item */ - if ( is_direntry_le_ih (B_N_PITEM_HEAD(tbS0, item_pos))) - { /* we append to directory item */ - int entry_count; - - RFALSE( zeros_num, - "PAP-12145: invalid parameter in case of a directory"); - entry_count = I_ENTRY_COUNT(B_N_PITEM_HEAD(tbS0, item_pos)); - if ( entry_count - tb->rbytes < pos_in_item ) - /* new directory entry falls into R[0] */ - { - int paste_entry_position; - - RFALSE( tb->rbytes - 1 >= entry_count || - ! tb->insert_size[0], - "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d", - tb->rbytes, entry_count); - /* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */ - leaf_shift_right(tb,tb->rnum[0],tb->rbytes - 1); - /* Paste given directory entry to directory item */ - paste_entry_position = pos_in_item - entry_count + tb->rbytes - 1; - bi.tb = tb; - bi.bi_bh = tb->R[0]; - bi.bi_parent = tb->FR[0]; - bi.bi_position = get_right_neighbor_position (tb, 0); - leaf_paste_in_buffer (&bi, 0, paste_entry_position, - tb->insert_size[0],body,zeros_num); - /* paste entry */ - leaf_paste_entries ( - bi.bi_bh, 0, paste_entry_position, 1, (struct reiserfs_de_head *)body, - body + DEH_SIZE, tb->insert_size[0] - ); - - if ( paste_entry_position == 0 ) { - /* change delimiting keys */ - replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); - } - - tb->insert_size[0] = 0; - pos_in_item++; - } - else /* new directory entry doesn't fall into R[0] */ - { - leaf_shift_right(tb,tb->rnum[0],tb->rbytes); - } - } - else /* regular object */ - { - int n_shift, n_rem, r_zeros_number; - const char * r_body; - - /* Calculate number of bytes which must be shifted from appended item */ - if ( (n_shift = tb->rbytes - tb->insert_size[0]) < 0 ) - n_shift = 0; - - RFALSE(pos_in_item != ih_item_len(B_N_PITEM_HEAD (tbS0, item_pos)), - "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d", - pos_in_item, ih_item_len( B_N_PITEM_HEAD(tbS0,item_pos))); - - leaf_shift_right(tb,tb->rnum[0],n_shift); - /* Calculate number of bytes which must remain in body after appending to R[0] */ - if ( (n_rem = tb->insert_size[0] - tb->rbytes) < 0 ) - n_rem = 0; - - { - int version; - unsigned long temp_rem = n_rem; - - version = ih_version (B_N_PITEM_HEAD (tb->R[0],0)); - if (is_indirect_le_key(version,B_N_PKEY(tb->R[0],0))){ - temp_rem = n_rem << (tb->tb_sb->s_blocksize_bits - - UNFM_P_SHIFT); - } - set_le_key_k_offset (version, B_N_PKEY(tb->R[0],0), - le_key_k_offset (version, B_N_PKEY(tb->R[0],0)) + temp_rem); - set_le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0]), - le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) + temp_rem); + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); } + break; + + case M_PASTE: /* append item */ + + if (n - tb->rnum[0] <= item_pos) { /* pasted item or part of it falls to R[0] */ + if (item_pos == n - tb->rnum[0] && tb->rbytes != -1) { /* we must shift the part of the appended item */ + if (is_direntry_le_ih(B_N_PITEM_HEAD(tbS0, item_pos))) { /* we append to directory item */ + int entry_count; + + RFALSE(zeros_num, + "PAP-12145: invalid parameter in case of a directory"); + entry_count = + I_ENTRY_COUNT(B_N_PITEM_HEAD + (tbS0, + item_pos)); + if (entry_count - tb->rbytes < + pos_in_item) + /* new directory entry falls into R[0] */ + { + int paste_entry_position; + + RFALSE(tb->rbytes - 1 >= + entry_count + || !tb-> + insert_size[0], + "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d", + tb->rbytes, + entry_count); + /* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */ + leaf_shift_right(tb, + tb-> + rnum + [0], + tb-> + rbytes + - 1); + /* Paste given directory entry to directory item */ + paste_entry_position = + pos_in_item - + entry_count + + tb->rbytes - 1; + bi.tb = tb; + bi.bi_bh = tb->R[0]; + bi.bi_parent = + tb->FR[0]; + bi.bi_position = + get_right_neighbor_position + (tb, 0); + leaf_paste_in_buffer + (&bi, 0, + paste_entry_position, + tb->insert_size[0], + body, zeros_num); + /* paste entry */ + leaf_paste_entries(bi. + bi_bh, + 0, + paste_entry_position, + 1, + (struct + reiserfs_de_head + *) + body, + body + + + DEH_SIZE, + tb-> + insert_size + [0] + ); + + if (paste_entry_position + == 0) { + /* change delimiting keys */ + replace_key(tb, + tb-> + CFR + [0], + tb-> + rkey + [0], + tb-> + R + [0], + 0); + } + + tb->insert_size[0] = 0; + pos_in_item++; + } else { /* new directory entry doesn't fall into R[0] */ + + leaf_shift_right(tb, + tb-> + rnum + [0], + tb-> + rbytes); + } + } else { /* regular object */ + + int n_shift, n_rem, + r_zeros_number; + const char *r_body; + + /* Calculate number of bytes which must be shifted from appended item */ + if ((n_shift = + tb->rbytes - + tb->insert_size[0]) < 0) + n_shift = 0; + + RFALSE(pos_in_item != + ih_item_len + (B_N_PITEM_HEAD + (tbS0, item_pos)), + "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d", + pos_in_item, + ih_item_len + (B_N_PITEM_HEAD + (tbS0, item_pos))); + + leaf_shift_right(tb, + tb->rnum[0], + n_shift); + /* Calculate number of bytes which must remain in body after appending to R[0] */ + if ((n_rem = + tb->insert_size[0] - + tb->rbytes) < 0) + n_rem = 0; + + { + int version; + unsigned long temp_rem = + n_rem; + + version = + ih_version + (B_N_PITEM_HEAD + (tb->R[0], 0)); + if (is_indirect_le_key + (version, + B_N_PKEY(tb->R[0], + 0))) { + temp_rem = + n_rem << + (tb->tb_sb-> + s_blocksize_bits + - + UNFM_P_SHIFT); + } + set_le_key_k_offset + (version, + B_N_PKEY(tb->R[0], + 0), + le_key_k_offset + (version, + B_N_PKEY(tb->R[0], + 0)) + + temp_rem); + set_le_key_k_offset + (version, + B_N_PDELIM_KEY(tb-> + CFR + [0], + tb-> + rkey + [0]), + le_key_k_offset + (version, + B_N_PDELIM_KEY + (tb->CFR[0], + tb->rkey[0])) + + temp_rem); + } /* k_offset (B_N_PKEY(tb->R[0],0)) += n_rem; k_offset (B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) += n_rem;*/ - do_balance_mark_internal_dirty (tb, tb->CFR[0], 0); - - /* Append part of body into R[0] */ - bi.tb = tb; - bi.bi_bh = tb->R[0]; - bi.bi_parent = tb->FR[0]; - bi.bi_position = get_right_neighbor_position (tb, 0); - if ( n_rem > zeros_num ) { - r_zeros_number = 0; - r_body = body + n_rem - zeros_num; - } - else { - r_body = body; - r_zeros_number = zeros_num - n_rem; - zeros_num -= r_zeros_number; - } - - leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem, r_body, r_zeros_number); - - if (is_indirect_le_ih (B_N_PITEM_HEAD(tb->R[0],0))) { + do_balance_mark_internal_dirty + (tb, tb->CFR[0], 0); + + /* Append part of body into R[0] */ + bi.tb = tb; + bi.bi_bh = tb->R[0]; + bi.bi_parent = tb->FR[0]; + bi.bi_position = + get_right_neighbor_position + (tb, 0); + if (n_rem > zeros_num) { + r_zeros_number = 0; + r_body = + body + n_rem - + zeros_num; + } else { + r_body = body; + r_zeros_number = + zeros_num - n_rem; + zeros_num -= + r_zeros_number; + } + + leaf_paste_in_buffer(&bi, 0, + n_shift, + tb-> + insert_size + [0] - + n_rem, + r_body, + r_zeros_number); + + if (is_indirect_le_ih + (B_N_PITEM_HEAD + (tb->R[0], 0))) { #if 0 - RFALSE( n_rem, - "PAP-12160: paste more than one unformatted node pointer"); + RFALSE(n_rem, + "PAP-12160: paste more than one unformatted node pointer"); #endif - set_ih_free_space (B_N_PITEM_HEAD(tb->R[0],0), 0); - } - tb->insert_size[0] = n_rem; - if ( ! n_rem ) - pos_in_item ++; - } - } - else /* pasted item in whole falls into R[0] */ - { - struct item_head * pasted; + set_ih_free_space + (B_N_PITEM_HEAD + (tb->R[0], 0), 0); + } + tb->insert_size[0] = n_rem; + if (!n_rem) + pos_in_item++; + } + } else { /* pasted item in whole falls into R[0] */ + + struct item_head *pasted; + + ret_val = + leaf_shift_right(tb, tb->rnum[0], + tb->rbytes); + /* append item in R[0] */ + if (pos_in_item >= 0) { + bi.tb = tb; + bi.bi_bh = tb->R[0]; + bi.bi_parent = tb->FR[0]; + bi.bi_position = + get_right_neighbor_position + (tb, 0); + leaf_paste_in_buffer(&bi, + item_pos - + n + + tb-> + rnum[0], + pos_in_item, + tb-> + insert_size + [0], body, + zeros_num); + } + + /* paste new entry, if item is directory item */ + pasted = + B_N_PITEM_HEAD(tb->R[0], + item_pos - n + + tb->rnum[0]); + if (is_direntry_le_ih(pasted) + && pos_in_item >= 0) { + leaf_paste_entries(bi.bi_bh, + item_pos - + n + + tb->rnum[0], + pos_in_item, + 1, + (struct + reiserfs_de_head + *)body, + body + + DEH_SIZE, + tb-> + insert_size + [0] + ); + if (!pos_in_item) { + + RFALSE(item_pos - n + + tb->rnum[0], + "PAP-12165: directory item must be first item of node when pasting is in 0th position"); + + /* update delimiting keys */ + replace_key(tb, + tb->CFR[0], + tb->rkey[0], + tb->R[0], + 0); + } + } + + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + zeros_num = tb->insert_size[0] = 0; + } + } else { /* new item doesn't fall into R[0] */ - ret_val = leaf_shift_right(tb,tb->rnum[0],tb->rbytes); - /* append item in R[0] */ - if ( pos_in_item >= 0 ) { - bi.tb = tb; - bi.bi_bh = tb->R[0]; - bi.bi_parent = tb->FR[0]; - bi.bi_position = get_right_neighbor_position (tb, 0); - leaf_paste_in_buffer(&bi,item_pos - n + tb->rnum[0], pos_in_item, - tb->insert_size[0],body, zeros_num); - } - - /* paste new entry, if item is directory item */ - pasted = B_N_PITEM_HEAD(tb->R[0], item_pos - n + tb->rnum[0]); - if (is_direntry_le_ih (pasted) && pos_in_item >= 0 ) { - leaf_paste_entries ( - bi.bi_bh, item_pos - n + tb->rnum[0], pos_in_item, 1, - (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0] - ); - if ( ! pos_in_item ) { - - RFALSE( item_pos - n + tb->rnum[0], - "PAP-12165: directory item must be first item of node when pasting is in 0th position"); - - /* update delimiting keys */ - replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); } - } - - if (is_indirect_le_ih (pasted)) - set_ih_free_space (pasted, 0); - zeros_num = tb->insert_size[0] = 0; + break; + default: /* cases d and t */ + reiserfs_panic(tb->tb_sb, + "PAP-12175: balance_leaf: rnum > 0: unexpectable mode: %s(%d)", + (flag == + M_DELETE) ? "DELETE" : ((flag == + M_CUT) ? "CUT" + : "UNKNOWN"), + flag); } - } - else /* new item doesn't fall into R[0] */ - { - leaf_shift_right(tb,tb->rnum[0],tb->rbytes); - } - break; - default: /* cases d and t */ - reiserfs_panic (tb->tb_sb, "PAP-12175: balance_leaf: rnum > 0: unexpectable mode: %s(%d)", - (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); - } - - } /* tb->rnum[0] > 0 */ - - - RFALSE( tb->blknum[0] > 3, - "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]); - RFALSE( tb->blknum[0] < 0, - "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]); - - /* if while adding to a node we discover that it is possible to split - it in two, and merge the left part into the left neighbor and the - right part into the right neighbor, eliminating the node */ - if ( tb->blknum[0] == 0 ) { /* node S[0] is empty now */ - - RFALSE( ! tb->lnum[0] || ! tb->rnum[0], - "PAP-12190: lnum and rnum must not be zero"); - /* if insertion was done before 0-th position in R[0], right - delimiting key of the tb->L[0]'s and left delimiting key are - not set correctly */ - if (tb->CFL[0]) { - if (!tb->CFR[0]) - reiserfs_panic (tb->tb_sb, "vs-12195: balance_leaf: CFR not initialized"); - copy_key (B_N_PDELIM_KEY (tb->CFL[0], tb->lkey[0]), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0])); - do_balance_mark_internal_dirty (tb, tb->CFL[0], 0); - } - - reiserfs_invalidate_buffer(tb,tbS0); - return 0; - } - - - /* Fill new nodes that appear in place of S[0] */ - /* I am told that this copying is because we need an array to enable - the looping code. -Hans */ - snum[0] = tb->s1num, - snum[1] = tb->s2num; - sbytes[0] = tb->s1bytes; - sbytes[1] = tb->s2bytes; - for( i = tb->blknum[0] - 2; i >= 0; i-- ) { - - RFALSE( !snum[i], "PAP-12200: snum[%d] == %d. Must be > 0", i, snum[i]); + } - /* here we shift from S to S_new nodes */ + /* tb->rnum[0] > 0 */ + RFALSE(tb->blknum[0] > 3, + "PAP-12180: blknum can not be %d. It must be <= 3", + tb->blknum[0]); + RFALSE(tb->blknum[0] < 0, + "PAP-12185: blknum can not be %d. It must be >= 0", + tb->blknum[0]); + + /* if while adding to a node we discover that it is possible to split + it in two, and merge the left part into the left neighbor and the + right part into the right neighbor, eliminating the node */ + if (tb->blknum[0] == 0) { /* node S[0] is empty now */ + + RFALSE(!tb->lnum[0] || !tb->rnum[0], + "PAP-12190: lnum and rnum must not be zero"); + /* if insertion was done before 0-th position in R[0], right + delimiting key of the tb->L[0]'s and left delimiting key are + not set correctly */ + if (tb->CFL[0]) { + if (!tb->CFR[0]) + reiserfs_panic(tb->tb_sb, + "vs-12195: balance_leaf: CFR not initialized"); + copy_key(B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]), + B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0])); + do_balance_mark_internal_dirty(tb, tb->CFL[0], 0); + } - S_new[i] = get_FEB(tb); + reiserfs_invalidate_buffer(tb, tbS0); + return 0; + } - /* initialized block type and tree level */ - set_blkh_level( B_BLK_HEAD(S_new[i]), DISK_LEAF_NODE_LEVEL ); + /* Fill new nodes that appear in place of S[0] */ + + /* I am told that this copying is because we need an array to enable + the looping code. -Hans */ + snum[0] = tb->s1num, snum[1] = tb->s2num; + sbytes[0] = tb->s1bytes; + sbytes[1] = tb->s2bytes; + for (i = tb->blknum[0] - 2; i >= 0; i--) { + + RFALSE(!snum[i], "PAP-12200: snum[%d] == %d. Must be > 0", i, + snum[i]); + + /* here we shift from S to S_new nodes */ + + S_new[i] = get_FEB(tb); + + /* initialized block type and tree level */ + set_blkh_level(B_BLK_HEAD(S_new[i]), DISK_LEAF_NODE_LEVEL); + + n = B_NR_ITEMS(tbS0); + + switch (flag) { + case M_INSERT: /* insert item */ + + if (n - snum[i] < item_pos) { /* new item or it's part falls to first new node S_new[i] */ + if (item_pos == n - snum[i] + 1 && sbytes[i] != -1) { /* part of new item falls into S_new[i] */ + int old_key_comp, old_len, + r_zeros_number; + const char *r_body; + int version; + + /* Move snum[i]-1 items from S[0] to S_new[i] */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + snum[i] - 1, -1, + S_new[i]); + /* Remember key component and item length */ + version = ih_version(ih); + old_key_comp = le_ih_k_offset(ih); + old_len = ih_item_len(ih); + + /* Calculate key component and item length to insert into S_new[i] */ + set_le_ih_k_offset(ih, + le_ih_k_offset(ih) + + ((old_len - + sbytes[i]) << + (is_indirect_le_ih + (ih) ? tb->tb_sb-> + s_blocksize_bits - + UNFM_P_SHIFT : + 0))); + + put_ih_item_len(ih, sbytes[i]); + + /* Insert part of the item into S_new[i] before 0-th item */ + bi.tb = tb; + bi.bi_bh = S_new[i]; + bi.bi_parent = NULL; + bi.bi_position = 0; + + if ((old_len - sbytes[i]) > zeros_num) { + r_zeros_number = 0; + r_body = + body + (old_len - + sbytes[i]) - + zeros_num; + } else { + r_body = body; + r_zeros_number = + zeros_num - (old_len - + sbytes[i]); + zeros_num -= r_zeros_number; + } + + leaf_insert_into_buf(&bi, 0, ih, r_body, + r_zeros_number); + + /* Calculate key component and item length to insert into S[i] */ + set_le_ih_k_offset(ih, old_key_comp); + put_ih_item_len(ih, + old_len - sbytes[i]); + tb->insert_size[0] -= sbytes[i]; + } else { /* whole new item falls into S_new[i] */ + + /* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */ + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + snum[i] - 1, sbytes[i], + S_new[i]); + + /* Insert new item into S_new[i] */ + bi.tb = tb; + bi.bi_bh = S_new[i]; + bi.bi_parent = NULL; + bi.bi_position = 0; + leaf_insert_into_buf(&bi, + item_pos - n + + snum[i] - 1, ih, + body, zeros_num); + + zeros_num = tb->insert_size[0] = 0; + } + } + else { /* new item or it part don't falls into S_new[i] */ - n = B_NR_ITEMS(tbS0); - - switch (flag) { - case M_INSERT: /* insert item */ - - if ( n - snum[i] < item_pos ) - { /* new item or it's part falls to first new node S_new[i]*/ - if ( item_pos == n - snum[i] + 1 && sbytes[i] != -1 ) - { /* part of new item falls into S_new[i] */ - int old_key_comp, old_len, r_zeros_number; - const char * r_body; - int version; - - /* Move snum[i]-1 items from S[0] to S_new[i] */ - leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i] - 1, -1, S_new[i]); - /* Remember key component and item length */ - version = ih_version (ih); - old_key_comp = le_ih_k_offset( ih ); - old_len = ih_item_len(ih); - - /* Calculate key component and item length to insert into S_new[i] */ - set_le_ih_k_offset( ih, - le_ih_k_offset(ih) + ((old_len - sbytes[i] )<<(is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0)) ); - - put_ih_item_len( ih, sbytes[i] ); - - /* Insert part of the item into S_new[i] before 0-th item */ - bi.tb = tb; - bi.bi_bh = S_new[i]; - bi.bi_parent = NULL; - bi.bi_position = 0; - - if ( (old_len - sbytes[i]) > zeros_num ) { - r_zeros_number = 0; - r_body = body + (old_len - sbytes[i]) - zeros_num; - } - else { - r_body = body; - r_zeros_number = zeros_num - (old_len - sbytes[i]); - zeros_num -= r_zeros_number; - } - - leaf_insert_into_buf (&bi, 0, ih, r_body, r_zeros_number); - - /* Calculate key component and item length to insert into S[i] */ - set_le_ih_k_offset( ih, old_key_comp ); - put_ih_item_len( ih, old_len - sbytes[i] ); - tb->insert_size[0] -= sbytes[i]; - } - else /* whole new item falls into S_new[i] */ - { - /* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */ - leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i] - 1, sbytes[i], S_new[i]); - - /* Insert new item into S_new[i] */ - bi.tb = tb; - bi.bi_bh = S_new[i]; - bi.bi_parent = NULL; - bi.bi_position = 0; - leaf_insert_into_buf (&bi, item_pos - n + snum[i] - 1, ih, body, zeros_num); - - zeros_num = tb->insert_size[0] = 0; - } - } - - else /* new item or it part don't falls into S_new[i] */ - { - leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); - } - break; - - case M_PASTE: /* append item */ - - if ( n - snum[i] <= item_pos ) /* pasted item or part if it falls to S_new[i] */ - { - if ( item_pos == n - snum[i] && sbytes[i] != -1 ) - { /* we must shift part of the appended item */ - struct item_head * aux_ih; - - RFALSE( ih, "PAP-12210: ih must be 0"); - - if ( is_direntry_le_ih (aux_ih = B_N_PITEM_HEAD(tbS0,item_pos))) { - /* we append to directory item */ - - int entry_count; - - entry_count = ih_entry_count(aux_ih); - - if ( entry_count - sbytes[i] < pos_in_item && pos_in_item <= entry_count ) { - /* new directory entry falls into S_new[i] */ - - RFALSE( ! tb->insert_size[0], - "PAP-12215: insert_size is already 0"); - RFALSE( sbytes[i] - 1 >= entry_count, - "PAP-12220: there are no so much entries (%d), only %d", - sbytes[i] - 1, entry_count); - - /* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */ - leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i]-1, S_new[i]); - /* Paste given directory entry to directory item */ - bi.tb = tb; - bi.bi_bh = S_new[i]; - bi.bi_parent = NULL; - bi.bi_position = 0; - leaf_paste_in_buffer (&bi, 0, pos_in_item - entry_count + sbytes[i] - 1, - tb->insert_size[0], body,zeros_num); - /* paste new directory entry */ - leaf_paste_entries ( - bi.bi_bh, 0, pos_in_item - entry_count + sbytes[i] - 1, - 1, (struct reiserfs_de_head *)body, body + DEH_SIZE, - tb->insert_size[0] - ); - tb->insert_size[0] = 0; - pos_in_item++; - } else { /* new directory entry doesn't fall into S_new[i] */ - leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + snum[i], sbytes[i], S_new[i]); } - } - else /* regular object */ - { - int n_shift, n_rem, r_zeros_number; - const char * r_body; - - RFALSE( pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos)) || - tb->insert_size[0] <= 0, - "PAP-12225: item too short or insert_size <= 0"); - - /* Calculate number of bytes which must be shifted from appended item */ - n_shift = sbytes[i] - tb->insert_size[0]; - if ( n_shift < 0 ) - n_shift = 0; - leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], n_shift, S_new[i]); - - /* Calculate number of bytes which must remain in body after append to S_new[i] */ - n_rem = tb->insert_size[0] - sbytes[i]; - if ( n_rem < 0 ) - n_rem = 0; - /* Append part of body into S_new[0] */ - bi.tb = tb; - bi.bi_bh = S_new[i]; - bi.bi_parent = NULL; - bi.bi_position = 0; + break; + + case M_PASTE: /* append item */ + + if (n - snum[i] <= item_pos) { /* pasted item or part if it falls to S_new[i] */ + if (item_pos == n - snum[i] && sbytes[i] != -1) { /* we must shift part of the appended item */ + struct item_head *aux_ih; + + RFALSE(ih, "PAP-12210: ih must be 0"); + + if (is_direntry_le_ih + (aux_ih = + B_N_PITEM_HEAD(tbS0, item_pos))) { + /* we append to directory item */ + + int entry_count; + + entry_count = + ih_entry_count(aux_ih); + + if (entry_count - sbytes[i] < + pos_in_item + && pos_in_item <= + entry_count) { + /* new directory entry falls into S_new[i] */ + + RFALSE(!tb-> + insert_size[0], + "PAP-12215: insert_size is already 0"); + RFALSE(sbytes[i] - 1 >= + entry_count, + "PAP-12220: there are no so much entries (%d), only %d", + sbytes[i] - 1, + entry_count); + + /* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */ + leaf_move_items + (LEAF_FROM_S_TO_SNEW, + tb, snum[i], + sbytes[i] - 1, + S_new[i]); + /* Paste given directory entry to directory item */ + bi.tb = tb; + bi.bi_bh = S_new[i]; + bi.bi_parent = NULL; + bi.bi_position = 0; + leaf_paste_in_buffer + (&bi, 0, + pos_in_item - + entry_count + + sbytes[i] - 1, + tb->insert_size[0], + body, zeros_num); + /* paste new directory entry */ + leaf_paste_entries(bi. + bi_bh, + 0, + pos_in_item + - + entry_count + + + sbytes + [i] - + 1, 1, + (struct + reiserfs_de_head + *) + body, + body + + + DEH_SIZE, + tb-> + insert_size + [0] + ); + tb->insert_size[0] = 0; + pos_in_item++; + } else { /* new directory entry doesn't fall into S_new[i] */ + leaf_move_items + (LEAF_FROM_S_TO_SNEW, + tb, snum[i], + sbytes[i], + S_new[i]); + } + } else { /* regular object */ + + int n_shift, n_rem, + r_zeros_number; + const char *r_body; + + RFALSE(pos_in_item != + ih_item_len + (B_N_PITEM_HEAD + (tbS0, item_pos)) + || tb->insert_size[0] <= + 0, + "PAP-12225: item too short or insert_size <= 0"); + + /* Calculate number of bytes which must be shifted from appended item */ + n_shift = + sbytes[i] - + tb->insert_size[0]; + if (n_shift < 0) + n_shift = 0; + leaf_move_items + (LEAF_FROM_S_TO_SNEW, tb, + snum[i], n_shift, + S_new[i]); + + /* Calculate number of bytes which must remain in body after append to S_new[i] */ + n_rem = + tb->insert_size[0] - + sbytes[i]; + if (n_rem < 0) + n_rem = 0; + /* Append part of body into S_new[0] */ + bi.tb = tb; + bi.bi_bh = S_new[i]; + bi.bi_parent = NULL; + bi.bi_position = 0; + + if (n_rem > zeros_num) { + r_zeros_number = 0; + r_body = + body + n_rem - + zeros_num; + } else { + r_body = body; + r_zeros_number = + zeros_num - n_rem; + zeros_num -= + r_zeros_number; + } + + leaf_paste_in_buffer(&bi, 0, + n_shift, + tb-> + insert_size + [0] - + n_rem, + r_body, + r_zeros_number); + { + struct item_head *tmp; + + tmp = + B_N_PITEM_HEAD(S_new + [i], + 0); + if (is_indirect_le_ih + (tmp)) { + set_ih_free_space + (tmp, 0); + set_le_ih_k_offset + (tmp, + le_ih_k_offset + (tmp) + + (n_rem << + (tb-> + tb_sb-> + s_blocksize_bits + - + UNFM_P_SHIFT))); + } else { + set_le_ih_k_offset + (tmp, + le_ih_k_offset + (tmp) + + n_rem); + } + } + + tb->insert_size[0] = n_rem; + if (!n_rem) + pos_in_item++; + } + } else + /* item falls wholly into S_new[i] */ + { + int ret_val; + struct item_head *pasted; - if ( n_rem > zeros_num ) { - r_zeros_number = 0; - r_body = body + n_rem - zeros_num; - } - else { - r_body = body; - r_zeros_number = zeros_num - n_rem; - zeros_num -= r_zeros_number; +#ifdef CONFIG_REISERFS_CHECK + struct item_head *ih = + B_N_PITEM_HEAD(tbS0, item_pos); + + if (!is_direntry_le_ih(ih) + && (pos_in_item != ih_item_len(ih) + || tb->insert_size[0] <= 0)) + reiserfs_panic(tb->tb_sb, + "PAP-12235: balance_leaf: pos_in_item must be equal to ih_item_len"); +#endif /* CONFIG_REISERFS_CHECK */ + + ret_val = + leaf_move_items(LEAF_FROM_S_TO_SNEW, + tb, snum[i], + sbytes[i], + S_new[i]); + + RFALSE(ret_val, + "PAP-12240: unexpected value returned by leaf_move_items (%d)", + ret_val); + + /* paste into item */ + bi.tb = tb; + bi.bi_bh = S_new[i]; + bi.bi_parent = NULL; + bi.bi_position = 0; + leaf_paste_in_buffer(&bi, + item_pos - n + + snum[i], + pos_in_item, + tb->insert_size[0], + body, zeros_num); + + pasted = + B_N_PITEM_HEAD(S_new[i], + item_pos - n + + snum[i]); + if (is_direntry_le_ih(pasted)) { + leaf_paste_entries(bi.bi_bh, + item_pos - + n + snum[i], + pos_in_item, + 1, + (struct + reiserfs_de_head + *)body, + body + + DEH_SIZE, + tb-> + insert_size + [0] + ); + } + + /* if we paste to indirect item update ih_free_space */ + if (is_indirect_le_ih(pasted)) + set_ih_free_space(pasted, 0); + zeros_num = tb->insert_size[0] = 0; + } } - leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0]-n_rem, r_body,r_zeros_number); - { - struct item_head * tmp; - - tmp = B_N_PITEM_HEAD(S_new[i],0); - if (is_indirect_le_ih (tmp)) { - set_ih_free_space (tmp, 0); - set_le_ih_k_offset( tmp, le_ih_k_offset(tmp) + - (n_rem << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT))); - } else { - set_le_ih_k_offset( tmp, le_ih_k_offset(tmp) + - n_rem ); - } - } + else { /* pasted item doesn't fall into S_new[i] */ - tb->insert_size[0] = n_rem; - if ( ! n_rem ) - pos_in_item++; - } + leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, + snum[i], sbytes[i], S_new[i]); + } + break; + default: /* cases d and t */ + reiserfs_panic(tb->tb_sb, + "PAP-12245: balance_leaf: blknum > 2: unexpectable mode: %s(%d)", + (flag == + M_DELETE) ? "DELETE" : ((flag == + M_CUT) ? "CUT" + : "UNKNOWN"), + flag); } - else - /* item falls wholly into S_new[i] */ - { - int ret_val; - struct item_head * pasted; -#ifdef CONFIG_REISERFS_CHECK - struct item_head * ih = B_N_PITEM_HEAD(tbS0,item_pos); - - if ( ! is_direntry_le_ih(ih) && (pos_in_item != ih_item_len(ih) || - tb->insert_size[0] <= 0) ) - reiserfs_panic (tb->tb_sb, "PAP-12235: balance_leaf: pos_in_item must be equal to ih_item_len"); -#endif /* CONFIG_REISERFS_CHECK */ - - ret_val = leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); - - RFALSE( ret_val, - "PAP-12240: unexpected value returned by leaf_move_items (%d)", - ret_val); - - /* paste into item */ - bi.tb = tb; - bi.bi_bh = S_new[i]; - bi.bi_parent = NULL; - bi.bi_position = 0; - leaf_paste_in_buffer(&bi, item_pos - n + snum[i], pos_in_item, tb->insert_size[0], body, zeros_num); - - pasted = B_N_PITEM_HEAD(S_new[i], item_pos - n + snum[i]); - if (is_direntry_le_ih (pasted)) - { - leaf_paste_entries ( - bi.bi_bh, item_pos - n + snum[i], pos_in_item, 1, - (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0] - ); - } - - /* if we paste to indirect item update ih_free_space */ - if (is_indirect_le_ih (pasted)) - set_ih_free_space (pasted, 0); - zeros_num = tb->insert_size[0] = 0; - } - } - - else /* pasted item doesn't fall into S_new[i] */ - { - leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); - } - break; - default: /* cases d and t */ - reiserfs_panic (tb->tb_sb, "PAP-12245: balance_leaf: blknum > 2: unexpectable mode: %s(%d)", - (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); + memcpy(insert_key + i, B_N_PKEY(S_new[i], 0), KEY_SIZE); + insert_ptr[i] = S_new[i]; + + RFALSE(!buffer_journaled(S_new[i]) + || buffer_journal_dirty(S_new[i]) + || buffer_dirty(S_new[i]), "PAP-12247: S_new[%d] : (%b)", + i, S_new[i]); } - memcpy (insert_key + i,B_N_PKEY(S_new[i],0),KEY_SIZE); - insert_ptr[i] = S_new[i]; - - RFALSE (!buffer_journaled (S_new [i]) || buffer_journal_dirty (S_new [i]) || - buffer_dirty (S_new [i]), - "PAP-12247: S_new[%d] : (%b)", i, S_new[i]); - } - - /* if the affected item was not wholly shifted then we perform all necessary operations on that part or whole of the - affected item which remains in S */ - if ( 0 <= item_pos && item_pos < tb->s0num ) - { /* if we must insert or append into buffer S[0] */ - - switch (flag) - { - case M_INSERT: /* insert item into S[0] */ - bi.tb = tb; - bi.bi_bh = tbS0; - bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); - bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); - leaf_insert_into_buf (&bi, item_pos, ih, body, zeros_num); - - /* If we insert the first key change the delimiting key */ - if( item_pos == 0 ) { - if (tb->CFL[0]) /* can be 0 in reiserfsck */ - replace_key(tb, tb->CFL[0], tb->lkey[0],tbS0,0); - - } - break; - - case M_PASTE: { /* append item in S[0] */ - struct item_head * pasted; - - pasted = B_N_PITEM_HEAD (tbS0, item_pos); - /* when directory, may be new entry already pasted */ - if (is_direntry_le_ih (pasted)) { - if ( pos_in_item >= 0 && - pos_in_item <= ih_entry_count(pasted) ) { - - RFALSE( ! tb->insert_size[0], - "PAP-12260: insert_size is 0 already"); - - /* prepare space */ - bi.tb = tb; - bi.bi_bh = tbS0; - bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); - bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); - leaf_paste_in_buffer(&bi, item_pos, pos_in_item, tb->insert_size[0], body, zeros_num); - - /* paste entry */ - leaf_paste_entries ( - bi.bi_bh, item_pos, pos_in_item, 1, (struct reiserfs_de_head *)body, - body + DEH_SIZE, tb->insert_size[0] - ); - if ( ! item_pos && ! pos_in_item ) { - RFALSE( !tb->CFL[0] || !tb->L[0], - "PAP-12270: CFL[0]/L[0] must be specified"); - if (tb->CFL[0]) { - replace_key(tb, tb->CFL[0], tb->lkey[0],tbS0,0); + /* if the affected item was not wholly shifted then we perform all necessary operations on that part or whole of the + affected item which remains in S */ + if (0 <= item_pos && item_pos < tb->s0num) { /* if we must insert or append into buffer S[0] */ + + switch (flag) { + case M_INSERT: /* insert item into S[0] */ + bi.tb = tb; + bi.bi_bh = tbS0; + bi.bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + bi.bi_position = PATH_H_POSITION(tb->tb_path, 1); + leaf_insert_into_buf(&bi, item_pos, ih, body, + zeros_num); + + /* If we insert the first key change the delimiting key */ + if (item_pos == 0) { + if (tb->CFL[0]) /* can be 0 in reiserfsck */ + replace_key(tb, tb->CFL[0], tb->lkey[0], + tbS0, 0); } - } - tb->insert_size[0] = 0; - } - } else { /* regular object */ - if ( pos_in_item == ih_item_len(pasted) ) { - - RFALSE( tb->insert_size[0] <= 0, - "PAP-12275: insert size must not be %d", - tb->insert_size[0]); - bi.tb = tb; - bi.bi_bh = tbS0; - bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); - bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); - leaf_paste_in_buffer (&bi, item_pos, pos_in_item, tb->insert_size[0], body, zeros_num); - - if (is_indirect_le_ih (pasted)) { + break; + + case M_PASTE:{ /* append item in S[0] */ + struct item_head *pasted; + + pasted = B_N_PITEM_HEAD(tbS0, item_pos); + /* when directory, may be new entry already pasted */ + if (is_direntry_le_ih(pasted)) { + if (pos_in_item >= 0 && + pos_in_item <= + ih_entry_count(pasted)) { + + RFALSE(!tb->insert_size[0], + "PAP-12260: insert_size is 0 already"); + + /* prepare space */ + bi.tb = tb; + bi.bi_bh = tbS0; + bi.bi_parent = + PATH_H_PPARENT(tb->tb_path, + 0); + bi.bi_position = + PATH_H_POSITION(tb->tb_path, + 1); + leaf_paste_in_buffer(&bi, + item_pos, + pos_in_item, + tb-> + insert_size + [0], body, + zeros_num); + + /* paste entry */ + leaf_paste_entries(bi.bi_bh, + item_pos, + pos_in_item, + 1, + (struct + reiserfs_de_head + *)body, + body + + DEH_SIZE, + tb-> + insert_size + [0] + ); + if (!item_pos && !pos_in_item) { + RFALSE(!tb->CFL[0] + || !tb->L[0], + "PAP-12270: CFL[0]/L[0] must be specified"); + if (tb->CFL[0]) { + replace_key(tb, + tb-> + CFL + [0], + tb-> + lkey + [0], + tbS0, + 0); + + } + } + tb->insert_size[0] = 0; + } + } else { /* regular object */ + if (pos_in_item == ih_item_len(pasted)) { + + RFALSE(tb->insert_size[0] <= 0, + "PAP-12275: insert size must not be %d", + tb->insert_size[0]); + bi.tb = tb; + bi.bi_bh = tbS0; + bi.bi_parent = + PATH_H_PPARENT(tb->tb_path, + 0); + bi.bi_position = + PATH_H_POSITION(tb->tb_path, + 1); + leaf_paste_in_buffer(&bi, + item_pos, + pos_in_item, + tb-> + insert_size + [0], body, + zeros_num); + + if (is_indirect_le_ih(pasted)) { #if 0 - RFALSE( tb->insert_size[0] != UNFM_P_SIZE, - "PAP-12280: insert_size for indirect item must be %d, not %d", - UNFM_P_SIZE, tb->insert_size[0]); + RFALSE(tb-> + insert_size[0] != + UNFM_P_SIZE, + "PAP-12280: insert_size for indirect item must be %d, not %d", + UNFM_P_SIZE, + tb-> + insert_size[0]); #endif - set_ih_free_space (pasted, 0); - } - tb->insert_size[0] = 0; - } - + set_ih_free_space + (pasted, 0); + } + tb->insert_size[0] = 0; + } #ifdef CONFIG_REISERFS_CHECK - else { - if ( tb->insert_size[0] ) { - print_cur_tb ("12285"); - reiserfs_panic (tb->tb_sb, "PAP-12285: balance_leaf: insert_size must be 0 (%d)", tb->insert_size[0]); - } + else { + if (tb->insert_size[0]) { + print_cur_tb("12285"); + reiserfs_panic(tb-> + tb_sb, + "PAP-12285: balance_leaf: insert_size must be 0 (%d)", + tb-> + insert_size + [0]); + } + } +#endif /* CONFIG_REISERFS_CHECK */ + + } + } /* case M_PASTE: */ } -#endif /* CONFIG_REISERFS_CHECK */ - - } - } /* case M_PASTE: */ } - } - #ifdef CONFIG_REISERFS_CHECK - if ( flag == M_PASTE && tb->insert_size[0] ) { - print_cur_tb ("12290"); - reiserfs_panic (tb->tb_sb, "PAP-12290: balance_leaf: insert_size is still not 0 (%d)", tb->insert_size[0]); - } -#endif /* CONFIG_REISERFS_CHECK */ - - return 0; -} /* Leaf level of the tree is balanced (end of balance_leaf) */ - + if (flag == M_PASTE && tb->insert_size[0]) { + print_cur_tb("12290"); + reiserfs_panic(tb->tb_sb, + "PAP-12290: balance_leaf: insert_size is still not 0 (%d)", + tb->insert_size[0]); + } +#endif /* CONFIG_REISERFS_CHECK */ + return 0; +} /* Leaf level of the tree is balanced (end of balance_leaf) */ /* Make empty node */ -void make_empty_node (struct buffer_info * bi) +void make_empty_node(struct buffer_info *bi) { - struct block_head * blkh; + struct block_head *blkh; - RFALSE( bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL"); + RFALSE(bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL"); - blkh = B_BLK_HEAD(bi->bi_bh); - set_blkh_nr_item( blkh, 0 ); - set_blkh_free_space( blkh, MAX_CHILD_SIZE(bi->bi_bh) ); + blkh = B_BLK_HEAD(bi->bi_bh); + set_blkh_nr_item(blkh, 0); + set_blkh_free_space(blkh, MAX_CHILD_SIZE(bi->bi_bh)); - if (bi->bi_parent) - B_N_CHILD (bi->bi_parent, bi->bi_position)->dc_size = 0; /* Endian safe if 0 */ + if (bi->bi_parent) + B_N_CHILD(bi->bi_parent, bi->bi_position)->dc_size = 0; /* Endian safe if 0 */ } - /* Get first empty buffer */ -struct buffer_head * get_FEB (struct tree_balance * tb) +struct buffer_head *get_FEB(struct tree_balance *tb) { - int i; - struct buffer_head * first_b; - struct buffer_info bi; - - for (i = 0; i < MAX_FEB_SIZE; i ++) - if (tb->FEB[i] != 0) - break; - - if (i == MAX_FEB_SIZE) - reiserfs_panic(tb->tb_sb, "vs-12300: get_FEB: FEB list is empty"); - - bi.tb = tb; - bi.bi_bh = first_b = tb->FEB[i]; - bi.bi_parent = NULL; - bi.bi_position = 0; - make_empty_node (&bi); - set_buffer_uptodate(first_b); - tb->FEB[i] = NULL; - tb->used[i] = first_b; - - return(first_b); -} + int i; + struct buffer_head *first_b; + struct buffer_info bi; + for (i = 0; i < MAX_FEB_SIZE; i++) + if (tb->FEB[i] != 0) + break; + + if (i == MAX_FEB_SIZE) + reiserfs_panic(tb->tb_sb, + "vs-12300: get_FEB: FEB list is empty"); + + bi.tb = tb; + bi.bi_bh = first_b = tb->FEB[i]; + bi.bi_parent = NULL; + bi.bi_position = 0; + make_empty_node(&bi); + set_buffer_uptodate(first_b); + tb->FEB[i] = NULL; + tb->used[i] = first_b; + + return (first_b); +} /* This is now used because reiserfs_free_block has to be able to ** schedule. */ -static void store_thrown (struct tree_balance * tb, struct buffer_head * bh) +static void store_thrown(struct tree_balance *tb, struct buffer_head *bh) { - int i; - - if (buffer_dirty (bh)) - reiserfs_warning (tb->tb_sb, "store_thrown deals with dirty buffer"); - for (i = 0; i < sizeof (tb->thrown)/sizeof (tb->thrown[0]); i ++) - if (!tb->thrown[i]) { - tb->thrown[i] = bh; - get_bh(bh) ; /* free_thrown puts this */ - return; - } - reiserfs_warning (tb->tb_sb, "store_thrown: too many thrown buffers"); + int i; + + if (buffer_dirty(bh)) + reiserfs_warning(tb->tb_sb, + "store_thrown deals with dirty buffer"); + for (i = 0; i < sizeof(tb->thrown) / sizeof(tb->thrown[0]); i++) + if (!tb->thrown[i]) { + tb->thrown[i] = bh; + get_bh(bh); /* free_thrown puts this */ + return; + } + reiserfs_warning(tb->tb_sb, "store_thrown: too many thrown buffers"); } -static void free_thrown(struct tree_balance *tb) { - int i ; - b_blocknr_t blocknr ; - for (i = 0; i < sizeof (tb->thrown)/sizeof (tb->thrown[0]); i++) { - if (tb->thrown[i]) { - blocknr = tb->thrown[i]->b_blocknr ; - if (buffer_dirty (tb->thrown[i])) - reiserfs_warning (tb->tb_sb, - "free_thrown deals with dirty buffer %d", - blocknr); - brelse(tb->thrown[i]) ; /* incremented in store_thrown */ - reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0); +static void free_thrown(struct tree_balance *tb) +{ + int i; + b_blocknr_t blocknr; + for (i = 0; i < sizeof(tb->thrown) / sizeof(tb->thrown[0]); i++) { + if (tb->thrown[i]) { + blocknr = tb->thrown[i]->b_blocknr; + if (buffer_dirty(tb->thrown[i])) + reiserfs_warning(tb->tb_sb, + "free_thrown deals with dirty buffer %d", + blocknr); + brelse(tb->thrown[i]); /* incremented in store_thrown */ + reiserfs_free_block(tb->transaction_handle, NULL, + blocknr, 0); + } } - } } -void reiserfs_invalidate_buffer (struct tree_balance * tb, struct buffer_head * bh) +void reiserfs_invalidate_buffer(struct tree_balance *tb, struct buffer_head *bh) { - struct block_head *blkh; - blkh = B_BLK_HEAD(bh); - set_blkh_level( blkh, FREE_LEVEL ); - set_blkh_nr_item( blkh, 0 ); - - clear_buffer_dirty(bh); - store_thrown (tb, bh); + struct block_head *blkh; + blkh = B_BLK_HEAD(bh); + set_blkh_level(blkh, FREE_LEVEL); + set_blkh_nr_item(blkh, 0); + + clear_buffer_dirty(bh); + store_thrown(tb, bh); } /* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/ -void replace_key (struct tree_balance * tb, struct buffer_head * dest, int n_dest, - struct buffer_head * src, int n_src) +void replace_key(struct tree_balance *tb, struct buffer_head *dest, int n_dest, + struct buffer_head *src, int n_src) { - RFALSE( dest == NULL || src == NULL, - "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)", - src, dest); - RFALSE( ! B_IS_KEYS_LEVEL (dest), - "vs-12310: invalid level (%z) for destination buffer. dest must be leaf", - dest); - RFALSE( n_dest < 0 || n_src < 0, - "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest); - RFALSE( n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src), - "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big", - n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest)); - - if (B_IS_ITEMS_LEVEL (src)) - /* source buffer contains leaf node */ - memcpy (B_N_PDELIM_KEY(dest,n_dest), B_N_PITEM_HEAD(src,n_src), KEY_SIZE); - else - memcpy (B_N_PDELIM_KEY(dest,n_dest), B_N_PDELIM_KEY(src,n_src), KEY_SIZE); - - do_balance_mark_internal_dirty (tb, dest, 0); + RFALSE(dest == NULL || src == NULL, + "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)", + src, dest); + RFALSE(!B_IS_KEYS_LEVEL(dest), + "vs-12310: invalid level (%z) for destination buffer. dest must be leaf", + dest); + RFALSE(n_dest < 0 || n_src < 0, + "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest); + RFALSE(n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src), + "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big", + n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest)); + + if (B_IS_ITEMS_LEVEL(src)) + /* source buffer contains leaf node */ + memcpy(B_N_PDELIM_KEY(dest, n_dest), B_N_PITEM_HEAD(src, n_src), + KEY_SIZE); + else + memcpy(B_N_PDELIM_KEY(dest, n_dest), B_N_PDELIM_KEY(src, n_src), + KEY_SIZE); + + do_balance_mark_internal_dirty(tb, dest, 0); } - -int get_left_neighbor_position ( - struct tree_balance * tb, - int h - ) +int get_left_neighbor_position(struct tree_balance *tb, int h) { - int Sh_position = PATH_H_POSITION (tb->tb_path, h + 1); + int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1); - RFALSE( PATH_H_PPARENT (tb->tb_path, h) == 0 || tb->FL[h] == 0, - "vs-12325: FL[%d](%p) or F[%d](%p) does not exist", - h, tb->FL[h], h, PATH_H_PPARENT (tb->tb_path, h)); + RFALSE(PATH_H_PPARENT(tb->tb_path, h) == 0 || tb->FL[h] == 0, + "vs-12325: FL[%d](%p) or F[%d](%p) does not exist", + h, tb->FL[h], h, PATH_H_PPARENT(tb->tb_path, h)); - if (Sh_position == 0) - return B_NR_ITEMS (tb->FL[h]); - else - return Sh_position - 1; + if (Sh_position == 0) + return B_NR_ITEMS(tb->FL[h]); + else + return Sh_position - 1; } - -int get_right_neighbor_position (struct tree_balance * tb, int h) +int get_right_neighbor_position(struct tree_balance *tb, int h) { - int Sh_position = PATH_H_POSITION (tb->tb_path, h + 1); + int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1); - RFALSE( PATH_H_PPARENT (tb->tb_path, h) == 0 || tb->FR[h] == 0, - "vs-12330: F[%d](%p) or FR[%d](%p) does not exist", - h, PATH_H_PPARENT (tb->tb_path, h), h, tb->FR[h]); + RFALSE(PATH_H_PPARENT(tb->tb_path, h) == 0 || tb->FR[h] == 0, + "vs-12330: F[%d](%p) or FR[%d](%p) does not exist", + h, PATH_H_PPARENT(tb->tb_path, h), h, tb->FR[h]); - if (Sh_position == B_NR_ITEMS (PATH_H_PPARENT (tb->tb_path, h))) - return 0; - else - return Sh_position + 1; + if (Sh_position == B_NR_ITEMS(PATH_H_PPARENT(tb->tb_path, h))) + return 0; + else + return Sh_position + 1; } - #ifdef CONFIG_REISERFS_CHECK -int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value); -static void check_internal_node (struct super_block * s, struct buffer_head * bh, char * mes) +int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value); +static void check_internal_node(struct super_block *s, struct buffer_head *bh, + char *mes) { - struct disk_child * dc; - int i; - - RFALSE( !bh, "PAP-12336: bh == 0"); - - if (!bh || !B_IS_IN_TREE (bh)) - return; - - RFALSE( !buffer_dirty (bh) && - !(buffer_journaled(bh) || buffer_journal_dirty(bh)), - "PAP-12337: buffer (%b) must be dirty", bh); - dc = B_N_CHILD (bh, 0); - - for (i = 0; i <= B_NR_ITEMS (bh); i ++, dc ++) { - if (!is_reusable (s, dc_block_number(dc), 1) ) { - print_cur_tb (mes); - reiserfs_panic (s, "PAP-12338: check_internal_node: invalid child pointer %y in %b", dc, bh); - } - } -} + struct disk_child *dc; + int i; + RFALSE(!bh, "PAP-12336: bh == 0"); -static int locked_or_not_in_tree (struct buffer_head * bh, char * which) -{ - if ( (!buffer_journal_prepared (bh) && buffer_locked (bh)) || - !B_IS_IN_TREE (bh) ) { - reiserfs_warning (NULL, "vs-12339: locked_or_not_in_tree: %s (%b)", - which, bh); - return 1; - } - return 0; -} + if (!bh || !B_IS_IN_TREE(bh)) + return; + RFALSE(!buffer_dirty(bh) && + !(buffer_journaled(bh) || buffer_journal_dirty(bh)), + "PAP-12337: buffer (%b) must be dirty", bh); + dc = B_N_CHILD(bh, 0); -static int check_before_balancing (struct tree_balance * tb) -{ - int retval = 0; - - if ( cur_tb ) { - reiserfs_panic (tb->tb_sb, "vs-12335: check_before_balancing: " - "suspect that schedule occurred based on cur_tb not being null at this point in code. " - "do_balance cannot properly handle schedule occurring while it runs."); - } - - /* double check that buffers that we will modify are unlocked. (fix_nodes should already have - prepped all of these for us). */ - if ( tb->lnum[0] ) { - retval |= locked_or_not_in_tree (tb->L[0], "L[0]"); - retval |= locked_or_not_in_tree (tb->FL[0], "FL[0]"); - retval |= locked_or_not_in_tree (tb->CFL[0], "CFL[0]"); - check_leaf (tb->L[0]); - } - if ( tb->rnum[0] ) { - retval |= locked_or_not_in_tree (tb->R[0], "R[0]"); - retval |= locked_or_not_in_tree (tb->FR[0], "FR[0]"); - retval |= locked_or_not_in_tree (tb->CFR[0], "CFR[0]"); - check_leaf (tb->R[0]); - } - retval |= locked_or_not_in_tree (PATH_PLAST_BUFFER (tb->tb_path), "S[0]"); - check_leaf (PATH_PLAST_BUFFER (tb->tb_path)); - - return retval; + for (i = 0; i <= B_NR_ITEMS(bh); i++, dc++) { + if (!is_reusable(s, dc_block_number(dc), 1)) { + print_cur_tb(mes); + reiserfs_panic(s, + "PAP-12338: check_internal_node: invalid child pointer %y in %b", + dc, bh); + } + } } +static int locked_or_not_in_tree(struct buffer_head *bh, char *which) +{ + if ((!buffer_journal_prepared(bh) && buffer_locked(bh)) || + !B_IS_IN_TREE(bh)) { + reiserfs_warning(NULL, + "vs-12339: locked_or_not_in_tree: %s (%b)", + which, bh); + return 1; + } + return 0; +} -static void check_after_balance_leaf (struct tree_balance * tb) +static int check_before_balancing(struct tree_balance *tb) { - if (tb->lnum[0]) { - if (B_FREE_SPACE (tb->L[0]) != - MAX_CHILD_SIZE (tb->L[0]) - dc_size(B_N_CHILD (tb->FL[0], get_left_neighbor_position (tb, 0)))) { - print_cur_tb ("12221"); - reiserfs_panic (tb->tb_sb, "PAP-12355: check_after_balance_leaf: shift to left was incorrect"); + int retval = 0; + + if (cur_tb) { + reiserfs_panic(tb->tb_sb, "vs-12335: check_before_balancing: " + "suspect that schedule occurred based on cur_tb not being null at this point in code. " + "do_balance cannot properly handle schedule occurring while it runs."); } - } - if (tb->rnum[0]) { - if (B_FREE_SPACE (tb->R[0]) != - MAX_CHILD_SIZE (tb->R[0]) - dc_size(B_N_CHILD (tb->FR[0], get_right_neighbor_position (tb, 0)))) { - print_cur_tb ("12222"); - reiserfs_panic (tb->tb_sb, "PAP-12360: check_after_balance_leaf: shift to right was incorrect"); + + /* double check that buffers that we will modify are unlocked. (fix_nodes should already have + prepped all of these for us). */ + if (tb->lnum[0]) { + retval |= locked_or_not_in_tree(tb->L[0], "L[0]"); + retval |= locked_or_not_in_tree(tb->FL[0], "FL[0]"); + retval |= locked_or_not_in_tree(tb->CFL[0], "CFL[0]"); + check_leaf(tb->L[0]); } - } - if (PATH_H_PBUFFER(tb->tb_path,1) && - (B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) != - (MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)) - - dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1), - PATH_H_POSITION (tb->tb_path, 1)))) )) { - int left = B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)); - int right = (MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)) - - dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1), - PATH_H_POSITION (tb->tb_path, 1)))); - print_cur_tb ("12223"); - reiserfs_warning (tb->tb_sb, - "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; " - "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d", - left, - MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)), - PATH_H_PBUFFER(tb->tb_path,1), - PATH_H_POSITION (tb->tb_path, 1), - dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1), PATH_H_POSITION (tb->tb_path, 1 )) ), - right ); - reiserfs_panic (tb->tb_sb, "PAP-12365: check_after_balance_leaf: S is incorrect"); - } -} + if (tb->rnum[0]) { + retval |= locked_or_not_in_tree(tb->R[0], "R[0]"); + retval |= locked_or_not_in_tree(tb->FR[0], "FR[0]"); + retval |= locked_or_not_in_tree(tb->CFR[0], "CFR[0]"); + check_leaf(tb->R[0]); + } + retval |= locked_or_not_in_tree(PATH_PLAST_BUFFER(tb->tb_path), "S[0]"); + check_leaf(PATH_PLAST_BUFFER(tb->tb_path)); + return retval; +} -static void check_leaf_level (struct tree_balance * tb) +static void check_after_balance_leaf(struct tree_balance *tb) { - check_leaf (tb->L[0]); - check_leaf (tb->R[0]); - check_leaf (PATH_PLAST_BUFFER (tb->tb_path)); + if (tb->lnum[0]) { + if (B_FREE_SPACE(tb->L[0]) != + MAX_CHILD_SIZE(tb->L[0]) - + dc_size(B_N_CHILD + (tb->FL[0], get_left_neighbor_position(tb, 0)))) { + print_cur_tb("12221"); + reiserfs_panic(tb->tb_sb, + "PAP-12355: check_after_balance_leaf: shift to left was incorrect"); + } + } + if (tb->rnum[0]) { + if (B_FREE_SPACE(tb->R[0]) != + MAX_CHILD_SIZE(tb->R[0]) - + dc_size(B_N_CHILD + (tb->FR[0], get_right_neighbor_position(tb, 0)))) { + print_cur_tb("12222"); + reiserfs_panic(tb->tb_sb, + "PAP-12360: check_after_balance_leaf: shift to right was incorrect"); + } + } + if (PATH_H_PBUFFER(tb->tb_path, 1) && + (B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)) != + (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) - + dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1), + PATH_H_POSITION(tb->tb_path, 1)))))) { + int left = B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)); + int right = (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) - + dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1), + PATH_H_POSITION(tb->tb_path, + 1)))); + print_cur_tb("12223"); + reiserfs_warning(tb->tb_sb, + "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; " + "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d", + left, + MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)), + PATH_H_PBUFFER(tb->tb_path, 1), + PATH_H_POSITION(tb->tb_path, 1), + dc_size(B_N_CHILD + (PATH_H_PBUFFER(tb->tb_path, 1), + PATH_H_POSITION(tb->tb_path, 1))), + right); + reiserfs_panic(tb->tb_sb, + "PAP-12365: check_after_balance_leaf: S is incorrect"); + } } -static void check_internal_levels (struct tree_balance * tb) +static void check_leaf_level(struct tree_balance *tb) { - int h; + check_leaf(tb->L[0]); + check_leaf(tb->R[0]); + check_leaf(PATH_PLAST_BUFFER(tb->tb_path)); +} - /* check all internal nodes */ - for (h = 1; tb->insert_size[h]; h ++) { - check_internal_node (tb->tb_sb, PATH_H_PBUFFER (tb->tb_path, h), "BAD BUFFER ON PATH"); - if (tb->lnum[h]) - check_internal_node (tb->tb_sb, tb->L[h], "BAD L"); - if (tb->rnum[h]) - check_internal_node (tb->tb_sb, tb->R[h], "BAD R"); - } +static void check_internal_levels(struct tree_balance *tb) +{ + int h; + + /* check all internal nodes */ + for (h = 1; tb->insert_size[h]; h++) { + check_internal_node(tb->tb_sb, PATH_H_PBUFFER(tb->tb_path, h), + "BAD BUFFER ON PATH"); + if (tb->lnum[h]) + check_internal_node(tb->tb_sb, tb->L[h], "BAD L"); + if (tb->rnum[h]) + check_internal_node(tb->tb_sb, tb->R[h], "BAD R"); + } } #endif - - - - - /* Now we have all of the buffers that must be used in balancing of the tree. We rely on the assumption that schedule() will not occur while do_balance works. ( Only interrupt handlers are acceptable.) @@ -1484,114 +2029,109 @@ static void check_internal_levels (struct tree_balance * tb) */ -static inline void do_balance_starts (struct tree_balance *tb) +static inline void do_balance_starts(struct tree_balance *tb) { - /* use print_cur_tb() to see initial state of struct - tree_balance */ + /* use print_cur_tb() to see initial state of struct + tree_balance */ - /* store_print_tb (tb); */ + /* store_print_tb (tb); */ - /* do not delete, just comment it out */ + /* do not delete, just comment it out */ /* print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb, "check");*/ - RFALSE( check_before_balancing (tb), "PAP-12340: locked buffers in TB"); + RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB"); #ifdef CONFIG_REISERFS_CHECK - cur_tb = tb; + cur_tb = tb; #endif } - -static inline void do_balance_completed (struct tree_balance * tb) +static inline void do_balance_completed(struct tree_balance *tb) { - + #ifdef CONFIG_REISERFS_CHECK - check_leaf_level (tb); - check_internal_levels (tb); - cur_tb = NULL; + check_leaf_level(tb); + check_internal_levels(tb); + cur_tb = NULL; #endif - /* reiserfs_free_block is no longer schedule safe. So, we need to - ** put the buffers we want freed on the thrown list during do_balance, - ** and then free them now - */ - - REISERFS_SB(tb->tb_sb)->s_do_balance ++; + /* reiserfs_free_block is no longer schedule safe. So, we need to + ** put the buffers we want freed on the thrown list during do_balance, + ** and then free them now + */ + REISERFS_SB(tb->tb_sb)->s_do_balance++; - /* release all nodes hold to perform the balancing */ - unfix_nodes(tb); + /* release all nodes hold to perform the balancing */ + unfix_nodes(tb); - free_thrown(tb) ; + free_thrown(tb); } +void do_balance(struct tree_balance *tb, /* tree_balance structure */ + struct item_head *ih, /* item header of inserted item */ + const char *body, /* body of inserted item or bytes to paste */ + int flag) +{ /* i - insert, d - delete + c - cut, p - paste + + Cut means delete part of an item + (includes removing an entry from a + directory). + + Delete means delete whole item. + + Insert means add a new item into the + tree. + + Paste means to append to the end of an + existing file or to insert a directory + entry. */ + int child_pos, /* position of a child node in its parent */ + h; /* level of the tree being processed */ + struct item_head insert_key[2]; /* in our processing of one level + we sometimes determine what + must be inserted into the next + higher level. This insertion + consists of a key or two keys + and their corresponding + pointers */ + struct buffer_head *insert_ptr[2]; /* inserted node-ptrs for the next + level */ + + tb->tb_mode = flag; + tb->need_balance_dirty = 0; + + if (FILESYSTEM_CHANGED_TB(tb)) { + reiserfs_panic(tb->tb_sb, + "clm-6000: do_balance, fs generation has changed\n"); + } + /* if we have no real work to do */ + if (!tb->insert_size[0]) { + reiserfs_warning(tb->tb_sb, + "PAP-12350: do_balance: insert_size == 0, mode == %c", + flag); + unfix_nodes(tb); + return; + } + atomic_inc(&(fs_generation(tb->tb_sb))); + do_balance_starts(tb); - - -void do_balance (struct tree_balance * tb, /* tree_balance structure */ - struct item_head * ih, /* item header of inserted item */ - const char * body, /* body of inserted item or bytes to paste */ - int flag) /* i - insert, d - delete - c - cut, p - paste - - Cut means delete part of an item - (includes removing an entry from a - directory). - - Delete means delete whole item. - - Insert means add a new item into the - tree. - - Paste means to append to the end of an - existing file or to insert a directory - entry. */ -{ - int child_pos, /* position of a child node in its parent */ - h; /* level of the tree being processed */ - struct item_head insert_key[2]; /* in our processing of one level - we sometimes determine what - must be inserted into the next - higher level. This insertion - consists of a key or two keys - and their corresponding - pointers */ - struct buffer_head *insert_ptr[2]; /* inserted node-ptrs for the next - level */ - - tb->tb_mode = flag; - tb->need_balance_dirty = 0; - - if (FILESYSTEM_CHANGED_TB(tb)) { - reiserfs_panic(tb->tb_sb, "clm-6000: do_balance, fs generation has changed\n") ; - } - /* if we have no real work to do */ - if ( ! tb->insert_size[0] ) { - reiserfs_warning (tb->tb_sb, - "PAP-12350: do_balance: insert_size == 0, mode == %c", - flag); - unfix_nodes(tb); - return; - } - - atomic_inc (&(fs_generation (tb->tb_sb))); - do_balance_starts (tb); - /* balance leaf returns 0 except if combining L R and S into one node. see balance_internal() for explanation of this - line of code.*/ - child_pos = PATH_H_B_ITEM_ORDER (tb->tb_path, 0) + - balance_leaf (tb, ih, body, flag, insert_key, insert_ptr); + line of code. */ + child_pos = PATH_H_B_ITEM_ORDER(tb->tb_path, 0) + + balance_leaf(tb, ih, body, flag, insert_key, insert_ptr); #ifdef CONFIG_REISERFS_CHECK - check_after_balance_leaf (tb); + check_after_balance_leaf(tb); #endif - /* Balance internal level of the tree. */ - for ( h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++ ) - child_pos = balance_internal (tb, h, child_pos, insert_key, insert_ptr); - + /* Balance internal level of the tree. */ + for (h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++) + child_pos = + balance_internal(tb, h, child_pos, insert_key, insert_ptr); - do_balance_completed (tb); + do_balance_completed(tb); } diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 2230afff187..c9f178fb494 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -2,7 +2,6 @@ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ - #include <linux/time.h> #include <linux/reiserfs_fs.h> #include <linux/reiserfs_acl.h> @@ -31,82 +30,84 @@ ** We use reiserfs_truncate_file to pack the tail, since it already has ** all the conditions coded. */ -static int reiserfs_file_release (struct inode * inode, struct file * filp) +static int reiserfs_file_release(struct inode *inode, struct file *filp) { - struct reiserfs_transaction_handle th ; - int err; - int jbegin_failure = 0; + struct reiserfs_transaction_handle th; + int err; + int jbegin_failure = 0; - if (!S_ISREG (inode->i_mode)) - BUG (); + if (!S_ISREG(inode->i_mode)) + BUG(); - /* fast out for when nothing needs to be done */ - if ((atomic_read(&inode->i_count) > 1 || - !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) || - !tail_has_to_be_packed(inode)) && - REISERFS_I(inode)->i_prealloc_count <= 0) { - return 0; - } - - reiserfs_write_lock(inode->i_sb); - down (&inode->i_sem); - /* freeing preallocation only involves relogging blocks that - * are already in the current transaction. preallocation gets - * freed at the end of each transaction, so it is impossible for - * us to log any additional blocks (including quota blocks) - */ - err = journal_begin(&th, inode->i_sb, 1); - if (err) { - /* uh oh, we can't allow the inode to go away while there - * is still preallocation blocks pending. Try to join the - * aborted transaction - */ - jbegin_failure = err; - err = journal_join_abort(&th, inode->i_sb, 1); + /* fast out for when nothing needs to be done */ + if ((atomic_read(&inode->i_count) > 1 || + !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) || + !tail_has_to_be_packed(inode)) && + REISERFS_I(inode)->i_prealloc_count <= 0) { + return 0; + } + reiserfs_write_lock(inode->i_sb); + down(&inode->i_sem); + /* freeing preallocation only involves relogging blocks that + * are already in the current transaction. preallocation gets + * freed at the end of each transaction, so it is impossible for + * us to log any additional blocks (including quota blocks) + */ + err = journal_begin(&th, inode->i_sb, 1); if (err) { - /* hmpf, our choices here aren't good. We can pin the inode - * which will disallow unmount from every happening, we can - * do nothing, which will corrupt random memory on unmount, - * or we can forcibly remove the file from the preallocation - * list, which will leak blocks on disk. Lets pin the inode - * and let the admin know what is going on. - */ - igrab(inode); - reiserfs_warning(inode->i_sb, "pinning inode %lu because the " - "preallocation can't be freed"); - goto out; + /* uh oh, we can't allow the inode to go away while there + * is still preallocation blocks pending. Try to join the + * aborted transaction + */ + jbegin_failure = err; + err = journal_join_abort(&th, inode->i_sb, 1); + + if (err) { + /* hmpf, our choices here aren't good. We can pin the inode + * which will disallow unmount from every happening, we can + * do nothing, which will corrupt random memory on unmount, + * or we can forcibly remove the file from the preallocation + * list, which will leak blocks on disk. Lets pin the inode + * and let the admin know what is going on. + */ + igrab(inode); + reiserfs_warning(inode->i_sb, + "pinning inode %lu because the " + "preallocation can't be freed"); + goto out; + } } - } - reiserfs_update_inode_transaction(inode) ; + reiserfs_update_inode_transaction(inode); #ifdef REISERFS_PREALLOCATE - reiserfs_discard_prealloc (&th, inode); + reiserfs_discard_prealloc(&th, inode); #endif - err = journal_end(&th, inode->i_sb, 1); - - /* copy back the error code from journal_begin */ - if (!err) - err = jbegin_failure; - - if (!err && atomic_read(&inode->i_count) <= 1 && - (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) && - tail_has_to_be_packed (inode)) { - /* if regular file is released by last holder and it has been - appended (we append by unformatted node only) or its direct - item(s) had to be converted, then it may have to be - indirect2direct converted */ - err = reiserfs_truncate_file(inode, 0) ; - } -out: - up (&inode->i_sem); - reiserfs_write_unlock(inode->i_sb); - return err; + err = journal_end(&th, inode->i_sb, 1); + + /* copy back the error code from journal_begin */ + if (!err) + err = jbegin_failure; + + if (!err && atomic_read(&inode->i_count) <= 1 && + (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) && + tail_has_to_be_packed(inode)) { + /* if regular file is released by last holder and it has been + appended (we append by unformatted node only) or its direct + item(s) had to be converted, then it may have to be + indirect2direct converted */ + err = reiserfs_truncate_file(inode, 0); + } + out: + up(&inode->i_sem); + reiserfs_write_unlock(inode->i_sb); + return err; } -static void reiserfs_vfs_truncate_file(struct inode *inode) { - reiserfs_truncate_file(inode, 1) ; +static void reiserfs_vfs_truncate_file(struct inode *inode) +{ + reiserfs_truncate_file(inode, 1); } /* Sync a reiserfs file. */ @@ -116,26 +117,24 @@ static void reiserfs_vfs_truncate_file(struct inode *inode) { * be removed... */ -static int reiserfs_sync_file( - struct file * p_s_filp, - struct dentry * p_s_dentry, - int datasync - ) { - struct inode * p_s_inode = p_s_dentry->d_inode; - int n_err; - int barrier_done; - - if (!S_ISREG(p_s_inode->i_mode)) - BUG (); - n_err = sync_mapping_buffers(p_s_inode->i_mapping) ; - reiserfs_write_lock(p_s_inode->i_sb); - barrier_done = reiserfs_commit_for_inode(p_s_inode); - reiserfs_write_unlock(p_s_inode->i_sb); - if (barrier_done != 1) - blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL); - if (barrier_done < 0) - return barrier_done; - return ( n_err < 0 ) ? -EIO : 0; +static int reiserfs_sync_file(struct file *p_s_filp, + struct dentry *p_s_dentry, int datasync) +{ + struct inode *p_s_inode = p_s_dentry->d_inode; + int n_err; + int barrier_done; + + if (!S_ISREG(p_s_inode->i_mode)) + BUG(); + n_err = sync_mapping_buffers(p_s_inode->i_mapping); + reiserfs_write_lock(p_s_inode->i_sb); + barrier_done = reiserfs_commit_for_inode(p_s_inode); + reiserfs_write_unlock(p_s_inode->i_sb); + if (barrier_done != 1) + blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL); + if (barrier_done < 0) + return barrier_done; + return (n_err < 0) ? -EIO : 0; } /* I really do not want to play with memory shortage right now, so @@ -147,700 +146,797 @@ static int reiserfs_sync_file( /* Allocates blocks for a file to fulfil write request. Maps all unmapped but prepared pages from the list. Updates metadata with newly allocated blocknumbers as needed */ -static int reiserfs_allocate_blocks_for_region( - struct reiserfs_transaction_handle *th, - struct inode *inode, /* Inode we work with */ - loff_t pos, /* Writing position */ - int num_pages, /* number of pages write going - to touch */ - int write_bytes, /* amount of bytes to write */ - struct page **prepared_pages, /* array of - prepared pages - */ - int blocks_to_allocate /* Amount of blocks we - need to allocate to - fit the data into file - */ - ) +static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handle *th, struct inode *inode, /* Inode we work with */ + loff_t pos, /* Writing position */ + int num_pages, /* number of pages write going + to touch */ + int write_bytes, /* amount of bytes to write */ + struct page **prepared_pages, /* array of + prepared pages + */ + int blocks_to_allocate /* Amount of blocks we + need to allocate to + fit the data into file + */ + ) { - struct cpu_key key; // cpu key of item that we are going to deal with - struct item_head *ih; // pointer to item head that we are going to deal with - struct buffer_head *bh; // Buffer head that contains items that we are going to deal with - __le32 * item; // pointer to item we are going to deal with - INITIALIZE_PATH(path); // path to item, that we are going to deal with. - b_blocknr_t *allocated_blocks; // Pointer to a place where allocated blocknumbers would be stored. - reiserfs_blocknr_hint_t hint; // hint structure for block allocator. - size_t res; // return value of various functions that we call. - int curr_block; // current block used to keep track of unmapped blocks. - int i; // loop counter - int itempos; // position in item - unsigned int from = (pos & (PAGE_CACHE_SIZE - 1)); // writing position in - // first page - unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1; /* last modified byte offset in last page */ - __u64 hole_size ; // amount of blocks for a file hole, if it needed to be created. - int modifying_this_item = 0; // Flag for items traversal code to keep track - // of the fact that we already prepared - // current block for journal - int will_prealloc = 0; - RFALSE(!blocks_to_allocate, "green-9004: tried to allocate zero blocks?"); - - /* only preallocate if this is a small write */ - if (REISERFS_I(inode)->i_prealloc_count || - (!(write_bytes & (inode->i_sb->s_blocksize -1)) && - blocks_to_allocate < - REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize)) - will_prealloc = REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize; - - allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) * - sizeof(b_blocknr_t), GFP_NOFS); - - /* First we compose a key to point at the writing position, we want to do - that outside of any locking region. */ - make_cpu_key (&key, inode, pos+1, TYPE_ANY, 3/*key length*/); - - /* If we came here, it means we absolutely need to open a transaction, - since we need to allocate some blocks */ - reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that. - res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS); // Wish I know if this number enough - if (res) - goto error_exit; - reiserfs_update_inode_transaction(inode) ; - - /* Look for the in-tree position of our write, need path for block allocator */ - res = search_for_position_by_key(inode->i_sb, &key, &path); - if ( res == IO_ERROR ) { - res = -EIO; - goto error_exit; - } - - /* Allocate blocks */ - /* First fill in "hint" structure for block allocator */ - hint.th = th; // transaction handle. - hint.path = &path; // Path, so that block allocator can determine packing locality or whatever it needs to determine. - hint.inode = inode; // Inode is needed by block allocator too. - hint.search_start = 0; // We have no hint on where to search free blocks for block allocator. - hint.key = key.on_disk_key; // on disk key of file. - hint.block = inode->i_blocks>>(inode->i_sb->s_blocksize_bits-9); // Number of disk blocks this file occupies already. - hint.formatted_node = 0; // We are allocating blocks for unformatted node. - hint.preallocate = will_prealloc; - - /* Call block allocator to allocate blocks */ - res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate); - if ( res != CARRY_ON ) { - if ( res == NO_DISK_SPACE ) { - /* We flush the transaction in case of no space. This way some - blocks might become free */ - SB_JOURNAL(inode->i_sb)->j_must_wait = 1; - res = restart_transaction(th, inode, &path); - if (res) - goto error_exit; - - /* We might have scheduled, so search again */ - res = search_for_position_by_key(inode->i_sb, &key, &path); - if ( res == IO_ERROR ) { - res = -EIO; + struct cpu_key key; // cpu key of item that we are going to deal with + struct item_head *ih; // pointer to item head that we are going to deal with + struct buffer_head *bh; // Buffer head that contains items that we are going to deal with + __le32 *item; // pointer to item we are going to deal with + INITIALIZE_PATH(path); // path to item, that we are going to deal with. + b_blocknr_t *allocated_blocks; // Pointer to a place where allocated blocknumbers would be stored. + reiserfs_blocknr_hint_t hint; // hint structure for block allocator. + size_t res; // return value of various functions that we call. + int curr_block; // current block used to keep track of unmapped blocks. + int i; // loop counter + int itempos; // position in item + unsigned int from = (pos & (PAGE_CACHE_SIZE - 1)); // writing position in + // first page + unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1; /* last modified byte offset in last page */ + __u64 hole_size; // amount of blocks for a file hole, if it needed to be created. + int modifying_this_item = 0; // Flag for items traversal code to keep track + // of the fact that we already prepared + // current block for journal + int will_prealloc = 0; + RFALSE(!blocks_to_allocate, + "green-9004: tried to allocate zero blocks?"); + + /* only preallocate if this is a small write */ + if (REISERFS_I(inode)->i_prealloc_count || + (!(write_bytes & (inode->i_sb->s_blocksize - 1)) && + blocks_to_allocate < + REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize)) + will_prealloc = + REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize; + + allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) * + sizeof(b_blocknr_t), GFP_NOFS); + + /* First we compose a key to point at the writing position, we want to do + that outside of any locking region. */ + make_cpu_key(&key, inode, pos + 1, TYPE_ANY, 3 /*key length */ ); + + /* If we came here, it means we absolutely need to open a transaction, + since we need to allocate some blocks */ + reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that. + res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb)); // Wish I know if this number enough + if (res) goto error_exit; - } + reiserfs_update_inode_transaction(inode); - /* update changed info for hint structure. */ - res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate); - if ( res != CARRY_ON ) { - res = -ENOSPC; - pathrelse(&path); + /* Look for the in-tree position of our write, need path for block allocator */ + res = search_for_position_by_key(inode->i_sb, &key, &path); + if (res == IO_ERROR) { + res = -EIO; goto error_exit; - } - } else { - res = -ENOSPC; - pathrelse(&path); - goto error_exit; } - } -#ifdef __BIG_ENDIAN - // Too bad, I have not found any way to convert a given region from - // cpu format to little endian format - { - int i; - for ( i = 0; i < blocks_to_allocate ; i++) - allocated_blocks[i]=cpu_to_le32(allocated_blocks[i]); - } -#endif - - /* Blocks allocating well might have scheduled and tree might have changed, - let's search the tree again */ - /* find where in the tree our write should go */ - res = search_for_position_by_key(inode->i_sb, &key, &path); - if ( res == IO_ERROR ) { - res = -EIO; - goto error_exit_free_blocks; - } - - bh = get_last_bh( &path ); // Get a bufferhead for last element in path. - ih = get_ih( &path ); // Get a pointer to last item head in path. - item = get_item( &path ); // Get a pointer to last item in path - - /* Let's see what we have found */ - if ( res != POSITION_FOUND ) { /* position not found, this means that we - might need to append file with holes - first */ - // Since we are writing past the file's end, we need to find out if - // there is a hole that needs to be inserted before our writing - // position, and how many blocks it is going to cover (we need to - // populate pointers to file blocks representing the hole with zeros) + /* Allocate blocks */ + /* First fill in "hint" structure for block allocator */ + hint.th = th; // transaction handle. + hint.path = &path; // Path, so that block allocator can determine packing locality or whatever it needs to determine. + hint.inode = inode; // Inode is needed by block allocator too. + hint.search_start = 0; // We have no hint on where to search free blocks for block allocator. + hint.key = key.on_disk_key; // on disk key of file. + hint.block = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); // Number of disk blocks this file occupies already. + hint.formatted_node = 0; // We are allocating blocks for unformatted node. + hint.preallocate = will_prealloc; + + /* Call block allocator to allocate blocks */ + res = + reiserfs_allocate_blocknrs(&hint, allocated_blocks, + blocks_to_allocate, blocks_to_allocate); + if (res != CARRY_ON) { + if (res == NO_DISK_SPACE) { + /* We flush the transaction in case of no space. This way some + blocks might become free */ + SB_JOURNAL(inode->i_sb)->j_must_wait = 1; + res = restart_transaction(th, inode, &path); + if (res) + goto error_exit; + + /* We might have scheduled, so search again */ + res = + search_for_position_by_key(inode->i_sb, &key, + &path); + if (res == IO_ERROR) { + res = -EIO; + goto error_exit; + } + /* update changed info for hint structure. */ + res = + reiserfs_allocate_blocknrs(&hint, allocated_blocks, + blocks_to_allocate, + blocks_to_allocate); + if (res != CARRY_ON) { + res = -ENOSPC; + pathrelse(&path); + goto error_exit; + } + } else { + res = -ENOSPC; + pathrelse(&path); + goto error_exit; + } + } +#ifdef __BIG_ENDIAN + // Too bad, I have not found any way to convert a given region from + // cpu format to little endian format { - int item_offset = 1; - /* - * if ih is stat data, its offset is 0 and we don't want to - * add 1 to pos in the hole_size calculation - */ - if (is_statdata_le_ih(ih)) - item_offset = 0; - hole_size = (pos + item_offset - - (le_key_k_offset( get_inode_item_key_version(inode), - &(ih->ih_key)) + - op_bytes_number(ih, inode->i_sb->s_blocksize))) >> - inode->i_sb->s_blocksize_bits; + int i; + for (i = 0; i < blocks_to_allocate; i++) + allocated_blocks[i] = cpu_to_le32(allocated_blocks[i]); } +#endif - if ( hole_size > 0 ) { - int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE ); // How much data to insert first time. - /* area filled with zeroes, to supply as list of zero blocknumbers - We allocate it outside of loop just in case loop would spin for - several iterations. */ - char *zeros = kmalloc(to_paste*UNFM_P_SIZE, GFP_ATOMIC); // We cannot insert more than MAX_ITEM_LEN bytes anyway. - if ( !zeros ) { - res = -ENOMEM; + /* Blocks allocating well might have scheduled and tree might have changed, + let's search the tree again */ + /* find where in the tree our write should go */ + res = search_for_position_by_key(inode->i_sb, &key, &path); + if (res == IO_ERROR) { + res = -EIO; goto error_exit_free_blocks; - } - memset ( zeros, 0, to_paste*UNFM_P_SIZE); - do { - to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE ); - if ( is_indirect_le_ih(ih) ) { - /* Ok, there is existing indirect item already. Need to append it */ - /* Calculate position past inserted item */ - make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3); - res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)zeros, UNFM_P_SIZE*to_paste); - if ( res ) { - kfree(zeros); - goto error_exit_free_blocks; - } - } else if ( is_statdata_le_ih(ih) ) { - /* No existing item, create it */ - /* item head for new item */ - struct item_head ins_ih; - - /* create a key for our new item */ - make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3); - - /* Create new item head for our new item */ - make_le_item_head (&ins_ih, &key, key.version, 1, - TYPE_INDIRECT, to_paste*UNFM_P_SIZE, - 0 /* free space */); - - /* Find where such item should live in the tree */ - res = search_item (inode->i_sb, &key, &path); - if ( res != ITEM_NOT_FOUND ) { - /* item should not exist, otherwise we have error */ - if ( res != -ENOSPC ) { - reiserfs_warning (inode->i_sb, - "green-9008: search_by_key (%K) returned %d", - &key, res); + } + + bh = get_last_bh(&path); // Get a bufferhead for last element in path. + ih = get_ih(&path); // Get a pointer to last item head in path. + item = get_item(&path); // Get a pointer to last item in path + + /* Let's see what we have found */ + if (res != POSITION_FOUND) { /* position not found, this means that we + might need to append file with holes + first */ + // Since we are writing past the file's end, we need to find out if + // there is a hole that needs to be inserted before our writing + // position, and how many blocks it is going to cover (we need to + // populate pointers to file blocks representing the hole with zeros) + + { + int item_offset = 1; + /* + * if ih is stat data, its offset is 0 and we don't want to + * add 1 to pos in the hole_size calculation + */ + if (is_statdata_le_ih(ih)) + item_offset = 0; + hole_size = (pos + item_offset - + (le_key_k_offset + (get_inode_item_key_version(inode), + &(ih->ih_key)) + op_bytes_number(ih, + inode-> + i_sb-> + s_blocksize))) + >> inode->i_sb->s_blocksize_bits; + } + + if (hole_size > 0) { + int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize) / UNFM_P_SIZE); // How much data to insert first time. + /* area filled with zeroes, to supply as list of zero blocknumbers + We allocate it outside of loop just in case loop would spin for + several iterations. */ + char *zeros = kmalloc(to_paste * UNFM_P_SIZE, GFP_ATOMIC); // We cannot insert more than MAX_ITEM_LEN bytes anyway. + if (!zeros) { + res = -ENOMEM; + goto error_exit_free_blocks; } - res = -EIO; - kfree(zeros); - goto error_exit_free_blocks; - } - res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)zeros); - } else { - reiserfs_panic(inode->i_sb, "green-9011: Unexpected key type %K\n", &key); + memset(zeros, 0, to_paste * UNFM_P_SIZE); + do { + to_paste = + min_t(__u64, hole_size, + MAX_ITEM_LEN(inode->i_sb-> + s_blocksize) / + UNFM_P_SIZE); + if (is_indirect_le_ih(ih)) { + /* Ok, there is existing indirect item already. Need to append it */ + /* Calculate position past inserted item */ + make_cpu_key(&key, inode, + le_key_k_offset + (get_inode_item_key_version + (inode), + &(ih->ih_key)) + + op_bytes_number(ih, + inode-> + i_sb-> + s_blocksize), + TYPE_INDIRECT, 3); + res = + reiserfs_paste_into_item(th, &path, + &key, + inode, + (char *) + zeros, + UNFM_P_SIZE + * + to_paste); + if (res) { + kfree(zeros); + goto error_exit_free_blocks; + } + } else if (is_statdata_le_ih(ih)) { + /* No existing item, create it */ + /* item head for new item */ + struct item_head ins_ih; + + /* create a key for our new item */ + make_cpu_key(&key, inode, 1, + TYPE_INDIRECT, 3); + + /* Create new item head for our new item */ + make_le_item_head(&ins_ih, &key, + key.version, 1, + TYPE_INDIRECT, + to_paste * + UNFM_P_SIZE, + 0 /* free space */ ); + + /* Find where such item should live in the tree */ + res = + search_item(inode->i_sb, &key, + &path); + if (res != ITEM_NOT_FOUND) { + /* item should not exist, otherwise we have error */ + if (res != -ENOSPC) { + reiserfs_warning(inode-> + i_sb, + "green-9008: search_by_key (%K) returned %d", + &key, + res); + } + res = -EIO; + kfree(zeros); + goto error_exit_free_blocks; + } + res = + reiserfs_insert_item(th, &path, + &key, &ins_ih, + inode, + (char *)zeros); + } else { + reiserfs_panic(inode->i_sb, + "green-9011: Unexpected key type %K\n", + &key); + } + if (res) { + kfree(zeros); + goto error_exit_free_blocks; + } + /* Now we want to check if transaction is too full, and if it is + we restart it. This will also free the path. */ + if (journal_transaction_should_end + (th, th->t_blocks_allocated)) { + res = + restart_transaction(th, inode, + &path); + if (res) { + pathrelse(&path); + kfree(zeros); + goto error_exit; + } + } + + /* Well, need to recalculate path and stuff */ + set_cpu_key_k_offset(&key, + cpu_key_k_offset(&key) + + (to_paste << inode-> + i_blkbits)); + res = + search_for_position_by_key(inode->i_sb, + &key, &path); + if (res == IO_ERROR) { + res = -EIO; + kfree(zeros); + goto error_exit_free_blocks; + } + bh = get_last_bh(&path); + ih = get_ih(&path); + item = get_item(&path); + hole_size -= to_paste; + } while (hole_size); + kfree(zeros); } - if ( res ) { - kfree(zeros); - goto error_exit_free_blocks; + } + // Go through existing indirect items first + // replace all zeroes with blocknumbers from list + // Note that if no corresponding item was found, by previous search, + // it means there are no existing in-tree representation for file area + // we are going to overwrite, so there is nothing to scan through for holes. + for (curr_block = 0, itempos = path.pos_in_item; + curr_block < blocks_to_allocate && res == POSITION_FOUND;) { + retry: + + if (itempos >= ih_item_len(ih) / UNFM_P_SIZE) { + /* We run out of data in this indirect item, let's look for another + one. */ + /* First if we are already modifying current item, log it */ + if (modifying_this_item) { + journal_mark_dirty(th, inode->i_sb, bh); + modifying_this_item = 0; + } + /* Then set the key to look for a new indirect item (offset of old + item is added to old item length */ + set_cpu_key_k_offset(&key, + le_key_k_offset + (get_inode_item_key_version(inode), + &(ih->ih_key)) + + op_bytes_number(ih, + inode->i_sb-> + s_blocksize)); + /* Search ofor position of new key in the tree. */ + res = + search_for_position_by_key(inode->i_sb, &key, + &path); + if (res == IO_ERROR) { + res = -EIO; + goto error_exit_free_blocks; + } + bh = get_last_bh(&path); + ih = get_ih(&path); + item = get_item(&path); + itempos = path.pos_in_item; + continue; // loop to check all kinds of conditions and so on. } - /* Now we want to check if transaction is too full, and if it is - we restart it. This will also free the path. */ - if (journal_transaction_should_end(th, th->t_blocks_allocated)) { - res = restart_transaction(th, inode, &path); - if (res) { - pathrelse (&path); - kfree(zeros); - goto error_exit; - } - } - - /* Well, need to recalculate path and stuff */ - set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + (to_paste << inode->i_blkbits)); - res = search_for_position_by_key(inode->i_sb, &key, &path); - if ( res == IO_ERROR ) { - res = -EIO; - kfree(zeros); - goto error_exit_free_blocks; + /* Ok, we have correct position in item now, so let's see if it is + representing file hole (blocknumber is zero) and fill it if needed */ + if (!item[itempos]) { + /* Ok, a hole. Now we need to check if we already prepared this + block to be journaled */ + while (!modifying_this_item) { // loop until succeed + /* Well, this item is not journaled yet, so we must prepare + it for journal first, before we can change it */ + struct item_head tmp_ih; // We copy item head of found item, + // here to detect if fs changed under + // us while we were preparing for + // journal. + int fs_gen; // We store fs generation here to find if someone + // changes fs under our feet + + copy_item_head(&tmp_ih, ih); // Remember itemhead + fs_gen = get_generation(inode->i_sb); // remember fs generation + reiserfs_prepare_for_journal(inode->i_sb, bh, 1); // Prepare a buffer within which indirect item is stored for changing. + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + // Sigh, fs was changed under us, we need to look for new + // location of item we are working with + + /* unmark prepaerd area as journaled and search for it's + new position */ + reiserfs_restore_prepared_buffer(inode-> + i_sb, + bh); + res = + search_for_position_by_key(inode-> + i_sb, + &key, + &path); + if (res == IO_ERROR) { + res = -EIO; + goto error_exit_free_blocks; + } + bh = get_last_bh(&path); + ih = get_ih(&path); + item = get_item(&path); + itempos = path.pos_in_item; + goto retry; + } + modifying_this_item = 1; + } + item[itempos] = allocated_blocks[curr_block]; // Assign new block + curr_block++; } - bh=get_last_bh(&path); - ih=get_ih(&path); - item = get_item(&path); - hole_size -= to_paste; - } while ( hole_size ); - kfree(zeros); + itempos++; } - } - - // Go through existing indirect items first - // replace all zeroes with blocknumbers from list - // Note that if no corresponding item was found, by previous search, - // it means there are no existing in-tree representation for file area - // we are going to overwrite, so there is nothing to scan through for holes. - for ( curr_block = 0, itempos = path.pos_in_item ; curr_block < blocks_to_allocate && res == POSITION_FOUND ; ) { -retry: - - if ( itempos >= ih_item_len(ih)/UNFM_P_SIZE ) { - /* We run out of data in this indirect item, let's look for another - one. */ - /* First if we are already modifying current item, log it */ - if ( modifying_this_item ) { - journal_mark_dirty (th, inode->i_sb, bh); - modifying_this_item = 0; - } - /* Then set the key to look for a new indirect item (offset of old - item is added to old item length */ - set_cpu_key_k_offset( &key, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize)); - /* Search ofor position of new key in the tree. */ - res = search_for_position_by_key(inode->i_sb, &key, &path); - if ( res == IO_ERROR) { - res = -EIO; - goto error_exit_free_blocks; - } - bh=get_last_bh(&path); - ih=get_ih(&path); - item = get_item(&path); - itempos = path.pos_in_item; - continue; // loop to check all kinds of conditions and so on. + + if (modifying_this_item) { // We need to log last-accessed block, if it + // was modified, but not logged yet. + journal_mark_dirty(th, inode->i_sb, bh); } - /* Ok, we have correct position in item now, so let's see if it is - representing file hole (blocknumber is zero) and fill it if needed */ - if ( !item[itempos] ) { - /* Ok, a hole. Now we need to check if we already prepared this - block to be journaled */ - while ( !modifying_this_item ) { // loop until succeed - /* Well, this item is not journaled yet, so we must prepare - it for journal first, before we can change it */ - struct item_head tmp_ih; // We copy item head of found item, - // here to detect if fs changed under - // us while we were preparing for - // journal. - int fs_gen; // We store fs generation here to find if someone - // changes fs under our feet - - copy_item_head (&tmp_ih, ih); // Remember itemhead - fs_gen = get_generation (inode->i_sb); // remember fs generation - reiserfs_prepare_for_journal(inode->i_sb, bh, 1); // Prepare a buffer within which indirect item is stored for changing. - if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { - // Sigh, fs was changed under us, we need to look for new - // location of item we are working with - - /* unmark prepaerd area as journaled and search for it's - new position */ - reiserfs_restore_prepared_buffer(inode->i_sb, bh); - res = search_for_position_by_key(inode->i_sb, &key, &path); - if ( res == IO_ERROR) { - res = -EIO; - goto error_exit_free_blocks; - } - bh=get_last_bh(&path); - ih=get_ih(&path); - item = get_item(&path); - itempos = path.pos_in_item; - goto retry; + + if (curr_block < blocks_to_allocate) { + // Oh, well need to append to indirect item, or to create indirect item + // if there weren't any + if (is_indirect_le_ih(ih)) { + // Existing indirect item - append. First calculate key for append + // position. We do not need to recalculate path as it should + // already point to correct place. + make_cpu_key(&key, inode, + le_key_k_offset(get_inode_item_key_version + (inode), + &(ih->ih_key)) + + op_bytes_number(ih, + inode->i_sb->s_blocksize), + TYPE_INDIRECT, 3); + res = + reiserfs_paste_into_item(th, &path, &key, inode, + (char *)(allocated_blocks + + curr_block), + UNFM_P_SIZE * + (blocks_to_allocate - + curr_block)); + if (res) { + goto error_exit_free_blocks; + } + } else if (is_statdata_le_ih(ih)) { + // Last found item was statdata. That means we need to create indirect item. + struct item_head ins_ih; /* itemhead for new item */ + + /* create a key for our new item */ + make_cpu_key(&key, inode, 1, TYPE_INDIRECT, 3); // Position one, + // because that's + // where first + // indirect item + // begins + /* Create new item head for our new item */ + make_le_item_head(&ins_ih, &key, key.version, 1, + TYPE_INDIRECT, + (blocks_to_allocate - + curr_block) * UNFM_P_SIZE, + 0 /* free space */ ); + /* Find where such item should live in the tree */ + res = search_item(inode->i_sb, &key, &path); + if (res != ITEM_NOT_FOUND) { + /* Well, if we have found such item already, or some error + occured, we need to warn user and return error */ + if (res != -ENOSPC) { + reiserfs_warning(inode->i_sb, + "green-9009: search_by_key (%K) " + "returned %d", &key, + res); + } + res = -EIO; + goto error_exit_free_blocks; + } + /* Insert item into the tree with the data as its body */ + res = + reiserfs_insert_item(th, &path, &key, &ins_ih, + inode, + (char *)(allocated_blocks + + curr_block)); + } else { + reiserfs_panic(inode->i_sb, + "green-9010: unexpected item type for key %K\n", + &key); } - modifying_this_item = 1; - } - item[itempos] = allocated_blocks[curr_block]; // Assign new block - curr_block++; } - itempos++; - } - - if ( modifying_this_item ) { // We need to log last-accessed block, if it - // was modified, but not logged yet. - journal_mark_dirty (th, inode->i_sb, bh); - } - - if ( curr_block < blocks_to_allocate ) { - // Oh, well need to append to indirect item, or to create indirect item - // if there weren't any - if ( is_indirect_le_ih(ih) ) { - // Existing indirect item - append. First calculate key for append - // position. We do not need to recalculate path as it should - // already point to correct place. - make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3); - res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block)); - if ( res ) { - goto error_exit_free_blocks; - } - } else if (is_statdata_le_ih(ih) ) { - // Last found item was statdata. That means we need to create indirect item. - struct item_head ins_ih; /* itemhead for new item */ - - /* create a key for our new item */ - make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3); // Position one, - // because that's - // where first - // indirect item - // begins - /* Create new item head for our new item */ - make_le_item_head (&ins_ih, &key, key.version, 1, TYPE_INDIRECT, - (blocks_to_allocate-curr_block)*UNFM_P_SIZE, - 0 /* free space */); - /* Find where such item should live in the tree */ - res = search_item (inode->i_sb, &key, &path); - if ( res != ITEM_NOT_FOUND ) { - /* Well, if we have found such item already, or some error - occured, we need to warn user and return error */ - if ( res != -ENOSPC ) { - reiserfs_warning (inode->i_sb, - "green-9009: search_by_key (%K) " - "returned %d", &key, res); + // the caller is responsible for closing the transaction + // unless we return an error, they are also responsible for logging + // the inode. + // + pathrelse(&path); + /* + * cleanup prellocation from previous writes + * if this is a partial block write + */ + if (write_bytes & (inode->i_sb->s_blocksize - 1)) + reiserfs_discard_prealloc(th, inode); + reiserfs_write_unlock(inode->i_sb); + + // go through all the pages/buffers and map the buffers to newly allocated + // blocks (so that system knows where to write these pages later). + curr_block = 0; + for (i = 0; i < num_pages; i++) { + struct page *page = prepared_pages[i]; //current page + struct buffer_head *head = page_buffers(page); // first buffer for a page + int block_start, block_end; // in-page offsets for buffers. + + if (!page_buffers(page)) + reiserfs_panic(inode->i_sb, + "green-9005: No buffers for prepared page???"); + + /* For each buffer in page */ + for (bh = head, block_start = 0; bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + if (!bh) + reiserfs_panic(inode->i_sb, + "green-9006: Allocated but absent buffer for a page?"); + block_end = block_start + inode->i_sb->s_blocksize; + if (i == 0 && block_end <= from) + /* if this buffer is before requested data to map, skip it */ + continue; + if (i == num_pages - 1 && block_start >= to) + /* If this buffer is after requested data to map, abort + processing of current page */ + break; + + if (!buffer_mapped(bh)) { // Ok, unmapped buffer, need to map it + map_bh(bh, inode->i_sb, + le32_to_cpu(allocated_blocks + [curr_block])); + curr_block++; + set_buffer_new(bh); + } } - res = -EIO; - goto error_exit_free_blocks; - } - /* Insert item into the tree with the data as its body */ - res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)(allocated_blocks+curr_block)); - } else { - reiserfs_panic(inode->i_sb, "green-9010: unexpected item type for key %K\n",&key); - } - } - - // the caller is responsible for closing the transaction - // unless we return an error, they are also responsible for logging - // the inode. - // - pathrelse(&path); - /* - * cleanup prellocation from previous writes - * if this is a partial block write - */ - if (write_bytes & (inode->i_sb->s_blocksize -1)) - reiserfs_discard_prealloc(th, inode); - reiserfs_write_unlock(inode->i_sb); - - // go through all the pages/buffers and map the buffers to newly allocated - // blocks (so that system knows where to write these pages later). - curr_block = 0; - for ( i = 0; i < num_pages ; i++ ) { - struct page *page=prepared_pages[i]; //current page - struct buffer_head *head = page_buffers(page);// first buffer for a page - int block_start, block_end; // in-page offsets for buffers. - - if (!page_buffers(page)) - reiserfs_panic(inode->i_sb, "green-9005: No buffers for prepared page???"); - - /* For each buffer in page */ - for(bh = head, block_start = 0; bh != head || !block_start; - block_start=block_end, bh = bh->b_this_page) { - if (!bh) - reiserfs_panic(inode->i_sb, "green-9006: Allocated but absent buffer for a page?"); - block_end = block_start+inode->i_sb->s_blocksize; - if (i == 0 && block_end <= from ) - /* if this buffer is before requested data to map, skip it */ - continue; - if (i == num_pages - 1 && block_start >= to) - /* If this buffer is after requested data to map, abort - processing of current page */ - break; - - if ( !buffer_mapped(bh) ) { // Ok, unmapped buffer, need to map it - map_bh( bh, inode->i_sb, le32_to_cpu(allocated_blocks[curr_block])); - curr_block++; - set_buffer_new(bh); - } } - } - RFALSE( curr_block > blocks_to_allocate, "green-9007: Used too many blocks? weird"); + RFALSE(curr_block > blocks_to_allocate, + "green-9007: Used too many blocks? weird"); - kfree(allocated_blocks); - return 0; + kfree(allocated_blocks); + return 0; // Need to deal with transaction here. -error_exit_free_blocks: - pathrelse(&path); - // free blocks - for( i = 0; i < blocks_to_allocate; i++ ) - reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]), 1); - -error_exit: - if (th->t_trans_id) { - int err; - // update any changes we made to blk count - reiserfs_update_sd(th, inode); - err = journal_end(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS); - if (err) - res = err; - } - reiserfs_write_unlock(inode->i_sb); - kfree(allocated_blocks); - - return res; + error_exit_free_blocks: + pathrelse(&path); + // free blocks + for (i = 0; i < blocks_to_allocate; i++) + reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]), + 1); + + error_exit: + if (th->t_trans_id) { + int err; + // update any changes we made to blk count + reiserfs_update_sd(th, inode); + err = + journal_end(th, inode->i_sb, + JOURNAL_PER_BALANCE_CNT * 3 + 1 + + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb)); + if (err) + res = err; + } + reiserfs_write_unlock(inode->i_sb); + kfree(allocated_blocks); + + return res; } /* Unlock pages prepared by reiserfs_prepare_file_region_for_write */ -static void reiserfs_unprepare_pages(struct page **prepared_pages, /* list of locked pages */ - size_t num_pages /* amount of pages */) { - int i; // loop counter +static void reiserfs_unprepare_pages(struct page **prepared_pages, /* list of locked pages */ + size_t num_pages /* amount of pages */ ) +{ + int i; // loop counter - for (i=0; i < num_pages ; i++) { - struct page *page = prepared_pages[i]; + for (i = 0; i < num_pages; i++) { + struct page *page = prepared_pages[i]; - try_to_free_buffers(page); - unlock_page(page); - page_cache_release(page); - } + try_to_free_buffers(page); + unlock_page(page); + page_cache_release(page); + } } /* This function will copy data from userspace to specified pages within supplied byte range */ -static int reiserfs_copy_from_user_to_file_region( - loff_t pos, /* In-file position */ - int num_pages, /* Number of pages affected */ - int write_bytes, /* Amount of bytes to write */ - struct page **prepared_pages, /* pointer to - array to - prepared pages - */ - const char __user *buf /* Pointer to user-supplied - data*/ - ) +static int reiserfs_copy_from_user_to_file_region(loff_t pos, /* In-file position */ + int num_pages, /* Number of pages affected */ + int write_bytes, /* Amount of bytes to write */ + struct page **prepared_pages, /* pointer to + array to + prepared pages + */ + const char __user * buf /* Pointer to user-supplied + data */ + ) { - long page_fault=0; // status of copy_from_user. - int i; // loop counter. - int offset; // offset in page - - for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) { - size_t count = min_t(size_t,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page - struct page *page=prepared_pages[i]; // Current page we process. - - fault_in_pages_readable( buf, count); - - /* Copy data from userspace to the current page */ - kmap(page); - page_fault = __copy_from_user(page_address(page)+offset, buf, count); // Copy the data. - /* Flush processor's dcache for this page */ - flush_dcache_page(page); - kunmap(page); - buf+=count; - write_bytes-=count; - - if (page_fault) - break; // Was there a fault? abort. - } - - return page_fault?-EFAULT:0; + long page_fault = 0; // status of copy_from_user. + int i; // loop counter. + int offset; // offset in page + + for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages; + i++, offset = 0) { + size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes); // How much of bytes to write to this page + struct page *page = prepared_pages[i]; // Current page we process. + + fault_in_pages_readable(buf, count); + + /* Copy data from userspace to the current page */ + kmap(page); + page_fault = __copy_from_user(page_address(page) + offset, buf, count); // Copy the data. + /* Flush processor's dcache for this page */ + flush_dcache_page(page); + kunmap(page); + buf += count; + write_bytes -= count; + + if (page_fault) + break; // Was there a fault? abort. + } + + return page_fault ? -EFAULT : 0; } /* taken fs/buffer.c:__block_commit_write */ int reiserfs_commit_page(struct inode *inode, struct page *page, - unsigned from, unsigned to) + unsigned from, unsigned to) { - unsigned block_start, block_end; - int partial = 0; - unsigned blocksize; - struct buffer_head *bh, *head; - unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT; - int new; - int logit = reiserfs_file_data_log(inode); - struct super_block *s = inode->i_sb; - int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; - struct reiserfs_transaction_handle th; - int ret = 0; - - th.t_trans_id = 0; - blocksize = 1 << inode->i_blkbits; - - if (logit) { - reiserfs_write_lock(s); - ret = journal_begin(&th, s, bh_per_page + 1); - if (ret) - goto drop_write_lock; - reiserfs_update_inode_transaction(inode); - } - for(bh = head = page_buffers(page), block_start = 0; - bh != head || !block_start; - block_start=block_end, bh = bh->b_this_page) - { - - new = buffer_new(bh); - clear_buffer_new(bh); - block_end = block_start + blocksize; - if (block_end <= from || block_start >= to) { - if (!buffer_uptodate(bh)) - partial = 1; - } else { - set_buffer_uptodate(bh); - if (logit) { - reiserfs_prepare_for_journal(s, bh, 1); - journal_mark_dirty(&th, s, bh); - } else if (!buffer_dirty(bh)) { - mark_buffer_dirty(bh); - /* do data=ordered on any page past the end - * of file and any buffer marked BH_New. - */ - if (reiserfs_data_ordered(inode->i_sb) && - (new || page->index >= i_size_index)) { - reiserfs_add_ordered_list(inode, bh); - } - } + unsigned block_start, block_end; + int partial = 0; + unsigned blocksize; + struct buffer_head *bh, *head; + unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT; + int new; + int logit = reiserfs_file_data_log(inode); + struct super_block *s = inode->i_sb; + int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; + struct reiserfs_transaction_handle th; + int ret = 0; + + th.t_trans_id = 0; + blocksize = 1 << inode->i_blkbits; + + if (logit) { + reiserfs_write_lock(s); + ret = journal_begin(&th, s, bh_per_page + 1); + if (ret) + goto drop_write_lock; + reiserfs_update_inode_transaction(inode); + } + for (bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + + new = buffer_new(bh); + clear_buffer_new(bh); + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + } else { + set_buffer_uptodate(bh); + if (logit) { + reiserfs_prepare_for_journal(s, bh, 1); + journal_mark_dirty(&th, s, bh); + } else if (!buffer_dirty(bh)) { + mark_buffer_dirty(bh); + /* do data=ordered on any page past the end + * of file and any buffer marked BH_New. + */ + if (reiserfs_data_ordered(inode->i_sb) && + (new || page->index >= i_size_index)) { + reiserfs_add_ordered_list(inode, bh); + } + } + } } - } - if (logit) { - ret = journal_end(&th, s, bh_per_page + 1); -drop_write_lock: - reiserfs_write_unlock(s); - } - /* - * If this is a partial write which happened to make all buffers - * uptodate then we can optimize away a bogus readpage() for - * the next read(). Here we 'discover' whether the page went - * uptodate as a result of this (potentially partial) write. - */ - if (!partial) - SetPageUptodate(page); - return ret; + if (logit) { + ret = journal_end(&th, s, bh_per_page + 1); + drop_write_lock: + reiserfs_write_unlock(s); + } + /* + * If this is a partial write which happened to make all buffers + * uptodate then we can optimize away a bogus readpage() for + * the next read(). Here we 'discover' whether the page went + * uptodate as a result of this (potentially partial) write. + */ + if (!partial) + SetPageUptodate(page); + return ret; } - /* Submit pages for write. This was separated from actual file copying because we might want to allocate block numbers in-between. This function assumes that caller will adjust file size to correct value. */ -static int reiserfs_submit_file_region_for_write( - struct reiserfs_transaction_handle *th, - struct inode *inode, - loff_t pos, /* Writing position offset */ - size_t num_pages, /* Number of pages to write */ - size_t write_bytes, /* number of bytes to write */ - struct page **prepared_pages /* list of pages */ - ) +static int reiserfs_submit_file_region_for_write(struct reiserfs_transaction_handle *th, struct inode *inode, loff_t pos, /* Writing position offset */ + size_t num_pages, /* Number of pages to write */ + size_t write_bytes, /* number of bytes to write */ + struct page **prepared_pages /* list of pages */ + ) { - int status; // return status of block_commit_write. - int retval = 0; // Return value we are going to return. - int i; // loop counter - int offset; // Writing offset in page. - int orig_write_bytes = write_bytes; - int sd_update = 0; - - for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) { - int count = min_t(int,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page - struct page *page=prepared_pages[i]; // Current page we process. - - status = reiserfs_commit_page(inode, page, offset, offset+count); - if ( status ) - retval = status; // To not overcomplicate matters We are going to - // submit all the pages even if there was error. - // we only remember error status to report it on - // exit. - write_bytes-=count; - } - /* now that we've gotten all the ordered buffers marked dirty, - * we can safely update i_size and close any running transaction - */ - if ( pos + orig_write_bytes > inode->i_size) { - inode->i_size = pos + orig_write_bytes; // Set new size - /* If the file have grown so much that tail packing is no - * longer possible, reset "need to pack" flag */ - if ( (have_large_tails (inode->i_sb) && - inode->i_size > i_block_size (inode)*4) || - (have_small_tails (inode->i_sb) && - inode->i_size > i_block_size(inode)) ) - REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ; - else if ( (have_large_tails (inode->i_sb) && - inode->i_size < i_block_size (inode)*4) || - (have_small_tails (inode->i_sb) && - inode->i_size < i_block_size(inode)) ) - REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ; - + int status; // return status of block_commit_write. + int retval = 0; // Return value we are going to return. + int i; // loop counter + int offset; // Writing offset in page. + int orig_write_bytes = write_bytes; + int sd_update = 0; + + for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages; + i++, offset = 0) { + int count = min_t(int, PAGE_CACHE_SIZE - offset, write_bytes); // How much of bytes to write to this page + struct page *page = prepared_pages[i]; // Current page we process. + + status = + reiserfs_commit_page(inode, page, offset, offset + count); + if (status) + retval = status; // To not overcomplicate matters We are going to + // submit all the pages even if there was error. + // we only remember error status to report it on + // exit. + write_bytes -= count; + } + /* now that we've gotten all the ordered buffers marked dirty, + * we can safely update i_size and close any running transaction + */ + if (pos + orig_write_bytes > inode->i_size) { + inode->i_size = pos + orig_write_bytes; // Set new size + /* If the file have grown so much that tail packing is no + * longer possible, reset "need to pack" flag */ + if ((have_large_tails(inode->i_sb) && + inode->i_size > i_block_size(inode) * 4) || + (have_small_tails(inode->i_sb) && + inode->i_size > i_block_size(inode))) + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + else if ((have_large_tails(inode->i_sb) && + inode->i_size < i_block_size(inode) * 4) || + (have_small_tails(inode->i_sb) && + inode->i_size < i_block_size(inode))) + REISERFS_I(inode)->i_flags |= i_pack_on_close_mask; + + if (th->t_trans_id) { + reiserfs_write_lock(inode->i_sb); + reiserfs_update_sd(th, inode); // And update on-disk metadata + reiserfs_write_unlock(inode->i_sb); + } else + inode->i_sb->s_op->dirty_inode(inode); + + sd_update = 1; + } if (th->t_trans_id) { - reiserfs_write_lock(inode->i_sb); - reiserfs_update_sd(th, inode); // And update on-disk metadata - reiserfs_write_unlock(inode->i_sb); - } else - inode->i_sb->s_op->dirty_inode(inode); + reiserfs_write_lock(inode->i_sb); + if (!sd_update) + reiserfs_update_sd(th, inode); + status = journal_end(th, th->t_super, th->t_blocks_allocated); + if (status) + retval = status; + reiserfs_write_unlock(inode->i_sb); + } + th->t_trans_id = 0; - sd_update = 1; - } - if (th->t_trans_id) { - reiserfs_write_lock(inode->i_sb); - if (!sd_update) - reiserfs_update_sd(th, inode); - status = journal_end(th, th->t_super, th->t_blocks_allocated); - if (status) - retval = status; - reiserfs_write_unlock(inode->i_sb); - } - th->t_trans_id = 0; - - /* - * we have to unlock the pages after updating i_size, otherwise - * we race with writepage - */ - for ( i = 0; i < num_pages ; i++) { - struct page *page=prepared_pages[i]; - unlock_page(page); - mark_page_accessed(page); - page_cache_release(page); - } - return retval; + /* + * we have to unlock the pages after updating i_size, otherwise + * we race with writepage + */ + for (i = 0; i < num_pages; i++) { + struct page *page = prepared_pages[i]; + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + } + return retval; } /* Look if passed writing region is going to touch file's tail (if it is present). And if it is, convert the tail to unformatted node */ -static int reiserfs_check_for_tail_and_convert( struct inode *inode, /* inode to deal with */ - loff_t pos, /* Writing position */ - int write_bytes /* amount of bytes to write */ - ) +static int reiserfs_check_for_tail_and_convert(struct inode *inode, /* inode to deal with */ + loff_t pos, /* Writing position */ + int write_bytes /* amount of bytes to write */ + ) { - INITIALIZE_PATH(path); // needed for search_for_position - struct cpu_key key; // Key that would represent last touched writing byte. - struct item_head *ih; // item header of found block; - int res; // Return value of various functions we call. - int cont_expand_offset; // We will put offset for generic_cont_expand here - // This can be int just because tails are created - // only for small files. - + INITIALIZE_PATH(path); // needed for search_for_position + struct cpu_key key; // Key that would represent last touched writing byte. + struct item_head *ih; // item header of found block; + int res; // Return value of various functions we call. + int cont_expand_offset; // We will put offset for generic_cont_expand here + // This can be int just because tails are created + // only for small files. + /* this embodies a dependency on a particular tail policy */ - if ( inode->i_size >= inode->i_sb->s_blocksize*4 ) { - /* such a big files do not have tails, so we won't bother ourselves - to look for tails, simply return */ - return 0; - } - - reiserfs_write_lock(inode->i_sb); - /* find the item containing the last byte to be written, or if - * writing past the end of the file then the last item of the - * file (and then we check its type). */ - make_cpu_key (&key, inode, pos+write_bytes+1, TYPE_ANY, 3/*key length*/); - res = search_for_position_by_key(inode->i_sb, &key, &path); - if ( res == IO_ERROR ) { - reiserfs_write_unlock(inode->i_sb); - return -EIO; - } - ih = get_ih(&path); - res = 0; - if ( is_direct_le_ih(ih) ) { - /* Ok, closest item is file tail (tails are stored in "direct" - * items), so we need to unpack it. */ - /* To not overcomplicate matters, we just call generic_cont_expand - which will in turn call other stuff and finally will boil down to - reiserfs_get_block() that would do necessary conversion. */ - cont_expand_offset = le_key_k_offset(get_inode_item_key_version(inode), &(ih->ih_key)); - pathrelse(&path); - res = generic_cont_expand( inode, cont_expand_offset); - } else - pathrelse(&path); + if (inode->i_size >= inode->i_sb->s_blocksize * 4) { + /* such a big files do not have tails, so we won't bother ourselves + to look for tails, simply return */ + return 0; + } - reiserfs_write_unlock(inode->i_sb); - return res; + reiserfs_write_lock(inode->i_sb); + /* find the item containing the last byte to be written, or if + * writing past the end of the file then the last item of the + * file (and then we check its type). */ + make_cpu_key(&key, inode, pos + write_bytes + 1, TYPE_ANY, + 3 /*key length */ ); + res = search_for_position_by_key(inode->i_sb, &key, &path); + if (res == IO_ERROR) { + reiserfs_write_unlock(inode->i_sb); + return -EIO; + } + ih = get_ih(&path); + res = 0; + if (is_direct_le_ih(ih)) { + /* Ok, closest item is file tail (tails are stored in "direct" + * items), so we need to unpack it. */ + /* To not overcomplicate matters, we just call generic_cont_expand + which will in turn call other stuff and finally will boil down to + reiserfs_get_block() that would do necessary conversion. */ + cont_expand_offset = + le_key_k_offset(get_inode_item_key_version(inode), + &(ih->ih_key)); + pathrelse(&path); + res = generic_cont_expand(inode, cont_expand_offset); + } else + pathrelse(&path); + + reiserfs_write_unlock(inode->i_sb); + return res; } /* This function locks pages starting from @pos for @inode. @@ -851,275 +947,296 @@ static int reiserfs_check_for_tail_and_convert( struct inode *inode, /* inode to append), it is zeroed, then. Returns number of unallocated blocks that should be allocated to cover new file data.*/ -static int reiserfs_prepare_file_region_for_write( - struct inode *inode /* Inode of the file */, - loff_t pos, /* position in the file */ - size_t num_pages, /* number of pages to - prepare */ - size_t write_bytes, /* Amount of bytes to be - overwritten from - @pos */ - struct page **prepared_pages /* pointer to array - where to store - prepared pages */ - ) +static int reiserfs_prepare_file_region_for_write(struct inode *inode + /* Inode of the file */ , + loff_t pos, /* position in the file */ + size_t num_pages, /* number of pages to + prepare */ + size_t write_bytes, /* Amount of bytes to be + overwritten from + @pos */ + struct page **prepared_pages /* pointer to array + where to store + prepared pages */ + ) { - int res=0; // Return values of different functions we call. - unsigned long index = pos >> PAGE_CACHE_SHIFT; // Offset in file in pages. - int from = (pos & (PAGE_CACHE_SIZE - 1)); // Writing offset in first page - int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1; - /* offset of last modified byte in last - page */ - struct address_space *mapping = inode->i_mapping; // Pages are mapped here. - int i; // Simple counter - int blocks = 0; /* Return value (blocks that should be allocated) */ - struct buffer_head *bh, *head; // Current bufferhead and first bufferhead - // of a page. - unsigned block_start, block_end; // Starting and ending offsets of current - // buffer in the page. - struct buffer_head *wait[2], **wait_bh=wait; // Buffers for page, if - // Page appeared to be not up - // to date. Note how we have - // at most 2 buffers, this is - // because we at most may - // partially overwrite two - // buffers for one page. One at // the beginning of write area - // and one at the end. - // Everything inthe middle gets // overwritten totally. - - struct cpu_key key; // cpu key of item that we are going to deal with - struct item_head *ih = NULL; // pointer to item head that we are going to deal with - struct buffer_head *itembuf=NULL; // Buffer head that contains items that we are going to deal with - INITIALIZE_PATH(path); // path to item, that we are going to deal with. - __le32 * item=NULL; // pointer to item we are going to deal with - int item_pos=-1; /* Position in indirect item */ - - - if ( num_pages < 1 ) { - reiserfs_warning (inode->i_sb, - "green-9001: reiserfs_prepare_file_region_for_write " - "called with zero number of pages to process"); - return -EFAULT; - } - - /* We have 2 loops for pages. In first loop we grab and lock the pages, so - that nobody would touch these until we release the pages. Then - we'd start to deal with mapping buffers to blocks. */ - for ( i = 0; i < num_pages; i++) { - prepared_pages[i] = grab_cache_page(mapping, index + i); // locks the page - if ( !prepared_pages[i]) { - res = -ENOMEM; - goto failed_page_grabbing; - } - if (!page_has_buffers(prepared_pages[i])) - create_empty_buffers(prepared_pages[i], inode->i_sb->s_blocksize, 0); - } - - /* Let's count amount of blocks for a case where all the blocks - overwritten are new (we will substract already allocated blocks later)*/ - if ( num_pages > 2 ) - /* These are full-overwritten pages so we count all the blocks in - these pages are counted as needed to be allocated */ - blocks = (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits); - - /* count blocks needed for first page (possibly partially written) */ - blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) + - !!(from & (inode->i_sb->s_blocksize-1)); /* roundup */ - - /* Now we account for last page. If last page == first page (we - overwrite only one page), we substract all the blocks past the - last writing position in a page out of already calculated number - of blocks */ - blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT-inode->i_blkbits)) - - ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits); - /* Note how we do not roundup here since partial blocks still - should be allocated */ - - /* Now if all the write area lies past the file end, no point in - maping blocks, since there is none, so we just zero out remaining - parts of first and last pages in write area (if needed) */ - if ( (pos & ~((loff_t)PAGE_CACHE_SIZE - 1)) > inode->i_size ) { - if ( from != 0 ) {/* First page needs to be partially zeroed */ - char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0); - memset(kaddr, 0, from); - kunmap_atomic( kaddr, KM_USER0); - } - if ( to != PAGE_CACHE_SIZE ) { /* Last page needs to be partially zeroed */ - char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0); - memset(kaddr+to, 0, PAGE_CACHE_SIZE - to); - kunmap_atomic( kaddr, KM_USER0); + int res = 0; // Return values of different functions we call. + unsigned long index = pos >> PAGE_CACHE_SHIFT; // Offset in file in pages. + int from = (pos & (PAGE_CACHE_SIZE - 1)); // Writing offset in first page + int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1; + /* offset of last modified byte in last + page */ + struct address_space *mapping = inode->i_mapping; // Pages are mapped here. + int i; // Simple counter + int blocks = 0; /* Return value (blocks that should be allocated) */ + struct buffer_head *bh, *head; // Current bufferhead and first bufferhead + // of a page. + unsigned block_start, block_end; // Starting and ending offsets of current + // buffer in the page. + struct buffer_head *wait[2], **wait_bh = wait; // Buffers for page, if + // Page appeared to be not up + // to date. Note how we have + // at most 2 buffers, this is + // because we at most may + // partially overwrite two + // buffers for one page. One at // the beginning of write area + // and one at the end. + // Everything inthe middle gets // overwritten totally. + + struct cpu_key key; // cpu key of item that we are going to deal with + struct item_head *ih = NULL; // pointer to item head that we are going to deal with + struct buffer_head *itembuf = NULL; // Buffer head that contains items that we are going to deal with + INITIALIZE_PATH(path); // path to item, that we are going to deal with. + __le32 *item = NULL; // pointer to item we are going to deal with + int item_pos = -1; /* Position in indirect item */ + + if (num_pages < 1) { + reiserfs_warning(inode->i_sb, + "green-9001: reiserfs_prepare_file_region_for_write " + "called with zero number of pages to process"); + return -EFAULT; } - /* Since all blocks are new - use already calculated value */ - return blocks; - } - - /* Well, since we write somewhere into the middle of a file, there is - possibility we are writing over some already allocated blocks, so - let's map these blocks and substract number of such blocks out of blocks - we need to allocate (calculated above) */ - /* Mask write position to start on blocksize, we do it out of the - loop for performance reasons */ - pos &= ~((loff_t) inode->i_sb->s_blocksize - 1); - /* Set cpu key to the starting position in a file (on left block boundary)*/ - make_cpu_key (&key, inode, 1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)), TYPE_ANY, 3/*key length*/); - - reiserfs_write_lock(inode->i_sb); // We need that for at least search_by_key() - for ( i = 0; i < num_pages ; i++ ) { - - head = page_buffers(prepared_pages[i]); - /* For each buffer in the page */ - for(bh = head, block_start = 0; bh != head || !block_start; - block_start=block_end, bh = bh->b_this_page) { - if (!bh) - reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?"); - /* Find where this buffer ends */ - block_end = block_start+inode->i_sb->s_blocksize; - if (i == 0 && block_end <= from ) - /* if this buffer is before requested data to map, skip it*/ - continue; - - if (i == num_pages - 1 && block_start >= to) { - /* If this buffer is after requested data to map, abort - processing of current page */ - break; + /* We have 2 loops for pages. In first loop we grab and lock the pages, so + that nobody would touch these until we release the pages. Then + we'd start to deal with mapping buffers to blocks. */ + for (i = 0; i < num_pages; i++) { + prepared_pages[i] = grab_cache_page(mapping, index + i); // locks the page + if (!prepared_pages[i]) { + res = -ENOMEM; + goto failed_page_grabbing; } + if (!page_has_buffers(prepared_pages[i])) + create_empty_buffers(prepared_pages[i], + inode->i_sb->s_blocksize, 0); + } - if ( buffer_mapped(bh) && bh->b_blocknr !=0 ) { - /* This is optimisation for a case where buffer is mapped - and have blocknumber assigned. In case significant amount - of such buffers are present, we may avoid some amount - of search_by_key calls. - Probably it would be possible to move parts of this code - out of BKL, but I afraid that would overcomplicate code - without any noticeable benefit. - */ - item_pos++; - /* Update the key */ - set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize); - blocks--; // Decrease the amount of blocks that need to be - // allocated - continue; // Go to the next buffer + /* Let's count amount of blocks for a case where all the blocks + overwritten are new (we will substract already allocated blocks later) */ + if (num_pages > 2) + /* These are full-overwritten pages so we count all the blocks in + these pages are counted as needed to be allocated */ + blocks = + (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + /* count blocks needed for first page (possibly partially written) */ + blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) + !!(from & (inode->i_sb->s_blocksize - 1)); /* roundup */ + + /* Now we account for last page. If last page == first page (we + overwrite only one page), we substract all the blocks past the + last writing position in a page out of already calculated number + of blocks */ + blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - + ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits); + /* Note how we do not roundup here since partial blocks still + should be allocated */ + + /* Now if all the write area lies past the file end, no point in + maping blocks, since there is none, so we just zero out remaining + parts of first and last pages in write area (if needed) */ + if ((pos & ~((loff_t) PAGE_CACHE_SIZE - 1)) > inode->i_size) { + if (from != 0) { /* First page needs to be partially zeroed */ + char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0); + memset(kaddr, 0, from); + kunmap_atomic(kaddr, KM_USER0); + } + if (to != PAGE_CACHE_SIZE) { /* Last page needs to be partially zeroed */ + char *kaddr = + kmap_atomic(prepared_pages[num_pages - 1], + KM_USER0); + memset(kaddr + to, 0, PAGE_CACHE_SIZE - to); + kunmap_atomic(kaddr, KM_USER0); } - if ( !itembuf || /* if first iteration */ - item_pos >= ih_item_len(ih)/UNFM_P_SIZE) - { /* or if we progressed past the - current unformatted_item */ - /* Try to find next item */ - res = search_for_position_by_key(inode->i_sb, &key, &path); - /* Abort if no more items */ - if ( res != POSITION_FOUND ) { - /* make sure later loops don't use this item */ - itembuf = NULL; - item = NULL; - break; + /* Since all blocks are new - use already calculated value */ + return blocks; + } + + /* Well, since we write somewhere into the middle of a file, there is + possibility we are writing over some already allocated blocks, so + let's map these blocks and substract number of such blocks out of blocks + we need to allocate (calculated above) */ + /* Mask write position to start on blocksize, we do it out of the + loop for performance reasons */ + pos &= ~((loff_t) inode->i_sb->s_blocksize - 1); + /* Set cpu key to the starting position in a file (on left block boundary) */ + make_cpu_key(&key, inode, + 1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)), + TYPE_ANY, 3 /*key length */ ); + + reiserfs_write_lock(inode->i_sb); // We need that for at least search_by_key() + for (i = 0; i < num_pages; i++) { + + head = page_buffers(prepared_pages[i]); + /* For each buffer in the page */ + for (bh = head, block_start = 0; bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + if (!bh) + reiserfs_panic(inode->i_sb, + "green-9002: Allocated but absent buffer for a page?"); + /* Find where this buffer ends */ + block_end = block_start + inode->i_sb->s_blocksize; + if (i == 0 && block_end <= from) + /* if this buffer is before requested data to map, skip it */ + continue; + + if (i == num_pages - 1 && block_start >= to) { + /* If this buffer is after requested data to map, abort + processing of current page */ + break; } - /* Update information about current indirect item */ - itembuf = get_last_bh( &path ); - ih = get_ih( &path ); - item = get_item( &path ); - item_pos = path.pos_in_item; + if (buffer_mapped(bh) && bh->b_blocknr != 0) { + /* This is optimisation for a case where buffer is mapped + and have blocknumber assigned. In case significant amount + of such buffers are present, we may avoid some amount + of search_by_key calls. + Probably it would be possible to move parts of this code + out of BKL, but I afraid that would overcomplicate code + without any noticeable benefit. + */ + item_pos++; + /* Update the key */ + set_cpu_key_k_offset(&key, + cpu_key_k_offset(&key) + + inode->i_sb->s_blocksize); + blocks--; // Decrease the amount of blocks that need to be + // allocated + continue; // Go to the next buffer + } - RFALSE( !is_indirect_le_ih (ih), "green-9003: indirect item expected"); - } + if (!itembuf || /* if first iteration */ + item_pos >= ih_item_len(ih) / UNFM_P_SIZE) { /* or if we progressed past the + current unformatted_item */ + /* Try to find next item */ + res = + search_for_position_by_key(inode->i_sb, + &key, &path); + /* Abort if no more items */ + if (res != POSITION_FOUND) { + /* make sure later loops don't use this item */ + itembuf = NULL; + item = NULL; + break; + } + + /* Update information about current indirect item */ + itembuf = get_last_bh(&path); + ih = get_ih(&path); + item = get_item(&path); + item_pos = path.pos_in_item; + + RFALSE(!is_indirect_le_ih(ih), + "green-9003: indirect item expected"); + } - /* See if there is some block associated with the file - at that position, map the buffer to this block */ - if ( get_block_num(item,item_pos) ) { - map_bh(bh, inode->i_sb, get_block_num(item,item_pos)); - blocks--; // Decrease the amount of blocks that need to be - // allocated + /* See if there is some block associated with the file + at that position, map the buffer to this block */ + if (get_block_num(item, item_pos)) { + map_bh(bh, inode->i_sb, + get_block_num(item, item_pos)); + blocks--; // Decrease the amount of blocks that need to be + // allocated + } + item_pos++; + /* Update the key */ + set_cpu_key_k_offset(&key, + cpu_key_k_offset(&key) + + inode->i_sb->s_blocksize); } - item_pos++; - /* Update the key */ - set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize); } - } - pathrelse(&path); // Free the path - reiserfs_write_unlock(inode->i_sb); + pathrelse(&path); // Free the path + reiserfs_write_unlock(inode->i_sb); /* Now zero out unmappend buffers for the first and last pages of write area or issue read requests if page is mapped. */ /* First page, see if it is not uptodate */ - if ( !PageUptodate(prepared_pages[0]) ) { - head = page_buffers(prepared_pages[0]); - - /* For each buffer in page */ - for(bh = head, block_start = 0; bh != head || !block_start; - block_start=block_end, bh = bh->b_this_page) { - - if (!bh) - reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?"); - /* Find where this buffer ends */ - block_end = block_start+inode->i_sb->s_blocksize; - if ( block_end <= from ) - /* if this buffer is before requested data to map, skip it*/ - continue; - if ( block_start < from ) { /* Aha, our partial buffer */ - if ( buffer_mapped(bh) ) { /* If it is mapped, we need to - issue READ request for it to - not loose data */ - ll_rw_block(READ, 1, &bh); - *wait_bh++=bh; - } else { /* Not mapped, zero it */ - char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0); - memset(kaddr+block_start, 0, from-block_start); - kunmap_atomic( kaddr, KM_USER0); - set_buffer_uptodate(bh); - } + if (!PageUptodate(prepared_pages[0])) { + head = page_buffers(prepared_pages[0]); + + /* For each buffer in page */ + for (bh = head, block_start = 0; bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + + if (!bh) + reiserfs_panic(inode->i_sb, + "green-9002: Allocated but absent buffer for a page?"); + /* Find where this buffer ends */ + block_end = block_start + inode->i_sb->s_blocksize; + if (block_end <= from) + /* if this buffer is before requested data to map, skip it */ + continue; + if (block_start < from) { /* Aha, our partial buffer */ + if (buffer_mapped(bh)) { /* If it is mapped, we need to + issue READ request for it to + not loose data */ + ll_rw_block(READ, 1, &bh); + *wait_bh++ = bh; + } else { /* Not mapped, zero it */ + char *kaddr = + kmap_atomic(prepared_pages[0], + KM_USER0); + memset(kaddr + block_start, 0, + from - block_start); + kunmap_atomic(kaddr, KM_USER0); + set_buffer_uptodate(bh); + } + } } - } } /* Last page, see if it is not uptodate, or if the last page is past the end of the file. */ - if ( !PageUptodate(prepared_pages[num_pages-1]) || - ((pos+write_bytes)>>PAGE_CACHE_SHIFT) > (inode->i_size>>PAGE_CACHE_SHIFT) ) { - head = page_buffers(prepared_pages[num_pages-1]); - - /* for each buffer in page */ - for(bh = head, block_start = 0; bh != head || !block_start; - block_start=block_end, bh = bh->b_this_page) { - - if (!bh) - reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?"); - /* Find where this buffer ends */ - block_end = block_start+inode->i_sb->s_blocksize; - if ( block_start >= to ) - /* if this buffer is after requested data to map, skip it*/ - break; - if ( block_end > to ) { /* Aha, our partial buffer */ - if ( buffer_mapped(bh) ) { /* If it is mapped, we need to - issue READ request for it to - not loose data */ - ll_rw_block(READ, 1, &bh); - *wait_bh++=bh; - } else { /* Not mapped, zero it */ - char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0); - memset(kaddr+to, 0, block_end-to); - kunmap_atomic( kaddr, KM_USER0); - set_buffer_uptodate(bh); - } + if (!PageUptodate(prepared_pages[num_pages - 1]) || + ((pos + write_bytes) >> PAGE_CACHE_SHIFT) > + (inode->i_size >> PAGE_CACHE_SHIFT)) { + head = page_buffers(prepared_pages[num_pages - 1]); + + /* for each buffer in page */ + for (bh = head, block_start = 0; bh != head || !block_start; + block_start = block_end, bh = bh->b_this_page) { + + if (!bh) + reiserfs_panic(inode->i_sb, + "green-9002: Allocated but absent buffer for a page?"); + /* Find where this buffer ends */ + block_end = block_start + inode->i_sb->s_blocksize; + if (block_start >= to) + /* if this buffer is after requested data to map, skip it */ + break; + if (block_end > to) { /* Aha, our partial buffer */ + if (buffer_mapped(bh)) { /* If it is mapped, we need to + issue READ request for it to + not loose data */ + ll_rw_block(READ, 1, &bh); + *wait_bh++ = bh; + } else { /* Not mapped, zero it */ + char *kaddr = + kmap_atomic(prepared_pages + [num_pages - 1], + KM_USER0); + memset(kaddr + to, 0, block_end - to); + kunmap_atomic(kaddr, KM_USER0); + set_buffer_uptodate(bh); + } + } } - } } - /* Wait for read requests we made to happen, if necessary */ - while(wait_bh > wait) { - wait_on_buffer(*--wait_bh); - if (!buffer_uptodate(*wait_bh)) { - res = -EIO; - goto failed_read; + /* Wait for read requests we made to happen, if necessary */ + while (wait_bh > wait) { + wait_on_buffer(*--wait_bh); + if (!buffer_uptodate(*wait_bh)) { + res = -EIO; + goto failed_read; + } } - } - - return blocks; -failed_page_grabbing: - num_pages = i; -failed_read: - reiserfs_unprepare_pages(prepared_pages, num_pages); - return res; + + return blocks; + failed_page_grabbing: + num_pages = i; + failed_read: + reiserfs_unprepare_pages(prepared_pages, num_pages); + return res; } /* Write @count bytes at position @ppos in a file indicated by @file @@ -1148,262 +1265,305 @@ failed_read: Future Features: providing search_by_key with hints. */ -static ssize_t reiserfs_file_write( struct file *file, /* the file we are going to write into */ - const char __user *buf, /* pointer to user supplied data -(in userspace) */ - size_t count, /* amount of bytes to write */ - loff_t *ppos /* pointer to position in file that we start writing at. Should be updated to - * new current position before returning. */ ) +static ssize_t reiserfs_file_write(struct file *file, /* the file we are going to write into */ + const char __user * buf, /* pointer to user supplied data + (in userspace) */ + size_t count, /* amount of bytes to write */ + loff_t * ppos /* pointer to position in file that we start writing at. Should be updated to + * new current position before returning. */ + ) { - size_t already_written = 0; // Number of bytes already written to the file. - loff_t pos; // Current position in the file. - ssize_t res; // return value of various functions that we call. - int err = 0; - struct inode *inode = file->f_dentry->d_inode; // Inode of the file that we are writing to. - /* To simplify coding at this time, we store - locked pages in array for now */ - struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME]; - struct reiserfs_transaction_handle th; - th.t_trans_id = 0; - - if ( file->f_flags & O_DIRECT) { // Direct IO needs treatment - ssize_t result, after_file_end = 0; - if ( (*ppos + count >= inode->i_size) || (file->f_flags & O_APPEND) ) { - /* If we are appending a file, we need to put this savelink in here. - If we will crash while doing direct io, finish_unfinished will - cut the garbage from the file end. */ - reiserfs_write_lock(inode->i_sb); - err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT ); - if (err) { - reiserfs_write_unlock (inode->i_sb); - return err; - } - reiserfs_update_inode_transaction(inode); - add_save_link (&th, inode, 1 /* Truncate */); - after_file_end = 1; - err = journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT ); - reiserfs_write_unlock(inode->i_sb); - if (err) - return err; - } - result = generic_file_write(file, buf, count, ppos); - - if ( after_file_end ) { /* Now update i_size and remove the savelink */ - struct reiserfs_transaction_handle th; - reiserfs_write_lock(inode->i_sb); - err = journal_begin(&th, inode->i_sb, 1); - if (err) { - reiserfs_write_unlock (inode->i_sb); - return err; - } - reiserfs_update_inode_transaction(inode); - reiserfs_update_sd(&th, inode); - err = journal_end(&th, inode->i_sb, 1); - if (err) { - reiserfs_write_unlock (inode->i_sb); - return err; - } - err = remove_save_link (inode, 1/* truncate */); - reiserfs_write_unlock(inode->i_sb); - if (err) - return err; - } - - return result; - } - - if ( unlikely((ssize_t) count < 0 )) - return -EINVAL; - - if (unlikely(!access_ok(VERIFY_READ, buf, count))) - return -EFAULT; - - down(&inode->i_sem); // locks the entire file for just us - - pos = *ppos; - - /* Check if we can write to specified region of file, file - is not overly big and this kind of stuff. Adjust pos and - count, if needed */ - res = generic_write_checks(file, &pos, &count, 0); - if (res) - goto out; - - if ( count == 0 ) - goto out; - - res = remove_suid(file->f_dentry); - if (res) - goto out; - - inode_update_time(inode, 1); /* Both mtime and ctime */ - - // Ok, we are done with all the checks. + size_t already_written = 0; // Number of bytes already written to the file. + loff_t pos; // Current position in the file. + ssize_t res; // return value of various functions that we call. + int err = 0; + struct inode *inode = file->f_dentry->d_inode; // Inode of the file that we are writing to. + /* To simplify coding at this time, we store + locked pages in array for now */ + struct page *prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME]; + struct reiserfs_transaction_handle th; + th.t_trans_id = 0; + + if (file->f_flags & O_DIRECT) { // Direct IO needs treatment + ssize_t result, after_file_end = 0; + if ((*ppos + count >= inode->i_size) + || (file->f_flags & O_APPEND)) { + /* If we are appending a file, we need to put this savelink in here. + If we will crash while doing direct io, finish_unfinished will + cut the garbage from the file end. */ + reiserfs_write_lock(inode->i_sb); + err = + journal_begin(&th, inode->i_sb, + JOURNAL_PER_BALANCE_CNT); + if (err) { + reiserfs_write_unlock(inode->i_sb); + return err; + } + reiserfs_update_inode_transaction(inode); + add_save_link(&th, inode, 1 /* Truncate */ ); + after_file_end = 1; + err = + journal_end(&th, inode->i_sb, + JOURNAL_PER_BALANCE_CNT); + reiserfs_write_unlock(inode->i_sb); + if (err) + return err; + } + result = generic_file_write(file, buf, count, ppos); + + if (after_file_end) { /* Now update i_size and remove the savelink */ + struct reiserfs_transaction_handle th; + reiserfs_write_lock(inode->i_sb); + err = journal_begin(&th, inode->i_sb, 1); + if (err) { + reiserfs_write_unlock(inode->i_sb); + return err; + } + reiserfs_update_inode_transaction(inode); + reiserfs_update_sd(&th, inode); + err = journal_end(&th, inode->i_sb, 1); + if (err) { + reiserfs_write_unlock(inode->i_sb); + return err; + } + err = remove_save_link(inode, 1 /* truncate */ ); + reiserfs_write_unlock(inode->i_sb); + if (err) + return err; + } - // Now we should start real work + return result; + } - /* If we are going to write past the file's packed tail or if we are going - to overwrite part of the tail, we need that tail to be converted into - unformatted node */ - res = reiserfs_check_for_tail_and_convert( inode, pos, count); - if (res) - goto out; + if (unlikely((ssize_t) count < 0)) + return -EINVAL; + + if (unlikely(!access_ok(VERIFY_READ, buf, count))) + return -EFAULT; + + down(&inode->i_sem); // locks the entire file for just us + + pos = *ppos; + + /* Check if we can write to specified region of file, file + is not overly big and this kind of stuff. Adjust pos and + count, if needed */ + res = generic_write_checks(file, &pos, &count, 0); + if (res) + goto out; + + if (count == 0) + goto out; + + res = remove_suid(file->f_dentry); + if (res) + goto out; + + inode_update_time(inode, 1); /* Both mtime and ctime */ + + // Ok, we are done with all the checks. + + // Now we should start real work + + /* If we are going to write past the file's packed tail or if we are going + to overwrite part of the tail, we need that tail to be converted into + unformatted node */ + res = reiserfs_check_for_tail_and_convert(inode, pos, count); + if (res) + goto out; + + while (count > 0) { + /* This is the main loop in which we running until some error occures + or until we write all of the data. */ + size_t num_pages; /* amount of pages we are going to write this iteration */ + size_t write_bytes; /* amount of bytes to write during this iteration */ + size_t blocks_to_allocate; /* how much blocks we need to allocate for this iteration */ + + /* (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos */ + num_pages = !!((pos + count) & (PAGE_CACHE_SIZE - 1)) + /* round up partial + pages */ + ((count + + (pos & (PAGE_CACHE_SIZE - 1))) >> PAGE_CACHE_SHIFT); + /* convert size to amount of + pages */ + reiserfs_write_lock(inode->i_sb); + if (num_pages > REISERFS_WRITE_PAGES_AT_A_TIME + || num_pages > reiserfs_can_fit_pages(inode->i_sb)) { + /* If we were asked to write more data than we want to or if there + is not that much space, then we shorten amount of data to write + for this iteration. */ + num_pages = + min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME, + reiserfs_can_fit_pages(inode->i_sb)); + /* Also we should not forget to set size in bytes accordingly */ + write_bytes = (num_pages << PAGE_CACHE_SHIFT) - + (pos & (PAGE_CACHE_SIZE - 1)); + /* If position is not on the + start of the page, we need + to substract the offset + within page */ + } else + write_bytes = count; + + /* reserve the blocks to be allocated later, so that later on + we still have the space to write the blocks to */ + reiserfs_claim_blocks_to_be_allocated(inode->i_sb, + num_pages << + (PAGE_CACHE_SHIFT - + inode->i_blkbits)); + reiserfs_write_unlock(inode->i_sb); + + if (!num_pages) { /* If we do not have enough space even for a single page... */ + if (pos > + inode->i_size + inode->i_sb->s_blocksize - + (pos & (inode->i_sb->s_blocksize - 1))) { + res = -ENOSPC; + break; // In case we are writing past the end of the last file block, break. + } + // Otherwise we are possibly overwriting the file, so + // let's set write size to be equal or less than blocksize. + // This way we get it correctly for file holes. + // But overwriting files on absolutelly full volumes would not + // be very efficient. Well, people are not supposed to fill + // 100% of disk space anyway. + write_bytes = + min_t(size_t, count, + inode->i_sb->s_blocksize - + (pos & (inode->i_sb->s_blocksize - 1))); + num_pages = 1; + // No blocks were claimed before, so do it now. + reiserfs_claim_blocks_to_be_allocated(inode->i_sb, + 1 << + (PAGE_CACHE_SHIFT + - + inode-> + i_blkbits)); + } - while ( count > 0) { - /* This is the main loop in which we running until some error occures - or until we write all of the data. */ - size_t num_pages;/* amount of pages we are going to write this iteration */ - size_t write_bytes; /* amount of bytes to write during this iteration */ - size_t blocks_to_allocate; /* how much blocks we need to allocate for this iteration */ - - /* (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos*/ - num_pages = !!((pos+count) & (PAGE_CACHE_SIZE - 1)) + /* round up partial - pages */ - ((count + (pos & (PAGE_CACHE_SIZE-1))) >> PAGE_CACHE_SHIFT); - /* convert size to amount of - pages */ - reiserfs_write_lock(inode->i_sb); - if ( num_pages > REISERFS_WRITE_PAGES_AT_A_TIME - || num_pages > reiserfs_can_fit_pages(inode->i_sb) ) { - /* If we were asked to write more data than we want to or if there - is not that much space, then we shorten amount of data to write - for this iteration. */ - num_pages = min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME, reiserfs_can_fit_pages(inode->i_sb)); - /* Also we should not forget to set size in bytes accordingly */ - write_bytes = (num_pages << PAGE_CACHE_SHIFT) - - (pos & (PAGE_CACHE_SIZE-1)); - /* If position is not on the - start of the page, we need - to substract the offset - within page */ - } else - write_bytes = count; + /* Prepare for writing into the region, read in all the + partially overwritten pages, if needed. And lock the pages, + so that nobody else can access these until we are done. + We get number of actual blocks needed as a result. */ + blocks_to_allocate = + reiserfs_prepare_file_region_for_write(inode, pos, + num_pages, + write_bytes, + prepared_pages); + if (blocks_to_allocate < 0) { + res = blocks_to_allocate; + reiserfs_release_claimed_blocks(inode->i_sb, + num_pages << + (PAGE_CACHE_SHIFT - + inode->i_blkbits)); + break; + } - /* reserve the blocks to be allocated later, so that later on - we still have the space to write the blocks to */ - reiserfs_claim_blocks_to_be_allocated(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits)); - reiserfs_write_unlock(inode->i_sb); + /* First we correct our estimate of how many blocks we need */ + reiserfs_release_claimed_blocks(inode->i_sb, + (num_pages << + (PAGE_CACHE_SHIFT - + inode->i_sb-> + s_blocksize_bits)) - + blocks_to_allocate); + + if (blocks_to_allocate > 0) { /*We only allocate blocks if we need to */ + /* Fill in all the possible holes and append the file if needed */ + res = + reiserfs_allocate_blocks_for_region(&th, inode, pos, + num_pages, + write_bytes, + prepared_pages, + blocks_to_allocate); + } - if ( !num_pages ) { /* If we do not have enough space even for a single page... */ - if ( pos > inode->i_size+inode->i_sb->s_blocksize-(pos & (inode->i_sb->s_blocksize-1))) { - res = -ENOSPC; - break; // In case we are writing past the end of the last file block, break. - } - // Otherwise we are possibly overwriting the file, so - // let's set write size to be equal or less than blocksize. - // This way we get it correctly for file holes. - // But overwriting files on absolutelly full volumes would not - // be very efficient. Well, people are not supposed to fill - // 100% of disk space anyway. - write_bytes = min_t(size_t, count, inode->i_sb->s_blocksize - (pos & (inode->i_sb->s_blocksize - 1))); - num_pages = 1; - // No blocks were claimed before, so do it now. - reiserfs_claim_blocks_to_be_allocated(inode->i_sb, 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)); - } + /* well, we have allocated the blocks, so it is time to free + the reservation we made earlier. */ + reiserfs_release_claimed_blocks(inode->i_sb, + blocks_to_allocate); + if (res) { + reiserfs_unprepare_pages(prepared_pages, num_pages); + break; + } - /* Prepare for writing into the region, read in all the - partially overwritten pages, if needed. And lock the pages, - so that nobody else can access these until we are done. - We get number of actual blocks needed as a result.*/ - blocks_to_allocate = reiserfs_prepare_file_region_for_write(inode, pos, num_pages, write_bytes, prepared_pages); - if ( blocks_to_allocate < 0 ) { - res = blocks_to_allocate; - reiserfs_release_claimed_blocks(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits)); - break; - } +/* NOTE that allocating blocks and filling blocks can be done in reverse order + and probably we would do that just to get rid of garbage in files after a + crash */ - /* First we correct our estimate of how many blocks we need */ - reiserfs_release_claimed_blocks(inode->i_sb, (num_pages << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits)) - blocks_to_allocate ); + /* Copy data from user-supplied buffer to file's pages */ + res = + reiserfs_copy_from_user_to_file_region(pos, num_pages, + write_bytes, + prepared_pages, buf); + if (res) { + reiserfs_unprepare_pages(prepared_pages, num_pages); + break; + } - if ( blocks_to_allocate > 0) {/*We only allocate blocks if we need to*/ - /* Fill in all the possible holes and append the file if needed */ - res = reiserfs_allocate_blocks_for_region(&th, inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate); + /* Send the pages to disk and unlock them. */ + res = + reiserfs_submit_file_region_for_write(&th, inode, pos, + num_pages, + write_bytes, + prepared_pages); + if (res) + break; + + already_written += write_bytes; + buf += write_bytes; + *ppos = pos += write_bytes; + count -= write_bytes; + balance_dirty_pages_ratelimited(inode->i_mapping); } - /* well, we have allocated the blocks, so it is time to free - the reservation we made earlier. */ - reiserfs_release_claimed_blocks(inode->i_sb, blocks_to_allocate); - if ( res ) { - reiserfs_unprepare_pages(prepared_pages, num_pages); - break; + /* this is only true on error */ + if (th.t_trans_id) { + reiserfs_write_lock(inode->i_sb); + err = journal_end(&th, th.t_super, th.t_blocks_allocated); + reiserfs_write_unlock(inode->i_sb); + if (err) { + res = err; + goto out; + } } -/* NOTE that allocating blocks and filling blocks can be done in reverse order - and probably we would do that just to get rid of garbage in files after a - crash */ + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) + res = + generic_osync_inode(inode, file->f_mapping, + OSYNC_METADATA | OSYNC_DATA); - /* Copy data from user-supplied buffer to file's pages */ - res = reiserfs_copy_from_user_to_file_region(pos, num_pages, write_bytes, prepared_pages, buf); - if ( res ) { - reiserfs_unprepare_pages(prepared_pages, num_pages); - break; - } + up(&inode->i_sem); + reiserfs_async_progress_wait(inode->i_sb); + return (already_written != 0) ? already_written : res; - /* Send the pages to disk and unlock them. */ - res = reiserfs_submit_file_region_for_write(&th, inode, pos, num_pages, - write_bytes,prepared_pages); - if ( res ) - break; - - already_written += write_bytes; - buf += write_bytes; - *ppos = pos += write_bytes; - count -= write_bytes; - balance_dirty_pages_ratelimited(inode->i_mapping); - } - - /* this is only true on error */ - if (th.t_trans_id) { - reiserfs_write_lock(inode->i_sb); - err = journal_end(&th, th.t_super, th.t_blocks_allocated); - reiserfs_write_unlock(inode->i_sb); - if (err) { - res = err; - goto out; - } - } - - if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) - res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA); - - up(&inode->i_sem); - reiserfs_async_progress_wait(inode->i_sb); - return (already_written != 0)?already_written:res; - -out: - up(&inode->i_sem); // unlock the file on exit. - return res; + out: + up(&inode->i_sem); // unlock the file on exit. + return res; } -static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user *buf, - size_t count, loff_t pos) +static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf, + size_t count, loff_t pos) { - return generic_file_aio_write(iocb, buf, count, pos); + return generic_file_aio_write(iocb, buf, count, pos); } - - struct file_operations reiserfs_file_operations = { - .read = generic_file_read, - .write = reiserfs_file_write, - .ioctl = reiserfs_ioctl, - .mmap = generic_file_mmap, - .release = reiserfs_file_release, - .fsync = reiserfs_sync_file, - .sendfile = generic_file_sendfile, - .aio_read = generic_file_aio_read, - .aio_write = reiserfs_aio_write, + .read = generic_file_read, + .write = reiserfs_file_write, + .ioctl = reiserfs_ioctl, + .mmap = generic_file_mmap, + .release = reiserfs_file_release, + .fsync = reiserfs_sync_file, + .sendfile = generic_file_sendfile, + .aio_read = generic_file_aio_read, + .aio_write = reiserfs_aio_write, }; - -struct inode_operations reiserfs_file_inode_operations = { - .truncate = reiserfs_vfs_truncate_file, - .setattr = reiserfs_setattr, - .setxattr = reiserfs_setxattr, - .getxattr = reiserfs_getxattr, - .listxattr = reiserfs_listxattr, - .removexattr = reiserfs_removexattr, - .permission = reiserfs_permission, +struct inode_operations reiserfs_file_inode_operations = { + .truncate = reiserfs_vfs_truncate_file, + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, }; - - diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index e4f64be9e15..2706e2adffa 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -34,14 +34,12 @@ ** **/ - #include <linux/config.h> #include <linux/time.h> #include <linux/string.h> #include <linux/reiserfs_fs.h> #include <linux/buffer_head.h> - /* To make any changes in the tree we find a node, that contains item to be changed/deleted or position in the node we insert a new item to. We call this node S. To do balancing we need to decide what we @@ -56,490 +54,522 @@ have to have if we do not any shiftings, if we shift to left/right neighbor or to both. */ - /* taking item number in virtual node, returns number of item, that it has in source buffer */ -static inline int old_item_num (int new_num, int affected_item_num, int mode) +static inline int old_item_num(int new_num, int affected_item_num, int mode) { - if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num) - return new_num; + if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num) + return new_num; - if (mode == M_INSERT) { + if (mode == M_INSERT) { - RFALSE( new_num == 0, - "vs-8005: for INSERT mode and item number of inserted item"); + RFALSE(new_num == 0, + "vs-8005: for INSERT mode and item number of inserted item"); - return new_num - 1; - } + return new_num - 1; + } - RFALSE( mode != M_DELETE, - "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", mode); - /* delete mode */ - return new_num + 1; + RFALSE(mode != M_DELETE, + "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", + mode); + /* delete mode */ + return new_num + 1; } -static void create_virtual_node (struct tree_balance * tb, int h) +static void create_virtual_node(struct tree_balance *tb, int h) { - struct item_head * ih; - struct virtual_node * vn = tb->tb_vn; - int new_num; - struct buffer_head * Sh; /* this comes from tb->S[h] */ + struct item_head *ih; + struct virtual_node *vn = tb->tb_vn; + int new_num; + struct buffer_head *Sh; /* this comes from tb->S[h] */ - Sh = PATH_H_PBUFFER (tb->tb_path, h); + Sh = PATH_H_PBUFFER(tb->tb_path, h); - /* size of changed node */ - vn->vn_size = MAX_CHILD_SIZE (Sh) - B_FREE_SPACE (Sh) + tb->insert_size[h]; + /* size of changed node */ + vn->vn_size = + MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h]; - /* for internal nodes array if virtual items is not created */ - if (h) { - vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE); - return; - } - - /* number of items in virtual node */ - vn->vn_nr_item = B_NR_ITEMS (Sh) + ((vn->vn_mode == M_INSERT)? 1 : 0) - ((vn->vn_mode == M_DELETE)? 1 : 0); - - /* first virtual item */ - vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1); - memset (vn->vn_vi, 0, vn->vn_nr_item * sizeof (struct virtual_item)); - vn->vn_free_ptr += vn->vn_nr_item * sizeof (struct virtual_item); - - - /* first item in the node */ - ih = B_N_PITEM_HEAD (Sh, 0); - - /* define the mergeability for 0-th item (if it is not being deleted) */ - if (op_is_left_mergeable (&(ih->ih_key), Sh->b_size) && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num)) - vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE; - - /* go through all items those remain in the virtual node (except for the new (inserted) one) */ - for (new_num = 0; new_num < vn->vn_nr_item; new_num ++) { - int j; - struct virtual_item * vi = vn->vn_vi + new_num; - int is_affected = ((new_num != vn->vn_affected_item_num) ? 0 : 1); - - - if (is_affected && vn->vn_mode == M_INSERT) - continue; - - /* get item number in source node */ - j = old_item_num (new_num, vn->vn_affected_item_num, vn->vn_mode); - - vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE; - vi->vi_ih = ih + j; - vi->vi_item = B_I_PITEM (Sh, ih + j); - vi->vi_uarea = vn->vn_free_ptr; - - // FIXME: there is no check, that item operation did not - // consume too much memory - vn->vn_free_ptr += op_create_vi (vn, vi, is_affected, tb->insert_size [0]); - if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr) - reiserfs_panic (tb->tb_sb, "vs-8030: create_virtual_node: " - "virtual node space consumed"); - - if (!is_affected) - /* this is not being changed */ - continue; - - if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) { - vn->vn_vi[new_num].vi_item_len += tb->insert_size[0]; - vi->vi_new_data = vn->vn_data; // pointer to data which is going to be pasted + /* for internal nodes array if virtual items is not created */ + if (h) { + vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE); + return; } - } - - - /* virtual inserted item is not defined yet */ - if (vn->vn_mode == M_INSERT) { - struct virtual_item * vi = vn->vn_vi + vn->vn_affected_item_num; - - RFALSE( vn->vn_ins_ih == 0, - "vs-8040: item header of inserted item is not specified"); - vi->vi_item_len = tb->insert_size[0]; - vi->vi_ih = vn->vn_ins_ih; - vi->vi_item = vn->vn_data; - vi->vi_uarea = vn->vn_free_ptr; - - op_create_vi (vn, vi, 0/*not pasted or cut*/, tb->insert_size [0]); - } - - /* set right merge flag we take right delimiting key and check whether it is a mergeable item */ - if (tb->CFR[0]) { - struct reiserfs_key * key; - - key = B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]); - if (op_is_left_mergeable (key, Sh->b_size) && (vn->vn_mode != M_DELETE || - vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1)) - vn->vn_vi[vn->vn_nr_item-1].vi_type |= VI_TYPE_RIGHT_MERGEABLE; -#ifdef CONFIG_REISERFS_CHECK - if (op_is_left_mergeable (key, Sh->b_size) && - !(vn->vn_mode != M_DELETE || vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1) ) { - /* we delete last item and it could be merged with right neighbor's first item */ - if (!(B_NR_ITEMS (Sh) == 1 && is_direntry_le_ih (B_N_PITEM_HEAD (Sh, 0)) && - I_ENTRY_COUNT (B_N_PITEM_HEAD (Sh, 0)) == 1)) { - /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */ - print_block (Sh, 0, -1, -1); - reiserfs_panic (tb->tb_sb, "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c", - key, vn->vn_affected_item_num, vn->vn_mode, M_DELETE); - } else - /* we can delete directory item, that has only one directory entry in it */ - ; + /* number of items in virtual node */ + vn->vn_nr_item = + B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) - + ((vn->vn_mode == M_DELETE) ? 1 : 0); + + /* first virtual item */ + vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1); + memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item)); + vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item); + + /* first item in the node */ + ih = B_N_PITEM_HEAD(Sh, 0); + + /* define the mergeability for 0-th item (if it is not being deleted) */ + if (op_is_left_mergeable(&(ih->ih_key), Sh->b_size) + && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num)) + vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE; + + /* go through all items those remain in the virtual node (except for the new (inserted) one) */ + for (new_num = 0; new_num < vn->vn_nr_item; new_num++) { + int j; + struct virtual_item *vi = vn->vn_vi + new_num; + int is_affected = + ((new_num != vn->vn_affected_item_num) ? 0 : 1); + + if (is_affected && vn->vn_mode == M_INSERT) + continue; + + /* get item number in source node */ + j = old_item_num(new_num, vn->vn_affected_item_num, + vn->vn_mode); + + vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE; + vi->vi_ih = ih + j; + vi->vi_item = B_I_PITEM(Sh, ih + j); + vi->vi_uarea = vn->vn_free_ptr; + + // FIXME: there is no check, that item operation did not + // consume too much memory + vn->vn_free_ptr += + op_create_vi(vn, vi, is_affected, tb->insert_size[0]); + if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr) + reiserfs_panic(tb->tb_sb, + "vs-8030: create_virtual_node: " + "virtual node space consumed"); + + if (!is_affected) + /* this is not being changed */ + continue; + + if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) { + vn->vn_vi[new_num].vi_item_len += tb->insert_size[0]; + vi->vi_new_data = vn->vn_data; // pointer to data which is going to be pasted + } } + + /* virtual inserted item is not defined yet */ + if (vn->vn_mode == M_INSERT) { + struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num; + + RFALSE(vn->vn_ins_ih == 0, + "vs-8040: item header of inserted item is not specified"); + vi->vi_item_len = tb->insert_size[0]; + vi->vi_ih = vn->vn_ins_ih; + vi->vi_item = vn->vn_data; + vi->vi_uarea = vn->vn_free_ptr; + + op_create_vi(vn, vi, 0 /*not pasted or cut */ , + tb->insert_size[0]); + } + + /* set right merge flag we take right delimiting key and check whether it is a mergeable item */ + if (tb->CFR[0]) { + struct reiserfs_key *key; + + key = B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]); + if (op_is_left_mergeable(key, Sh->b_size) + && (vn->vn_mode != M_DELETE + || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) + vn->vn_vi[vn->vn_nr_item - 1].vi_type |= + VI_TYPE_RIGHT_MERGEABLE; + +#ifdef CONFIG_REISERFS_CHECK + if (op_is_left_mergeable(key, Sh->b_size) && + !(vn->vn_mode != M_DELETE + || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) { + /* we delete last item and it could be merged with right neighbor's first item */ + if (! + (B_NR_ITEMS(Sh) == 1 + && is_direntry_le_ih(B_N_PITEM_HEAD(Sh, 0)) + && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) { + /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */ + print_block(Sh, 0, -1, -1); + reiserfs_panic(tb->tb_sb, + "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c", + key, vn->vn_affected_item_num, + vn->vn_mode, M_DELETE); + } else + /* we can delete directory item, that has only one directory entry in it */ + ; + } #endif - - } -} + } +} /* using virtual node check, how many items can be shifted to left neighbor */ -static void check_left (struct tree_balance * tb, int h, int cur_free) +static void check_left(struct tree_balance *tb, int h, int cur_free) { - int i; - struct virtual_node * vn = tb->tb_vn; - struct virtual_item * vi; - int d_size, ih_size; + int i; + struct virtual_node *vn = tb->tb_vn; + struct virtual_item *vi; + int d_size, ih_size; - RFALSE( cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free); + RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free); - /* internal level */ - if (h > 0) { - tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE); - return; - } + /* internal level */ + if (h > 0) { + tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE); + return; + } - /* leaf level */ + /* leaf level */ - if (!cur_free || !vn->vn_nr_item) { - /* no free space or nothing to move */ - tb->lnum[h] = 0; - tb->lbytes = -1; - return; - } + if (!cur_free || !vn->vn_nr_item) { + /* no free space or nothing to move */ + tb->lnum[h] = 0; + tb->lbytes = -1; + return; + } - RFALSE( !PATH_H_PPARENT (tb->tb_path, 0), - "vs-8055: parent does not exist or invalid"); + RFALSE(!PATH_H_PPARENT(tb->tb_path, 0), + "vs-8055: parent does not exist or invalid"); - vi = vn->vn_vi; - if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) { - /* all contents of S[0] fits into L[0] */ + vi = vn->vn_vi; + if ((unsigned int)cur_free >= + (vn->vn_size - + ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) { + /* all contents of S[0] fits into L[0] */ - RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, - "vs-8055: invalid mode or balance condition failed"); + RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, + "vs-8055: invalid mode or balance condition failed"); - tb->lnum[0] = vn->vn_nr_item; - tb->lbytes = -1; - return; - } - - - d_size = 0, ih_size = IH_SIZE; - - /* first item may be merge with last item in left neighbor */ - if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE) - d_size = -((int)IH_SIZE), ih_size = 0; - - tb->lnum[0] = 0; - for (i = 0; i < vn->vn_nr_item; i ++, ih_size = IH_SIZE, d_size = 0, vi ++) { - d_size += vi->vi_item_len; - if (cur_free >= d_size) { - /* the item can be shifted entirely */ - cur_free -= d_size; - tb->lnum[0] ++; - continue; + tb->lnum[0] = vn->vn_nr_item; + tb->lbytes = -1; + return; } - - /* the item cannot be shifted entirely, try to split it */ - /* check whether L[0] can hold ih and at least one byte of the item body */ - if (cur_free <= ih_size) { - /* cannot shift even a part of the current item */ - tb->lbytes = -1; - return; + + d_size = 0, ih_size = IH_SIZE; + + /* first item may be merge with last item in left neighbor */ + if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE) + d_size = -((int)IH_SIZE), ih_size = 0; + + tb->lnum[0] = 0; + for (i = 0; i < vn->vn_nr_item; + i++, ih_size = IH_SIZE, d_size = 0, vi++) { + d_size += vi->vi_item_len; + if (cur_free >= d_size) { + /* the item can be shifted entirely */ + cur_free -= d_size; + tb->lnum[0]++; + continue; + } + + /* the item cannot be shifted entirely, try to split it */ + /* check whether L[0] can hold ih and at least one byte of the item body */ + if (cur_free <= ih_size) { + /* cannot shift even a part of the current item */ + tb->lbytes = -1; + return; + } + cur_free -= ih_size; + + tb->lbytes = op_check_left(vi, cur_free, 0, 0); + if (tb->lbytes != -1) + /* count partially shifted item */ + tb->lnum[0]++; + + break; } - cur_free -= ih_size; - - tb->lbytes = op_check_left (vi, cur_free, 0, 0); - if (tb->lbytes != -1) - /* count partially shifted item */ - tb->lnum[0] ++; - - break; - } - - return; -} + return; +} /* using virtual node check, how many items can be shifted to right neighbor */ -static void check_right (struct tree_balance * tb, int h, int cur_free) +static void check_right(struct tree_balance *tb, int h, int cur_free) { - int i; - struct virtual_node * vn = tb->tb_vn; - struct virtual_item * vi; - int d_size, ih_size; - - RFALSE( cur_free < 0, "vs-8070: cur_free < 0"); - - /* internal level */ - if (h > 0) { - tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE); - return; - } - - /* leaf level */ - - if (!cur_free || !vn->vn_nr_item) { - /* no free space */ - tb->rnum[h] = 0; - tb->rbytes = -1; - return; - } - - RFALSE( !PATH_H_PPARENT (tb->tb_path, 0), - "vs-8075: parent does not exist or invalid"); - - vi = vn->vn_vi + vn->vn_nr_item - 1; - if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) { - /* all contents of S[0] fits into R[0] */ - - RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, - "vs-8080: invalid mode or balance condition failed"); - - tb->rnum[h] = vn->vn_nr_item; - tb->rbytes = -1; - return; - } - - d_size = 0, ih_size = IH_SIZE; - - /* last item may be merge with first item in right neighbor */ - if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) - d_size = -(int)IH_SIZE, ih_size = 0; - - tb->rnum[0] = 0; - for (i = vn->vn_nr_item - 1; i >= 0; i --, d_size = 0, ih_size = IH_SIZE, vi --) { - d_size += vi->vi_item_len; - if (cur_free >= d_size) { - /* the item can be shifted entirely */ - cur_free -= d_size; - tb->rnum[0] ++; - continue; + int i; + struct virtual_node *vn = tb->tb_vn; + struct virtual_item *vi; + int d_size, ih_size; + + RFALSE(cur_free < 0, "vs-8070: cur_free < 0"); + + /* internal level */ + if (h > 0) { + tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE); + return; } - - /* check whether R[0] can hold ih and at least one byte of the item body */ - if ( cur_free <= ih_size ) { /* cannot shift even a part of the current item */ - tb->rbytes = -1; - return; + + /* leaf level */ + + if (!cur_free || !vn->vn_nr_item) { + /* no free space */ + tb->rnum[h] = 0; + tb->rbytes = -1; + return; } - - /* R[0] can hold the header of the item and at least one byte of its body */ - cur_free -= ih_size; /* cur_free is still > 0 */ - - tb->rbytes = op_check_right (vi, cur_free); - if (tb->rbytes != -1) - /* count partially shifted item */ - tb->rnum[0] ++; - - break; - } - - return; -} + RFALSE(!PATH_H_PPARENT(tb->tb_path, 0), + "vs-8075: parent does not exist or invalid"); + + vi = vn->vn_vi + vn->vn_nr_item - 1; + if ((unsigned int)cur_free >= + (vn->vn_size - + ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) { + /* all contents of S[0] fits into R[0] */ + + RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, + "vs-8080: invalid mode or balance condition failed"); + + tb->rnum[h] = vn->vn_nr_item; + tb->rbytes = -1; + return; + } + + d_size = 0, ih_size = IH_SIZE; + + /* last item may be merge with first item in right neighbor */ + if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) + d_size = -(int)IH_SIZE, ih_size = 0; + + tb->rnum[0] = 0; + for (i = vn->vn_nr_item - 1; i >= 0; + i--, d_size = 0, ih_size = IH_SIZE, vi--) { + d_size += vi->vi_item_len; + if (cur_free >= d_size) { + /* the item can be shifted entirely */ + cur_free -= d_size; + tb->rnum[0]++; + continue; + } + + /* check whether R[0] can hold ih and at least one byte of the item body */ + if (cur_free <= ih_size) { /* cannot shift even a part of the current item */ + tb->rbytes = -1; + return; + } + + /* R[0] can hold the header of the item and at least one byte of its body */ + cur_free -= ih_size; /* cur_free is still > 0 */ + + tb->rbytes = op_check_right(vi, cur_free); + if (tb->rbytes != -1) + /* count partially shifted item */ + tb->rnum[0]++; + + break; + } + + return; +} /* * from - number of items, which are shifted to left neighbor entirely * to - number of item, which are shifted to right neighbor entirely * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */ -static int get_num_ver (int mode, struct tree_balance * tb, int h, - int from, int from_bytes, - int to, int to_bytes, - short * snum012, int flow - ) +static int get_num_ver(int mode, struct tree_balance *tb, int h, + int from, int from_bytes, + int to, int to_bytes, short *snum012, int flow) { - int i; - int cur_free; - // int bytes; - int units; - struct virtual_node * vn = tb->tb_vn; - // struct virtual_item * vi; - - int total_node_size, max_node_size, current_item_size; - int needed_nodes; - int start_item, /* position of item we start filling node from */ - end_item, /* position of item we finish filling node by */ - start_bytes,/* number of first bytes (entries for directory) of start_item-th item - we do not include into node that is being filled */ - end_bytes; /* number of last bytes (entries for directory) of end_item-th item - we do node include into node that is being filled */ - int split_item_positions[2]; /* these are positions in virtual item of - items, that are split between S[0] and - S1new and S1new and S2new */ - - split_item_positions[0] = -1; - split_item_positions[1] = -1; - - /* We only create additional nodes if we are in insert or paste mode - or we are in replace mode at the internal level. If h is 0 and - the mode is M_REPLACE then in fix_nodes we change the mode to - paste or insert before we get here in the code. */ - RFALSE( tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE), - "vs-8100: insert_size < 0 in overflow"); - - max_node_size = MAX_CHILD_SIZE (PATH_H_PBUFFER (tb->tb_path, h)); - - /* snum012 [0-2] - number of items, that lay - to S[0], first new node and second new node */ - snum012[3] = -1; /* s1bytes */ - snum012[4] = -1; /* s2bytes */ - - /* internal level */ - if (h > 0) { - i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE); - if (i == max_node_size) - return 1; - return (i / max_node_size + 1); - } - - /* leaf level */ - needed_nodes = 1; - total_node_size = 0; - cur_free = max_node_size; - - // start from 'from'-th item - start_item = from; - // skip its first 'start_bytes' units - start_bytes = ((from_bytes != -1) ? from_bytes : 0); - - // last included item is the 'end_item'-th one - end_item = vn->vn_nr_item - to - 1; - // do not count last 'end_bytes' units of 'end_item'-th item - end_bytes = (to_bytes != -1) ? to_bytes : 0; - - /* go through all item beginning from the start_item-th item and ending by - the end_item-th item. Do not count first 'start_bytes' units of - 'start_item'-th item and last 'end_bytes' of 'end_item'-th item */ - - for (i = start_item; i <= end_item; i ++) { - struct virtual_item * vi = vn->vn_vi + i; - int skip_from_end = ((i == end_item) ? end_bytes : 0); - - RFALSE( needed_nodes > 3, "vs-8105: too many nodes are needed"); - - /* get size of current item */ - current_item_size = vi->vi_item_len; - - /* do not take in calculation head part (from_bytes) of from-th item */ - current_item_size -= op_part_size (vi, 0/*from start*/, start_bytes); - - /* do not take in calculation tail part of last item */ - current_item_size -= op_part_size (vi, 1/*from end*/, skip_from_end); - - /* if item fits into current node entierly */ - if (total_node_size + current_item_size <= max_node_size) { - snum012[needed_nodes - 1] ++; - total_node_size += current_item_size; - start_bytes = 0; - continue; + int i; + int cur_free; + // int bytes; + int units; + struct virtual_node *vn = tb->tb_vn; + // struct virtual_item * vi; + + int total_node_size, max_node_size, current_item_size; + int needed_nodes; + int start_item, /* position of item we start filling node from */ + end_item, /* position of item we finish filling node by */ + start_bytes, /* number of first bytes (entries for directory) of start_item-th item + we do not include into node that is being filled */ + end_bytes; /* number of last bytes (entries for directory) of end_item-th item + we do node include into node that is being filled */ + int split_item_positions[2]; /* these are positions in virtual item of + items, that are split between S[0] and + S1new and S1new and S2new */ + + split_item_positions[0] = -1; + split_item_positions[1] = -1; + + /* We only create additional nodes if we are in insert or paste mode + or we are in replace mode at the internal level. If h is 0 and + the mode is M_REPLACE then in fix_nodes we change the mode to + paste or insert before we get here in the code. */ + RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE), + "vs-8100: insert_size < 0 in overflow"); + + max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h)); + + /* snum012 [0-2] - number of items, that lay + to S[0], first new node and second new node */ + snum012[3] = -1; /* s1bytes */ + snum012[4] = -1; /* s2bytes */ + + /* internal level */ + if (h > 0) { + i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE); + if (i == max_node_size) + return 1; + return (i / max_node_size + 1); } - if (current_item_size > max_node_size) { - /* virtual item length is longer, than max size of item in - a node. It is impossible for direct item */ - RFALSE( is_direct_le_ih (vi->vi_ih), - "vs-8110: " - "direct item length is %d. It can not be longer than %d", - current_item_size, max_node_size); - /* we will try to split it */ - flow = 1; + /* leaf level */ + needed_nodes = 1; + total_node_size = 0; + cur_free = max_node_size; + + // start from 'from'-th item + start_item = from; + // skip its first 'start_bytes' units + start_bytes = ((from_bytes != -1) ? from_bytes : 0); + + // last included item is the 'end_item'-th one + end_item = vn->vn_nr_item - to - 1; + // do not count last 'end_bytes' units of 'end_item'-th item + end_bytes = (to_bytes != -1) ? to_bytes : 0; + + /* go through all item beginning from the start_item-th item and ending by + the end_item-th item. Do not count first 'start_bytes' units of + 'start_item'-th item and last 'end_bytes' of 'end_item'-th item */ + + for (i = start_item; i <= end_item; i++) { + struct virtual_item *vi = vn->vn_vi + i; + int skip_from_end = ((i == end_item) ? end_bytes : 0); + + RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed"); + + /* get size of current item */ + current_item_size = vi->vi_item_len; + + /* do not take in calculation head part (from_bytes) of from-th item */ + current_item_size -= + op_part_size(vi, 0 /*from start */ , start_bytes); + + /* do not take in calculation tail part of last item */ + current_item_size -= + op_part_size(vi, 1 /*from end */ , skip_from_end); + + /* if item fits into current node entierly */ + if (total_node_size + current_item_size <= max_node_size) { + snum012[needed_nodes - 1]++; + total_node_size += current_item_size; + start_bytes = 0; + continue; + } + + if (current_item_size > max_node_size) { + /* virtual item length is longer, than max size of item in + a node. It is impossible for direct item */ + RFALSE(is_direct_le_ih(vi->vi_ih), + "vs-8110: " + "direct item length is %d. It can not be longer than %d", + current_item_size, max_node_size); + /* we will try to split it */ + flow = 1; + } + + if (!flow) { + /* as we do not split items, take new node and continue */ + needed_nodes++; + i--; + total_node_size = 0; + continue; + } + // calculate number of item units which fit into node being + // filled + { + int free_space; + + free_space = max_node_size - total_node_size - IH_SIZE; + units = + op_check_left(vi, free_space, start_bytes, + skip_from_end); + if (units == -1) { + /* nothing fits into current node, take new node and continue */ + needed_nodes++, i--, total_node_size = 0; + continue; + } + } + + /* something fits into the current node */ + //if (snum012[3] != -1 || needed_nodes != 1) + // reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required"); + //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units; + start_bytes += units; + snum012[needed_nodes - 1 + 3] = units; + + if (needed_nodes > 2) + reiserfs_warning(tb->tb_sb, "vs-8111: get_num_ver: " + "split_item_position is out of boundary"); + snum012[needed_nodes - 1]++; + split_item_positions[needed_nodes - 1] = i; + needed_nodes++; + /* continue from the same item with start_bytes != -1 */ + start_item = i; + i--; + total_node_size = 0; } - if (!flow) { - /* as we do not split items, take new node and continue */ - needed_nodes ++; i --; total_node_size = 0; - continue; + // sum012[4] (if it is not -1) contains number of units of which + // are to be in S1new, snum012[3] - to be in S0. They are supposed + // to be S1bytes and S2bytes correspondingly, so recalculate + if (snum012[4] > 0) { + int split_item_num; + int bytes_to_r, bytes_to_l; + int bytes_to_S1new; + + split_item_num = split_item_positions[1]; + bytes_to_l = + ((from == split_item_num + && from_bytes != -1) ? from_bytes : 0); + bytes_to_r = + ((end_item == split_item_num + && end_bytes != -1) ? end_bytes : 0); + bytes_to_S1new = + ((split_item_positions[0] == + split_item_positions[1]) ? snum012[3] : 0); + + // s2bytes + snum012[4] = + op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] - + bytes_to_r - bytes_to_l - bytes_to_S1new; + + if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY && + vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT) + reiserfs_warning(tb->tb_sb, "vs-8115: get_num_ver: not " + "directory or indirect item"); } - // calculate number of item units which fit into node being - // filled - { - int free_space; - - free_space = max_node_size - total_node_size - IH_SIZE; - units = op_check_left (vi, free_space, start_bytes, skip_from_end); - if (units == -1) { - /* nothing fits into current node, take new node and continue */ - needed_nodes ++, i--, total_node_size = 0; - continue; - } + /* now we know S2bytes, calculate S1bytes */ + if (snum012[3] > 0) { + int split_item_num; + int bytes_to_r, bytes_to_l; + int bytes_to_S2new; + + split_item_num = split_item_positions[0]; + bytes_to_l = + ((from == split_item_num + && from_bytes != -1) ? from_bytes : 0); + bytes_to_r = + ((end_item == split_item_num + && end_bytes != -1) ? end_bytes : 0); + bytes_to_S2new = + ((split_item_positions[0] == split_item_positions[1] + && snum012[4] != -1) ? snum012[4] : 0); + + // s1bytes + snum012[3] = + op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] - + bytes_to_r - bytes_to_l - bytes_to_S2new; } - /* something fits into the current node */ - //if (snum012[3] != -1 || needed_nodes != 1) - // reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required"); - //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units; - start_bytes += units; - snum012[needed_nodes - 1 + 3] = units; - - if (needed_nodes > 2) - reiserfs_warning (tb->tb_sb, "vs-8111: get_num_ver: " - "split_item_position is out of boundary"); - snum012[needed_nodes - 1] ++; - split_item_positions[needed_nodes - 1] = i; - needed_nodes ++; - /* continue from the same item with start_bytes != -1 */ - start_item = i; - i --; - total_node_size = 0; - } - - // sum012[4] (if it is not -1) contains number of units of which - // are to be in S1new, snum012[3] - to be in S0. They are supposed - // to be S1bytes and S2bytes correspondingly, so recalculate - if (snum012[4] > 0) { - int split_item_num; - int bytes_to_r, bytes_to_l; - int bytes_to_S1new; - - split_item_num = split_item_positions[1]; - bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0); - bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0); - bytes_to_S1new = ((split_item_positions[0] == split_item_positions[1]) ? snum012[3] : 0); - - // s2bytes - snum012[4] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[4] - bytes_to_r - bytes_to_l - bytes_to_S1new; - - if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY && - vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT) - reiserfs_warning (tb->tb_sb, "vs-8115: get_num_ver: not " - "directory or indirect item"); - } - - /* now we know S2bytes, calculate S1bytes */ - if (snum012[3] > 0) { - int split_item_num; - int bytes_to_r, bytes_to_l; - int bytes_to_S2new; - - split_item_num = split_item_positions[0]; - bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0); - bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0); - bytes_to_S2new = ((split_item_positions[0] == split_item_positions[1] && snum012[4] != -1) ? snum012[4] : 0); - - // s1bytes - snum012[3] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[3] - bytes_to_r - bytes_to_l - bytes_to_S2new; - } - - return needed_nodes; + return needed_nodes; } - #ifdef CONFIG_REISERFS_CHECK -extern struct tree_balance * cur_tb; +extern struct tree_balance *cur_tb; #endif - /* Set parameters for balancing. * Performs write of results of analysis of balancing into structure tb, * where it will later be used by the functions that actually do the balancing. @@ -557,131 +587,130 @@ extern struct tree_balance * cur_tb; * s1bytes number of bytes which flow to the first new node when S[0] splits (this number is contained in s012 array) */ -static void set_parameters (struct tree_balance * tb, int h, int lnum, - int rnum, int blk_num, short * s012, int lb, int rb) +static void set_parameters(struct tree_balance *tb, int h, int lnum, + int rnum, int blk_num, short *s012, int lb, int rb) { - tb->lnum[h] = lnum; - tb->rnum[h] = rnum; - tb->blknum[h] = blk_num; + tb->lnum[h] = lnum; + tb->rnum[h] = rnum; + tb->blknum[h] = blk_num; - if (h == 0) - { /* only for leaf level */ - if (s012 != NULL) - { - tb->s0num = * s012 ++, - tb->s1num = * s012 ++, - tb->s2num = * s012 ++; - tb->s1bytes = * s012 ++; - tb->s2bytes = * s012; + if (h == 0) { /* only for leaf level */ + if (s012 != NULL) { + tb->s0num = *s012++, + tb->s1num = *s012++, tb->s2num = *s012++; + tb->s1bytes = *s012++; + tb->s2bytes = *s012; + } + tb->lbytes = lb; + tb->rbytes = rb; } - tb->lbytes = lb; - tb->rbytes = rb; - } - PROC_INFO_ADD( tb -> tb_sb, lnum[ h ], lnum ); - PROC_INFO_ADD( tb -> tb_sb, rnum[ h ], rnum ); - - PROC_INFO_ADD( tb -> tb_sb, lbytes[ h ], lb ); - PROC_INFO_ADD( tb -> tb_sb, rbytes[ h ], rb ); -} - + PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum); + PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum); + PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb); + PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb); +} /* check, does node disappear if we shift tb->lnum[0] items to left neighbor and tb->rnum[0] to the right one. */ -static int is_leaf_removable (struct tree_balance * tb) +static int is_leaf_removable(struct tree_balance *tb) { - struct virtual_node * vn = tb->tb_vn; - int to_left, to_right; - int size; - int remain_items; - - /* number of items, that will be shifted to left (right) neighbor - entirely */ - to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0); - to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0); - remain_items = vn->vn_nr_item; - - /* how many items remain in S[0] after shiftings to neighbors */ - remain_items -= (to_left + to_right); - - if (remain_items < 1) { - /* all content of node can be shifted to neighbors */ - set_parameters (tb, 0, to_left, vn->vn_nr_item - to_left, 0, NULL, -1, -1); - return 1; - } - - if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1) - /* S[0] is not removable */ - return 0; - - /* check, whether we can divide 1 remaining item between neighbors */ - - /* get size of remaining item (in item units) */ - size = op_unit_num (&(vn->vn_vi[to_left])); - - if (tb->lbytes + tb->rbytes >= size) { - set_parameters (tb, 0, to_left + 1, to_right + 1, 0, NULL, tb->lbytes, -1); - return 1; - } - - return 0; -} + struct virtual_node *vn = tb->tb_vn; + int to_left, to_right; + int size; + int remain_items; + + /* number of items, that will be shifted to left (right) neighbor + entirely */ + to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0); + to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0); + remain_items = vn->vn_nr_item; + + /* how many items remain in S[0] after shiftings to neighbors */ + remain_items -= (to_left + to_right); + + if (remain_items < 1) { + /* all content of node can be shifted to neighbors */ + set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0, + NULL, -1, -1); + return 1; + } + if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1) + /* S[0] is not removable */ + return 0; + + /* check, whether we can divide 1 remaining item between neighbors */ + + /* get size of remaining item (in item units) */ + size = op_unit_num(&(vn->vn_vi[to_left])); + + if (tb->lbytes + tb->rbytes >= size) { + set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL, + tb->lbytes, -1); + return 1; + } + + return 0; +} /* check whether L, S, R can be joined in one node */ -static int are_leaves_removable (struct tree_balance * tb, int lfree, int rfree) +static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree) { - struct virtual_node * vn = tb->tb_vn; - int ih_size; - struct buffer_head *S0; - - S0 = PATH_H_PBUFFER (tb->tb_path, 0); - - ih_size = 0; - if (vn->vn_nr_item) { - if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE) - ih_size += IH_SIZE; - - if (vn->vn_vi[vn->vn_nr_item-1].vi_type & VI_TYPE_RIGHT_MERGEABLE) - ih_size += IH_SIZE; - } else { - /* there was only one item and it will be deleted */ - struct item_head * ih; - - RFALSE( B_NR_ITEMS (S0) != 1, - "vs-8125: item number must be 1: it is %d", B_NR_ITEMS(S0)); - - ih = B_N_PITEM_HEAD (S0, 0); - if (tb->CFR[0] && !comp_short_le_keys (&(ih->ih_key), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]))) - if (is_direntry_le_ih (ih)) { - /* Directory must be in correct state here: that is - somewhere at the left side should exist first directory - item. But the item being deleted can not be that first - one because its right neighbor is item of the same - directory. (But first item always gets deleted in last - turn). So, neighbors of deleted item can be merged, so - we can save ih_size */ - ih_size = IH_SIZE; - - /* we might check that left neighbor exists and is of the - same directory */ - RFALSE(le_ih_k_offset (ih) == DOT_OFFSET, - "vs-8130: first directory item can not be removed until directory is not empty"); - } - - } - - if (MAX_CHILD_SIZE (S0) + vn->vn_size <= rfree + lfree + ih_size) { - set_parameters (tb, 0, -1, -1, -1, NULL, -1, -1); - PROC_INFO_INC( tb -> tb_sb, leaves_removable ); - return 1; - } - return 0; - -} + struct virtual_node *vn = tb->tb_vn; + int ih_size; + struct buffer_head *S0; + + S0 = PATH_H_PBUFFER(tb->tb_path, 0); + + ih_size = 0; + if (vn->vn_nr_item) { + if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE) + ih_size += IH_SIZE; + + if (vn->vn_vi[vn->vn_nr_item - 1]. + vi_type & VI_TYPE_RIGHT_MERGEABLE) + ih_size += IH_SIZE; + } else { + /* there was only one item and it will be deleted */ + struct item_head *ih; + + RFALSE(B_NR_ITEMS(S0) != 1, + "vs-8125: item number must be 1: it is %d", + B_NR_ITEMS(S0)); + + ih = B_N_PITEM_HEAD(S0, 0); + if (tb->CFR[0] + && !comp_short_le_keys(&(ih->ih_key), + B_N_PDELIM_KEY(tb->CFR[0], + tb->rkey[0]))) + if (is_direntry_le_ih(ih)) { + /* Directory must be in correct state here: that is + somewhere at the left side should exist first directory + item. But the item being deleted can not be that first + one because its right neighbor is item of the same + directory. (But first item always gets deleted in last + turn). So, neighbors of deleted item can be merged, so + we can save ih_size */ + ih_size = IH_SIZE; + + /* we might check that left neighbor exists and is of the + same directory */ + RFALSE(le_ih_k_offset(ih) == DOT_OFFSET, + "vs-8130: first directory item can not be removed until directory is not empty"); + } + } + + if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) { + set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1); + PROC_INFO_INC(tb->tb_sb, leaves_removable); + return 1; + } + return 0; +} /* when we do not split item, lnum and rnum are numbers of entire items */ #define SET_PAR_SHIFT_LEFT \ @@ -704,7 +733,6 @@ else \ -1, -1);\ } - #define SET_PAR_SHIFT_RIGHT \ if (h)\ {\ @@ -724,214 +752,199 @@ else \ -1, -1);\ } - -static void free_buffers_in_tb ( - struct tree_balance * p_s_tb - ) { - int n_counter; - - decrement_counters_in_path(p_s_tb->tb_path); - - for ( n_counter = 0; n_counter < MAX_HEIGHT; n_counter++ ) { - decrement_bcount(p_s_tb->L[n_counter]); - p_s_tb->L[n_counter] = NULL; - decrement_bcount(p_s_tb->R[n_counter]); - p_s_tb->R[n_counter] = NULL; - decrement_bcount(p_s_tb->FL[n_counter]); - p_s_tb->FL[n_counter] = NULL; - decrement_bcount(p_s_tb->FR[n_counter]); - p_s_tb->FR[n_counter] = NULL; - decrement_bcount(p_s_tb->CFL[n_counter]); - p_s_tb->CFL[n_counter] = NULL; - decrement_bcount(p_s_tb->CFR[n_counter]); - p_s_tb->CFR[n_counter] = NULL; - } +static void free_buffers_in_tb(struct tree_balance *p_s_tb) +{ + int n_counter; + + decrement_counters_in_path(p_s_tb->tb_path); + + for (n_counter = 0; n_counter < MAX_HEIGHT; n_counter++) { + decrement_bcount(p_s_tb->L[n_counter]); + p_s_tb->L[n_counter] = NULL; + decrement_bcount(p_s_tb->R[n_counter]); + p_s_tb->R[n_counter] = NULL; + decrement_bcount(p_s_tb->FL[n_counter]); + p_s_tb->FL[n_counter] = NULL; + decrement_bcount(p_s_tb->FR[n_counter]); + p_s_tb->FR[n_counter] = NULL; + decrement_bcount(p_s_tb->CFL[n_counter]); + p_s_tb->CFL[n_counter] = NULL; + decrement_bcount(p_s_tb->CFR[n_counter]); + p_s_tb->CFR[n_counter] = NULL; + } } - /* Get new buffers for storing new nodes that are created while balancing. * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; * CARRY_ON - schedule didn't occur while the function worked; * NO_DISK_SPACE - no disk space. */ /* The function is NOT SCHEDULE-SAFE! */ -static int get_empty_nodes( - struct tree_balance * p_s_tb, - int n_h - ) { - struct buffer_head * p_s_new_bh, - * p_s_Sh = PATH_H_PBUFFER (p_s_tb->tb_path, n_h); - b_blocknr_t * p_n_blocknr, - a_n_blocknrs[MAX_AMOUNT_NEEDED] = {0, }; - int n_counter, - n_number_of_freeblk, - n_amount_needed,/* number of needed empty blocks */ - n_retval = CARRY_ON; - struct super_block * p_s_sb = p_s_tb->tb_sb; - - - /* number_of_freeblk is the number of empty blocks which have been - acquired for use by the balancing algorithm minus the number of - empty blocks used in the previous levels of the analysis, - number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs - after empty blocks are acquired, and the balancing analysis is - then restarted, amount_needed is the number needed by this level - (n_h) of the balancing analysis. - - Note that for systems with many processes writing, it would be - more layout optimal to calculate the total number needed by all - levels and then to run reiserfs_new_blocks to get all of them at once. */ - - /* Initiate number_of_freeblk to the amount acquired prior to the restart of - the analysis or 0 if not restarted, then subtract the amount needed - by all of the levels of the tree below n_h. */ - /* blknum includes S[n_h], so we subtract 1 in this calculation */ - for ( n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum; n_counter < n_h; n_counter++ ) - n_number_of_freeblk -= ( p_s_tb->blknum[n_counter] ) ? (p_s_tb->blknum[n_counter] - 1) : 0; - - /* Allocate missing empty blocks. */ - /* if p_s_Sh == 0 then we are getting a new root */ - n_amount_needed = ( p_s_Sh ) ? (p_s_tb->blknum[n_h] - 1) : 1; - /* Amount_needed = the amount that we need more than the amount that we have. */ - if ( n_amount_needed > n_number_of_freeblk ) - n_amount_needed -= n_number_of_freeblk; - else /* If we have enough already then there is nothing to do. */ - return CARRY_ON; - - /* No need to check quota - is not allocated for blocks used for formatted nodes */ - if (reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs, - n_amount_needed) == NO_DISK_SPACE) - return NO_DISK_SPACE; - - /* for each blocknumber we just got, get a buffer and stick it on FEB */ - for ( p_n_blocknr = a_n_blocknrs, n_counter = 0; n_counter < n_amount_needed; - p_n_blocknr++, n_counter++ ) { - - RFALSE( ! *p_n_blocknr, - "PAP-8135: reiserfs_new_blocknrs failed when got new blocks"); - - p_s_new_bh = sb_getblk(p_s_sb, *p_n_blocknr); - RFALSE (buffer_dirty (p_s_new_bh) || - buffer_journaled (p_s_new_bh) || - buffer_journal_dirty (p_s_new_bh), - "PAP-8140: journlaled or dirty buffer %b for the new block", - p_s_new_bh); - - /* Put empty buffers into the array. */ - RFALSE (p_s_tb->FEB[p_s_tb->cur_blknum], - "PAP-8141: busy slot for new buffer"); - - set_buffer_journal_new (p_s_new_bh); - p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh; - } - - if ( n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB (p_s_tb) ) - n_retval = REPEAT_SEARCH ; - - return n_retval; -} +static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h) +{ + struct buffer_head *p_s_new_bh, + *p_s_Sh = PATH_H_PBUFFER(p_s_tb->tb_path, n_h); + b_blocknr_t *p_n_blocknr, a_n_blocknrs[MAX_AMOUNT_NEEDED] = { 0, }; + int n_counter, n_number_of_freeblk, n_amount_needed, /* number of needed empty blocks */ + n_retval = CARRY_ON; + struct super_block *p_s_sb = p_s_tb->tb_sb; + + /* number_of_freeblk is the number of empty blocks which have been + acquired for use by the balancing algorithm minus the number of + empty blocks used in the previous levels of the analysis, + number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs + after empty blocks are acquired, and the balancing analysis is + then restarted, amount_needed is the number needed by this level + (n_h) of the balancing analysis. + + Note that for systems with many processes writing, it would be + more layout optimal to calculate the total number needed by all + levels and then to run reiserfs_new_blocks to get all of them at once. */ + + /* Initiate number_of_freeblk to the amount acquired prior to the restart of + the analysis or 0 if not restarted, then subtract the amount needed + by all of the levels of the tree below n_h. */ + /* blknum includes S[n_h], so we subtract 1 in this calculation */ + for (n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum; + n_counter < n_h; n_counter++) + n_number_of_freeblk -= + (p_s_tb->blknum[n_counter]) ? (p_s_tb->blknum[n_counter] - + 1) : 0; + + /* Allocate missing empty blocks. */ + /* if p_s_Sh == 0 then we are getting a new root */ + n_amount_needed = (p_s_Sh) ? (p_s_tb->blknum[n_h] - 1) : 1; + /* Amount_needed = the amount that we need more than the amount that we have. */ + if (n_amount_needed > n_number_of_freeblk) + n_amount_needed -= n_number_of_freeblk; + else /* If we have enough already then there is nothing to do. */ + return CARRY_ON; + + /* No need to check quota - is not allocated for blocks used for formatted nodes */ + if (reiserfs_new_form_blocknrs(p_s_tb, a_n_blocknrs, + n_amount_needed) == NO_DISK_SPACE) + return NO_DISK_SPACE; + + /* for each blocknumber we just got, get a buffer and stick it on FEB */ + for (p_n_blocknr = a_n_blocknrs, n_counter = 0; + n_counter < n_amount_needed; p_n_blocknr++, n_counter++) { + + RFALSE(!*p_n_blocknr, + "PAP-8135: reiserfs_new_blocknrs failed when got new blocks"); + + p_s_new_bh = sb_getblk(p_s_sb, *p_n_blocknr); + RFALSE(buffer_dirty(p_s_new_bh) || + buffer_journaled(p_s_new_bh) || + buffer_journal_dirty(p_s_new_bh), + "PAP-8140: journlaled or dirty buffer %b for the new block", + p_s_new_bh); + + /* Put empty buffers into the array. */ + RFALSE(p_s_tb->FEB[p_s_tb->cur_blknum], + "PAP-8141: busy slot for new buffer"); + + set_buffer_journal_new(p_s_new_bh); + p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh; + } + + if (n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB(p_s_tb)) + n_retval = REPEAT_SEARCH; + return n_retval; +} /* Get free space of the left neighbor, which is stored in the parent * node of the left neighbor. */ -static int get_lfree (struct tree_balance * tb, int h) +static int get_lfree(struct tree_balance *tb, int h) { - struct buffer_head * l, * f; - int order; + struct buffer_head *l, *f; + int order; - if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0) - return 0; + if ((f = PATH_H_PPARENT(tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0) + return 0; - if (f == l) - order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) - 1; - else { - order = B_NR_ITEMS (l); - f = l; - } + if (f == l) + order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1; + else { + order = B_NR_ITEMS(l); + f = l; + } - return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f,order))); + return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order))); } - /* Get free space of the right neighbor, * which is stored in the parent node of the right neighbor. */ -static int get_rfree (struct tree_balance * tb, int h) +static int get_rfree(struct tree_balance *tb, int h) { - struct buffer_head * r, * f; - int order; + struct buffer_head *r, *f; + int order; - if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0) - return 0; + if ((f = PATH_H_PPARENT(tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0) + return 0; - if (f == r) - order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) + 1; - else { - order = 0; - f = r; - } + if (f == r) + order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1; + else { + order = 0; + f = r; + } - return (MAX_CHILD_SIZE(f) - dc_size( B_N_CHILD(f,order))); + return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order))); } - /* Check whether left neighbor is in memory. */ -static int is_left_neighbor_in_cache( - struct tree_balance * p_s_tb, - int n_h - ) { - struct buffer_head * p_s_father, * left; - struct super_block * p_s_sb = p_s_tb->tb_sb; - b_blocknr_t n_left_neighbor_blocknr; - int n_left_neighbor_position; - - if ( ! p_s_tb->FL[n_h] ) /* Father of the left neighbor does not exist. */ - return 0; - - /* Calculate father of the node to be balanced. */ - p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1); - - RFALSE( ! p_s_father || - ! B_IS_IN_TREE (p_s_father) || - ! B_IS_IN_TREE (p_s_tb->FL[n_h]) || - ! buffer_uptodate (p_s_father) || - ! buffer_uptodate (p_s_tb->FL[n_h]), - "vs-8165: F[h] (%b) or FL[h] (%b) is invalid", - p_s_father, p_s_tb->FL[n_h]); - - - /* Get position of the pointer to the left neighbor into the left father. */ - n_left_neighbor_position = ( p_s_father == p_s_tb->FL[n_h] ) ? - p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]); - /* Get left neighbor block number. */ - n_left_neighbor_blocknr = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position); - /* Look for the left neighbor in the cache. */ - if ( (left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr)) ) { - - RFALSE( buffer_uptodate (left) && ! B_IS_IN_TREE(left), - "vs-8170: left neighbor (%b %z) is not in the tree", left, left); - put_bh(left) ; - return 1; - } - - return 0; -} +static int is_left_neighbor_in_cache(struct tree_balance *p_s_tb, int n_h) +{ + struct buffer_head *p_s_father, *left; + struct super_block *p_s_sb = p_s_tb->tb_sb; + b_blocknr_t n_left_neighbor_blocknr; + int n_left_neighbor_position; + + if (!p_s_tb->FL[n_h]) /* Father of the left neighbor does not exist. */ + return 0; + + /* Calculate father of the node to be balanced. */ + p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1); + + RFALSE(!p_s_father || + !B_IS_IN_TREE(p_s_father) || + !B_IS_IN_TREE(p_s_tb->FL[n_h]) || + !buffer_uptodate(p_s_father) || + !buffer_uptodate(p_s_tb->FL[n_h]), + "vs-8165: F[h] (%b) or FL[h] (%b) is invalid", + p_s_father, p_s_tb->FL[n_h]); + + /* Get position of the pointer to the left neighbor into the left father. */ + n_left_neighbor_position = (p_s_father == p_s_tb->FL[n_h]) ? + p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->FL[n_h]); + /* Get left neighbor block number. */ + n_left_neighbor_blocknr = + B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position); + /* Look for the left neighbor in the cache. */ + if ((left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr))) { + + RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left), + "vs-8170: left neighbor (%b %z) is not in the tree", + left, left); + put_bh(left); + return 1; + } + return 0; +} #define LEFT_PARENTS 'l' #define RIGHT_PARENTS 'r' - -static void decrement_key (struct cpu_key * p_s_key) +static void decrement_key(struct cpu_key *p_s_key) { - // call item specific function for this key - item_ops[cpu_key_k_type (p_s_key)]->decrement_key (p_s_key); + // call item specific function for this key + item_ops[cpu_key_k_type(p_s_key)]->decrement_key(p_s_key); } - - - /* Calculate far left/right parent of the left/right neighbor of the current node, that * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h]. * Calculate left/right common parent of the current node and L[h]/R[h]. @@ -940,111 +953,121 @@ static void decrement_key (struct cpu_key * p_s_key) SCHEDULE_OCCURRED - schedule occurred while the function worked; * CARRY_ON - schedule didn't occur while the function worked; */ -static int get_far_parent (struct tree_balance * p_s_tb, - int n_h, - struct buffer_head ** pp_s_father, - struct buffer_head ** pp_s_com_father, - char c_lr_par) +static int get_far_parent(struct tree_balance *p_s_tb, + int n_h, + struct buffer_head **pp_s_father, + struct buffer_head **pp_s_com_father, char c_lr_par) { - struct buffer_head * p_s_parent; - INITIALIZE_PATH (s_path_to_neighbor_father); - struct path * p_s_path = p_s_tb->tb_path; - struct cpu_key s_lr_father_key; - int n_counter, - n_position = INT_MAX, - n_first_last_position = 0, - n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h); - - /* Starting from F[n_h] go upwards in the tree, and look for the common - ancestor of F[n_h], and its neighbor l/r, that should be obtained. */ - - n_counter = n_path_offset; - - RFALSE( n_counter < FIRST_PATH_ELEMENT_OFFSET, - "PAP-8180: invalid path length"); - - - for ( ; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter-- ) { - /* Check whether parent of the current buffer in the path is really parent in the tree. */ - if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)) ) - return REPEAT_SEARCH; - /* Check whether position in the parent is correct. */ - if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_counter - 1)) > B_NR_ITEMS(p_s_parent) ) - return REPEAT_SEARCH; - /* Check whether parent at the path really points to the child. */ - if ( B_N_CHILD_NUM(p_s_parent, n_position) != - PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr ) - return REPEAT_SEARCH; - /* Return delimiting key if position in the parent is not equal to first/last one. */ - if ( c_lr_par == RIGHT_PARENTS ) - n_first_last_position = B_NR_ITEMS (p_s_parent); - if ( n_position != n_first_last_position ) { - *pp_s_com_father = p_s_parent; - get_bh(*pp_s_com_father) ; - /*(*pp_s_com_father = p_s_parent)->b_count++;*/ - break; + struct buffer_head *p_s_parent; + INITIALIZE_PATH(s_path_to_neighbor_father); + struct path *p_s_path = p_s_tb->tb_path; + struct cpu_key s_lr_father_key; + int n_counter, + n_position = INT_MAX, + n_first_last_position = 0, + n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h); + + /* Starting from F[n_h] go upwards in the tree, and look for the common + ancestor of F[n_h], and its neighbor l/r, that should be obtained. */ + + n_counter = n_path_offset; + + RFALSE(n_counter < FIRST_PATH_ELEMENT_OFFSET, + "PAP-8180: invalid path length"); + + for (; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter--) { + /* Check whether parent of the current buffer in the path is really parent in the tree. */ + if (!B_IS_IN_TREE + (p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1))) + return REPEAT_SEARCH; + /* Check whether position in the parent is correct. */ + if ((n_position = + PATH_OFFSET_POSITION(p_s_path, + n_counter - 1)) > + B_NR_ITEMS(p_s_parent)) + return REPEAT_SEARCH; + /* Check whether parent at the path really points to the child. */ + if (B_N_CHILD_NUM(p_s_parent, n_position) != + PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr) + return REPEAT_SEARCH; + /* Return delimiting key if position in the parent is not equal to first/last one. */ + if (c_lr_par == RIGHT_PARENTS) + n_first_last_position = B_NR_ITEMS(p_s_parent); + if (n_position != n_first_last_position) { + *pp_s_com_father = p_s_parent; + get_bh(*pp_s_com_father); + /*(*pp_s_com_father = p_s_parent)->b_count++; */ + break; + } } - } - - /* if we are in the root of the tree, then there is no common father */ - if ( n_counter == FIRST_PATH_ELEMENT_OFFSET ) { - /* Check whether first buffer in the path is the root of the tree. */ - if ( PATH_OFFSET_PBUFFER(p_s_tb->tb_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == - SB_ROOT_BLOCK (p_s_tb->tb_sb) ) { - *pp_s_father = *pp_s_com_father = NULL; - return CARRY_ON; + + /* if we are in the root of the tree, then there is no common father */ + if (n_counter == FIRST_PATH_ELEMENT_OFFSET) { + /* Check whether first buffer in the path is the root of the tree. */ + if (PATH_OFFSET_PBUFFER + (p_s_tb->tb_path, + FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == + SB_ROOT_BLOCK(p_s_tb->tb_sb)) { + *pp_s_father = *pp_s_com_father = NULL; + return CARRY_ON; + } + return REPEAT_SEARCH; } - return REPEAT_SEARCH; - } - RFALSE( B_LEVEL (*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL, - "PAP-8185: (%b %z) level too small", - *pp_s_com_father, *pp_s_com_father); + RFALSE(B_LEVEL(*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL, + "PAP-8185: (%b %z) level too small", + *pp_s_com_father, *pp_s_com_father); - /* Check whether the common parent is locked. */ + /* Check whether the common parent is locked. */ - if ( buffer_locked (*pp_s_com_father) ) { - __wait_on_buffer(*pp_s_com_father); - if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { - decrement_bcount(*pp_s_com_father); - return REPEAT_SEARCH; + if (buffer_locked(*pp_s_com_father)) { + __wait_on_buffer(*pp_s_com_father); + if (FILESYSTEM_CHANGED_TB(p_s_tb)) { + decrement_bcount(*pp_s_com_father); + return REPEAT_SEARCH; + } } - } - - /* So, we got common parent of the current node and its left/right neighbor. - Now we are geting the parent of the left/right neighbor. */ - /* Form key to get parent of the left/right neighbor. */ - le_key2cpu_key (&s_lr_father_key, B_N_PDELIM_KEY(*pp_s_com_father, ( c_lr_par == LEFT_PARENTS ) ? - (p_s_tb->lkey[n_h - 1] = n_position - 1) : (p_s_tb->rkey[n_h - 1] = n_position))); + /* So, we got common parent of the current node and its left/right neighbor. + Now we are geting the parent of the left/right neighbor. */ + /* Form key to get parent of the left/right neighbor. */ + le_key2cpu_key(&s_lr_father_key, + B_N_PDELIM_KEY(*pp_s_com_father, + (c_lr_par == + LEFT_PARENTS) ? (p_s_tb->lkey[n_h - 1] = + n_position - + 1) : (p_s_tb->rkey[n_h - + 1] = + n_position))); - if ( c_lr_par == LEFT_PARENTS ) - decrement_key(&s_lr_father_key); + if (c_lr_par == LEFT_PARENTS) + decrement_key(&s_lr_father_key); - if (search_by_key(p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, n_h + 1) == IO_ERROR) - // path is released - return IO_ERROR; + if (search_by_key + (p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, + n_h + 1) == IO_ERROR) + // path is released + return IO_ERROR; - if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { - decrement_counters_in_path(&s_path_to_neighbor_father); - decrement_bcount(*pp_s_com_father); - return REPEAT_SEARCH; - } + if (FILESYSTEM_CHANGED_TB(p_s_tb)) { + decrement_counters_in_path(&s_path_to_neighbor_father); + decrement_bcount(*pp_s_com_father); + return REPEAT_SEARCH; + } - *pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father); + *pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father); - RFALSE( B_LEVEL (*pp_s_father) != n_h + 1, - "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father); - RFALSE( s_path_to_neighbor_father.path_length < FIRST_PATH_ELEMENT_OFFSET, - "PAP-8192: path length is too small"); + RFALSE(B_LEVEL(*pp_s_father) != n_h + 1, + "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father); + RFALSE(s_path_to_neighbor_father.path_length < + FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small"); - s_path_to_neighbor_father.path_length--; - decrement_counters_in_path(&s_path_to_neighbor_father); - return CARRY_ON; + s_path_to_neighbor_father.path_length--; + decrement_counters_in_path(&s_path_to_neighbor_father); + return CARRY_ON; } - /* Get parents of neighbors of node in the path(S[n_path_offset]) and common parents of * S[n_path_offset] and L[n_path_offset]/R[n_path_offset]: F[n_path_offset], FL[n_path_offset], * FR[n_path_offset], CFL[n_path_offset], CFR[n_path_offset]. @@ -1052,122 +1075,127 @@ static int get_far_parent (struct tree_balance * p_s_tb, * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; * CARRY_ON - schedule didn't occur while the function worked; */ -static int get_parents (struct tree_balance * p_s_tb, int n_h) +static int get_parents(struct tree_balance *p_s_tb, int n_h) { - struct path * p_s_path = p_s_tb->tb_path; - int n_position, - n_ret_value, - n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h); - struct buffer_head * p_s_curf, - * p_s_curcf; - - /* Current node is the root of the tree or will be root of the tree */ - if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) { - /* The root can not have parents. - Release nodes which previously were obtained as parents of the current node neighbors. */ + struct path *p_s_path = p_s_tb->tb_path; + int n_position, + n_ret_value, + n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h); + struct buffer_head *p_s_curf, *p_s_curcf; + + /* Current node is the root of the tree or will be root of the tree */ + if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) { + /* The root can not have parents. + Release nodes which previously were obtained as parents of the current node neighbors. */ + decrement_bcount(p_s_tb->FL[n_h]); + decrement_bcount(p_s_tb->CFL[n_h]); + decrement_bcount(p_s_tb->FR[n_h]); + decrement_bcount(p_s_tb->CFR[n_h]); + p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] = + p_s_tb->CFR[n_h] = NULL; + return CARRY_ON; + } + + /* Get parent FL[n_path_offset] of L[n_path_offset]. */ + if ((n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1))) { + /* Current node is not the first child of its parent. */ + /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */ + p_s_curf = p_s_curcf = + PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1); + get_bh(p_s_curf); + get_bh(p_s_curf); + p_s_tb->lkey[n_h] = n_position - 1; + } else { + /* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node. + Calculate current common parent of L[n_path_offset] and the current node. Note that + CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset]. + Calculate lkey[n_path_offset]. */ + if ((n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf, + &p_s_curcf, + LEFT_PARENTS)) != CARRY_ON) + return n_ret_value; + } + decrement_bcount(p_s_tb->FL[n_h]); + p_s_tb->FL[n_h] = p_s_curf; /* New initialization of FL[n_h]. */ decrement_bcount(p_s_tb->CFL[n_h]); - decrement_bcount(p_s_tb->FR[n_h]); - decrement_bcount(p_s_tb->CFR[n_h]); - p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] = p_s_tb->CFR[n_h] = NULL; - return CARRY_ON; - } - - /* Get parent FL[n_path_offset] of L[n_path_offset]. */ - if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) ) { - /* Current node is not the first child of its parent. */ - /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/ - p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1); - get_bh(p_s_curf) ; - get_bh(p_s_curf) ; - p_s_tb->lkey[n_h] = n_position - 1; - } - else { - /* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node. - Calculate current common parent of L[n_path_offset] and the current node. Note that - CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset]. - Calculate lkey[n_path_offset]. */ - if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf, - &p_s_curcf, LEFT_PARENTS)) != CARRY_ON ) - return n_ret_value; - } - - decrement_bcount(p_s_tb->FL[n_h]); - p_s_tb->FL[n_h] = p_s_curf; /* New initialization of FL[n_h]. */ - decrement_bcount(p_s_tb->CFL[n_h]); - p_s_tb->CFL[n_h] = p_s_curcf; /* New initialization of CFL[n_h]. */ - - RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) || - (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)), - "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf); + p_s_tb->CFL[n_h] = p_s_curcf; /* New initialization of CFL[n_h]. */ + + RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) || + (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)), + "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf); /* Get parent FR[n_h] of R[n_h]. */ /* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */ - if ( n_position == B_NR_ITEMS (PATH_H_PBUFFER(p_s_path, n_h + 1)) ) { + if (n_position == B_NR_ITEMS(PATH_H_PBUFFER(p_s_path, n_h + 1))) { /* Calculate current parent of R[n_h], which is the right neighbor of F[n_h]. Calculate current common parent of R[n_h] and current node. Note that CFR[n_h] not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */ - if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf, RIGHT_PARENTS)) != CARRY_ON ) - return n_ret_value; - } - else { + if ((n_ret_value = + get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf, + RIGHT_PARENTS)) != CARRY_ON) + return n_ret_value; + } else { /* Current node is not the last child of its parent F[n_h]. */ - /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/ - p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1); - get_bh(p_s_curf) ; - get_bh(p_s_curf) ; - p_s_tb->rkey[n_h] = n_position; - } - - decrement_bcount(p_s_tb->FR[n_h]); - p_s_tb->FR[n_h] = p_s_curf; /* New initialization of FR[n_path_offset]. */ - - decrement_bcount(p_s_tb->CFR[n_h]); - p_s_tb->CFR[n_h] = p_s_curcf; /* New initialization of CFR[n_path_offset]. */ - - RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) || - (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)), - "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf); - - return CARRY_ON; -} + /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */ + p_s_curf = p_s_curcf = + PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1); + get_bh(p_s_curf); + get_bh(p_s_curf); + p_s_tb->rkey[n_h] = n_position; + } + decrement_bcount(p_s_tb->FR[n_h]); + p_s_tb->FR[n_h] = p_s_curf; /* New initialization of FR[n_path_offset]. */ + + decrement_bcount(p_s_tb->CFR[n_h]); + p_s_tb->CFR[n_h] = p_s_curcf; /* New initialization of CFR[n_path_offset]. */ + + RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) || + (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)), + "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf); + + return CARRY_ON; +} /* it is possible to remove node as result of shiftings to neighbors even when we insert or paste item. */ -static inline int can_node_be_removed (int mode, int lfree, int sfree, int rfree, struct tree_balance * tb, int h) +static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree, + struct tree_balance *tb, int h) { - struct buffer_head * Sh = PATH_H_PBUFFER (tb->tb_path, h); - int levbytes = tb->insert_size[h]; - struct item_head * ih; - struct reiserfs_key * r_key = NULL; - - ih = B_N_PITEM_HEAD (Sh, 0); - if ( tb->CFR[h] ) - r_key = B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]); - - if ( - lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes - /* shifting may merge items which might save space */ - - (( ! h && op_is_left_mergeable (&(ih->ih_key), Sh->b_size) ) ? IH_SIZE : 0) - - (( ! h && r_key && op_is_left_mergeable (r_key, Sh->b_size) ) ? IH_SIZE : 0) - + (( h ) ? KEY_SIZE : 0)) - { - /* node can not be removed */ - if (sfree >= levbytes ) { /* new item fits into node S[h] without any shifting */ - if ( ! h ) - tb->s0num = B_NR_ITEMS(Sh) + ((mode == M_INSERT ) ? 1 : 0); - set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; + struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h); + int levbytes = tb->insert_size[h]; + struct item_head *ih; + struct reiserfs_key *r_key = NULL; + + ih = B_N_PITEM_HEAD(Sh, 0); + if (tb->CFR[h]) + r_key = B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]); + + if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes + /* shifting may merge items which might save space */ + - + ((!h + && op_is_left_mergeable(&(ih->ih_key), Sh->b_size)) ? IH_SIZE : 0) + - + ((!h && r_key + && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0) + + ((h) ? KEY_SIZE : 0)) { + /* node can not be removed */ + if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */ + if (!h) + tb->s0num = + B_NR_ITEMS(Sh) + + ((mode == M_INSERT) ? 1 : 0); + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } } - } - PROC_INFO_INC( tb -> tb_sb, can_node_be_removed[ h ] ); - return !NO_BALANCING_NEEDED; + PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]); + return !NO_BALANCING_NEEDED; } - - /* Check whether current node S[h] is balanced when increasing its size by * Inserting or Pasting. * Calculate parameters for balancing for current level h. @@ -1182,154 +1210,157 @@ static inline int can_node_be_removed (int mode, int lfree, int sfree, int rfree * -2 - no disk space. */ /* ip means Inserting or Pasting */ -static int ip_check_balance (struct tree_balance * tb, int h) +static int ip_check_balance(struct tree_balance *tb, int h) { - struct virtual_node * vn = tb->tb_vn; - int levbytes, /* Number of bytes that must be inserted into (value - is negative if bytes are deleted) buffer which - contains node being balanced. The mnemonic is - that the attempted change in node space used level - is levbytes bytes. */ - n_ret_value; - - int lfree, sfree, rfree /* free space in L, S and R */; - - /* nver is short for number of vertixes, and lnver is the number if - we shift to the left, rnver is the number if we shift to the - right, and lrnver is the number if we shift in both directions. - The goal is to minimize first the number of vertixes, and second, - the number of vertixes whose contents are changed by shifting, - and third the number of uncached vertixes whose contents are - changed by shifting and must be read from disk. */ - int nver, lnver, rnver, lrnver; - - /* used at leaf level only, S0 = S[0] is the node being balanced, - sInum [ I = 0,1,2 ] is the number of items that will - remain in node SI after balancing. S1 and S2 are new - nodes that might be created. */ - - /* we perform 8 calls to get_num_ver(). For each call we calculate five parameters. - where 4th parameter is s1bytes and 5th - s2bytes - */ - short snum012[40] = {0,}; /* s0num, s1num, s2num for 8 cases - 0,1 - do not shift and do not shift but bottle - 2 - shift only whole item to left - 3 - shift to left and bottle as much as possible - 4,5 - shift to right (whole items and as much as possible - 6,7 - shift to both directions (whole items and as much as possible) - */ - - /* Sh is the node whose balance is currently being checked */ - struct buffer_head * Sh; - - Sh = PATH_H_PBUFFER (tb->tb_path, h); - levbytes = tb->insert_size[h]; - - /* Calculate balance parameters for creating new root. */ - if ( ! Sh ) { - if ( ! h ) - reiserfs_panic (tb->tb_sb, "vs-8210: ip_check_balance: S[0] can not be 0"); - switch ( n_ret_value = get_empty_nodes (tb, h) ) { - case CARRY_ON: - set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */ - - case NO_DISK_SPACE: - case REPEAT_SEARCH: - return n_ret_value; - default: - reiserfs_panic(tb->tb_sb, "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes"); + struct virtual_node *vn = tb->tb_vn; + int levbytes, /* Number of bytes that must be inserted into (value + is negative if bytes are deleted) buffer which + contains node being balanced. The mnemonic is + that the attempted change in node space used level + is levbytes bytes. */ + n_ret_value; + + int lfree, sfree, rfree /* free space in L, S and R */ ; + + /* nver is short for number of vertixes, and lnver is the number if + we shift to the left, rnver is the number if we shift to the + right, and lrnver is the number if we shift in both directions. + The goal is to minimize first the number of vertixes, and second, + the number of vertixes whose contents are changed by shifting, + and third the number of uncached vertixes whose contents are + changed by shifting and must be read from disk. */ + int nver, lnver, rnver, lrnver; + + /* used at leaf level only, S0 = S[0] is the node being balanced, + sInum [ I = 0,1,2 ] is the number of items that will + remain in node SI after balancing. S1 and S2 are new + nodes that might be created. */ + + /* we perform 8 calls to get_num_ver(). For each call we calculate five parameters. + where 4th parameter is s1bytes and 5th - s2bytes + */ + short snum012[40] = { 0, }; /* s0num, s1num, s2num for 8 cases + 0,1 - do not shift and do not shift but bottle + 2 - shift only whole item to left + 3 - shift to left and bottle as much as possible + 4,5 - shift to right (whole items and as much as possible + 6,7 - shift to both directions (whole items and as much as possible) + */ + + /* Sh is the node whose balance is currently being checked */ + struct buffer_head *Sh; + + Sh = PATH_H_PBUFFER(tb->tb_path, h); + levbytes = tb->insert_size[h]; + + /* Calculate balance parameters for creating new root. */ + if (!Sh) { + if (!h) + reiserfs_panic(tb->tb_sb, + "vs-8210: ip_check_balance: S[0] can not be 0"); + switch (n_ret_value = get_empty_nodes(tb, h)) { + case CARRY_ON: + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */ + + case NO_DISK_SPACE: + case REPEAT_SEARCH: + return n_ret_value; + default: + reiserfs_panic(tb->tb_sb, + "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes"); + } } - } - - if ( (n_ret_value = get_parents (tb, h)) != CARRY_ON ) /* get parents of S[h] neighbors. */ - return n_ret_value; - - sfree = B_FREE_SPACE (Sh); - - /* get free space of neighbors */ - rfree = get_rfree (tb, h); - lfree = get_lfree (tb, h); - - if (can_node_be_removed (vn->vn_mode, lfree, sfree, rfree, tb, h) == NO_BALANCING_NEEDED) - /* and new item fits into node S[h] without any shifting */ - return NO_BALANCING_NEEDED; - - create_virtual_node (tb, h); - - /* - determine maximal number of items we can shift to the left neighbor (in tb structure) - and the maximal number of bytes that can flow to the left neighbor - from the left most liquid item that cannot be shifted from S[0] entirely (returned value) - */ - check_left (tb, h, lfree); - - /* - determine maximal number of items we can shift to the right neighbor (in tb structure) - and the maximal number of bytes that can flow to the right neighbor - from the right most liquid item that cannot be shifted from S[0] entirely (returned value) - */ - check_right (tb, h, rfree); - - - /* all contents of internal node S[h] can be moved into its - neighbors, S[h] will be removed after balancing */ - if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) { - int to_r; - - /* Since we are working on internal nodes, and our internal - nodes have fixed size entries, then we can balance by the - number of items rather than the space they consume. In this - routine we set the left node equal to the right node, - allowing a difference of less than or equal to 1 child - pointer. */ - to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 - - (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); - set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1); - return CARRY_ON; - } - - /* this checks balance condition, that any two neighboring nodes can not fit in one node */ - RFALSE( h && - ( tb->lnum[h] >= vn->vn_nr_item + 1 || - tb->rnum[h] >= vn->vn_nr_item + 1), - "vs-8220: tree is not balanced on internal level"); - RFALSE( ! h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) || - (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1)) ), - "vs-8225: tree is not balanced on leaf level"); - - /* all contents of S[0] can be moved into its neighbors - S[0] will be removed after balancing. */ - if (!h && is_leaf_removable (tb)) - return CARRY_ON; + if ((n_ret_value = get_parents(tb, h)) != CARRY_ON) /* get parents of S[h] neighbors. */ + return n_ret_value; - /* why do we perform this check here rather than earlier?? - Answer: we can win 1 node in some cases above. Moreover we - checked it above, when we checked, that S[0] is not removable - in principle */ - if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */ - if ( ! h ) - tb->s0num = vn->vn_nr_item; - set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; - } + sfree = B_FREE_SPACE(Sh); + + /* get free space of neighbors */ + rfree = get_rfree(tb, h); + lfree = get_lfree(tb, h); + + if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) == + NO_BALANCING_NEEDED) + /* and new item fits into node S[h] without any shifting */ + return NO_BALANCING_NEEDED; + create_virtual_node(tb, h); - { - int lpar, rpar, nset, lset, rset, lrset; - /* - * regular overflowing of the node + /* + determine maximal number of items we can shift to the left neighbor (in tb structure) + and the maximal number of bytes that can flow to the left neighbor + from the left most liquid item that cannot be shifted from S[0] entirely (returned value) */ + check_left(tb, h, lfree); - /* get_num_ver works in 2 modes (FLOW & NO_FLOW) - lpar, rpar - number of items we can shift to left/right neighbor (including splitting item) - nset, lset, rset, lrset - shows, whether flowing items give better packing - */ + /* + determine maximal number of items we can shift to the right neighbor (in tb structure) + and the maximal number of bytes that can flow to the right neighbor + from the right most liquid item that cannot be shifted from S[0] entirely (returned value) + */ + check_right(tb, h, rfree); + + /* all contents of internal node S[h] can be moved into its + neighbors, S[h] will be removed after balancing */ + if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) { + int to_r; + + /* Since we are working on internal nodes, and our internal + nodes have fixed size entries, then we can balance by the + number of items rather than the space they consume. In this + routine we set the left node equal to the right node, + allowing a difference of less than or equal to 1 child + pointer. */ + to_r = + ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] + + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - + tb->rnum[h]); + set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, + -1, -1); + return CARRY_ON; + } + + /* this checks balance condition, that any two neighboring nodes can not fit in one node */ + RFALSE(h && + (tb->lnum[h] >= vn->vn_nr_item + 1 || + tb->rnum[h] >= vn->vn_nr_item + 1), + "vs-8220: tree is not balanced on internal level"); + RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) || + (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))), + "vs-8225: tree is not balanced on leaf level"); + + /* all contents of S[0] can be moved into its neighbors + S[0] will be removed after balancing. */ + if (!h && is_leaf_removable(tb)) + return CARRY_ON; + + /* why do we perform this check here rather than earlier?? + Answer: we can win 1 node in some cases above. Moreover we + checked it above, when we checked, that S[0] is not removable + in principle */ + if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */ + if (!h) + tb->s0num = vn->vn_nr_item; + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + + { + int lpar, rpar, nset, lset, rset, lrset; + /* + * regular overflowing of the node + */ + + /* get_num_ver works in 2 modes (FLOW & NO_FLOW) + lpar, rpar - number of items we can shift to left/right neighbor (including splitting item) + nset, lset, rset, lrset - shows, whether flowing items give better packing + */ #define FLOW 1 -#define NO_FLOW 0 /* do not any splitting */ +#define NO_FLOW 0 /* do not any splitting */ - /* we choose one the following */ + /* we choose one the following */ #define NOTHING_SHIFT_NO_FLOW 0 #define NOTHING_SHIFT_FLOW 5 #define LEFT_SHIFT_NO_FLOW 10 @@ -1339,164 +1370,173 @@ static int ip_check_balance (struct tree_balance * tb, int h) #define LR_SHIFT_NO_FLOW 30 #define LR_SHIFT_FLOW 35 + lpar = tb->lnum[h]; + rpar = tb->rnum[h]; + + /* calculate number of blocks S[h] must be split into when + nothing is shifted to the neighbors, + as well as number of items in each part of the split node (s012 numbers), + and number of bytes (s1bytes) of the shared drop which flow to S1 if any */ + nset = NOTHING_SHIFT_NO_FLOW; + nver = get_num_ver(vn->vn_mode, tb, h, + 0, -1, h ? vn->vn_nr_item : 0, -1, + snum012, NO_FLOW); + + if (!h) { + int nver1; + + /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */ + nver1 = get_num_ver(vn->vn_mode, tb, h, + 0, -1, 0, -1, + snum012 + NOTHING_SHIFT_FLOW, FLOW); + if (nver > nver1) + nset = NOTHING_SHIFT_FLOW, nver = nver1; + } - lpar = tb->lnum[h]; - rpar = tb->rnum[h]; - - - /* calculate number of blocks S[h] must be split into when - nothing is shifted to the neighbors, - as well as number of items in each part of the split node (s012 numbers), - and number of bytes (s1bytes) of the shared drop which flow to S1 if any */ - nset = NOTHING_SHIFT_NO_FLOW; - nver = get_num_ver (vn->vn_mode, tb, h, - 0, -1, h?vn->vn_nr_item:0, -1, - snum012, NO_FLOW); - - if (!h) - { - int nver1; - - /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */ - nver1 = get_num_ver (vn->vn_mode, tb, h, - 0, -1, 0, -1, - snum012 + NOTHING_SHIFT_FLOW, FLOW); - if (nver > nver1) - nset = NOTHING_SHIFT_FLOW, nver = nver1; - } - - - /* calculate number of blocks S[h] must be split into when - l_shift_num first items and l_shift_bytes of the right most - liquid item to be shifted are shifted to the left neighbor, - as well as number of items in each part of the splitted node (s012 numbers), - and number of bytes (s1bytes) of the shared drop which flow to S1 if any - */ - lset = LEFT_SHIFT_NO_FLOW; - lnver = get_num_ver (vn->vn_mode, tb, h, - lpar - (( h || tb->lbytes == -1 ) ? 0 : 1), -1, h ? vn->vn_nr_item:0, -1, - snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW); - if (!h) - { - int lnver1; - - lnver1 = get_num_ver (vn->vn_mode, tb, h, - lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, 0, -1, - snum012 + LEFT_SHIFT_FLOW, FLOW); - if (lnver > lnver1) - lset = LEFT_SHIFT_FLOW, lnver = lnver1; - } - - - /* calculate number of blocks S[h] must be split into when - r_shift_num first items and r_shift_bytes of the left most - liquid item to be shifted are shifted to the right neighbor, - as well as number of items in each part of the splitted node (s012 numbers), - and number of bytes (s1bytes) of the shared drop which flow to S1 if any - */ - rset = RIGHT_SHIFT_NO_FLOW; - rnver = get_num_ver (vn->vn_mode, tb, h, - 0, -1, h ? (vn->vn_nr_item-rpar) : (rpar - (( tb->rbytes != -1 ) ? 1 : 0)), -1, - snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW); - if (!h) - { - int rnver1; - - rnver1 = get_num_ver (vn->vn_mode, tb, h, - 0, -1, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes, - snum012 + RIGHT_SHIFT_FLOW, FLOW); - - if (rnver > rnver1) - rset = RIGHT_SHIFT_FLOW, rnver = rnver1; - } - - - /* calculate number of blocks S[h] must be split into when - items are shifted in both directions, - as well as number of items in each part of the splitted node (s012 numbers), - and number of bytes (s1bytes) of the shared drop which flow to S1 if any - */ - lrset = LR_SHIFT_NO_FLOW; - lrnver = get_num_ver (vn->vn_mode, tb, h, - lpar - ((h || tb->lbytes == -1) ? 0 : 1), -1, h ? (vn->vn_nr_item-rpar):(rpar - ((tb->rbytes != -1) ? 1 : 0)), -1, - snum012 + LR_SHIFT_NO_FLOW, NO_FLOW); - if (!h) - { - int lrnver1; - - lrnver1 = get_num_ver (vn->vn_mode, tb, h, - lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes, - snum012 + LR_SHIFT_FLOW, FLOW); - if (lrnver > lrnver1) - lrset = LR_SHIFT_FLOW, lrnver = lrnver1; - } - - + /* calculate number of blocks S[h] must be split into when + l_shift_num first items and l_shift_bytes of the right most + liquid item to be shifted are shifted to the left neighbor, + as well as number of items in each part of the splitted node (s012 numbers), + and number of bytes (s1bytes) of the shared drop which flow to S1 if any + */ + lset = LEFT_SHIFT_NO_FLOW; + lnver = get_num_ver(vn->vn_mode, tb, h, + lpar - ((h || tb->lbytes == -1) ? 0 : 1), + -1, h ? vn->vn_nr_item : 0, -1, + snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW); + if (!h) { + int lnver1; + + lnver1 = get_num_ver(vn->vn_mode, tb, h, + lpar - + ((tb->lbytes != -1) ? 1 : 0), + tb->lbytes, 0, -1, + snum012 + LEFT_SHIFT_FLOW, FLOW); + if (lnver > lnver1) + lset = LEFT_SHIFT_FLOW, lnver = lnver1; + } - /* Our general shifting strategy is: - 1) to minimized number of new nodes; - 2) to minimized number of neighbors involved in shifting; - 3) to minimized number of disk reads; */ + /* calculate number of blocks S[h] must be split into when + r_shift_num first items and r_shift_bytes of the left most + liquid item to be shifted are shifted to the right neighbor, + as well as number of items in each part of the splitted node (s012 numbers), + and number of bytes (s1bytes) of the shared drop which flow to S1 if any + */ + rset = RIGHT_SHIFT_NO_FLOW; + rnver = get_num_ver(vn->vn_mode, tb, h, + 0, -1, + h ? (vn->vn_nr_item - rpar) : (rpar - + ((tb-> + rbytes != + -1) ? 1 : + 0)), -1, + snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW); + if (!h) { + int rnver1; + + rnver1 = get_num_ver(vn->vn_mode, tb, h, + 0, -1, + (rpar - + ((tb->rbytes != -1) ? 1 : 0)), + tb->rbytes, + snum012 + RIGHT_SHIFT_FLOW, FLOW); + + if (rnver > rnver1) + rset = RIGHT_SHIFT_FLOW, rnver = rnver1; + } - /* we can win TWO or ONE nodes by shifting in both directions */ - if (lrnver < lnver && lrnver < rnver) - { - RFALSE( h && - (tb->lnum[h] != 1 || - tb->rnum[h] != 1 || - lrnver != 1 || rnver != 2 || lnver != 2 || h != 1), - "vs-8230: bad h"); - if (lrset == LR_SHIFT_FLOW) - set_parameters (tb, h, tb->lnum[h], tb->rnum[h], lrnver, snum012 + lrset, - tb->lbytes, tb->rbytes); - else - set_parameters (tb, h, tb->lnum[h] - ((tb->lbytes == -1) ? 0 : 1), - tb->rnum[h] - ((tb->rbytes == -1) ? 0 : 1), lrnver, snum012 + lrset, -1, -1); - - return CARRY_ON; - } + /* calculate number of blocks S[h] must be split into when + items are shifted in both directions, + as well as number of items in each part of the splitted node (s012 numbers), + and number of bytes (s1bytes) of the shared drop which flow to S1 if any + */ + lrset = LR_SHIFT_NO_FLOW; + lrnver = get_num_ver(vn->vn_mode, tb, h, + lpar - ((h || tb->lbytes == -1) ? 0 : 1), + -1, + h ? (vn->vn_nr_item - rpar) : (rpar - + ((tb-> + rbytes != + -1) ? 1 : + 0)), -1, + snum012 + LR_SHIFT_NO_FLOW, NO_FLOW); + if (!h) { + int lrnver1; + + lrnver1 = get_num_ver(vn->vn_mode, tb, h, + lpar - + ((tb->lbytes != -1) ? 1 : 0), + tb->lbytes, + (rpar - + ((tb->rbytes != -1) ? 1 : 0)), + tb->rbytes, + snum012 + LR_SHIFT_FLOW, FLOW); + if (lrnver > lrnver1) + lrset = LR_SHIFT_FLOW, lrnver = lrnver1; + } - /* if shifting doesn't lead to better packing then don't shift */ - if (nver == lrnver) - { - set_parameters (tb, h, 0, 0, nver, snum012 + nset, -1, -1); - return CARRY_ON; - } + /* Our general shifting strategy is: + 1) to minimized number of new nodes; + 2) to minimized number of neighbors involved in shifting; + 3) to minimized number of disk reads; */ + + /* we can win TWO or ONE nodes by shifting in both directions */ + if (lrnver < lnver && lrnver < rnver) { + RFALSE(h && + (tb->lnum[h] != 1 || + tb->rnum[h] != 1 || + lrnver != 1 || rnver != 2 || lnver != 2 + || h != 1), "vs-8230: bad h"); + if (lrset == LR_SHIFT_FLOW) + set_parameters(tb, h, tb->lnum[h], tb->rnum[h], + lrnver, snum012 + lrset, + tb->lbytes, tb->rbytes); + else + set_parameters(tb, h, + tb->lnum[h] - + ((tb->lbytes == -1) ? 0 : 1), + tb->rnum[h] - + ((tb->rbytes == -1) ? 0 : 1), + lrnver, snum012 + lrset, -1, -1); + + return CARRY_ON; + } + /* if shifting doesn't lead to better packing then don't shift */ + if (nver == lrnver) { + set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1, + -1); + return CARRY_ON; + } - /* now we know that for better packing shifting in only one - direction either to the left or to the right is required */ + /* now we know that for better packing shifting in only one + direction either to the left or to the right is required */ - /* if shifting to the left is better than shifting to the right */ - if (lnver < rnver) - { - SET_PAR_SHIFT_LEFT; - return CARRY_ON; - } + /* if shifting to the left is better than shifting to the right */ + if (lnver < rnver) { + SET_PAR_SHIFT_LEFT; + return CARRY_ON; + } - /* if shifting to the right is better than shifting to the left */ - if (lnver > rnver) - { - SET_PAR_SHIFT_RIGHT; - return CARRY_ON; - } + /* if shifting to the right is better than shifting to the left */ + if (lnver > rnver) { + SET_PAR_SHIFT_RIGHT; + return CARRY_ON; + } + /* now shifting in either direction gives the same number + of nodes and we can make use of the cached neighbors */ + if (is_left_neighbor_in_cache(tb, h)) { + SET_PAR_SHIFT_LEFT; + return CARRY_ON; + } - /* now shifting in either direction gives the same number - of nodes and we can make use of the cached neighbors */ - if (is_left_neighbor_in_cache (tb,h)) - { - SET_PAR_SHIFT_LEFT; - return CARRY_ON; + /* shift to the right independently on whether the right neighbor in cache or not */ + SET_PAR_SHIFT_RIGHT; + return CARRY_ON; } - - /* shift to the right independently on whether the right neighbor in cache or not */ - SET_PAR_SHIFT_RIGHT; - return CARRY_ON; - } } - /* Check whether current node S[h] is balanced when Decreasing its size by * Deleting or Cutting for INTERNAL node of S+tree. * Calculate parameters for balancing for current level h. @@ -1513,157 +1553,173 @@ static int ip_check_balance (struct tree_balance * tb, int h) * Note: Items of internal nodes have fixed size, so the balance condition for * the internal part of S+tree is as for the B-trees. */ -static int dc_check_balance_internal (struct tree_balance * tb, int h) +static int dc_check_balance_internal(struct tree_balance *tb, int h) { - struct virtual_node * vn = tb->tb_vn; + struct virtual_node *vn = tb->tb_vn; - /* Sh is the node whose balance is currently being checked, - and Fh is its father. */ - struct buffer_head * Sh, * Fh; - int maxsize, - n_ret_value; - int lfree, rfree /* free space in L and R */; + /* Sh is the node whose balance is currently being checked, + and Fh is its father. */ + struct buffer_head *Sh, *Fh; + int maxsize, n_ret_value; + int lfree, rfree /* free space in L and R */ ; - Sh = PATH_H_PBUFFER (tb->tb_path, h); - Fh = PATH_H_PPARENT (tb->tb_path, h); + Sh = PATH_H_PBUFFER(tb->tb_path, h); + Fh = PATH_H_PPARENT(tb->tb_path, h); - maxsize = MAX_CHILD_SIZE(Sh); + maxsize = MAX_CHILD_SIZE(Sh); /* using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */ /* new_nr_item = number of items node would have if operation is */ /* performed without balancing (new_nr_item); */ - create_virtual_node (tb, h); + create_virtual_node(tb, h); - if ( ! Fh ) - { /* S[h] is the root. */ - if ( vn->vn_nr_item > 0 ) - { - set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */ + if (!Fh) { /* S[h] is the root. */ + if (vn->vn_nr_item > 0) { + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */ + } + /* new_nr_item == 0. + * Current root will be deleted resulting in + * decrementing the tree height. */ + set_parameters(tb, h, 0, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + if ((n_ret_value = get_parents(tb, h)) != CARRY_ON) + return n_ret_value; + + /* get free space of neighbors */ + rfree = get_rfree(tb, h); + lfree = get_lfree(tb, h); + + /* determine maximal number of items we can fit into neighbors */ + check_left(tb, h, lfree); + check_right(tb, h, rfree); + + if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { /* Balance condition for the internal node is valid. + * In this case we balance only if it leads to better packing. */ + if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { /* Here we join S[h] with one of its neighbors, + * which is impossible with greater values of new_nr_item. */ + if (tb->lnum[h] >= vn->vn_nr_item + 1) { + /* All contents of S[h] can be moved to L[h]. */ + int n; + int order_L; + + order_L = + ((n = + PATH_H_B_ITEM_ORDER(tb->tb_path, + h)) == + 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; + n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / + (DC_SIZE + KEY_SIZE); + set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, + -1); + return CARRY_ON; + } + + if (tb->rnum[h] >= vn->vn_nr_item + 1) { + /* All contents of S[h] can be moved to R[h]. */ + int n; + int order_R; + + order_R = + ((n = + PATH_H_B_ITEM_ORDER(tb->tb_path, + h)) == + B_NR_ITEMS(Fh)) ? 0 : n + 1; + n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / + (DC_SIZE + KEY_SIZE); + set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, + -1); + return CARRY_ON; + } + } + + if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) { + /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ + int to_r; + + to_r = + ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - + tb->rnum[h] + vn->vn_nr_item + 1) / 2 - + (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); + set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, + 0, NULL, -1, -1); + return CARRY_ON; + } + + /* Balancing does not lead to better packing. */ + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; } - /* new_nr_item == 0. - * Current root will be deleted resulting in - * decrementing the tree height. */ - set_parameters (tb, h, 0, 0, 0, NULL, -1, -1); - return CARRY_ON; - } - - if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON ) - return n_ret_value; - - - /* get free space of neighbors */ - rfree = get_rfree (tb, h); - lfree = get_lfree (tb, h); - - /* determine maximal number of items we can fit into neighbors */ - check_left (tb, h, lfree); - check_right (tb, h, rfree); - - - if ( vn->vn_nr_item >= MIN_NR_KEY(Sh) ) - { /* Balance condition for the internal node is valid. - * In this case we balance only if it leads to better packing. */ - if ( vn->vn_nr_item == MIN_NR_KEY(Sh) ) - { /* Here we join S[h] with one of its neighbors, - * which is impossible with greater values of new_nr_item. */ - if ( tb->lnum[h] >= vn->vn_nr_item + 1 ) - { - /* All contents of S[h] can be moved to L[h]. */ - int n; - int order_L; - - order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; - n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE); - set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1); - return CARRY_ON; - } - - if ( tb->rnum[h] >= vn->vn_nr_item + 1 ) - { - /* All contents of S[h] can be moved to R[h]. */ - int n; - int order_R; - - order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : n + 1; - n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE); - set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1); - return CARRY_ON; - } + + /* Current node contain insufficient number of items. Balancing is required. */ + /* Check whether we can merge S[h] with left neighbor. */ + if (tb->lnum[h] >= vn->vn_nr_item + 1) + if (is_left_neighbor_in_cache(tb, h) + || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) { + int n; + int order_L; + + order_L = + ((n = + PATH_H_B_ITEM_ORDER(tb->tb_path, + h)) == + 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; + n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE + + KEY_SIZE); + set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* Check whether we can merge S[h] with right neighbor. */ + if (tb->rnum[h] >= vn->vn_nr_item + 1) { + int n; + int order_R; + + order_R = + ((n = + PATH_H_B_ITEM_ORDER(tb->tb_path, + h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1); + n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE + + KEY_SIZE); + set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1); + return CARRY_ON; } - if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) - { - /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ - int to_r; + /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ + if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) { + int to_r; + + to_r = + ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] + + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - + tb->rnum[h]); + set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, + -1, -1); + return CARRY_ON; + } - to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 - - (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); - set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1); - return CARRY_ON; + /* For internal nodes try to borrow item from a neighbor */ + RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root"); + + /* Borrow one or two items from caching neighbor */ + if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) { + int from_l; + + from_l = + (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + + 1) / 2 - (vn->vn_nr_item + 1); + set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1); + return CARRY_ON; } - /* Balancing does not lead to better packing. */ - set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; - } - - /* Current node contain insufficient number of items. Balancing is required. */ - /* Check whether we can merge S[h] with left neighbor. */ - if (tb->lnum[h] >= vn->vn_nr_item + 1) - if (is_left_neighbor_in_cache (tb,h) || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) - { - int n; - int order_L; - - order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; - n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE); - set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1); + set_parameters(tb, h, 0, + -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item + + 1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1); return CARRY_ON; - } - - /* Check whether we can merge S[h] with right neighbor. */ - if (tb->rnum[h] >= vn->vn_nr_item + 1) - { - int n; - int order_R; - - order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : (n + 1); - n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE); - set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1); - return CARRY_ON; - } - - /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ - if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) - { - int to_r; - - to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 - - (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); - set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1); - return CARRY_ON; - } - - /* For internal nodes try to borrow item from a neighbor */ - RFALSE( !tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root"); - - /* Borrow one or two items from caching neighbor */ - if (is_left_neighbor_in_cache (tb,h) || !tb->FR[h]) - { - int from_l; - - from_l = (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + 1) / 2 - (vn->vn_nr_item + 1); - set_parameters (tb, h, -from_l, 0, 1, NULL, -1, -1); - return CARRY_ON; - } - - set_parameters (tb, h, 0, -((MAX_NR_KEY(Sh)+1-tb->rnum[h]+vn->vn_nr_item+1)/2-(vn->vn_nr_item+1)), 1, - NULL, -1, -1); - return CARRY_ON; } - /* Check whether current node S[h] is balanced when Decreasing its size by * Deleting or Truncating for LEAF node of S+tree. * Calculate parameters for balancing for current level h. @@ -1677,90 +1733,86 @@ static int dc_check_balance_internal (struct tree_balance * tb, int h) * -1 - no balancing for higher levels needed; * -2 - no disk space. */ -static int dc_check_balance_leaf (struct tree_balance * tb, int h) +static int dc_check_balance_leaf(struct tree_balance *tb, int h) { - struct virtual_node * vn = tb->tb_vn; - - /* Number of bytes that must be deleted from - (value is negative if bytes are deleted) buffer which - contains node being balanced. The mnemonic is that the - attempted change in node space used level is levbytes bytes. */ - int levbytes; - /* the maximal item size */ - int maxsize, - n_ret_value; - /* S0 is the node whose balance is currently being checked, - and F0 is its father. */ - struct buffer_head * S0, * F0; - int lfree, rfree /* free space in L and R */; - - S0 = PATH_H_PBUFFER (tb->tb_path, 0); - F0 = PATH_H_PPARENT (tb->tb_path, 0); - - levbytes = tb->insert_size[h]; - - maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */ - - if ( ! F0 ) - { /* S[0] is the root now. */ - - RFALSE( -levbytes >= maxsize - B_FREE_SPACE (S0), - "vs-8240: attempt to create empty buffer tree"); - - set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; - } - - if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON ) - return n_ret_value; - - /* get free space of neighbors */ - rfree = get_rfree (tb, h); - lfree = get_lfree (tb, h); - - create_virtual_node (tb, h); - - /* if 3 leaves can be merge to one, set parameters and return */ - if (are_leaves_removable (tb, lfree, rfree)) - return CARRY_ON; - - /* determine maximal number of items we can shift to the left/right neighbor - and the maximal number of bytes that can flow to the left/right neighbor - from the left/right most liquid item that cannot be shifted from S[0] entirely - */ - check_left (tb, h, lfree); - check_right (tb, h, rfree); - - /* check whether we can merge S with left neighbor. */ - if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1) - if (is_left_neighbor_in_cache (tb,h) || - ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */ - !tb->FR[h]) { - - RFALSE( !tb->FL[h], "vs-8245: dc_check_balance_leaf: FL[h] must exist"); - - /* set parameter to merge S[0] with its left neighbor */ - set_parameters (tb, h, -1, 0, 0, NULL, -1, -1); - return CARRY_ON; - } - - /* check whether we can merge S[0] with right neighbor. */ - if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) { - set_parameters (tb, h, 0, -1, 0, NULL, -1, -1); - return CARRY_ON; - } - - /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */ - if (is_leaf_removable (tb)) - return CARRY_ON; - - /* Balancing is not required. */ - tb->s0num = vn->vn_nr_item; - set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); - return NO_BALANCING_NEEDED; -} + struct virtual_node *vn = tb->tb_vn; + + /* Number of bytes that must be deleted from + (value is negative if bytes are deleted) buffer which + contains node being balanced. The mnemonic is that the + attempted change in node space used level is levbytes bytes. */ + int levbytes; + /* the maximal item size */ + int maxsize, n_ret_value; + /* S0 is the node whose balance is currently being checked, + and F0 is its father. */ + struct buffer_head *S0, *F0; + int lfree, rfree /* free space in L and R */ ; + + S0 = PATH_H_PBUFFER(tb->tb_path, 0); + F0 = PATH_H_PPARENT(tb->tb_path, 0); + levbytes = tb->insert_size[h]; + maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */ + + if (!F0) { /* S[0] is the root now. */ + + RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0), + "vs-8240: attempt to create empty buffer tree"); + + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + + if ((n_ret_value = get_parents(tb, h)) != CARRY_ON) + return n_ret_value; + + /* get free space of neighbors */ + rfree = get_rfree(tb, h); + lfree = get_lfree(tb, h); + + create_virtual_node(tb, h); + + /* if 3 leaves can be merge to one, set parameters and return */ + if (are_leaves_removable(tb, lfree, rfree)) + return CARRY_ON; + + /* determine maximal number of items we can shift to the left/right neighbor + and the maximal number of bytes that can flow to the left/right neighbor + from the left/right most liquid item that cannot be shifted from S[0] entirely + */ + check_left(tb, h, lfree); + check_right(tb, h, rfree); + + /* check whether we can merge S with left neighbor. */ + if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1) + if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */ + !tb->FR[h]) { + + RFALSE(!tb->FL[h], + "vs-8245: dc_check_balance_leaf: FL[h] must exist"); + + /* set parameter to merge S[0] with its left neighbor */ + set_parameters(tb, h, -1, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* check whether we can merge S[0] with right neighbor. */ + if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) { + set_parameters(tb, h, 0, -1, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */ + if (is_leaf_removable(tb)) + return CARRY_ON; + + /* Balancing is not required. */ + tb->s0num = vn->vn_nr_item; + set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; +} /* Check whether current node S[h] is balanced when Decreasing its size by * Deleting or Cutting. @@ -1775,18 +1827,17 @@ static int dc_check_balance_leaf (struct tree_balance * tb, int h) * -1 - no balancing for higher levels needed; * -2 - no disk space. */ -static int dc_check_balance (struct tree_balance * tb, int h) +static int dc_check_balance(struct tree_balance *tb, int h) { - RFALSE( ! (PATH_H_PBUFFER (tb->tb_path, h)), "vs-8250: S is not initialized"); + RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)), + "vs-8250: S is not initialized"); - if ( h ) - return dc_check_balance_internal (tb, h); - else - return dc_check_balance_leaf (tb, h); + if (h) + return dc_check_balance_internal(tb, h); + else + return dc_check_balance_leaf(tb, h); } - - /* Check whether current node S[h] is balanced. * Calculate parameters for balancing for current level h. * Parameters: @@ -1805,83 +1856,80 @@ static int dc_check_balance (struct tree_balance * tb, int h) * -1 - no balancing for higher levels needed; * -2 - no disk space. */ -static int check_balance (int mode, - struct tree_balance * tb, - int h, - int inum, - int pos_in_item, - struct item_head * ins_ih, - const void * data - ) +static int check_balance(int mode, + struct tree_balance *tb, + int h, + int inum, + int pos_in_item, + struct item_head *ins_ih, const void *data) { - struct virtual_node * vn; + struct virtual_node *vn; - vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf); - vn->vn_free_ptr = (char *)(tb->tb_vn + 1); - vn->vn_mode = mode; - vn->vn_affected_item_num = inum; - vn->vn_pos_in_item = pos_in_item; - vn->vn_ins_ih = ins_ih; - vn->vn_data = data; + vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf); + vn->vn_free_ptr = (char *)(tb->tb_vn + 1); + vn->vn_mode = mode; + vn->vn_affected_item_num = inum; + vn->vn_pos_in_item = pos_in_item; + vn->vn_ins_ih = ins_ih; + vn->vn_data = data; - RFALSE( mode == M_INSERT && !vn->vn_ins_ih, - "vs-8255: ins_ih can not be 0 in insert mode"); + RFALSE(mode == M_INSERT && !vn->vn_ins_ih, + "vs-8255: ins_ih can not be 0 in insert mode"); - if ( tb->insert_size[h] > 0 ) - /* Calculate balance parameters when size of node is increasing. */ - return ip_check_balance (tb, h); + if (tb->insert_size[h] > 0) + /* Calculate balance parameters when size of node is increasing. */ + return ip_check_balance(tb, h); - /* Calculate balance parameters when size of node is decreasing. */ - return dc_check_balance (tb, h); + /* Calculate balance parameters when size of node is decreasing. */ + return dc_check_balance(tb, h); } +/* Check whether parent at the path is the really parent of the current node.*/ +static int get_direct_parent(struct tree_balance *p_s_tb, int n_h) +{ + struct buffer_head *p_s_bh; + struct path *p_s_path = p_s_tb->tb_path; + int n_position, + n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h); + + /* We are in the root or in the new root. */ + if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) { + + RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1, + "PAP-8260: invalid offset in the path"); + + if (PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)-> + b_blocknr == SB_ROOT_BLOCK(p_s_tb->tb_sb)) { + /* Root is not changed. */ + PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL; + PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0; + return CARRY_ON; + } + return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */ + } + + if (!B_IS_IN_TREE + (p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))) + return REPEAT_SEARCH; /* Parent in the path is not in the tree. */ + if ((n_position = + PATH_OFFSET_POSITION(p_s_path, + n_path_offset - 1)) > B_NR_ITEMS(p_s_bh)) + return REPEAT_SEARCH; -/* Check whether parent at the path is the really parent of the current node.*/ -static int get_direct_parent( - struct tree_balance * p_s_tb, - int n_h - ) { - struct buffer_head * p_s_bh; - struct path * p_s_path = p_s_tb->tb_path; - int n_position, - n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h); - - /* We are in the root or in the new root. */ - if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) { - - RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1, - "PAP-8260: invalid offset in the path"); - - if ( PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == - SB_ROOT_BLOCK (p_s_tb->tb_sb) ) { - /* Root is not changed. */ - PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL; - PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0; - return CARRY_ON; + if (B_N_CHILD_NUM(p_s_bh, n_position) != + PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr) + /* Parent in the path is not parent of the current node in the tree. */ + return REPEAT_SEARCH; + + if (buffer_locked(p_s_bh)) { + __wait_on_buffer(p_s_bh); + if (FILESYSTEM_CHANGED_TB(p_s_tb)) + return REPEAT_SEARCH; } - return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */ - } - - if ( ! B_IS_IN_TREE(p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)) ) - return REPEAT_SEARCH; /* Parent in the path is not in the tree. */ - - if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) > B_NR_ITEMS(p_s_bh) ) - return REPEAT_SEARCH; - - if ( B_N_CHILD_NUM(p_s_bh, n_position) != PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr ) - /* Parent in the path is not parent of the current node in the tree. */ - return REPEAT_SEARCH; - - if ( buffer_locked(p_s_bh) ) { - __wait_on_buffer(p_s_bh); - if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) - return REPEAT_SEARCH; - } - - return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node. */ -} + return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node. */ +} /* Using lnum[n_h] and rnum[n_h] we should determine what neighbors * of S[n_h] we @@ -1889,356 +1937,401 @@ static int get_direct_parent( * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; * CARRY_ON - schedule didn't occur while the function worked; */ -static int get_neighbors( - struct tree_balance * p_s_tb, - int n_h - ) { - int n_child_position, - n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1); - unsigned long n_son_number; - struct super_block * p_s_sb = p_s_tb->tb_sb; - struct buffer_head * p_s_bh; - - - PROC_INFO_INC( p_s_sb, get_neighbors[ n_h ] ); - - if ( p_s_tb->lnum[n_h] ) { - /* We need left neighbor to balance S[n_h]. */ - PROC_INFO_INC( p_s_sb, need_l_neighbor[ n_h ] ); - p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset); - - RFALSE( p_s_bh == p_s_tb->FL[n_h] && - ! PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset), - "PAP-8270: invalid position in the parent"); - - n_child_position = ( p_s_bh == p_s_tb->FL[n_h] ) ? p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]); - n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position); - p_s_bh = sb_bread(p_s_sb, n_son_number); - if (!p_s_bh) - return IO_ERROR; - if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { - decrement_bcount(p_s_bh); - PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] ); - return REPEAT_SEARCH; +static int get_neighbors(struct tree_balance *p_s_tb, int n_h) +{ + int n_child_position, + n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1); + unsigned long n_son_number; + struct super_block *p_s_sb = p_s_tb->tb_sb; + struct buffer_head *p_s_bh; + + PROC_INFO_INC(p_s_sb, get_neighbors[n_h]); + + if (p_s_tb->lnum[n_h]) { + /* We need left neighbor to balance S[n_h]. */ + PROC_INFO_INC(p_s_sb, need_l_neighbor[n_h]); + p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset); + + RFALSE(p_s_bh == p_s_tb->FL[n_h] && + !PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset), + "PAP-8270: invalid position in the parent"); + + n_child_position = + (p_s_bh == + p_s_tb->FL[n_h]) ? p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb-> + FL[n_h]); + n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position); + p_s_bh = sb_bread(p_s_sb, n_son_number); + if (!p_s_bh) + return IO_ERROR; + if (FILESYSTEM_CHANGED_TB(p_s_tb)) { + decrement_bcount(p_s_bh); + PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]); + return REPEAT_SEARCH; + } + + RFALSE(!B_IS_IN_TREE(p_s_tb->FL[n_h]) || + n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) || + B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) != + p_s_bh->b_blocknr, "PAP-8275: invalid parent"); + RFALSE(!B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child"); + RFALSE(!n_h && + B_FREE_SPACE(p_s_bh) != + MAX_CHILD_SIZE(p_s_bh) - + dc_size(B_N_CHILD(p_s_tb->FL[0], n_child_position)), + "PAP-8290: invalid child size of left neighbor"); + + decrement_bcount(p_s_tb->L[n_h]); + p_s_tb->L[n_h] = p_s_bh; } - - RFALSE( ! B_IS_IN_TREE(p_s_tb->FL[n_h]) || - n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) || - B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) != - p_s_bh->b_blocknr, "PAP-8275: invalid parent"); - RFALSE( ! B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child"); - RFALSE( ! n_h && - B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FL[0],n_child_position)), - "PAP-8290: invalid child size of left neighbor"); - - decrement_bcount(p_s_tb->L[n_h]); - p_s_tb->L[n_h] = p_s_bh; - } - - - if ( p_s_tb->rnum[n_h] ) { /* We need right neighbor to balance S[n_path_offset]. */ - PROC_INFO_INC( p_s_sb, need_r_neighbor[ n_h ] ); - p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset); - - RFALSE( p_s_bh == p_s_tb->FR[n_h] && - PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset) >= B_NR_ITEMS(p_s_bh), - "PAP-8295: invalid position in the parent"); - - n_child_position = ( p_s_bh == p_s_tb->FR[n_h] ) ? p_s_tb->rkey[n_h] + 1 : 0; - n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position); - p_s_bh = sb_bread(p_s_sb, n_son_number); - if (!p_s_bh) - return IO_ERROR; - if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { - decrement_bcount(p_s_bh); - PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] ); - return REPEAT_SEARCH; + + if (p_s_tb->rnum[n_h]) { /* We need right neighbor to balance S[n_path_offset]. */ + PROC_INFO_INC(p_s_sb, need_r_neighbor[n_h]); + p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset); + + RFALSE(p_s_bh == p_s_tb->FR[n_h] && + PATH_OFFSET_POSITION(p_s_tb->tb_path, + n_path_offset) >= + B_NR_ITEMS(p_s_bh), + "PAP-8295: invalid position in the parent"); + + n_child_position = + (p_s_bh == p_s_tb->FR[n_h]) ? p_s_tb->rkey[n_h] + 1 : 0; + n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position); + p_s_bh = sb_bread(p_s_sb, n_son_number); + if (!p_s_bh) + return IO_ERROR; + if (FILESYSTEM_CHANGED_TB(p_s_tb)) { + decrement_bcount(p_s_bh); + PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]); + return REPEAT_SEARCH; + } + decrement_bcount(p_s_tb->R[n_h]); + p_s_tb->R[n_h] = p_s_bh; + + RFALSE(!n_h + && B_FREE_SPACE(p_s_bh) != + MAX_CHILD_SIZE(p_s_bh) - + dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)), + "PAP-8300: invalid child size of right neighbor (%d != %d - %d)", + B_FREE_SPACE(p_s_bh), MAX_CHILD_SIZE(p_s_bh), + dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position))); + } - decrement_bcount(p_s_tb->R[n_h]); - p_s_tb->R[n_h] = p_s_bh; - - RFALSE( ! n_h && B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position)), - "PAP-8300: invalid child size of right neighbor (%d != %d - %d)", - B_FREE_SPACE (p_s_bh), MAX_CHILD_SIZE (p_s_bh), - dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position))); - - } - return CARRY_ON; + return CARRY_ON; } #ifdef CONFIG_REISERFS_CHECK -void * reiserfs_kmalloc (size_t size, int flags, struct super_block * s) +void *reiserfs_kmalloc(size_t size, int flags, struct super_block *s) { - void * vp; - static size_t malloced; - - - vp = kmalloc (size, flags); - if (vp) { - REISERFS_SB(s)->s_kmallocs += size; - if (REISERFS_SB(s)->s_kmallocs > malloced + 200000) { - reiserfs_warning (s, - "vs-8301: reiserfs_kmalloc: allocated memory %d", - REISERFS_SB(s)->s_kmallocs); - malloced = REISERFS_SB(s)->s_kmallocs; + void *vp; + static size_t malloced; + + vp = kmalloc(size, flags); + if (vp) { + REISERFS_SB(s)->s_kmallocs += size; + if (REISERFS_SB(s)->s_kmallocs > malloced + 200000) { + reiserfs_warning(s, + "vs-8301: reiserfs_kmalloc: allocated memory %d", + REISERFS_SB(s)->s_kmallocs); + malloced = REISERFS_SB(s)->s_kmallocs; + } } - } - return vp; + return vp; } -void reiserfs_kfree (const void * vp, size_t size, struct super_block * s) +void reiserfs_kfree(const void *vp, size_t size, struct super_block *s) { - kfree (vp); - - REISERFS_SB(s)->s_kmallocs -= size; - if (REISERFS_SB(s)->s_kmallocs < 0) - reiserfs_warning (s, "vs-8302: reiserfs_kfree: allocated memory %d", - REISERFS_SB(s)->s_kmallocs); + kfree(vp); + + REISERFS_SB(s)->s_kmallocs -= size; + if (REISERFS_SB(s)->s_kmallocs < 0) + reiserfs_warning(s, + "vs-8302: reiserfs_kfree: allocated memory %d", + REISERFS_SB(s)->s_kmallocs); } #endif - -static int get_virtual_node_size (struct super_block * sb, struct buffer_head * bh) +static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh) { - int max_num_of_items; - int max_num_of_entries; - unsigned long blocksize = sb->s_blocksize; + int max_num_of_items; + int max_num_of_entries; + unsigned long blocksize = sb->s_blocksize; #define MIN_NAME_LEN 1 - max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN); - max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) / - (DEH_SIZE + MIN_NAME_LEN); + max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN); + max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) / + (DEH_SIZE + MIN_NAME_LEN); - return sizeof(struct virtual_node) + - max(max_num_of_items * sizeof (struct virtual_item), - sizeof (struct virtual_item) + sizeof(struct direntry_uarea) + - (max_num_of_entries - 1) * sizeof (__u16)); + return sizeof(struct virtual_node) + + max(max_num_of_items * sizeof(struct virtual_item), + sizeof(struct virtual_item) + sizeof(struct direntry_uarea) + + (max_num_of_entries - 1) * sizeof(__u16)); } - - /* maybe we should fail balancing we are going to perform when kmalloc fails several times. But now it will loop until kmalloc gets required memory */ -static int get_mem_for_virtual_node (struct tree_balance * tb) +static int get_mem_for_virtual_node(struct tree_balance *tb) { - int check_fs = 0; - int size; - char * buf; - - size = get_virtual_node_size (tb->tb_sb, PATH_PLAST_BUFFER (tb->tb_path)); - - if (size > tb->vn_buf_size) { - /* we have to allocate more memory for virtual node */ - if (tb->vn_buf) { - /* free memory allocated before */ - reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb); - /* this is not needed if kfree is atomic */ - check_fs = 1; - } + int check_fs = 0; + int size; + char *buf; + + size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path)); + + if (size > tb->vn_buf_size) { + /* we have to allocate more memory for virtual node */ + if (tb->vn_buf) { + /* free memory allocated before */ + reiserfs_kfree(tb->vn_buf, tb->vn_buf_size, tb->tb_sb); + /* this is not needed if kfree is atomic */ + check_fs = 1; + } - /* virtual node requires now more memory */ - tb->vn_buf_size = size; - - /* get memory for virtual item */ - buf = reiserfs_kmalloc(size, GFP_ATOMIC | __GFP_NOWARN, tb->tb_sb); - if ( ! buf ) { - /* getting memory with GFP_KERNEL priority may involve - balancing now (due to indirect_to_direct conversion on - dcache shrinking). So, release path and collected - resources here */ - free_buffers_in_tb (tb); - buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb); - if ( !buf ) { + /* virtual node requires now more memory */ + tb->vn_buf_size = size; + + /* get memory for virtual item */ + buf = + reiserfs_kmalloc(size, GFP_ATOMIC | __GFP_NOWARN, + tb->tb_sb); + if (!buf) { + /* getting memory with GFP_KERNEL priority may involve + balancing now (due to indirect_to_direct conversion on + dcache shrinking). So, release path and collected + resources here */ + free_buffers_in_tb(tb); + buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb); + if (!buf) { #ifdef CONFIG_REISERFS_CHECK - reiserfs_warning (tb->tb_sb, - "vs-8345: get_mem_for_virtual_node: " - "kmalloc failed. reiserfs kmalloced %d bytes", - REISERFS_SB(tb->tb_sb)->s_kmallocs); + reiserfs_warning(tb->tb_sb, + "vs-8345: get_mem_for_virtual_node: " + "kmalloc failed. reiserfs kmalloced %d bytes", + REISERFS_SB(tb->tb_sb)-> + s_kmallocs); #endif - tb->vn_buf_size = 0; - } - tb->vn_buf = buf; - schedule() ; - return REPEAT_SEARCH; - } + tb->vn_buf_size = 0; + } + tb->vn_buf = buf; + schedule(); + return REPEAT_SEARCH; + } - tb->vn_buf = buf; - } + tb->vn_buf = buf; + } - if ( check_fs && FILESYSTEM_CHANGED_TB (tb) ) - return REPEAT_SEARCH; + if (check_fs && FILESYSTEM_CHANGED_TB(tb)) + return REPEAT_SEARCH; - return CARRY_ON; + return CARRY_ON; } - #ifdef CONFIG_REISERFS_CHECK -static void tb_buffer_sanity_check (struct super_block * p_s_sb, - struct buffer_head * p_s_bh, - const char *descr, int level) { - if (p_s_bh) { - if (atomic_read (&(p_s_bh->b_count)) <= 0) { - - reiserfs_panic (p_s_sb, "jmacd-1: tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n", descr, level, p_s_bh); - } - - if ( ! buffer_uptodate (p_s_bh) ) { - reiserfs_panic (p_s_sb, "jmacd-2: tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n", descr, level, p_s_bh); - } - - if ( ! B_IS_IN_TREE (p_s_bh) ) { - reiserfs_panic (p_s_sb, "jmacd-3: tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n", descr, level, p_s_bh); - } - - if (p_s_bh->b_bdev != p_s_sb->s_bdev) { - reiserfs_panic (p_s_sb, "jmacd-4: tb_buffer_sanity_check(): buffer has wrong device %s[%d] (%b)\n", descr, level, p_s_bh); - } - - if (p_s_bh->b_size != p_s_sb->s_blocksize) { - reiserfs_panic (p_s_sb, "jmacd-5: tb_buffer_sanity_check(): buffer has wrong blocksize %s[%d] (%b)\n", descr, level, p_s_bh); - } - - if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) { - reiserfs_panic (p_s_sb, "jmacd-6: tb_buffer_sanity_check(): buffer block number too high %s[%d] (%b)\n", descr, level, p_s_bh); - } - } -} -#else -static void tb_buffer_sanity_check (struct super_block * p_s_sb, - struct buffer_head * p_s_bh, - const char *descr, int level) -{;} -#endif - -static int clear_all_dirty_bits(struct super_block *s, - struct buffer_head *bh) { - return reiserfs_prepare_for_journal(s, bh, 0) ; -} - -static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb) +static void tb_buffer_sanity_check(struct super_block *p_s_sb, + struct buffer_head *p_s_bh, + const char *descr, int level) { - struct buffer_head * locked; -#ifdef CONFIG_REISERFS_CHECK - int repeat_counter = 0; -#endif - int i; + if (p_s_bh) { + if (atomic_read(&(p_s_bh->b_count)) <= 0) { - do { - - locked = NULL; - - for ( i = p_s_tb->tb_path->path_length; !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i-- ) { - if ( PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i) ) { - /* if I understand correctly, we can only be sure the last buffer - ** in the path is in the tree --clm - */ -#ifdef CONFIG_REISERFS_CHECK - if (PATH_PLAST_BUFFER(p_s_tb->tb_path) == - PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) { - tb_buffer_sanity_check (p_s_tb->tb_sb, - PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i), - "S", - p_s_tb->tb_path->path_length - i); + reiserfs_panic(p_s_sb, + "jmacd-1: tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n", + descr, level, p_s_bh); } -#endif - if (!clear_all_dirty_bits(p_s_tb->tb_sb, - PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i))) - { - locked = PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i); - } - } - } - for ( i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i]; i++ ) { + if (!buffer_uptodate(p_s_bh)) { + reiserfs_panic(p_s_sb, + "jmacd-2: tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n", + descr, level, p_s_bh); + } - if (p_s_tb->lnum[i] ) { + if (!B_IS_IN_TREE(p_s_bh)) { + reiserfs_panic(p_s_sb, + "jmacd-3: tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n", + descr, level, p_s_bh); + } - if ( p_s_tb->L[i] ) { - tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->L[i], "L", i); - if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i])) - locked = p_s_tb->L[i]; + if (p_s_bh->b_bdev != p_s_sb->s_bdev) { + reiserfs_panic(p_s_sb, + "jmacd-4: tb_buffer_sanity_check(): buffer has wrong device %s[%d] (%b)\n", + descr, level, p_s_bh); } - if ( !locked && p_s_tb->FL[i] ) { - tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FL[i], "FL", i); - if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i])) - locked = p_s_tb->FL[i]; + if (p_s_bh->b_size != p_s_sb->s_blocksize) { + reiserfs_panic(p_s_sb, + "jmacd-5: tb_buffer_sanity_check(): buffer has wrong blocksize %s[%d] (%b)\n", + descr, level, p_s_bh); } - if ( !locked && p_s_tb->CFL[i] ) { - tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFL[i], "CFL", i); - if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i])) - locked = p_s_tb->CFL[i]; + if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) { + reiserfs_panic(p_s_sb, + "jmacd-6: tb_buffer_sanity_check(): buffer block number too high %s[%d] (%b)\n", + descr, level, p_s_bh); } + } +} +#else +static void tb_buffer_sanity_check(struct super_block *p_s_sb, + struct buffer_head *p_s_bh, + const char *descr, int level) +{; +} +#endif - } +static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh) +{ + return reiserfs_prepare_for_journal(s, bh, 0); +} - if ( !locked && (p_s_tb->rnum[i]) ) { +static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb) +{ + struct buffer_head *locked; +#ifdef CONFIG_REISERFS_CHECK + int repeat_counter = 0; +#endif + int i; - if ( p_s_tb->R[i] ) { - tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->R[i], "R", i); - if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i])) - locked = p_s_tb->R[i]; - } + do { - - if ( !locked && p_s_tb->FR[i] ) { - tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FR[i], "FR", i); - if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i])) - locked = p_s_tb->FR[i]; + locked = NULL; + + for (i = p_s_tb->tb_path->path_length; + !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) { + if (PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) { + /* if I understand correctly, we can only be sure the last buffer + ** in the path is in the tree --clm + */ +#ifdef CONFIG_REISERFS_CHECK + if (PATH_PLAST_BUFFER(p_s_tb->tb_path) == + PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) { + tb_buffer_sanity_check(p_s_tb->tb_sb, + PATH_OFFSET_PBUFFER + (p_s_tb->tb_path, + i), "S", + p_s_tb->tb_path-> + path_length - i); + } +#endif + if (!clear_all_dirty_bits(p_s_tb->tb_sb, + PATH_OFFSET_PBUFFER + (p_s_tb->tb_path, + i))) { + locked = + PATH_OFFSET_PBUFFER(p_s_tb->tb_path, + i); + } + } } - if ( !locked && p_s_tb->CFR[i] ) { - tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFR[i], "CFR", i); - if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i])) - locked = p_s_tb->CFR[i]; + for (i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i]; + i++) { + + if (p_s_tb->lnum[i]) { + + if (p_s_tb->L[i]) { + tb_buffer_sanity_check(p_s_tb->tb_sb, + p_s_tb->L[i], + "L", i); + if (!clear_all_dirty_bits + (p_s_tb->tb_sb, p_s_tb->L[i])) + locked = p_s_tb->L[i]; + } + + if (!locked && p_s_tb->FL[i]) { + tb_buffer_sanity_check(p_s_tb->tb_sb, + p_s_tb->FL[i], + "FL", i); + if (!clear_all_dirty_bits + (p_s_tb->tb_sb, p_s_tb->FL[i])) + locked = p_s_tb->FL[i]; + } + + if (!locked && p_s_tb->CFL[i]) { + tb_buffer_sanity_check(p_s_tb->tb_sb, + p_s_tb->CFL[i], + "CFL", i); + if (!clear_all_dirty_bits + (p_s_tb->tb_sb, p_s_tb->CFL[i])) + locked = p_s_tb->CFL[i]; + } + + } + + if (!locked && (p_s_tb->rnum[i])) { + + if (p_s_tb->R[i]) { + tb_buffer_sanity_check(p_s_tb->tb_sb, + p_s_tb->R[i], + "R", i); + if (!clear_all_dirty_bits + (p_s_tb->tb_sb, p_s_tb->R[i])) + locked = p_s_tb->R[i]; + } + + if (!locked && p_s_tb->FR[i]) { + tb_buffer_sanity_check(p_s_tb->tb_sb, + p_s_tb->FR[i], + "FR", i); + if (!clear_all_dirty_bits + (p_s_tb->tb_sb, p_s_tb->FR[i])) + locked = p_s_tb->FR[i]; + } + + if (!locked && p_s_tb->CFR[i]) { + tb_buffer_sanity_check(p_s_tb->tb_sb, + p_s_tb->CFR[i], + "CFR", i); + if (!clear_all_dirty_bits + (p_s_tb->tb_sb, p_s_tb->CFR[i])) + locked = p_s_tb->CFR[i]; + } + } + } + /* as far as I can tell, this is not required. The FEB list seems + ** to be full of newly allocated nodes, which will never be locked, + ** dirty, or anything else. + ** To be safe, I'm putting in the checks and waits in. For the moment, + ** they are needed to keep the code in journal.c from complaining + ** about the buffer. That code is inside CONFIG_REISERFS_CHECK as well. + ** --clm + */ + for (i = 0; !locked && i < MAX_FEB_SIZE; i++) { + if (p_s_tb->FEB[i]) { + if (!clear_all_dirty_bits + (p_s_tb->tb_sb, p_s_tb->FEB[i])) + locked = p_s_tb->FEB[i]; + } } - } - } - /* as far as I can tell, this is not required. The FEB list seems - ** to be full of newly allocated nodes, which will never be locked, - ** dirty, or anything else. - ** To be safe, I'm putting in the checks and waits in. For the moment, - ** they are needed to keep the code in journal.c from complaining - ** about the buffer. That code is inside CONFIG_REISERFS_CHECK as well. - ** --clm - */ - for ( i = 0; !locked && i < MAX_FEB_SIZE; i++ ) { - if ( p_s_tb->FEB[i] ) { - if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i])) - locked = p_s_tb->FEB[i] ; - } - } - if (locked) { + if (locked) { #ifdef CONFIG_REISERFS_CHECK - repeat_counter++; - if ( (repeat_counter % 10000) == 0) { - reiserfs_warning (p_s_tb->tb_sb, - "wait_tb_buffers_until_released(): too many " - "iterations waiting for buffer to unlock " - "(%b)", locked); - - /* Don't loop forever. Try to recover from possible error. */ - - return ( FILESYSTEM_CHANGED_TB (p_s_tb) ) ? REPEAT_SEARCH : CARRY_ON; - } + repeat_counter++; + if ((repeat_counter % 10000) == 0) { + reiserfs_warning(p_s_tb->tb_sb, + "wait_tb_buffers_until_released(): too many " + "iterations waiting for buffer to unlock " + "(%b)", locked); + + /* Don't loop forever. Try to recover from possible error. */ + + return (FILESYSTEM_CHANGED_TB(p_s_tb)) ? + REPEAT_SEARCH : CARRY_ON; + } #endif - __wait_on_buffer (locked); - if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { - return REPEAT_SEARCH; - } - } + __wait_on_buffer(locked); + if (FILESYSTEM_CHANGED_TB(p_s_tb)) { + return REPEAT_SEARCH; + } + } - } while (locked); + } while (locked); - return CARRY_ON; + return CARRY_ON; } - /* Prepare for balancing, that is * get all necessary parents, and neighbors; * analyze what and where should be moved; @@ -2267,252 +2360,266 @@ static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb) * -1 - if no_disk_space */ +int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb, struct item_head *p_s_ins_ih, // item head of item being inserted + const void *data // inserted item or data to be pasted + ) +{ + int n_ret_value, n_h, n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path); + int n_pos_in_item; -int fix_nodes (int n_op_mode, - struct tree_balance * p_s_tb, - struct item_head * p_s_ins_ih, // item head of item being inserted - const void * data // inserted item or data to be pasted - ) { - int n_ret_value, - n_h, - n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path); - int n_pos_in_item; - - /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared - ** during wait_tb_buffers_run - */ - int wait_tb_buffers_run = 0 ; - struct buffer_head * p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path); - - ++ REISERFS_SB(p_s_tb -> tb_sb) -> s_fix_nodes; - - n_pos_in_item = p_s_tb->tb_path->pos_in_item; - - - p_s_tb->fs_gen = get_generation (p_s_tb->tb_sb); - - /* we prepare and log the super here so it will already be in the - ** transaction when do_balance needs to change it. - ** This way do_balance won't have to schedule when trying to prepare - ** the super for logging - */ - reiserfs_prepare_for_journal(p_s_tb->tb_sb, - SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1) ; - journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb, - SB_BUFFER_WITH_SB(p_s_tb->tb_sb)) ; - if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) - return REPEAT_SEARCH; - - /* if it possible in indirect_to_direct conversion */ - if (buffer_locked (p_s_tbS0)) { - __wait_on_buffer (p_s_tbS0); - if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) - return REPEAT_SEARCH; - } + /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared + ** during wait_tb_buffers_run + */ + int wait_tb_buffers_run = 0; + struct buffer_head *p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path); -#ifdef CONFIG_REISERFS_CHECK - if ( cur_tb ) { - print_cur_tb ("fix_nodes"); - reiserfs_panic(p_s_tb->tb_sb,"PAP-8305: fix_nodes: there is pending do_balance"); - } - - if (!buffer_uptodate (p_s_tbS0) || !B_IS_IN_TREE (p_s_tbS0)) { - reiserfs_panic (p_s_tb->tb_sb, "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate " - "at the beginning of fix_nodes or not in tree (mode %c)", p_s_tbS0, p_s_tbS0, n_op_mode); - } - - /* Check parameters. */ - switch (n_op_mode) { - case M_INSERT: - if ( n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0) ) - reiserfs_panic(p_s_tb->tb_sb,"PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert", - n_item_num, B_NR_ITEMS(p_s_tbS0)); - break; - case M_PASTE: - case M_DELETE: - case M_CUT: - if ( n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0) ) { - print_block (p_s_tbS0, 0, -1, -1); - reiserfs_panic(p_s_tb->tb_sb,"PAP-8335: fix_nodes: Incorrect item number(%d); mode = %c insert_size = %d\n", n_item_num, n_op_mode, p_s_tb->insert_size[0]); - } - break; - default: - reiserfs_panic(p_s_tb->tb_sb,"PAP-8340: fix_nodes: Incorrect mode of operation"); - } -#endif + ++REISERFS_SB(p_s_tb->tb_sb)->s_fix_nodes; + + n_pos_in_item = p_s_tb->tb_path->pos_in_item; + + p_s_tb->fs_gen = get_generation(p_s_tb->tb_sb); - if (get_mem_for_virtual_node (p_s_tb) == REPEAT_SEARCH) - // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat - return REPEAT_SEARCH; + /* we prepare and log the super here so it will already be in the + ** transaction when do_balance needs to change it. + ** This way do_balance won't have to schedule when trying to prepare + ** the super for logging + */ + reiserfs_prepare_for_journal(p_s_tb->tb_sb, + SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1); + journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb, + SB_BUFFER_WITH_SB(p_s_tb->tb_sb)); + if (FILESYSTEM_CHANGED_TB(p_s_tb)) + return REPEAT_SEARCH; + /* if it possible in indirect_to_direct conversion */ + if (buffer_locked(p_s_tbS0)) { + __wait_on_buffer(p_s_tbS0); + if (FILESYSTEM_CHANGED_TB(p_s_tb)) + return REPEAT_SEARCH; + } +#ifdef CONFIG_REISERFS_CHECK + if (cur_tb) { + print_cur_tb("fix_nodes"); + reiserfs_panic(p_s_tb->tb_sb, + "PAP-8305: fix_nodes: there is pending do_balance"); + } - /* Starting from the leaf level; for all levels n_h of the tree. */ - for ( n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++ ) { - if ( (n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON ) { - goto repeat; + if (!buffer_uptodate(p_s_tbS0) || !B_IS_IN_TREE(p_s_tbS0)) { + reiserfs_panic(p_s_tb->tb_sb, + "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate " + "at the beginning of fix_nodes or not in tree (mode %c)", + p_s_tbS0, p_s_tbS0, n_op_mode); } - if ( (n_ret_value = check_balance (n_op_mode, p_s_tb, n_h, n_item_num, - n_pos_in_item, p_s_ins_ih, data)) != CARRY_ON ) { - if ( n_ret_value == NO_BALANCING_NEEDED ) { - /* No balancing for higher levels needed. */ - if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) { - goto repeat; + /* Check parameters. */ + switch (n_op_mode) { + case M_INSERT: + if (n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0)) + reiserfs_panic(p_s_tb->tb_sb, + "PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert", + n_item_num, B_NR_ITEMS(p_s_tbS0)); + break; + case M_PASTE: + case M_DELETE: + case M_CUT: + if (n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0)) { + print_block(p_s_tbS0, 0, -1, -1); + reiserfs_panic(p_s_tb->tb_sb, + "PAP-8335: fix_nodes: Incorrect item number(%d); mode = %c insert_size = %d\n", + n_item_num, n_op_mode, + p_s_tb->insert_size[0]); } - if ( n_h != MAX_HEIGHT - 1 ) - p_s_tb->insert_size[n_h + 1] = 0; - /* ok, analysis and resource gathering are complete */ break; - } - goto repeat; + default: + reiserfs_panic(p_s_tb->tb_sb, + "PAP-8340: fix_nodes: Incorrect mode of operation"); } +#endif - if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) { - goto repeat; - } + if (get_mem_for_virtual_node(p_s_tb) == REPEAT_SEARCH) + // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat + return REPEAT_SEARCH; - if ( (n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON ) { - goto repeat; /* No disk space, or schedule occurred and - analysis may be invalid and needs to be redone. */ - } - - if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h) ) { - /* We have a positive insert size but no nodes exist on this - level, this means that we are creating a new root. */ + /* Starting from the leaf level; for all levels n_h of the tree. */ + for (n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++) { + if ((n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON) { + goto repeat; + } - RFALSE( p_s_tb->blknum[n_h] != 1, - "PAP-8350: creating new empty root"); + if ((n_ret_value = + check_balance(n_op_mode, p_s_tb, n_h, n_item_num, + n_pos_in_item, p_s_ins_ih, + data)) != CARRY_ON) { + if (n_ret_value == NO_BALANCING_NEEDED) { + /* No balancing for higher levels needed. */ + if ((n_ret_value = + get_neighbors(p_s_tb, n_h)) != CARRY_ON) { + goto repeat; + } + if (n_h != MAX_HEIGHT - 1) + p_s_tb->insert_size[n_h + 1] = 0; + /* ok, analysis and resource gathering are complete */ + break; + } + goto repeat; + } - if ( n_h < MAX_HEIGHT - 1 ) - p_s_tb->insert_size[n_h + 1] = 0; - } - else - if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1) ) { - if ( p_s_tb->blknum[n_h] > 1 ) { - /* The tree needs to be grown, so this node S[n_h] - which is the root node is split into two nodes, - and a new node (S[n_h+1]) will be created to - become the root node. */ - - RFALSE( n_h == MAX_HEIGHT - 1, - "PAP-8355: attempt to create too high of a tree"); - - p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) + DC_SIZE; + if ((n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON) { + goto repeat; } - else - if ( n_h < MAX_HEIGHT - 1 ) - p_s_tb->insert_size[n_h + 1] = 0; - } - else - p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1); - } - - if ((n_ret_value = wait_tb_buffers_until_unlocked (p_s_tb)) == CARRY_ON) { - if (FILESYSTEM_CHANGED_TB(p_s_tb)) { - wait_tb_buffers_run = 1 ; - n_ret_value = REPEAT_SEARCH ; - goto repeat; - } else { - return CARRY_ON; + + if ((n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON) { + goto repeat; /* No disk space, or schedule occurred and + analysis may be invalid and needs to be redone. */ + } + + if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h)) { + /* We have a positive insert size but no nodes exist on this + level, this means that we are creating a new root. */ + + RFALSE(p_s_tb->blknum[n_h] != 1, + "PAP-8350: creating new empty root"); + + if (n_h < MAX_HEIGHT - 1) + p_s_tb->insert_size[n_h + 1] = 0; + } else if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1)) { + if (p_s_tb->blknum[n_h] > 1) { + /* The tree needs to be grown, so this node S[n_h] + which is the root node is split into two nodes, + and a new node (S[n_h+1]) will be created to + become the root node. */ + + RFALSE(n_h == MAX_HEIGHT - 1, + "PAP-8355: attempt to create too high of a tree"); + + p_s_tb->insert_size[n_h + 1] = + (DC_SIZE + + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) + + DC_SIZE; + } else if (n_h < MAX_HEIGHT - 1) + p_s_tb->insert_size[n_h + 1] = 0; + } else + p_s_tb->insert_size[n_h + 1] = + (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1); } - } else { - wait_tb_buffers_run = 1 ; - goto repeat; - } - - repeat: - // fix_nodes was unable to perform its calculation due to - // filesystem got changed under us, lack of free disk space or i/o - // failure. If the first is the case - the search will be - // repeated. For now - free all resources acquired so far except - // for the new allocated nodes - { - int i; - /* Release path buffers. */ - if (wait_tb_buffers_run) { - pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path) ; + if ((n_ret_value = wait_tb_buffers_until_unlocked(p_s_tb)) == CARRY_ON) { + if (FILESYSTEM_CHANGED_TB(p_s_tb)) { + wait_tb_buffers_run = 1; + n_ret_value = REPEAT_SEARCH; + goto repeat; + } else { + return CARRY_ON; + } } else { - pathrelse (p_s_tb->tb_path); - } - /* brelse all resources collected for balancing */ - for ( i = 0; i < MAX_HEIGHT; i++ ) { - if (wait_tb_buffers_run) { - reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->L[i]); - reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->R[i]); - reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FL[i]); - reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FR[i]); - reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFL[i]); - reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFR[i]); - } - - brelse (p_s_tb->L[i]);p_s_tb->L[i] = NULL; - brelse (p_s_tb->R[i]);p_s_tb->R[i] = NULL; - brelse (p_s_tb->FL[i]);p_s_tb->FL[i] = NULL; - brelse (p_s_tb->FR[i]);p_s_tb->FR[i] = NULL; - brelse (p_s_tb->CFL[i]);p_s_tb->CFL[i] = NULL; - brelse (p_s_tb->CFR[i]);p_s_tb->CFR[i] = NULL; + wait_tb_buffers_run = 1; + goto repeat; } - if (wait_tb_buffers_run) { - for ( i = 0; i < MAX_FEB_SIZE; i++ ) { - if ( p_s_tb->FEB[i] ) { - reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, - p_s_tb->FEB[i]) ; + repeat: + // fix_nodes was unable to perform its calculation due to + // filesystem got changed under us, lack of free disk space or i/o + // failure. If the first is the case - the search will be + // repeated. For now - free all resources acquired so far except + // for the new allocated nodes + { + int i; + + /* Release path buffers. */ + if (wait_tb_buffers_run) { + pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path); + } else { + pathrelse(p_s_tb->tb_path); + } + /* brelse all resources collected for balancing */ + for (i = 0; i < MAX_HEIGHT; i++) { + if (wait_tb_buffers_run) { + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, + p_s_tb->L[i]); + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, + p_s_tb->R[i]); + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, + p_s_tb->FL[i]); + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, + p_s_tb->FR[i]); + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, + p_s_tb-> + CFL[i]); + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, + p_s_tb-> + CFR[i]); + } + + brelse(p_s_tb->L[i]); + p_s_tb->L[i] = NULL; + brelse(p_s_tb->R[i]); + p_s_tb->R[i] = NULL; + brelse(p_s_tb->FL[i]); + p_s_tb->FL[i] = NULL; + brelse(p_s_tb->FR[i]); + p_s_tb->FR[i] = NULL; + brelse(p_s_tb->CFL[i]); + p_s_tb->CFL[i] = NULL; + brelse(p_s_tb->CFR[i]); + p_s_tb->CFR[i] = NULL; + } + + if (wait_tb_buffers_run) { + for (i = 0; i < MAX_FEB_SIZE; i++) { + if (p_s_tb->FEB[i]) { + reiserfs_restore_prepared_buffer + (p_s_tb->tb_sb, p_s_tb->FEB[i]); + } + } } - } + return n_ret_value; } - return n_ret_value; - } } - /* Anatoly will probably forgive me renaming p_s_tb to tb. I just wanted to make lines shorter */ -void unfix_nodes (struct tree_balance * tb) +void unfix_nodes(struct tree_balance *tb) { - int i; - - /* Release path buffers. */ - pathrelse_and_restore (tb->tb_sb, tb->tb_path); - - /* brelse all resources collected for balancing */ - for ( i = 0; i < MAX_HEIGHT; i++ ) { - reiserfs_restore_prepared_buffer (tb->tb_sb, tb->L[i]); - reiserfs_restore_prepared_buffer (tb->tb_sb, tb->R[i]); - reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FL[i]); - reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FR[i]); - reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFL[i]); - reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFR[i]); - - brelse (tb->L[i]); - brelse (tb->R[i]); - brelse (tb->FL[i]); - brelse (tb->FR[i]); - brelse (tb->CFL[i]); - brelse (tb->CFR[i]); - } - - /* deal with list of allocated (used and unused) nodes */ - for ( i = 0; i < MAX_FEB_SIZE; i++ ) { - if ( tb->FEB[i] ) { - b_blocknr_t blocknr = tb->FEB[i]->b_blocknr ; - /* de-allocated block which was not used by balancing and - bforget about buffer for it */ - brelse (tb->FEB[i]); - reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0); - } - if (tb->used[i]) { - /* release used as new nodes including a new root */ - brelse (tb->used[i]); - } - } + int i; - if (tb->vn_buf) - reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb); + /* Release path buffers. */ + pathrelse_and_restore(tb->tb_sb, tb->tb_path); -} + /* brelse all resources collected for balancing */ + for (i = 0; i < MAX_HEIGHT; i++) { + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]); + reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]); + + brelse(tb->L[i]); + brelse(tb->R[i]); + brelse(tb->FL[i]); + brelse(tb->FR[i]); + brelse(tb->CFL[i]); + brelse(tb->CFR[i]); + } + /* deal with list of allocated (used and unused) nodes */ + for (i = 0; i < MAX_FEB_SIZE; i++) { + if (tb->FEB[i]) { + b_blocknr_t blocknr = tb->FEB[i]->b_blocknr; + /* de-allocated block which was not used by balancing and + bforget about buffer for it */ + brelse(tb->FEB[i]); + reiserfs_free_block(tb->transaction_handle, NULL, + blocknr, 0); + } + if (tb->used[i]) { + /* release used as new nodes including a new root */ + brelse(tb->used[i]); + } + } + if (tb->vn_buf) + reiserfs_kfree(tb->vn_buf, tb->vn_buf_size, tb->tb_sb); +} diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c index 08d0508c2d3..37c1306eb9b 100644 --- a/fs/reiserfs/hashes.c +++ b/fs/reiserfs/hashes.c @@ -22,7 +22,6 @@ #include <asm/types.h> #include <asm/bug.h> - #define DELTA 0x9E3779B9 #define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ #define PARTROUNDS 6 /* 6 gets complete mixing */ @@ -48,105 +47,75 @@ h1 += b1; \ } while(0) - u32 keyed_hash(const signed char *msg, int len) { - u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3}; + u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3 }; u32 h0 = k[0], h1 = k[1]; u32 a, b, c, d; u32 pad; int i; - - // assert(len >= 0 && len < 256); - pad = (u32)len | ((u32)len << 8); + // assert(len >= 0 && len < 256); + + pad = (u32) len | ((u32) len << 8); pad |= pad << 16; - while(len >= 16) - { - a = (u32)msg[ 0] | - (u32)msg[ 1] << 8 | - (u32)msg[ 2] << 16| - (u32)msg[ 3] << 24; - b = (u32)msg[ 4] | - (u32)msg[ 5] << 8 | - (u32)msg[ 6] << 16| - (u32)msg[ 7] << 24; - c = (u32)msg[ 8] | - (u32)msg[ 9] << 8 | - (u32)msg[10] << 16| - (u32)msg[11] << 24; - d = (u32)msg[12] | - (u32)msg[13] << 8 | - (u32)msg[14] << 16| - (u32)msg[15] << 24; - + while (len >= 16) { + a = (u32) msg[0] | + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; + b = (u32) msg[4] | + (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; + c = (u32) msg[8] | + (u32) msg[9] << 8 | + (u32) msg[10] << 16 | (u32) msg[11] << 24; + d = (u32) msg[12] | + (u32) msg[13] << 8 | + (u32) msg[14] << 16 | (u32) msg[15] << 24; + TEACORE(PARTROUNDS); len -= 16; msg += 16; } - if (len >= 12) - { - a = (u32)msg[ 0] | - (u32)msg[ 1] << 8 | - (u32)msg[ 2] << 16| - (u32)msg[ 3] << 24; - b = (u32)msg[ 4] | - (u32)msg[ 5] << 8 | - (u32)msg[ 6] << 16| - (u32)msg[ 7] << 24; - c = (u32)msg[ 8] | - (u32)msg[ 9] << 8 | - (u32)msg[10] << 16| - (u32)msg[11] << 24; + if (len >= 12) { + a = (u32) msg[0] | + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; + b = (u32) msg[4] | + (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; + c = (u32) msg[8] | + (u32) msg[9] << 8 | + (u32) msg[10] << 16 | (u32) msg[11] << 24; d = pad; - for(i = 12; i < len; i++) - { + for (i = 12; i < len; i++) { d <<= 8; d |= msg[i]; } - } - else if (len >= 8) - { - a = (u32)msg[ 0] | - (u32)msg[ 1] << 8 | - (u32)msg[ 2] << 16| - (u32)msg[ 3] << 24; - b = (u32)msg[ 4] | - (u32)msg[ 5] << 8 | - (u32)msg[ 6] << 16| - (u32)msg[ 7] << 24; + } else if (len >= 8) { + a = (u32) msg[0] | + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; + b = (u32) msg[4] | + (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24; c = d = pad; - for(i = 8; i < len; i++) - { + for (i = 8; i < len; i++) { c <<= 8; c |= msg[i]; } - } - else if (len >= 4) - { - a = (u32)msg[ 0] | - (u32)msg[ 1] << 8 | - (u32)msg[ 2] << 16| - (u32)msg[ 3] << 24; + } else if (len >= 4) { + a = (u32) msg[0] | + (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24; b = c = d = pad; - for(i = 4; i < len; i++) - { + for (i = 4; i < len; i++) { b <<= 8; b |= msg[i]; } - } - else - { + } else { a = b = c = d = pad; - for(i = 0; i < len; i++) - { + for (i = 0; i < len; i++) { a <<= 8; a |= msg[i]; } @@ -155,55 +124,59 @@ u32 keyed_hash(const signed char *msg, int len) TEACORE(FULLROUNDS); /* return 0;*/ - return h0^h1; + return h0 ^ h1; } /* What follows in this file is copyright 2000 by Hans Reiser, and the * licensing of what follows is governed by reiserfs/README */ -u32 yura_hash (const signed char *msg, int len) +u32 yura_hash(const signed char *msg, int len) { - int j, pow; - u32 a, c; - int i; - - for (pow=1,i=1; i < len; i++) pow = pow * 10; - - if (len == 1) - a = msg[0]-48; - else - a = (msg[0] - 48) * pow; - - for (i=1; i < len; i++) { - c = msg[i] - 48; - for (pow=1,j=i; j < len-1; j++) pow = pow * 10; - a = a + c * pow; - } - - for (; i < 40; i++) { - c = '0' - 48; - for (pow=1,j=i; j < len-1; j++) pow = pow * 10; - a = a + c * pow; - } - - for (; i < 256; i++) { - c = i; - for (pow=1,j=i; j < len-1; j++) pow = pow * 10; - a = a + c * pow; - } - - a = a << 7; - return a; + int j, pow; + u32 a, c; + int i; + + for (pow = 1, i = 1; i < len; i++) + pow = pow * 10; + + if (len == 1) + a = msg[0] - 48; + else + a = (msg[0] - 48) * pow; + + for (i = 1; i < len; i++) { + c = msg[i] - 48; + for (pow = 1, j = i; j < len - 1; j++) + pow = pow * 10; + a = a + c * pow; + } + + for (; i < 40; i++) { + c = '0' - 48; + for (pow = 1, j = i; j < len - 1; j++) + pow = pow * 10; + a = a + c * pow; + } + + for (; i < 256; i++) { + c = i; + for (pow = 1, j = i; j < len - 1; j++) + pow = pow * 10; + a = a + c * pow; + } + + a = a << 7; + return a; } -u32 r5_hash (const signed char *msg, int len) +u32 r5_hash(const signed char *msg, int len) { - u32 a=0; - while(*msg) { - a += *msg << 4; - a += *msg >> 4; - a *= 11; - msg++; - } - return a; + u32 a = 0; + while (*msg) { + a += *msg << 4; + a += *msg >> 4; + a *= 11; + msg++; + } + return a; } diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c index a362125da0d..6c5a726fd34 100644 --- a/fs/reiserfs/ibalance.c +++ b/fs/reiserfs/ibalance.c @@ -10,13 +10,8 @@ #include <linux/buffer_head.h> /* this is one and only function that is used outside (do_balance.c) */ -int balance_internal ( - struct tree_balance * , - int, - int, - struct item_head * , - struct buffer_head ** - ); +int balance_internal(struct tree_balance *, + int, int, struct item_head *, struct buffer_head **); /* modes of internal_shift_left, internal_shift_right and internal_insert_childs */ #define INTERNAL_SHIFT_FROM_S_TO_L 0 @@ -27,464 +22,474 @@ int balance_internal ( #define INTERNAL_INSERT_TO_L 5 #define INTERNAL_INSERT_TO_R 6 -static void internal_define_dest_src_infos ( - int shift_mode, - struct tree_balance * tb, - int h, - struct buffer_info * dest_bi, - struct buffer_info * src_bi, - int * d_key, - struct buffer_head ** cf - ) +static void internal_define_dest_src_infos(int shift_mode, + struct tree_balance *tb, + int h, + struct buffer_info *dest_bi, + struct buffer_info *src_bi, + int *d_key, struct buffer_head **cf) { - memset (dest_bi, 0, sizeof (struct buffer_info)); - memset (src_bi, 0, sizeof (struct buffer_info)); - /* define dest, src, dest parent, dest position */ - switch (shift_mode) { - case INTERNAL_SHIFT_FROM_S_TO_L: /* used in internal_shift_left */ - src_bi->tb = tb; - src_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); - src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); - src_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); - dest_bi->tb = tb; - dest_bi->bi_bh = tb->L[h]; - dest_bi->bi_parent = tb->FL[h]; - dest_bi->bi_position = get_left_neighbor_position (tb, h); - *d_key = tb->lkey[h]; - *cf = tb->CFL[h]; - break; - case INTERNAL_SHIFT_FROM_L_TO_S: - src_bi->tb = tb; - src_bi->bi_bh = tb->L[h]; - src_bi->bi_parent = tb->FL[h]; - src_bi->bi_position = get_left_neighbor_position (tb, h); - dest_bi->tb = tb; - dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); - dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); - dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); /* dest position is analog of dest->b_item_order */ - *d_key = tb->lkey[h]; - *cf = tb->CFL[h]; - break; - - case INTERNAL_SHIFT_FROM_R_TO_S: /* used in internal_shift_left */ - src_bi->tb = tb; - src_bi->bi_bh = tb->R[h]; - src_bi->bi_parent = tb->FR[h]; - src_bi->bi_position = get_right_neighbor_position (tb, h); - dest_bi->tb = tb; - dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); - dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); - dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); - *d_key = tb->rkey[h]; - *cf = tb->CFR[h]; - break; - - case INTERNAL_SHIFT_FROM_S_TO_R: - src_bi->tb = tb; - src_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); - src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); - src_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); - dest_bi->tb = tb; - dest_bi->bi_bh = tb->R[h]; - dest_bi->bi_parent = tb->FR[h]; - dest_bi->bi_position = get_right_neighbor_position (tb, h); - *d_key = tb->rkey[h]; - *cf = tb->CFR[h]; - break; - - case INTERNAL_INSERT_TO_L: - dest_bi->tb = tb; - dest_bi->bi_bh = tb->L[h]; - dest_bi->bi_parent = tb->FL[h]; - dest_bi->bi_position = get_left_neighbor_position (tb, h); - break; - - case INTERNAL_INSERT_TO_S: - dest_bi->tb = tb; - dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); - dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); - dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); - break; - - case INTERNAL_INSERT_TO_R: - dest_bi->tb = tb; - dest_bi->bi_bh = tb->R[h]; - dest_bi->bi_parent = tb->FR[h]; - dest_bi->bi_position = get_right_neighbor_position (tb, h); - break; - - default: - reiserfs_panic (tb->tb_sb, "internal_define_dest_src_infos: shift type is unknown (%d)", shift_mode); - } + memset(dest_bi, 0, sizeof(struct buffer_info)); + memset(src_bi, 0, sizeof(struct buffer_info)); + /* define dest, src, dest parent, dest position */ + switch (shift_mode) { + case INTERNAL_SHIFT_FROM_S_TO_L: /* used in internal_shift_left */ + src_bi->tb = tb; + src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[h]; + dest_bi->bi_parent = tb->FL[h]; + dest_bi->bi_position = get_left_neighbor_position(tb, h); + *d_key = tb->lkey[h]; + *cf = tb->CFL[h]; + break; + case INTERNAL_SHIFT_FROM_L_TO_S: + src_bi->tb = tb; + src_bi->bi_bh = tb->L[h]; + src_bi->bi_parent = tb->FL[h]; + src_bi->bi_position = get_left_neighbor_position(tb, h); + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); /* dest position is analog of dest->b_item_order */ + *d_key = tb->lkey[h]; + *cf = tb->CFL[h]; + break; + + case INTERNAL_SHIFT_FROM_R_TO_S: /* used in internal_shift_left */ + src_bi->tb = tb; + src_bi->bi_bh = tb->R[h]; + src_bi->bi_parent = tb->FR[h]; + src_bi->bi_position = get_right_neighbor_position(tb, h); + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + *d_key = tb->rkey[h]; + *cf = tb->CFR[h]; + break; + + case INTERNAL_SHIFT_FROM_S_TO_R: + src_bi->tb = tb; + src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[h]; + dest_bi->bi_parent = tb->FR[h]; + dest_bi->bi_position = get_right_neighbor_position(tb, h); + *d_key = tb->rkey[h]; + *cf = tb->CFR[h]; + break; + + case INTERNAL_INSERT_TO_L: + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[h]; + dest_bi->bi_parent = tb->FL[h]; + dest_bi->bi_position = get_left_neighbor_position(tb, h); + break; + + case INTERNAL_INSERT_TO_S: + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h); + dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + break; + + case INTERNAL_INSERT_TO_R: + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[h]; + dest_bi->bi_parent = tb->FR[h]; + dest_bi->bi_position = get_right_neighbor_position(tb, h); + break; + + default: + reiserfs_panic(tb->tb_sb, + "internal_define_dest_src_infos: shift type is unknown (%d)", + shift_mode); + } } - - /* Insert count node pointers into buffer cur before position to + 1. * Insert count items into buffer cur before position to. * Items and node pointers are specified by inserted and bh respectively. - */ -static void internal_insert_childs (struct buffer_info * cur_bi, - int to, int count, - struct item_head * inserted, - struct buffer_head ** bh - ) + */ +static void internal_insert_childs(struct buffer_info *cur_bi, + int to, int count, + struct item_head *inserted, + struct buffer_head **bh) { - struct buffer_head * cur = cur_bi->bi_bh; - struct block_head * blkh; - int nr; - struct reiserfs_key * ih; - struct disk_child new_dc[2]; - struct disk_child * dc; - int i; - - if (count <= 0) - return; - - blkh = B_BLK_HEAD(cur); - nr = blkh_nr_item(blkh); - - RFALSE( count > 2, - "too many children (%d) are to be inserted", count); - RFALSE( B_FREE_SPACE (cur) < count * (KEY_SIZE + DC_SIZE), - "no enough free space (%d), needed %d bytes", - B_FREE_SPACE (cur), count * (KEY_SIZE + DC_SIZE)); - - /* prepare space for count disk_child */ - dc = B_N_CHILD(cur,to+1); - - memmove (dc + count, dc, (nr+1-(to+1)) * DC_SIZE); - - /* copy to_be_insert disk children */ - for (i = 0; i < count; i ++) { - put_dc_size( &(new_dc[i]), MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i])); - put_dc_block_number( &(new_dc[i]), bh[i]->b_blocknr ); - } - memcpy (dc, new_dc, DC_SIZE * count); - - - /* prepare space for count items */ - ih = B_N_PDELIM_KEY (cur, ((to == -1) ? 0 : to)); - - memmove (ih + count, ih, (nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE); - - /* copy item headers (keys) */ - memcpy (ih, inserted, KEY_SIZE); - if ( count > 1 ) - memcpy (ih + 1, inserted + 1, KEY_SIZE); - - /* sizes, item number */ - set_blkh_nr_item( blkh, blkh_nr_item(blkh) + count ); - set_blkh_free_space( blkh, - blkh_free_space(blkh) - count * (DC_SIZE + KEY_SIZE ) ); - - do_balance_mark_internal_dirty (cur_bi->tb, cur,0); - - /*&&&&&&&&&&&&&&&&&&&&&&&&*/ - check_internal (cur); - /*&&&&&&&&&&&&&&&&&&&&&&&&*/ - - if (cur_bi->bi_parent) { - struct disk_child *t_dc = B_N_CHILD (cur_bi->bi_parent,cur_bi->bi_position); - put_dc_size( t_dc, dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE))); - do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent, 0); - - /*&&&&&&&&&&&&&&&&&&&&&&&&*/ - check_internal (cur_bi->bi_parent); - /*&&&&&&&&&&&&&&&&&&&&&&&&*/ - } + struct buffer_head *cur = cur_bi->bi_bh; + struct block_head *blkh; + int nr; + struct reiserfs_key *ih; + struct disk_child new_dc[2]; + struct disk_child *dc; + int i; + + if (count <= 0) + return; + + blkh = B_BLK_HEAD(cur); + nr = blkh_nr_item(blkh); + + RFALSE(count > 2, "too many children (%d) are to be inserted", count); + RFALSE(B_FREE_SPACE(cur) < count * (KEY_SIZE + DC_SIZE), + "no enough free space (%d), needed %d bytes", + B_FREE_SPACE(cur), count * (KEY_SIZE + DC_SIZE)); + + /* prepare space for count disk_child */ + dc = B_N_CHILD(cur, to + 1); + + memmove(dc + count, dc, (nr + 1 - (to + 1)) * DC_SIZE); + + /* copy to_be_insert disk children */ + for (i = 0; i < count; i++) { + put_dc_size(&(new_dc[i]), + MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i])); + put_dc_block_number(&(new_dc[i]), bh[i]->b_blocknr); + } + memcpy(dc, new_dc, DC_SIZE * count); + + /* prepare space for count items */ + ih = B_N_PDELIM_KEY(cur, ((to == -1) ? 0 : to)); + + memmove(ih + count, ih, + (nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE); + + /* copy item headers (keys) */ + memcpy(ih, inserted, KEY_SIZE); + if (count > 1) + memcpy(ih + 1, inserted + 1, KEY_SIZE); + + /* sizes, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + count); + set_blkh_free_space(blkh, + blkh_free_space(blkh) - count * (DC_SIZE + + KEY_SIZE)); + + do_balance_mark_internal_dirty(cur_bi->tb, cur, 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(cur); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + + if (cur_bi->bi_parent) { + struct disk_child *t_dc = + B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE))); + do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent, + 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(cur_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + } } - /* Delete del_num items and node pointers from buffer cur starting from * * the first_i'th item and first_p'th pointers respectively. */ -static void internal_delete_pointers_items ( - struct buffer_info * cur_bi, - int first_p, - int first_i, - int del_num - ) +static void internal_delete_pointers_items(struct buffer_info *cur_bi, + int first_p, + int first_i, int del_num) { - struct buffer_head * cur = cur_bi->bi_bh; - int nr; - struct block_head * blkh; - struct reiserfs_key * key; - struct disk_child * dc; - - RFALSE( cur == NULL, "buffer is 0"); - RFALSE( del_num < 0, - "negative number of items (%d) can not be deleted", del_num); - RFALSE( first_p < 0 || first_p + del_num > B_NR_ITEMS (cur) + 1 || first_i < 0, - "first pointer order (%d) < 0 or " - "no so many pointers (%d), only (%d) or " - "first key order %d < 0", first_p, - first_p + del_num, B_NR_ITEMS (cur) + 1, first_i); - if ( del_num == 0 ) - return; - - blkh = B_BLK_HEAD(cur); - nr = blkh_nr_item(blkh); - - if ( first_p == 0 && del_num == nr + 1 ) { - RFALSE( first_i != 0, "1st deleted key must have order 0, not %d", first_i); - make_empty_node (cur_bi); - return; - } - - RFALSE( first_i + del_num > B_NR_ITEMS (cur), - "first_i = %d del_num = %d " - "no so many keys (%d) in the node (%b)(%z)", - first_i, del_num, first_i + del_num, cur, cur); - - - /* deleting */ - dc = B_N_CHILD (cur, first_p); - - memmove (dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE); - key = B_N_PDELIM_KEY (cur, first_i); - memmove (key, key + del_num, (nr - first_i - del_num) * KEY_SIZE + (nr + 1 - del_num) * DC_SIZE); - - - /* sizes, item number */ - set_blkh_nr_item( blkh, blkh_nr_item(blkh) - del_num ); - set_blkh_free_space( blkh, - blkh_free_space(blkh) + (del_num * (KEY_SIZE + DC_SIZE) ) ); - - do_balance_mark_internal_dirty (cur_bi->tb, cur, 0); - /*&&&&&&&&&&&&&&&&&&&&&&&*/ - check_internal (cur); - /*&&&&&&&&&&&&&&&&&&&&&&&*/ - - if (cur_bi->bi_parent) { - struct disk_child *t_dc; - t_dc = B_N_CHILD (cur_bi->bi_parent, cur_bi->bi_position); - put_dc_size( t_dc, dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE) ) ); - - do_balance_mark_internal_dirty (cur_bi->tb, cur_bi->bi_parent,0); - /*&&&&&&&&&&&&&&&&&&&&&&&&*/ - check_internal (cur_bi->bi_parent); - /*&&&&&&&&&&&&&&&&&&&&&&&&*/ - } -} + struct buffer_head *cur = cur_bi->bi_bh; + int nr; + struct block_head *blkh; + struct reiserfs_key *key; + struct disk_child *dc; + + RFALSE(cur == NULL, "buffer is 0"); + RFALSE(del_num < 0, + "negative number of items (%d) can not be deleted", del_num); + RFALSE(first_p < 0 || first_p + del_num > B_NR_ITEMS(cur) + 1 + || first_i < 0, + "first pointer order (%d) < 0 or " + "no so many pointers (%d), only (%d) or " + "first key order %d < 0", first_p, first_p + del_num, + B_NR_ITEMS(cur) + 1, first_i); + if (del_num == 0) + return; + + blkh = B_BLK_HEAD(cur); + nr = blkh_nr_item(blkh); + + if (first_p == 0 && del_num == nr + 1) { + RFALSE(first_i != 0, + "1st deleted key must have order 0, not %d", first_i); + make_empty_node(cur_bi); + return; + } + RFALSE(first_i + del_num > B_NR_ITEMS(cur), + "first_i = %d del_num = %d " + "no so many keys (%d) in the node (%b)(%z)", + first_i, del_num, first_i + del_num, cur, cur); + + /* deleting */ + dc = B_N_CHILD(cur, first_p); + + memmove(dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE); + key = B_N_PDELIM_KEY(cur, first_i); + memmove(key, key + del_num, + (nr - first_i - del_num) * KEY_SIZE + (nr + 1 - + del_num) * DC_SIZE); + + /* sizes, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num); + set_blkh_free_space(blkh, + blkh_free_space(blkh) + + (del_num * (KEY_SIZE + DC_SIZE))); + + do_balance_mark_internal_dirty(cur_bi->tb, cur, 0); + /*&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(cur); + /*&&&&&&&&&&&&&&&&&&&&&&& */ + + if (cur_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE))); + + do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent, + 0); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(cur_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + } +} /* delete n node pointers and items starting from given position */ -static void internal_delete_childs (struct buffer_info * cur_bi, - int from, int n) +static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n) { - int i_from; + int i_from; - i_from = (from == 0) ? from : from - 1; + i_from = (from == 0) ? from : from - 1; - /* delete n pointers starting from `from' position in CUR; - delete n keys starting from 'i_from' position in CUR; - */ - internal_delete_pointers_items (cur_bi, from, i_from, n); + /* delete n pointers starting from `from' position in CUR; + delete n keys starting from 'i_from' position in CUR; + */ + internal_delete_pointers_items(cur_bi, from, i_from, n); } - /* copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest * last_first == FIRST_TO_LAST means, that we copy first items from src to tail of dest * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest */ -static void internal_copy_pointers_items ( - struct buffer_info * dest_bi, - struct buffer_head * src, - int last_first, int cpy_num - ) +static void internal_copy_pointers_items(struct buffer_info *dest_bi, + struct buffer_head *src, + int last_first, int cpy_num) { - /* ATTENTION! Number of node pointers in DEST is equal to number of items in DEST * - * as delimiting key have already inserted to buffer dest.*/ - struct buffer_head * dest = dest_bi->bi_bh; - int nr_dest, nr_src; - int dest_order, src_order; - struct block_head * blkh; - struct reiserfs_key * key; - struct disk_child * dc; - - nr_src = B_NR_ITEMS (src); - - RFALSE( dest == NULL || src == NULL, - "src (%p) or dest (%p) buffer is 0", src, dest); - RFALSE( last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST, - "invalid last_first parameter (%d)", last_first); - RFALSE( nr_src < cpy_num - 1, - "no so many items (%d) in src (%d)", cpy_num, nr_src); - RFALSE( cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num); - RFALSE( cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest), - "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)", - cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest)); - - if ( cpy_num == 0 ) - return; + /* ATTENTION! Number of node pointers in DEST is equal to number of items in DEST * + * as delimiting key have already inserted to buffer dest.*/ + struct buffer_head *dest = dest_bi->bi_bh; + int nr_dest, nr_src; + int dest_order, src_order; + struct block_head *blkh; + struct reiserfs_key *key; + struct disk_child *dc; + + nr_src = B_NR_ITEMS(src); + + RFALSE(dest == NULL || src == NULL, + "src (%p) or dest (%p) buffer is 0", src, dest); + RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST, + "invalid last_first parameter (%d)", last_first); + RFALSE(nr_src < cpy_num - 1, + "no so many items (%d) in src (%d)", cpy_num, nr_src); + RFALSE(cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num); + RFALSE(cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest), + "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)", + cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest)); + + if (cpy_num == 0) + return; /* coping */ - blkh = B_BLK_HEAD(dest); - nr_dest = blkh_nr_item(blkh); + blkh = B_BLK_HEAD(dest); + nr_dest = blkh_nr_item(blkh); - /*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest;*/ - /*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0;*/ - (last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order = nr_src - cpy_num + 1) : - (dest_order = nr_dest, src_order = 0); + /*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest; */ + /*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0; */ + (last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order = + nr_src - cpy_num + 1) : (dest_order = + nr_dest, + src_order = + 0); - /* prepare space for cpy_num pointers */ - dc = B_N_CHILD (dest, dest_order); + /* prepare space for cpy_num pointers */ + dc = B_N_CHILD(dest, dest_order); - memmove (dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE); + memmove(dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE); /* insert pointers */ - memcpy (dc, B_N_CHILD (src, src_order), DC_SIZE * cpy_num); - - - /* prepare space for cpy_num - 1 item headers */ - key = B_N_PDELIM_KEY(dest, dest_order); - memmove (key + cpy_num - 1, key, - KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest + cpy_num)); - - - /* insert headers */ - memcpy (key, B_N_PDELIM_KEY (src, src_order), KEY_SIZE * (cpy_num - 1)); - - /* sizes, item number */ - set_blkh_nr_item( blkh, blkh_nr_item(blkh) + (cpy_num - 1 ) ); - set_blkh_free_space( blkh, - blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) + DC_SIZE * cpy_num ) ); - - do_balance_mark_internal_dirty (dest_bi->tb, dest, 0); - - /*&&&&&&&&&&&&&&&&&&&&&&&&*/ - check_internal (dest); - /*&&&&&&&&&&&&&&&&&&&&&&&&*/ - - if (dest_bi->bi_parent) { - struct disk_child *t_dc; - t_dc = B_N_CHILD(dest_bi->bi_parent,dest_bi->bi_position); - put_dc_size( t_dc, dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) + DC_SIZE * cpy_num) ); - - do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent,0); - /*&&&&&&&&&&&&&&&&&&&&&&&&*/ - check_internal (dest_bi->bi_parent); - /*&&&&&&&&&&&&&&&&&&&&&&&&*/ - } + memcpy(dc, B_N_CHILD(src, src_order), DC_SIZE * cpy_num); + + /* prepare space for cpy_num - 1 item headers */ + key = B_N_PDELIM_KEY(dest, dest_order); + memmove(key + cpy_num - 1, key, + KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest + + cpy_num)); + + /* insert headers */ + memcpy(key, B_N_PDELIM_KEY(src, src_order), KEY_SIZE * (cpy_num - 1)); + + /* sizes, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + (cpy_num - 1)); + set_blkh_free_space(blkh, + blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) + + DC_SIZE * cpy_num)); + + do_balance_mark_internal_dirty(dest_bi->tb, dest, 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(dest); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + + if (dest_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) + + DC_SIZE * cpy_num)); + + do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent, + 0); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(dest_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + } } - /* Copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest. * Delete cpy_num - del_par items and node pointers from buffer src. * last_first == FIRST_TO_LAST means, that we copy/delete first items from src. * last_first == LAST_TO_FIRST means, that we copy/delete last items from src. */ -static void internal_move_pointers_items (struct buffer_info * dest_bi, - struct buffer_info * src_bi, - int last_first, int cpy_num, int del_par) +static void internal_move_pointers_items(struct buffer_info *dest_bi, + struct buffer_info *src_bi, + int last_first, int cpy_num, + int del_par) { - int first_pointer; - int first_item; - - internal_copy_pointers_items (dest_bi, src_bi->bi_bh, last_first, cpy_num); - - if (last_first == FIRST_TO_LAST) { /* shift_left occurs */ - first_pointer = 0; - first_item = 0; - /* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer, - for key - with first_item */ - internal_delete_pointers_items (src_bi, first_pointer, first_item, cpy_num - del_par); - } else { /* shift_right occurs */ - int i, j; - - i = ( cpy_num - del_par == ( j = B_NR_ITEMS(src_bi->bi_bh)) + 1 ) ? 0 : j - cpy_num + del_par; - - internal_delete_pointers_items (src_bi, j + 1 - cpy_num + del_par, i, cpy_num - del_par); - } + int first_pointer; + int first_item; + + internal_copy_pointers_items(dest_bi, src_bi->bi_bh, last_first, + cpy_num); + + if (last_first == FIRST_TO_LAST) { /* shift_left occurs */ + first_pointer = 0; + first_item = 0; + /* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer, + for key - with first_item */ + internal_delete_pointers_items(src_bi, first_pointer, + first_item, cpy_num - del_par); + } else { /* shift_right occurs */ + int i, j; + + i = (cpy_num - del_par == + (j = + B_NR_ITEMS(src_bi->bi_bh)) + 1) ? 0 : j - cpy_num + + del_par; + + internal_delete_pointers_items(src_bi, + j + 1 - cpy_num + del_par, i, + cpy_num - del_par); + } } /* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */ -static void internal_insert_key (struct buffer_info * dest_bi, - int dest_position_before, /* insert key before key with n_dest number */ - struct buffer_head * src, - int src_position) +static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_before, /* insert key before key with n_dest number */ + struct buffer_head *src, int src_position) { - struct buffer_head * dest = dest_bi->bi_bh; - int nr; - struct block_head * blkh; - struct reiserfs_key * key; - - RFALSE( dest == NULL || src == NULL, - "source(%p) or dest(%p) buffer is 0", src, dest); - RFALSE( dest_position_before < 0 || src_position < 0, - "source(%d) or dest(%d) key number less than 0", - src_position, dest_position_before); - RFALSE( dest_position_before > B_NR_ITEMS (dest) || - src_position >= B_NR_ITEMS(src), - "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))", - dest_position_before, B_NR_ITEMS (dest), - src_position, B_NR_ITEMS(src)); - RFALSE( B_FREE_SPACE (dest) < KEY_SIZE, - "no enough free space (%d) in dest buffer", B_FREE_SPACE (dest)); - - blkh = B_BLK_HEAD(dest); - nr = blkh_nr_item(blkh); - - /* prepare space for inserting key */ - key = B_N_PDELIM_KEY (dest, dest_position_before); - memmove (key + 1, key, (nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE); - - /* insert key */ - memcpy (key, B_N_PDELIM_KEY(src, src_position), KEY_SIZE); - - /* Change dirt, free space, item number fields. */ - - set_blkh_nr_item( blkh, blkh_nr_item(blkh) + 1 ); - set_blkh_free_space( blkh, blkh_free_space(blkh) - KEY_SIZE ); - - do_balance_mark_internal_dirty (dest_bi->tb, dest, 0); - - if (dest_bi->bi_parent) { - struct disk_child *t_dc; - t_dc = B_N_CHILD(dest_bi->bi_parent,dest_bi->bi_position); - put_dc_size( t_dc, dc_size(t_dc) + KEY_SIZE ); - - do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent,0); - } + struct buffer_head *dest = dest_bi->bi_bh; + int nr; + struct block_head *blkh; + struct reiserfs_key *key; + + RFALSE(dest == NULL || src == NULL, + "source(%p) or dest(%p) buffer is 0", src, dest); + RFALSE(dest_position_before < 0 || src_position < 0, + "source(%d) or dest(%d) key number less than 0", + src_position, dest_position_before); + RFALSE(dest_position_before > B_NR_ITEMS(dest) || + src_position >= B_NR_ITEMS(src), + "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))", + dest_position_before, B_NR_ITEMS(dest), + src_position, B_NR_ITEMS(src)); + RFALSE(B_FREE_SPACE(dest) < KEY_SIZE, + "no enough free space (%d) in dest buffer", B_FREE_SPACE(dest)); + + blkh = B_BLK_HEAD(dest); + nr = blkh_nr_item(blkh); + + /* prepare space for inserting key */ + key = B_N_PDELIM_KEY(dest, dest_position_before); + memmove(key + 1, key, + (nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE); + + /* insert key */ + memcpy(key, B_N_PDELIM_KEY(src, src_position), KEY_SIZE); + + /* Change dirt, free space, item number fields. */ + + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1); + set_blkh_free_space(blkh, blkh_free_space(blkh) - KEY_SIZE); + + do_balance_mark_internal_dirty(dest_bi->tb, dest, 0); + + if (dest_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position); + put_dc_size(t_dc, dc_size(t_dc) + KEY_SIZE); + + do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent, + 0); + } } - - /* Insert d_key'th (delimiting) key from buffer cfl to tail of dest. * Copy pointer_amount node pointers and pointer_amount - 1 items from buffer src to buffer dest. * Replace d_key'th key in buffer cfl. * Delete pointer_amount items and node pointers from buffer src. */ /* this can be invoked both to shift from S to L and from R to S */ -static void internal_shift_left ( - int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S */ - struct tree_balance * tb, - int h, - int pointer_amount - ) +static void internal_shift_left(int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S */ + struct tree_balance *tb, + int h, int pointer_amount) { - struct buffer_info dest_bi, src_bi; - struct buffer_head * cf; - int d_key_position; - - internal_define_dest_src_infos (mode, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); - - /*printk("pointer_amount = %d\n",pointer_amount);*/ - - if (pointer_amount) { - /* insert delimiting key from common father of dest and src to node dest into position B_NR_ITEM(dest) */ - internal_insert_key (&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position); - - if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) { - if (src_bi.bi_position/*src->b_item_order*/ == 0) - replace_key (tb, cf, d_key_position, src_bi.bi_parent/*src->b_parent*/, 0); - } else - replace_key (tb, cf, d_key_position, src_bi.bi_bh, pointer_amount - 1); - } - /* last parameter is del_parameter */ - internal_move_pointers_items (&dest_bi, &src_bi, FIRST_TO_LAST, pointer_amount, 0); + struct buffer_info dest_bi, src_bi; + struct buffer_head *cf; + int d_key_position; + + internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi, + &d_key_position, &cf); + + /*printk("pointer_amount = %d\n",pointer_amount); */ + + if (pointer_amount) { + /* insert delimiting key from common father of dest and src to node dest into position B_NR_ITEM(dest) */ + internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, + d_key_position); + + if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) { + if (src_bi.bi_position /*src->b_item_order */ == 0) + replace_key(tb, cf, d_key_position, + src_bi. + bi_parent /*src->b_parent */ , 0); + } else + replace_key(tb, cf, d_key_position, src_bi.bi_bh, + pointer_amount - 1); + } + /* last parameter is del_parameter */ + internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST, + pointer_amount, 0); } @@ -493,67 +498,66 @@ static void internal_shift_left ( * Delete n - 1 items and node pointers from buffer S[h]. */ /* it always shifts from S[h] to L[h] */ -static void internal_shift1_left ( - struct tree_balance * tb, - int h, - int pointer_amount - ) +static void internal_shift1_left(struct tree_balance *tb, + int h, int pointer_amount) { - struct buffer_info dest_bi, src_bi; - struct buffer_head * cf; - int d_key_position; + struct buffer_info dest_bi, src_bi; + struct buffer_head *cf; + int d_key_position; - internal_define_dest_src_infos (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); + internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, + &dest_bi, &src_bi, &d_key_position, &cf); - if ( pointer_amount > 0 ) /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */ - internal_insert_key (&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position); - /* internal_insert_key (tb->L[h], B_NR_ITEM(tb->L[h]), tb->CFL[h], tb->lkey[h]);*/ + if (pointer_amount > 0) /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */ + internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, + d_key_position); + /* internal_insert_key (tb->L[h], B_NR_ITEM(tb->L[h]), tb->CFL[h], tb->lkey[h]); */ - /* last parameter is del_parameter */ - internal_move_pointers_items (&dest_bi, &src_bi, FIRST_TO_LAST, pointer_amount, 1); - /* internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1);*/ + /* last parameter is del_parameter */ + internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST, + pointer_amount, 1); + /* internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1); */ } - /* Insert d_key'th (delimiting) key from buffer cfr to head of dest. * Copy n node pointers and n - 1 items from buffer src to buffer dest. * Replace d_key'th key in buffer cfr. * Delete n items and node pointers from buffer src. */ -static void internal_shift_right ( - int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S */ - struct tree_balance * tb, - int h, - int pointer_amount - ) +static void internal_shift_right(int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S */ + struct tree_balance *tb, + int h, int pointer_amount) { - struct buffer_info dest_bi, src_bi; - struct buffer_head * cf; - int d_key_position; - int nr; - - - internal_define_dest_src_infos (mode, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); - - nr = B_NR_ITEMS (src_bi.bi_bh); - - if (pointer_amount > 0) { - /* insert delimiting key from common father of dest and src to dest node into position 0 */ - internal_insert_key (&dest_bi, 0, cf, d_key_position); - if (nr == pointer_amount - 1) { - RFALSE( src_bi.bi_bh != PATH_H_PBUFFER (tb->tb_path, h)/*tb->S[h]*/ || - dest_bi.bi_bh != tb->R[h], - "src (%p) must be == tb->S[h](%p) when it disappears", - src_bi.bi_bh, PATH_H_PBUFFER (tb->tb_path, h)); - /* when S[h] disappers replace left delemiting key as well */ - if (tb->CFL[h]) - replace_key (tb, cf, d_key_position, tb->CFL[h], tb->lkey[h]); - } else - replace_key (tb, cf, d_key_position, src_bi.bi_bh, nr - pointer_amount); - } - - /* last parameter is del_parameter */ - internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, pointer_amount, 0); + struct buffer_info dest_bi, src_bi; + struct buffer_head *cf; + int d_key_position; + int nr; + + internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi, + &d_key_position, &cf); + + nr = B_NR_ITEMS(src_bi.bi_bh); + + if (pointer_amount > 0) { + /* insert delimiting key from common father of dest and src to dest node into position 0 */ + internal_insert_key(&dest_bi, 0, cf, d_key_position); + if (nr == pointer_amount - 1) { + RFALSE(src_bi.bi_bh != PATH_H_PBUFFER(tb->tb_path, h) /*tb->S[h] */ || + dest_bi.bi_bh != tb->R[h], + "src (%p) must be == tb->S[h](%p) when it disappears", + src_bi.bi_bh, PATH_H_PBUFFER(tb->tb_path, h)); + /* when S[h] disappers replace left delemiting key as well */ + if (tb->CFL[h]) + replace_key(tb, cf, d_key_position, tb->CFL[h], + tb->lkey[h]); + } else + replace_key(tb, cf, d_key_position, src_bi.bi_bh, + nr - pointer_amount); + } + + /* last parameter is del_parameter */ + internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, + pointer_amount, 0); } /* Insert delimiting key to R[h]. @@ -561,498 +565,526 @@ static void internal_shift_right ( * Delete n - 1 items and node pointers from buffer S[h]. */ /* it always shift from S[h] to R[h] */ -static void internal_shift1_right ( - struct tree_balance * tb, - int h, - int pointer_amount - ) +static void internal_shift1_right(struct tree_balance *tb, + int h, int pointer_amount) { - struct buffer_info dest_bi, src_bi; - struct buffer_head * cf; - int d_key_position; - - internal_define_dest_src_infos (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); - - if (pointer_amount > 0) /* insert rkey from CFR[h] to right neighbor R[h] */ - internal_insert_key (&dest_bi, 0, cf, d_key_position); - /* internal_insert_key (tb->R[h], 0, tb->CFR[h], tb->rkey[h]);*/ - - /* last parameter is del_parameter */ - internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, pointer_amount, 1); - /* internal_move_pointers_items (tb->R[h], tb->S[h], LAST_TO_FIRST, pointer_amount, 1);*/ -} + struct buffer_info dest_bi, src_bi; + struct buffer_head *cf; + int d_key_position; + + internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, + &dest_bi, &src_bi, &d_key_position, &cf); + + if (pointer_amount > 0) /* insert rkey from CFR[h] to right neighbor R[h] */ + internal_insert_key(&dest_bi, 0, cf, d_key_position); + /* internal_insert_key (tb->R[h], 0, tb->CFR[h], tb->rkey[h]); */ + /* last parameter is del_parameter */ + internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST, + pointer_amount, 1); + /* internal_move_pointers_items (tb->R[h], tb->S[h], LAST_TO_FIRST, pointer_amount, 1); */ +} /* Delete insert_num node pointers together with their left items * and balance current node.*/ -static void balance_internal_when_delete (struct tree_balance * tb, - int h, int child_pos) +static void balance_internal_when_delete(struct tree_balance *tb, + int h, int child_pos) { - int insert_num; - int n; - struct buffer_head * tbSh = PATH_H_PBUFFER (tb->tb_path, h); - struct buffer_info bi; - - insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE)); - - /* delete child-node-pointer(s) together with their left item(s) */ - bi.tb = tb; - bi.bi_bh = tbSh; - bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h); - bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1); - - internal_delete_childs (&bi, child_pos, -insert_num); - - RFALSE( tb->blknum[h] > 1, - "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]); - - n = B_NR_ITEMS(tbSh); - - if ( tb->lnum[h] == 0 && tb->rnum[h] == 0 ) { - if ( tb->blknum[h] == 0 ) { - /* node S[h] (root of the tree) is empty now */ - struct buffer_head *new_root; - - RFALSE( n || B_FREE_SPACE (tbSh) != MAX_CHILD_SIZE(tbSh) - DC_SIZE, - "buffer must have only 0 keys (%d)", n); - RFALSE( bi.bi_parent, "root has parent (%p)", bi.bi_parent); - - /* choose a new root */ - if ( ! tb->L[h-1] || ! B_NR_ITEMS(tb->L[h-1]) ) - new_root = tb->R[h-1]; - else - new_root = tb->L[h-1]; - /* switch super block's tree root block number to the new value */ - PUT_SB_ROOT_BLOCK( tb->tb_sb, new_root->b_blocknr ); - //REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; - PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) - 1 ); - - do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1); - /*&&&&&&&&&&&&&&&&&&&&&&*/ - if (h > 1) - /* use check_internal if new root is an internal node */ - check_internal (new_root); - /*&&&&&&&&&&&&&&&&&&&&&&*/ - - /* do what is needed for buffer thrown from tree */ - reiserfs_invalidate_buffer(tb, tbSh); - return; + int insert_num; + int n; + struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h); + struct buffer_info bi; + + insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE)); + + /* delete child-node-pointer(s) together with their left item(s) */ + bi.tb = tb; + bi.bi_bh = tbSh; + bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h); + bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + + internal_delete_childs(&bi, child_pos, -insert_num); + + RFALSE(tb->blknum[h] > 1, + "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]); + + n = B_NR_ITEMS(tbSh); + + if (tb->lnum[h] == 0 && tb->rnum[h] == 0) { + if (tb->blknum[h] == 0) { + /* node S[h] (root of the tree) is empty now */ + struct buffer_head *new_root; + + RFALSE(n + || B_FREE_SPACE(tbSh) != + MAX_CHILD_SIZE(tbSh) - DC_SIZE, + "buffer must have only 0 keys (%d)", n); + RFALSE(bi.bi_parent, "root has parent (%p)", + bi.bi_parent); + + /* choose a new root */ + if (!tb->L[h - 1] || !B_NR_ITEMS(tb->L[h - 1])) + new_root = tb->R[h - 1]; + else + new_root = tb->L[h - 1]; + /* switch super block's tree root block number to the new value */ + PUT_SB_ROOT_BLOCK(tb->tb_sb, new_root->b_blocknr); + //REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; + PUT_SB_TREE_HEIGHT(tb->tb_sb, + SB_TREE_HEIGHT(tb->tb_sb) - 1); + + do_balance_mark_sb_dirty(tb, + REISERFS_SB(tb->tb_sb)->s_sbh, + 1); + /*&&&&&&&&&&&&&&&&&&&&&& */ + if (h > 1) + /* use check_internal if new root is an internal node */ + check_internal(new_root); + /*&&&&&&&&&&&&&&&&&&&&&& */ + + /* do what is needed for buffer thrown from tree */ + reiserfs_invalidate_buffer(tb, tbSh); + return; + } + return; + } + + if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) { /* join S[h] with L[h] */ + + RFALSE(tb->rnum[h] != 0, + "invalid tb->rnum[%d]==%d when joining S[h] with L[h]", + h, tb->rnum[h]); + + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1); + reiserfs_invalidate_buffer(tb, tbSh); + + return; + } + + if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) { /* join S[h] with R[h] */ + RFALSE(tb->lnum[h] != 0, + "invalid tb->lnum[%d]==%d when joining S[h] with R[h]", + h, tb->lnum[h]); + + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1); + + reiserfs_invalidate_buffer(tb, tbSh); + return; } - return; - } - - if ( tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1 ) { /* join S[h] with L[h] */ - - RFALSE( tb->rnum[h] != 0, - "invalid tb->rnum[%d]==%d when joining S[h] with L[h]", - h, tb->rnum[h]); - - internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1); - reiserfs_invalidate_buffer(tb, tbSh); - - return; - } - - if ( tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1 ) { /* join S[h] with R[h] */ - RFALSE( tb->lnum[h] != 0, - "invalid tb->lnum[%d]==%d when joining S[h] with R[h]", - h, tb->lnum[h]); - - internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1); - - reiserfs_invalidate_buffer(tb,tbSh); - return; - } - - if ( tb->lnum[h] < 0 ) { /* borrow from left neighbor L[h] */ - RFALSE( tb->rnum[h] != 0, - "wrong tb->rnum[%d]==%d when borrow from L[h]", h, tb->rnum[h]); - /*internal_shift_right (tb, h, tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], -tb->lnum[h]);*/ - internal_shift_right (INTERNAL_SHIFT_FROM_L_TO_S, tb, h, -tb->lnum[h]); - return; - } - - if ( tb->rnum[h] < 0 ) { /* borrow from right neighbor R[h] */ - RFALSE( tb->lnum[h] != 0, - "invalid tb->lnum[%d]==%d when borrow from R[h]", - h, tb->lnum[h]); - internal_shift_left (INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]);/*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]);*/ - return; - } - - if ( tb->lnum[h] > 0 ) { /* split S[h] into two parts and put them into neighbors */ - RFALSE( tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1, - "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them", - h, tb->lnum[h], h, tb->rnum[h], n); - - internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]);/*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]);*/ - internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h]); - - reiserfs_invalidate_buffer (tb, tbSh); - - return; - } - reiserfs_panic (tb->tb_sb, "balance_internal_when_delete: unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d", - h, tb->lnum[h], h, tb->rnum[h]); -} + if (tb->lnum[h] < 0) { /* borrow from left neighbor L[h] */ + RFALSE(tb->rnum[h] != 0, + "wrong tb->rnum[%d]==%d when borrow from L[h]", h, + tb->rnum[h]); + /*internal_shift_right (tb, h, tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], -tb->lnum[h]); */ + internal_shift_right(INTERNAL_SHIFT_FROM_L_TO_S, tb, h, + -tb->lnum[h]); + return; + } + + if (tb->rnum[h] < 0) { /* borrow from right neighbor R[h] */ + RFALSE(tb->lnum[h] != 0, + "invalid tb->lnum[%d]==%d when borrow from R[h]", + h, tb->lnum[h]); + internal_shift_left(INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]); /*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]); */ + return; + } + + if (tb->lnum[h] > 0) { /* split S[h] into two parts and put them into neighbors */ + RFALSE(tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1, + "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them", + h, tb->lnum[h], h, tb->rnum[h], n); + + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]); /*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]); */ + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, + tb->rnum[h]); + + reiserfs_invalidate_buffer(tb, tbSh); + + return; + } + reiserfs_panic(tb->tb_sb, + "balance_internal_when_delete: unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d", + h, tb->lnum[h], h, tb->rnum[h]); +} /* Replace delimiting key of buffers L[h] and S[h] by the given key.*/ -static void replace_lkey ( - struct tree_balance * tb, - int h, - struct item_head * key - ) +static void replace_lkey(struct tree_balance *tb, int h, struct item_head *key) { - RFALSE( tb->L[h] == NULL || tb->CFL[h] == NULL, - "L[h](%p) and CFL[h](%p) must exist in replace_lkey", - tb->L[h], tb->CFL[h]); + RFALSE(tb->L[h] == NULL || tb->CFL[h] == NULL, + "L[h](%p) and CFL[h](%p) must exist in replace_lkey", + tb->L[h], tb->CFL[h]); - if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0) - return; + if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0) + return; - memcpy (B_N_PDELIM_KEY(tb->CFL[h],tb->lkey[h]), key, KEY_SIZE); + memcpy(B_N_PDELIM_KEY(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE); - do_balance_mark_internal_dirty (tb, tb->CFL[h],0); + do_balance_mark_internal_dirty(tb, tb->CFL[h], 0); } - /* Replace delimiting key of buffers S[h] and R[h] by the given key.*/ -static void replace_rkey ( - struct tree_balance * tb, - int h, - struct item_head * key - ) +static void replace_rkey(struct tree_balance *tb, int h, struct item_head *key) { - RFALSE( tb->R[h] == NULL || tb->CFR[h] == NULL, - "R[h](%p) and CFR[h](%p) must exist in replace_rkey", - tb->R[h], tb->CFR[h]); - RFALSE( B_NR_ITEMS(tb->R[h]) == 0, - "R[h] can not be empty if it exists (item number=%d)", - B_NR_ITEMS(tb->R[h])); + RFALSE(tb->R[h] == NULL || tb->CFR[h] == NULL, + "R[h](%p) and CFR[h](%p) must exist in replace_rkey", + tb->R[h], tb->CFR[h]); + RFALSE(B_NR_ITEMS(tb->R[h]) == 0, + "R[h] can not be empty if it exists (item number=%d)", + B_NR_ITEMS(tb->R[h])); - memcpy (B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]), key, KEY_SIZE); + memcpy(B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE); - do_balance_mark_internal_dirty (tb, tb->CFR[h], 0); + do_balance_mark_internal_dirty(tb, tb->CFR[h], 0); } - -int balance_internal (struct tree_balance * tb, /* tree_balance structure */ - int h, /* level of the tree */ - int child_pos, - struct item_head * insert_key, /* key for insertion on higher level */ - struct buffer_head ** insert_ptr /* node for insertion on higher level*/ +int balance_internal(struct tree_balance *tb, /* tree_balance structure */ + int h, /* level of the tree */ + int child_pos, struct item_head *insert_key, /* key for insertion on higher level */ + struct buffer_head **insert_ptr /* node for insertion on higher level */ ) /* if inserting/pasting { - child_pos is the position of the node-pointer in S[h] that * - pointed to S[h-1] before balancing of the h-1 level; * + child_pos is the position of the node-pointer in S[h] that * + pointed to S[h-1] before balancing of the h-1 level; * this means that new pointers and items must be inserted AFTER * child_pos } else { - it is the position of the leftmost pointer that must be deleted (together with - its corresponding key to the left of the pointer) - as a result of the previous level's balancing. - } -*/ + it is the position of the leftmost pointer that must be deleted (together with + its corresponding key to the left of the pointer) + as a result of the previous level's balancing. + } + */ { - struct buffer_head * tbSh = PATH_H_PBUFFER (tb->tb_path, h); - struct buffer_info bi; - int order; /* we return this: it is 0 if there is no S[h], else it is tb->S[h]->b_item_order */ - int insert_num, n, k; - struct buffer_head * S_new; - struct item_head new_insert_key; - struct buffer_head * new_insert_ptr = NULL; - struct item_head * new_insert_key_addr = insert_key; - - RFALSE( h < 1, "h (%d) can not be < 1 on internal level", h); - - PROC_INFO_INC( tb -> tb_sb, balance_at[ h ] ); - - order = ( tbSh ) ? PATH_H_POSITION (tb->tb_path, h + 1)/*tb->S[h]->b_item_order*/ : 0; - - /* Using insert_size[h] calculate the number insert_num of items - that must be inserted to or deleted from S[h]. */ - insert_num = tb->insert_size[h]/((int)(KEY_SIZE + DC_SIZE)); - - /* Check whether insert_num is proper **/ - RFALSE( insert_num < -2 || insert_num > 2, - "incorrect number of items inserted to the internal node (%d)", - insert_num); - RFALSE( h > 1 && (insert_num > 1 || insert_num < -1), - "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level", - insert_num, h); - - /* Make balance in case insert_num < 0 */ - if ( insert_num < 0 ) { - balance_internal_when_delete (tb, h, child_pos); - return order; - } - - k = 0; - if ( tb->lnum[h] > 0 ) { - /* shift lnum[h] items from S[h] to the left neighbor L[h]. - check how many of new items fall into L[h] or CFL[h] after - shifting */ - n = B_NR_ITEMS (tb->L[h]); /* number of items in L[h] */ - if ( tb->lnum[h] <= child_pos ) { - /* new items don't fall into L[h] or CFL[h] */ - internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]); - /*internal_shift_left (tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,tb->lnum[h]);*/ - child_pos -= tb->lnum[h]; - } else if ( tb->lnum[h] > child_pos + insert_num ) { - /* all new items fall into L[h] */ - internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h] - insert_num); - /* internal_shift_left(tb->L[h],tb->CFL[h],tb->lkey[h],tbSh, - tb->lnum[h]-insert_num); - */ - /* insert insert_num keys and node-pointers into L[h] */ - bi.tb = tb; - bi.bi_bh = tb->L[h]; - bi.bi_parent = tb->FL[h]; - bi.bi_position = get_left_neighbor_position (tb, h); - internal_insert_childs (&bi,/*tb->L[h], tb->S[h-1]->b_next*/ n + child_pos + 1, - insert_num,insert_key,insert_ptr); - - insert_num = 0; - } else { - struct disk_child * dc; - - /* some items fall into L[h] or CFL[h], but some don't fall */ - internal_shift1_left(tb,h,child_pos+1); - /* calculate number of new items that fall into L[h] */ - k = tb->lnum[h] - child_pos - 1; - bi.tb = tb; - bi.bi_bh = tb->L[h]; - bi.bi_parent = tb->FL[h]; - bi.bi_position = get_left_neighbor_position (tb, h); - internal_insert_childs (&bi,/*tb->L[h], tb->S[h-1]->b_next,*/ n + child_pos + 1,k, - insert_key,insert_ptr); - - replace_lkey(tb,h,insert_key + k); - - /* replace the first node-ptr in S[h] by node-ptr to insert_ptr[k] */ - dc = B_N_CHILD(tbSh, 0); - put_dc_size( dc, MAX_CHILD_SIZE(insert_ptr[k]) - B_FREE_SPACE (insert_ptr[k])); - put_dc_block_number( dc, insert_ptr[k]->b_blocknr ); - - do_balance_mark_internal_dirty (tb, tbSh, 0); - - k++; - insert_key += k; - insert_ptr += k; - insert_num -= k; - child_pos = 0; + struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h); + struct buffer_info bi; + int order; /* we return this: it is 0 if there is no S[h], else it is tb->S[h]->b_item_order */ + int insert_num, n, k; + struct buffer_head *S_new; + struct item_head new_insert_key; + struct buffer_head *new_insert_ptr = NULL; + struct item_head *new_insert_key_addr = insert_key; + + RFALSE(h < 1, "h (%d) can not be < 1 on internal level", h); + + PROC_INFO_INC(tb->tb_sb, balance_at[h]); + + order = + (tbSh) ? PATH_H_POSITION(tb->tb_path, + h + 1) /*tb->S[h]->b_item_order */ : 0; + + /* Using insert_size[h] calculate the number insert_num of items + that must be inserted to or deleted from S[h]. */ + insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE)); + + /* Check whether insert_num is proper * */ + RFALSE(insert_num < -2 || insert_num > 2, + "incorrect number of items inserted to the internal node (%d)", + insert_num); + RFALSE(h > 1 && (insert_num > 1 || insert_num < -1), + "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level", + insert_num, h); + + /* Make balance in case insert_num < 0 */ + if (insert_num < 0) { + balance_internal_when_delete(tb, h, child_pos); + return order; } - } /* tb->lnum[h] > 0 */ - - if ( tb->rnum[h] > 0 ) { - /*shift rnum[h] items from S[h] to the right neighbor R[h]*/ - /* check how many of new items fall into R or CFR after shifting */ - n = B_NR_ITEMS (tbSh); /* number of items in S[h] */ - if ( n - tb->rnum[h] >= child_pos ) - /* new items fall into S[h] */ - /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],tb->rnum[h]);*/ - internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h]); - else - if ( n + insert_num - tb->rnum[h] < child_pos ) - { - /* all new items fall into R[h] */ - /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h], - tb->rnum[h] - insert_num);*/ - internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h] - insert_num); - - /* insert insert_num keys and node-pointers into R[h] */ - bi.tb = tb; - bi.bi_bh = tb->R[h]; - bi.bi_parent = tb->FR[h]; - bi.bi_position = get_right_neighbor_position (tb, h); - internal_insert_childs (&bi, /*tb->R[h],tb->S[h-1]->b_next*/ child_pos - n - insert_num + tb->rnum[h] - 1, - insert_num,insert_key,insert_ptr); - insert_num = 0; - } - else - { - struct disk_child * dc; - - /* one of the items falls into CFR[h] */ - internal_shift1_right(tb,h,n - child_pos + 1); - /* calculate number of new items that fall into R[h] */ - k = tb->rnum[h] - n + child_pos - 1; - bi.tb = tb; - bi.bi_bh = tb->R[h]; - bi.bi_parent = tb->FR[h]; - bi.bi_position = get_right_neighbor_position (tb, h); - internal_insert_childs (&bi, /*tb->R[h], tb->R[h]->b_child,*/ 0, k, insert_key + 1, insert_ptr + 1); - replace_rkey(tb,h,insert_key + insert_num - k - 1); + k = 0; + if (tb->lnum[h] > 0) { + /* shift lnum[h] items from S[h] to the left neighbor L[h]. + check how many of new items fall into L[h] or CFL[h] after + shifting */ + n = B_NR_ITEMS(tb->L[h]); /* number of items in L[h] */ + if (tb->lnum[h] <= child_pos) { + /* new items don't fall into L[h] or CFL[h] */ + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, + tb->lnum[h]); + /*internal_shift_left (tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,tb->lnum[h]); */ + child_pos -= tb->lnum[h]; + } else if (tb->lnum[h] > child_pos + insert_num) { + /* all new items fall into L[h] */ + internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, + tb->lnum[h] - insert_num); + /* internal_shift_left(tb->L[h],tb->CFL[h],tb->lkey[h],tbSh, + tb->lnum[h]-insert_num); + */ + /* insert insert_num keys and node-pointers into L[h] */ + bi.tb = tb; + bi.bi_bh = tb->L[h]; + bi.bi_parent = tb->FL[h]; + bi.bi_position = get_left_neighbor_position(tb, h); + internal_insert_childs(&bi, + /*tb->L[h], tb->S[h-1]->b_next */ + n + child_pos + 1, + insert_num, insert_key, + insert_ptr); + + insert_num = 0; + } else { + struct disk_child *dc; + + /* some items fall into L[h] or CFL[h], but some don't fall */ + internal_shift1_left(tb, h, child_pos + 1); + /* calculate number of new items that fall into L[h] */ + k = tb->lnum[h] - child_pos - 1; + bi.tb = tb; + bi.bi_bh = tb->L[h]; + bi.bi_parent = tb->FL[h]; + bi.bi_position = get_left_neighbor_position(tb, h); + internal_insert_childs(&bi, + /*tb->L[h], tb->S[h-1]->b_next, */ + n + child_pos + 1, k, + insert_key, insert_ptr); + + replace_lkey(tb, h, insert_key + k); + + /* replace the first node-ptr in S[h] by node-ptr to insert_ptr[k] */ + dc = B_N_CHILD(tbSh, 0); + put_dc_size(dc, + MAX_CHILD_SIZE(insert_ptr[k]) - + B_FREE_SPACE(insert_ptr[k])); + put_dc_block_number(dc, insert_ptr[k]->b_blocknr); + + do_balance_mark_internal_dirty(tb, tbSh, 0); + + k++; + insert_key += k; + insert_ptr += k; + insert_num -= k; + child_pos = 0; + } + } + /* tb->lnum[h] > 0 */ + if (tb->rnum[h] > 0) { + /*shift rnum[h] items from S[h] to the right neighbor R[h] */ + /* check how many of new items fall into R or CFR after shifting */ + n = B_NR_ITEMS(tbSh); /* number of items in S[h] */ + if (n - tb->rnum[h] >= child_pos) + /* new items fall into S[h] */ + /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],tb->rnum[h]); */ + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, + tb->rnum[h]); + else if (n + insert_num - tb->rnum[h] < child_pos) { + /* all new items fall into R[h] */ + /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h], + tb->rnum[h] - insert_num); */ + internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, + tb->rnum[h] - insert_num); + + /* insert insert_num keys and node-pointers into R[h] */ + bi.tb = tb; + bi.bi_bh = tb->R[h]; + bi.bi_parent = tb->FR[h]; + bi.bi_position = get_right_neighbor_position(tb, h); + internal_insert_childs(&bi, + /*tb->R[h],tb->S[h-1]->b_next */ + child_pos - n - insert_num + + tb->rnum[h] - 1, + insert_num, insert_key, + insert_ptr); + insert_num = 0; + } else { + struct disk_child *dc; + + /* one of the items falls into CFR[h] */ + internal_shift1_right(tb, h, n - child_pos + 1); + /* calculate number of new items that fall into R[h] */ + k = tb->rnum[h] - n + child_pos - 1; + bi.tb = tb; + bi.bi_bh = tb->R[h]; + bi.bi_parent = tb->FR[h]; + bi.bi_position = get_right_neighbor_position(tb, h); + internal_insert_childs(&bi, + /*tb->R[h], tb->R[h]->b_child, */ + 0, k, insert_key + 1, + insert_ptr + 1); + + replace_rkey(tb, h, insert_key + insert_num - k - 1); + + /* replace the first node-ptr in R[h] by node-ptr insert_ptr[insert_num-k-1] */ + dc = B_N_CHILD(tb->R[h], 0); + put_dc_size(dc, + MAX_CHILD_SIZE(insert_ptr + [insert_num - k - 1]) - + B_FREE_SPACE(insert_ptr + [insert_num - k - 1])); + put_dc_block_number(dc, + insert_ptr[insert_num - k - + 1]->b_blocknr); + + do_balance_mark_internal_dirty(tb, tb->R[h], 0); + + insert_num -= (k + 1); + } + } - /* replace the first node-ptr in R[h] by node-ptr insert_ptr[insert_num-k-1]*/ - dc = B_N_CHILD(tb->R[h], 0); - put_dc_size( dc, MAX_CHILD_SIZE(insert_ptr[insert_num-k-1]) - - B_FREE_SPACE (insert_ptr[insert_num-k-1])); - put_dc_block_number( dc, insert_ptr[insert_num-k-1]->b_blocknr ); + /** Fill new node that appears instead of S[h] **/ + RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level"); + RFALSE(tb->blknum[h] < 0, "blknum can not be < 0"); - do_balance_mark_internal_dirty (tb, tb->R[h],0); + if (!tb->blknum[h]) { /* node S[h] is empty now */ + RFALSE(!tbSh, "S[h] is equal NULL"); - insert_num -= (k + 1); - } - } + /* do what is needed for buffer thrown from tree */ + reiserfs_invalidate_buffer(tb, tbSh); + return order; + } - /** Fill new node that appears instead of S[h] **/ - RFALSE( tb->blknum[h] > 2, "blknum can not be > 2 for internal level"); - RFALSE( tb->blknum[h] < 0, "blknum can not be < 0"); + if (!tbSh) { + /* create new root */ + struct disk_child *dc; + struct buffer_head *tbSh_1 = PATH_H_PBUFFER(tb->tb_path, h - 1); + struct block_head *blkh; - if ( ! tb->blknum[h] ) - { /* node S[h] is empty now */ - RFALSE( ! tbSh, "S[h] is equal NULL"); + if (tb->blknum[h] != 1) + reiserfs_panic(NULL, + "balance_internal: One new node required for creating the new root"); + /* S[h] = empty buffer from the list FEB. */ + tbSh = get_FEB(tb); + blkh = B_BLK_HEAD(tbSh); + set_blkh_level(blkh, h + 1); - /* do what is needed for buffer thrown from tree */ - reiserfs_invalidate_buffer(tb,tbSh); - return order; - } - - if ( ! tbSh ) { - /* create new root */ - struct disk_child * dc; - struct buffer_head * tbSh_1 = PATH_H_PBUFFER (tb->tb_path, h - 1); - struct block_head * blkh; - - - if ( tb->blknum[h] != 1 ) - reiserfs_panic(NULL, "balance_internal: One new node required for creating the new root"); - /* S[h] = empty buffer from the list FEB. */ - tbSh = get_FEB (tb); - blkh = B_BLK_HEAD(tbSh); - set_blkh_level( blkh, h + 1 ); - - /* Put the unique node-pointer to S[h] that points to S[h-1]. */ - - dc = B_N_CHILD(tbSh, 0); - put_dc_block_number( dc, tbSh_1->b_blocknr ); - put_dc_size( dc, (MAX_CHILD_SIZE (tbSh_1) - B_FREE_SPACE (tbSh_1))); - - tb->insert_size[h] -= DC_SIZE; - set_blkh_free_space( blkh, blkh_free_space(blkh) - DC_SIZE ); - - do_balance_mark_internal_dirty (tb, tbSh, 0); - - /*&&&&&&&&&&&&&&&&&&&&&&&&*/ - check_internal (tbSh); - /*&&&&&&&&&&&&&&&&&&&&&&&&*/ - - /* put new root into path structure */ - PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) = tbSh; - - /* Change root in structure super block. */ - PUT_SB_ROOT_BLOCK( tb->tb_sb, tbSh->b_blocknr ); - PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1 ); - do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1); - } - - if ( tb->blknum[h] == 2 ) { - int snum; - struct buffer_info dest_bi, src_bi; + /* Put the unique node-pointer to S[h] that points to S[h-1]. */ + + dc = B_N_CHILD(tbSh, 0); + put_dc_block_number(dc, tbSh_1->b_blocknr); + put_dc_size(dc, + (MAX_CHILD_SIZE(tbSh_1) - B_FREE_SPACE(tbSh_1))); + + tb->insert_size[h] -= DC_SIZE; + set_blkh_free_space(blkh, blkh_free_space(blkh) - DC_SIZE); + do_balance_mark_internal_dirty(tb, tbSh, 0); - /* S_new = free buffer from list FEB */ - S_new = get_FEB(tb); - - set_blkh_level( B_BLK_HEAD(S_new), h + 1 ); - - dest_bi.tb = tb; - dest_bi.bi_bh = S_new; - dest_bi.bi_parent = NULL; - dest_bi.bi_position = 0; - src_bi.tb = tb; - src_bi.bi_bh = tbSh; - src_bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h); - src_bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1); - - n = B_NR_ITEMS (tbSh); /* number of items in S[h] */ - snum = (insert_num + n + 1)/2; - if ( n - snum >= child_pos ) { - /* new items don't fall into S_new */ - /* store the delimiting key for the next level */ - /* new_insert_key = (n - snum)'th key in S[h] */ - memcpy (&new_insert_key,B_N_PDELIM_KEY(tbSh,n - snum), - KEY_SIZE); - /* last parameter is del_par */ - internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, snum, 0); - /* internal_move_pointers_items(S_new, tbSh, LAST_TO_FIRST, snum, 0);*/ - } else if ( n + insert_num - snum < child_pos ) { - /* all new items fall into S_new */ - /* store the delimiting key for the next level */ - /* new_insert_key = (n + insert_item - snum)'th key in S[h] */ - memcpy(&new_insert_key,B_N_PDELIM_KEY(tbSh,n + insert_num - snum), - KEY_SIZE); - /* last parameter is del_par */ - internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, snum - insert_num, 0); - /* internal_move_pointers_items(S_new,tbSh,1,snum - insert_num,0);*/ - - /* insert insert_num keys and node-pointers into S_new */ - internal_insert_childs (&dest_bi, /*S_new,tb->S[h-1]->b_next,*/child_pos - n - insert_num + snum - 1, - insert_num,insert_key,insert_ptr); - - insert_num = 0; - } else { - struct disk_child * dc; - - /* some items fall into S_new, but some don't fall */ - /* last parameter is del_par */ - internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, n - child_pos + 1, 1); - /* internal_move_pointers_items(S_new,tbSh,1,n - child_pos + 1,1);*/ - /* calculate number of new items that fall into S_new */ - k = snum - n + child_pos - 1; - - internal_insert_childs (&dest_bi, /*S_new,*/ 0, k, insert_key + 1, insert_ptr+1); - - /* new_insert_key = insert_key[insert_num - k - 1] */ - memcpy(&new_insert_key,insert_key + insert_num - k - 1, - KEY_SIZE); - /* replace first node-ptr in S_new by node-ptr to insert_ptr[insert_num-k-1] */ - - dc = B_N_CHILD(S_new,0); - put_dc_size( dc, (MAX_CHILD_SIZE(insert_ptr[insert_num-k-1]) - - B_FREE_SPACE(insert_ptr[insert_num-k-1])) ); - put_dc_block_number( dc, insert_ptr[insert_num-k-1]->b_blocknr ); - - do_balance_mark_internal_dirty (tb, S_new,0); - - insert_num -= (k + 1); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + check_internal(tbSh); + /*&&&&&&&&&&&&&&&&&&&&&&&& */ + + /* put new root into path structure */ + PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) = + tbSh; + + /* Change root in structure super block. */ + PUT_SB_ROOT_BLOCK(tb->tb_sb, tbSh->b_blocknr); + PUT_SB_TREE_HEIGHT(tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1); + do_balance_mark_sb_dirty(tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1); } - /* new_insert_ptr = node_pointer to S_new */ - new_insert_ptr = S_new; - - RFALSE (!buffer_journaled(S_new) || buffer_journal_dirty(S_new) || - buffer_dirty (S_new), - "cm-00001: bad S_new (%b)", S_new); - - // S_new is released in unfix_nodes - } - - n = B_NR_ITEMS (tbSh); /*number of items in S[h] */ - - if ( 0 <= child_pos && child_pos <= n && insert_num > 0 ) { - bi.tb = tb; - bi.bi_bh = tbSh; - bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h); - bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1); - internal_insert_childs ( - &bi,/*tbSh,*/ - /* ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next : tb->S[h]->b_child->b_next,*/ - child_pos,insert_num,insert_key,insert_ptr - ); + + if (tb->blknum[h] == 2) { + int snum; + struct buffer_info dest_bi, src_bi; + + /* S_new = free buffer from list FEB */ + S_new = get_FEB(tb); + + set_blkh_level(B_BLK_HEAD(S_new), h + 1); + + dest_bi.tb = tb; + dest_bi.bi_bh = S_new; + dest_bi.bi_parent = NULL; + dest_bi.bi_position = 0; + src_bi.tb = tb; + src_bi.bi_bh = tbSh; + src_bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h); + src_bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + + n = B_NR_ITEMS(tbSh); /* number of items in S[h] */ + snum = (insert_num + n + 1) / 2; + if (n - snum >= child_pos) { + /* new items don't fall into S_new */ + /* store the delimiting key for the next level */ + /* new_insert_key = (n - snum)'th key in S[h] */ + memcpy(&new_insert_key, B_N_PDELIM_KEY(tbSh, n - snum), + KEY_SIZE); + /* last parameter is del_par */ + internal_move_pointers_items(&dest_bi, &src_bi, + LAST_TO_FIRST, snum, 0); + /* internal_move_pointers_items(S_new, tbSh, LAST_TO_FIRST, snum, 0); */ + } else if (n + insert_num - snum < child_pos) { + /* all new items fall into S_new */ + /* store the delimiting key for the next level */ + /* new_insert_key = (n + insert_item - snum)'th key in S[h] */ + memcpy(&new_insert_key, + B_N_PDELIM_KEY(tbSh, n + insert_num - snum), + KEY_SIZE); + /* last parameter is del_par */ + internal_move_pointers_items(&dest_bi, &src_bi, + LAST_TO_FIRST, + snum - insert_num, 0); + /* internal_move_pointers_items(S_new,tbSh,1,snum - insert_num,0); */ + + /* insert insert_num keys and node-pointers into S_new */ + internal_insert_childs(&dest_bi, + /*S_new,tb->S[h-1]->b_next, */ + child_pos - n - insert_num + + snum - 1, + insert_num, insert_key, + insert_ptr); + + insert_num = 0; + } else { + struct disk_child *dc; + + /* some items fall into S_new, but some don't fall */ + /* last parameter is del_par */ + internal_move_pointers_items(&dest_bi, &src_bi, + LAST_TO_FIRST, + n - child_pos + 1, 1); + /* internal_move_pointers_items(S_new,tbSh,1,n - child_pos + 1,1); */ + /* calculate number of new items that fall into S_new */ + k = snum - n + child_pos - 1; + + internal_insert_childs(&dest_bi, /*S_new, */ 0, k, + insert_key + 1, insert_ptr + 1); + + /* new_insert_key = insert_key[insert_num - k - 1] */ + memcpy(&new_insert_key, insert_key + insert_num - k - 1, + KEY_SIZE); + /* replace first node-ptr in S_new by node-ptr to insert_ptr[insert_num-k-1] */ + + dc = B_N_CHILD(S_new, 0); + put_dc_size(dc, + (MAX_CHILD_SIZE + (insert_ptr[insert_num - k - 1]) - + B_FREE_SPACE(insert_ptr + [insert_num - k - 1]))); + put_dc_block_number(dc, + insert_ptr[insert_num - k - + 1]->b_blocknr); + + do_balance_mark_internal_dirty(tb, S_new, 0); + + insert_num -= (k + 1); + } + /* new_insert_ptr = node_pointer to S_new */ + new_insert_ptr = S_new; + + RFALSE(!buffer_journaled(S_new) || buffer_journal_dirty(S_new) + || buffer_dirty(S_new), "cm-00001: bad S_new (%b)", + S_new); + + // S_new is released in unfix_nodes } + n = B_NR_ITEMS(tbSh); /*number of items in S[h] */ - memcpy (new_insert_key_addr,&new_insert_key,KEY_SIZE); + if (0 <= child_pos && child_pos <= n && insert_num > 0) { + bi.tb = tb; + bi.bi_bh = tbSh; + bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h); + bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1); + internal_insert_childs(&bi, /*tbSh, */ + /* ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next : tb->S[h]->b_child->b_next, */ + child_pos, insert_num, insert_key, + insert_ptr); + } + + memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE); insert_ptr[0] = new_insert_ptr; return order; - } - - - +} diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 2711dff1b7b..ff291c973a5 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -18,107 +18,109 @@ #include <linux/writeback.h> #include <linux/quotaops.h> -extern int reiserfs_default_io_size; /* default io size devuned in super.c */ +extern int reiserfs_default_io_size; /* default io size devuned in super.c */ static int reiserfs_commit_write(struct file *f, struct page *page, - unsigned from, unsigned to); + unsigned from, unsigned to); static int reiserfs_prepare_write(struct file *f, struct page *page, unsigned from, unsigned to); -void reiserfs_delete_inode (struct inode * inode) +void reiserfs_delete_inode(struct inode *inode) { - /* We need blocks for transaction + (user+group) quota update (possibly delete) */ - int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * REISERFS_QUOTA_INIT_BLOCKS; - struct reiserfs_transaction_handle th ; - - reiserfs_write_lock(inode->i_sb); + /* We need blocks for transaction + (user+group) quota update (possibly delete) */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 2 + + 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb); + struct reiserfs_transaction_handle th; - /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ - if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ - down (&inode->i_sem); + reiserfs_write_lock(inode->i_sb); - reiserfs_delete_xattrs (inode); + /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ + if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ + down(&inode->i_sem); - if (journal_begin(&th, inode->i_sb, jbegin_count)) { - up (&inode->i_sem); - goto out; - } - reiserfs_update_inode_transaction(inode) ; + reiserfs_delete_xattrs(inode); - if (reiserfs_delete_object (&th, inode)) { - up (&inode->i_sem); - goto out; - } + if (journal_begin(&th, inode->i_sb, jbegin_count)) { + up(&inode->i_sem); + goto out; + } + reiserfs_update_inode_transaction(inode); - /* Do quota update inside a transaction for journaled quotas. We must do that - * after delete_object so that quota updates go into the same transaction as - * stat data deletion */ - DQUOT_FREE_INODE(inode); + if (reiserfs_delete_object(&th, inode)) { + up(&inode->i_sem); + goto out; + } - if (journal_end(&th, inode->i_sb, jbegin_count)) { - up (&inode->i_sem); - goto out; - } + /* Do quota update inside a transaction for journaled quotas. We must do that + * after delete_object so that quota updates go into the same transaction as + * stat data deletion */ + DQUOT_FREE_INODE(inode); + + if (journal_end(&th, inode->i_sb, jbegin_count)) { + up(&inode->i_sem); + goto out; + } - up (&inode->i_sem); + up(&inode->i_sem); - /* all items of file are deleted, so we can remove "save" link */ - remove_save_link (inode, 0/* not truncate */); /* we can't do anything - * about an error here */ - } else { - /* no object items are in the tree */ - ; - } -out: - clear_inode (inode); /* note this must go after the journal_end to prevent deadlock */ - inode->i_blocks = 0; - reiserfs_write_unlock(inode->i_sb); + /* all items of file are deleted, so we can remove "save" link */ + remove_save_link(inode, 0 /* not truncate */ ); /* we can't do anything + * about an error here */ + } else { + /* no object items are in the tree */ + ; + } + out: + clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */ + inode->i_blocks = 0; + reiserfs_write_unlock(inode->i_sb); } -static void _make_cpu_key (struct cpu_key * key, int version, __u32 dirid, __u32 objectid, - loff_t offset, int type, int length ) +static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid, + __u32 objectid, loff_t offset, int type, int length) { - key->version = version; + key->version = version; - key->on_disk_key.k_dir_id = dirid; - key->on_disk_key.k_objectid = objectid; - set_cpu_key_k_offset (key, offset); - set_cpu_key_k_type (key, type); - key->key_length = length; + key->on_disk_key.k_dir_id = dirid; + key->on_disk_key.k_objectid = objectid; + set_cpu_key_k_offset(key, offset); + set_cpu_key_k_type(key, type); + key->key_length = length; } - /* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set offset and type of key */ -void make_cpu_key (struct cpu_key * key, struct inode * inode, loff_t offset, - int type, int length ) +void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset, + int type, int length) { - _make_cpu_key (key, get_inode_item_key_version (inode), le32_to_cpu (INODE_PKEY (inode)->k_dir_id), - le32_to_cpu (INODE_PKEY (inode)->k_objectid), - offset, type, length); + _make_cpu_key(key, get_inode_item_key_version(inode), + le32_to_cpu(INODE_PKEY(inode)->k_dir_id), + le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type, + length); } - // // when key is 0, do not set version and short key // -inline void make_le_item_head (struct item_head * ih, const struct cpu_key * key, - int version, - loff_t offset, int type, int length, - int entry_count/*or ih_free_space*/) +inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key, + int version, + loff_t offset, int type, int length, + int entry_count /*or ih_free_space */ ) { - if (key) { - ih->ih_key.k_dir_id = cpu_to_le32 (key->on_disk_key.k_dir_id); - ih->ih_key.k_objectid = cpu_to_le32 (key->on_disk_key.k_objectid); - } - put_ih_version( ih, version ); - set_le_ih_k_offset (ih, offset); - set_le_ih_k_type (ih, type); - put_ih_item_len( ih, length ); - /* set_ih_free_space (ih, 0);*/ - // for directory items it is entry count, for directs and stat - // datas - 0xffff, for indirects - 0 - put_ih_entry_count( ih, entry_count ); + if (key) { + ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id); + ih->ih_key.k_objectid = + cpu_to_le32(key->on_disk_key.k_objectid); + } + put_ih_version(ih, version); + set_le_ih_k_offset(ih, offset); + set_le_ih_k_type(ih, type); + put_ih_item_len(ih, length); + /* set_ih_free_space (ih, 0); */ + // for directory items it is entry count, for directs and stat + // datas - 0xffff, for indirects - 0 + put_ih_entry_count(ih, entry_count); } // @@ -153,84 +155,84 @@ inline void make_le_item_head (struct item_head * ih, const struct cpu_key * key ** to be unmapped, so that block_prepare_write will correctly call ** reiserfs_get_block to convert the tail into an unformatted node */ -static inline void fix_tail_page_for_writing(struct page *page) { - struct buffer_head *head, *next, *bh ; - - if (page && page_has_buffers(page)) { - head = page_buffers(page) ; - bh = head ; - do { - next = bh->b_this_page ; - if (buffer_mapped(bh) && bh->b_blocknr == 0) { - reiserfs_unmap_buffer(bh) ; - } - bh = next ; - } while (bh != head) ; - } +static inline void fix_tail_page_for_writing(struct page *page) +{ + struct buffer_head *head, *next, *bh; + + if (page && page_has_buffers(page)) { + head = page_buffers(page); + bh = head; + do { + next = bh->b_this_page; + if (buffer_mapped(bh) && bh->b_blocknr == 0) { + reiserfs_unmap_buffer(bh); + } + bh = next; + } while (bh != head); + } } /* reiserfs_get_block does not need to allocate a block only if it has been done already or non-hole position has been found in the indirect item */ -static inline int allocation_needed (int retval, b_blocknr_t allocated, - struct item_head * ih, - __le32 * item, int pos_in_item) +static inline int allocation_needed(int retval, b_blocknr_t allocated, + struct item_head *ih, + __le32 * item, int pos_in_item) { - if (allocated) - return 0; - if (retval == POSITION_FOUND && is_indirect_le_ih (ih) && - get_block_num(item, pos_in_item)) - return 0; - return 1; + if (allocated) + return 0; + if (retval == POSITION_FOUND && is_indirect_le_ih(ih) && + get_block_num(item, pos_in_item)) + return 0; + return 1; } -static inline int indirect_item_found (int retval, struct item_head * ih) +static inline int indirect_item_found(int retval, struct item_head *ih) { - return (retval == POSITION_FOUND) && is_indirect_le_ih (ih); + return (retval == POSITION_FOUND) && is_indirect_le_ih(ih); } - -static inline void set_block_dev_mapped (struct buffer_head * bh, - b_blocknr_t block, struct inode * inode) +static inline void set_block_dev_mapped(struct buffer_head *bh, + b_blocknr_t block, struct inode *inode) { map_bh(bh, inode->i_sb, block); } - // // files which were created in the earlier version can not be longer, // than 2 gb // -static int file_capable (struct inode * inode, long block) +static int file_capable(struct inode *inode, long block) { - if (get_inode_item_key_version (inode) != KEY_FORMAT_3_5 || // it is new file. - block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb - return 1; + if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 || // it is new file. + block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb + return 1; - return 0; + return 0; } /*static*/ int restart_transaction(struct reiserfs_transaction_handle *th, - struct inode *inode, struct path *path) { - struct super_block *s = th->t_super ; - int len = th->t_blocks_allocated ; - int err; - - BUG_ON (!th->t_trans_id); - BUG_ON (!th->t_refcount); - - /* we cannot restart while nested */ - if (th->t_refcount > 1) { - return 0 ; - } - pathrelse(path) ; - reiserfs_update_sd(th, inode) ; - err = journal_end(th, s, len) ; - if (!err) { - err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6) ; - if (!err) - reiserfs_update_inode_transaction(inode) ; - } - return err; + struct inode *inode, struct path *path) +{ + struct super_block *s = th->t_super; + int len = th->t_blocks_allocated; + int err; + + BUG_ON(!th->t_trans_id); + BUG_ON(!th->t_refcount); + + /* we cannot restart while nested */ + if (th->t_refcount > 1) { + return 0; + } + pathrelse(path); + reiserfs_update_sd(th, inode); + err = journal_end(th, s, len); + if (!err) { + err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6); + if (!err) + reiserfs_update_inode_transaction(inode); + } + return err; } // it is called by get_block when create == 0. Returns block number @@ -241,181 +243,192 @@ static int file_capable (struct inode * inode, long block) // Please improve the english/clarity in the comment above, as it is // hard to understand. -static int _get_block_create_0 (struct inode * inode, long block, - struct buffer_head * bh_result, - int args) +static int _get_block_create_0(struct inode *inode, long block, + struct buffer_head *bh_result, int args) { - INITIALIZE_PATH (path); - struct cpu_key key; - struct buffer_head * bh; - struct item_head * ih, tmp_ih; - int fs_gen ; - int blocknr; - char * p = NULL; - int chars; - int ret ; - int done = 0 ; - unsigned long offset ; - - // prepare the key to look for the 'block'-th block of file - make_cpu_key (&key, inode, - (loff_t)block * inode->i_sb->s_blocksize + 1, TYPE_ANY, 3); - -research: - if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND) { - pathrelse (&path); - if (p) - kunmap(bh_result->b_page) ; - // We do not return -ENOENT if there is a hole but page is uptodate, because it means - // That there is some MMAPED data associated with it that is yet to be written to disk. - if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) { - return -ENOENT ; - } - return 0 ; - } - - // - bh = get_last_bh (&path); - ih = get_ih (&path); - if (is_indirect_le_ih (ih)) { - __le32 * ind_item = (__le32 *)B_I_PITEM (bh, ih); - - /* FIXME: here we could cache indirect item or part of it in - the inode to avoid search_by_key in case of subsequent - access to file */ - blocknr = get_block_num(ind_item, path.pos_in_item) ; - ret = 0 ; - if (blocknr) { - map_bh(bh_result, inode->i_sb, blocknr); - if (path.pos_in_item == ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) { - set_buffer_boundary(bh_result); - } - } else - // We do not return -ENOENT if there is a hole but page is uptodate, because it means - // That there is some MMAPED data associated with it that is yet to be written to disk. - if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) { - ret = -ENOENT ; - } - - pathrelse (&path); - if (p) - kunmap(bh_result->b_page) ; - return ret ; - } - - // requested data are in direct item(s) - if (!(args & GET_BLOCK_READ_DIRECT)) { - // we are called by bmap. FIXME: we can not map block of file - // when it is stored in direct item(s) - pathrelse (&path); - if (p) - kunmap(bh_result->b_page) ; - return -ENOENT; - } - - /* if we've got a direct item, and the buffer or page was uptodate, - ** we don't want to pull data off disk again. skip to the - ** end, where we map the buffer and return - */ - if (buffer_uptodate(bh_result)) { - goto finished ; - } else - /* - ** grab_tail_page can trigger calls to reiserfs_get_block on up to date - ** pages without any buffers. If the page is up to date, we don't want - ** read old data off disk. Set the up to date bit on the buffer instead - ** and jump to the end - */ - if (!bh_result->b_page || PageUptodate(bh_result->b_page)) { + INITIALIZE_PATH(path); + struct cpu_key key; + struct buffer_head *bh; + struct item_head *ih, tmp_ih; + int fs_gen; + int blocknr; + char *p = NULL; + int chars; + int ret; + int result; + int done = 0; + unsigned long offset; + + // prepare the key to look for the 'block'-th block of file + make_cpu_key(&key, inode, + (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY, + 3); + + research: + result = search_for_position_by_key(inode->i_sb, &key, &path); + if (result != POSITION_FOUND) { + pathrelse(&path); + if (p) + kunmap(bh_result->b_page); + if (result == IO_ERROR) + return -EIO; + // We do not return -ENOENT if there is a hole but page is uptodate, because it means + // That there is some MMAPED data associated with it that is yet to be written to disk. + if ((args & GET_BLOCK_NO_HOLE) + && !PageUptodate(bh_result->b_page)) { + return -ENOENT; + } + return 0; + } + // + bh = get_last_bh(&path); + ih = get_ih(&path); + if (is_indirect_le_ih(ih)) { + __le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih); + + /* FIXME: here we could cache indirect item or part of it in + the inode to avoid search_by_key in case of subsequent + access to file */ + blocknr = get_block_num(ind_item, path.pos_in_item); + ret = 0; + if (blocknr) { + map_bh(bh_result, inode->i_sb, blocknr); + if (path.pos_in_item == + ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) { + set_buffer_boundary(bh_result); + } + } else + // We do not return -ENOENT if there is a hole but page is uptodate, because it means + // That there is some MMAPED data associated with it that is yet to be written to disk. + if ((args & GET_BLOCK_NO_HOLE) + && !PageUptodate(bh_result->b_page)) { + ret = -ENOENT; + } + + pathrelse(&path); + if (p) + kunmap(bh_result->b_page); + return ret; + } + // requested data are in direct item(s) + if (!(args & GET_BLOCK_READ_DIRECT)) { + // we are called by bmap. FIXME: we can not map block of file + // when it is stored in direct item(s) + pathrelse(&path); + if (p) + kunmap(bh_result->b_page); + return -ENOENT; + } + + /* if we've got a direct item, and the buffer or page was uptodate, + ** we don't want to pull data off disk again. skip to the + ** end, where we map the buffer and return + */ + if (buffer_uptodate(bh_result)) { + goto finished; + } else + /* + ** grab_tail_page can trigger calls to reiserfs_get_block on up to date + ** pages without any buffers. If the page is up to date, we don't want + ** read old data off disk. Set the up to date bit on the buffer instead + ** and jump to the end + */ + if (!bh_result->b_page || PageUptodate(bh_result->b_page)) { set_buffer_uptodate(bh_result); - goto finished ; - } - - // read file tail into part of page - offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1) ; - fs_gen = get_generation(inode->i_sb) ; - copy_item_head (&tmp_ih, ih); - - /* we only want to kmap if we are reading the tail into the page. - ** this is not the common case, so we don't kmap until we are - ** sure we need to. But, this means the item might move if - ** kmap schedules - */ - if (!p) { - p = (char *)kmap(bh_result->b_page) ; - if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { - goto research; - } - } - p += offset ; - memset (p, 0, inode->i_sb->s_blocksize); - do { - if (!is_direct_le_ih (ih)) { - BUG (); - } - /* make sure we don't read more bytes than actually exist in - ** the file. This can happen in odd cases where i_size isn't - ** correct, and when direct item padding results in a few - ** extra bytes at the end of the direct item - */ - if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size) - break ; - if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) { - chars = inode->i_size - (le_ih_k_offset(ih) - 1) - path.pos_in_item; - done = 1 ; - } else { - chars = ih_item_len(ih) - path.pos_in_item; - } - memcpy (p, B_I_PITEM (bh, ih) + path.pos_in_item, chars); - - if (done) - break ; - - p += chars; - - if (PATH_LAST_POSITION (&path) != (B_NR_ITEMS (bh) - 1)) - // we done, if read direct item is not the last item of - // node FIXME: we could try to check right delimiting key - // to see whether direct item continues in the right - // neighbor or rely on i_size - break; - - // update key to look for the next piece - set_cpu_key_k_offset (&key, cpu_key_k_offset (&key) + chars); - if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND) - // we read something from tail, even if now we got IO_ERROR - break; - bh = get_last_bh (&path); - ih = get_ih (&path); - } while (1); - - flush_dcache_page(bh_result->b_page) ; - kunmap(bh_result->b_page) ; - -finished: - pathrelse (&path); - /* this buffer has valid data, but isn't valid for io. mapping it to - * block #0 tells the rest of reiserfs it just has a tail in it - */ - map_bh(bh_result, inode->i_sb, 0); - set_buffer_uptodate (bh_result); - return 0; -} + goto finished; + } + // read file tail into part of page + offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1); + fs_gen = get_generation(inode->i_sb); + copy_item_head(&tmp_ih, ih); + + /* we only want to kmap if we are reading the tail into the page. + ** this is not the common case, so we don't kmap until we are + ** sure we need to. But, this means the item might move if + ** kmap schedules + */ + if (!p) { + p = (char *)kmap(bh_result->b_page); + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + goto research; + } + } + p += offset; + memset(p, 0, inode->i_sb->s_blocksize); + do { + if (!is_direct_le_ih(ih)) { + BUG(); + } + /* make sure we don't read more bytes than actually exist in + ** the file. This can happen in odd cases where i_size isn't + ** correct, and when direct item padding results in a few + ** extra bytes at the end of the direct item + */ + if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size) + break; + if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) { + chars = + inode->i_size - (le_ih_k_offset(ih) - 1) - + path.pos_in_item; + done = 1; + } else { + chars = ih_item_len(ih) - path.pos_in_item; + } + memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars); + + if (done) + break; + + p += chars; + if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1)) + // we done, if read direct item is not the last item of + // node FIXME: we could try to check right delimiting key + // to see whether direct item continues in the right + // neighbor or rely on i_size + break; + + // update key to look for the next piece + set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars); + result = search_for_position_by_key(inode->i_sb, &key, &path); + if (result != POSITION_FOUND) + // i/o error most likely + break; + bh = get_last_bh(&path); + ih = get_ih(&path); + } while (1); + + flush_dcache_page(bh_result->b_page); + kunmap(bh_result->b_page); + + finished: + pathrelse(&path); + + if (result == IO_ERROR) + return -EIO; + + /* this buffer has valid data, but isn't valid for io. mapping it to + * block #0 tells the rest of reiserfs it just has a tail in it + */ + map_bh(bh_result, inode->i_sb, 0); + set_buffer_uptodate(bh_result); + return 0; +} // this is called to create file map. So, _get_block_create_0 will not // read direct item -static int reiserfs_bmap (struct inode * inode, sector_t block, - struct buffer_head * bh_result, int create) +static int reiserfs_bmap(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) { - if (!file_capable (inode, block)) - return -EFBIG; - - reiserfs_write_lock(inode->i_sb); - /* do not read the direct item */ - _get_block_create_0 (inode, block, bh_result, 0) ; - reiserfs_write_unlock(inode->i_sb); - return 0; + if (!file_capable(inode, block)) + return -EFBIG; + + reiserfs_write_lock(inode->i_sb); + /* do not read the direct item */ + _get_block_create_0(inode, block, bh_result, 0); + reiserfs_write_unlock(inode->i_sb); + return 0; } /* special version of get_block that is only used by grab_tail_page right @@ -435,9 +448,11 @@ static int reiserfs_bmap (struct inode * inode, sector_t block, ** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block, ** don't use this function. */ -static int reiserfs_get_block_create_0 (struct inode * inode, sector_t block, - struct buffer_head * bh_result, int create) { - return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE) ; +static int reiserfs_get_block_create_0(struct inode *inode, sector_t block, + struct buffer_head *bh_result, + int create) +{ + return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE); } /* This is special helper for reiserfs_get_block in case we are executing @@ -448,43 +463,42 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode, struct buffer_head *bh_result, int create) { - int ret ; - - bh_result->b_page = NULL; - - /* We set the b_size before reiserfs_get_block call since it is - referenced in convert_tail_for_hole() that may be called from - reiserfs_get_block() */ - bh_result->b_size = (1 << inode->i_blkbits); - - ret = reiserfs_get_block(inode, iblock, bh_result, - create | GET_BLOCK_NO_DANGLE) ; - if (ret) - goto out; - - /* don't allow direct io onto tail pages */ - if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { - /* make sure future calls to the direct io funcs for this offset - ** in the file fail by unmapping the buffer - */ - clear_buffer_mapped(bh_result); - ret = -EINVAL ; - } - /* Possible unpacked tail. Flush the data before pages have - disappeared */ - if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { - int err; - lock_kernel(); - err = reiserfs_commit_for_inode(inode); - REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; - unlock_kernel(); - if (err < 0) - ret = err; - } -out: - return ret ; -} + int ret; + + bh_result->b_page = NULL; + + /* We set the b_size before reiserfs_get_block call since it is + referenced in convert_tail_for_hole() that may be called from + reiserfs_get_block() */ + bh_result->b_size = (1 << inode->i_blkbits); + + ret = reiserfs_get_block(inode, iblock, bh_result, + create | GET_BLOCK_NO_DANGLE); + if (ret) + goto out; + /* don't allow direct io onto tail pages */ + if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { + /* make sure future calls to the direct io funcs for this offset + ** in the file fail by unmapping the buffer + */ + clear_buffer_mapped(bh_result); + ret = -EINVAL; + } + /* Possible unpacked tail. Flush the data before pages have + disappeared */ + if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { + int err; + lock_kernel(); + err = reiserfs_commit_for_inode(inode); + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + unlock_kernel(); + if (err < 0) + ret = err; + } + out: + return ret; +} /* ** helper function for when reiserfs_get_block is called for a hole @@ -496,490 +510,547 @@ out: ** you should not be in a transaction, or have any paths held when you ** call this. */ -static int convert_tail_for_hole(struct inode *inode, - struct buffer_head *bh_result, - loff_t tail_offset) { - unsigned long index ; - unsigned long tail_end ; - unsigned long tail_start ; - struct page * tail_page ; - struct page * hole_page = bh_result->b_page ; - int retval = 0 ; - - if ((tail_offset & (bh_result->b_size - 1)) != 1) - return -EIO ; - - /* always try to read until the end of the block */ - tail_start = tail_offset & (PAGE_CACHE_SIZE - 1) ; - tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ; - - index = tail_offset >> PAGE_CACHE_SHIFT ; - /* hole_page can be zero in case of direct_io, we are sure - that we cannot get here if we write with O_DIRECT into - tail page */ - if (!hole_page || index != hole_page->index) { - tail_page = grab_cache_page(inode->i_mapping, index) ; - retval = -ENOMEM; - if (!tail_page) { - goto out ; - } - } else { - tail_page = hole_page ; - } - - /* we don't have to make sure the conversion did not happen while - ** we were locking the page because anyone that could convert - ** must first take i_sem. - ** - ** We must fix the tail page for writing because it might have buffers - ** that are mapped, but have a block number of 0. This indicates tail - ** data that has been read directly into the page, and block_prepare_write - ** won't trigger a get_block in this case. - */ - fix_tail_page_for_writing(tail_page) ; - retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end); - if (retval) - goto unlock ; - - /* tail conversion might change the data in the page */ - flush_dcache_page(tail_page) ; - - retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end) ; - -unlock: - if (tail_page != hole_page) { - unlock_page(tail_page) ; - page_cache_release(tail_page) ; - } -out: - return retval ; +static int convert_tail_for_hole(struct inode *inode, + struct buffer_head *bh_result, + loff_t tail_offset) +{ + unsigned long index; + unsigned long tail_end; + unsigned long tail_start; + struct page *tail_page; + struct page *hole_page = bh_result->b_page; + int retval = 0; + + if ((tail_offset & (bh_result->b_size - 1)) != 1) + return -EIO; + + /* always try to read until the end of the block */ + tail_start = tail_offset & (PAGE_CACHE_SIZE - 1); + tail_end = (tail_start | (bh_result->b_size - 1)) + 1; + + index = tail_offset >> PAGE_CACHE_SHIFT; + /* hole_page can be zero in case of direct_io, we are sure + that we cannot get here if we write with O_DIRECT into + tail page */ + if (!hole_page || index != hole_page->index) { + tail_page = grab_cache_page(inode->i_mapping, index); + retval = -ENOMEM; + if (!tail_page) { + goto out; + } + } else { + tail_page = hole_page; + } + + /* we don't have to make sure the conversion did not happen while + ** we were locking the page because anyone that could convert + ** must first take i_sem. + ** + ** We must fix the tail page for writing because it might have buffers + ** that are mapped, but have a block number of 0. This indicates tail + ** data that has been read directly into the page, and block_prepare_write + ** won't trigger a get_block in this case. + */ + fix_tail_page_for_writing(tail_page); + retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end); + if (retval) + goto unlock; + + /* tail conversion might change the data in the page */ + flush_dcache_page(tail_page); + + retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end); + + unlock: + if (tail_page != hole_page) { + unlock_page(tail_page); + page_cache_release(tail_page); + } + out: + return retval; } static inline int _allocate_block(struct reiserfs_transaction_handle *th, - long block, - struct inode *inode, - b_blocknr_t *allocated_block_nr, - struct path * path, - int flags) { - BUG_ON (!th->t_trans_id); - + long block, + struct inode *inode, + b_blocknr_t * allocated_block_nr, + struct path *path, int flags) +{ + BUG_ON(!th->t_trans_id); + #ifdef REISERFS_PREALLOCATE - if (!(flags & GET_BLOCK_NO_ISEM)) { - return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, path, block); - } + if (!(flags & GET_BLOCK_NO_ISEM)) { + return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, + path, block); + } #endif - return reiserfs_new_unf_blocknrs (th, inode, allocated_block_nr, path, block); + return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path, + block); } -int reiserfs_get_block (struct inode * inode, sector_t block, - struct buffer_head * bh_result, int create) +int reiserfs_get_block(struct inode *inode, sector_t block, + struct buffer_head *bh_result, int create) { - int repeat, retval = 0; - b_blocknr_t allocated_block_nr = 0;// b_blocknr_t is (unsigned) 32 bit int - INITIALIZE_PATH(path); - int pos_in_item; - struct cpu_key key; - struct buffer_head * bh, * unbh = NULL; - struct item_head * ih, tmp_ih; - __le32 * item; - int done; - int fs_gen; - struct reiserfs_transaction_handle *th = NULL; - /* space reserved in transaction batch: - . 3 balancings in direct->indirect conversion - . 1 block involved into reiserfs_update_sd() - XXX in practically impossible worst case direct2indirect() - can incur (much) more than 3 balancings. - quota update for user, group */ - int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS; - int version; - int dangle = 1; - loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ; - - /* bad.... */ - reiserfs_write_lock(inode->i_sb); - version = get_inode_item_key_version (inode); - - if (block < 0) { - reiserfs_write_unlock(inode->i_sb); - return -EIO; - } + int repeat, retval = 0; + b_blocknr_t allocated_block_nr = 0; // b_blocknr_t is (unsigned) 32 bit int + INITIALIZE_PATH(path); + int pos_in_item; + struct cpu_key key; + struct buffer_head *bh, *unbh = NULL; + struct item_head *ih, tmp_ih; + __le32 *item; + int done; + int fs_gen; + struct reiserfs_transaction_handle *th = NULL; + /* space reserved in transaction batch: + . 3 balancings in direct->indirect conversion + . 1 block involved into reiserfs_update_sd() + XXX in practically impossible worst case direct2indirect() + can incur (much) more than 3 balancings. + quota update for user, group */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + 1 + + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); + int version; + int dangle = 1; + loff_t new_offset = + (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1; + + /* bad.... */ + reiserfs_write_lock(inode->i_sb); + version = get_inode_item_key_version(inode); - if (!file_capable (inode, block)) { - reiserfs_write_unlock(inode->i_sb); - return -EFBIG; - } - - /* if !create, we aren't changing the FS, so we don't need to - ** log anything, so we don't need to start a transaction - */ - if (!(create & GET_BLOCK_CREATE)) { - int ret ; - /* find number of block-th logical block of the file */ - ret = _get_block_create_0 (inode, block, bh_result, - create | GET_BLOCK_READ_DIRECT) ; - reiserfs_write_unlock(inode->i_sb); - return ret; - } - /* - * if we're already in a transaction, make sure to close - * any new transactions we start in this func - */ - if ((create & GET_BLOCK_NO_DANGLE) || - reiserfs_transaction_running(inode->i_sb)) - dangle = 0; - - /* If file is of such a size, that it might have a tail and tails are enabled - ** we should mark it as possibly needing tail packing on close - */ - if ( (have_large_tails (inode->i_sb) && inode->i_size < i_block_size (inode)*4) || - (have_small_tails (inode->i_sb) && inode->i_size < i_block_size(inode)) ) - REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ; - - /* set the key of the first byte in the 'block'-th block of file */ - make_cpu_key (&key, inode, new_offset, - TYPE_ANY, 3/*key length*/); - if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) { -start_trans: - th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count); - if (!th) { - retval = -ENOMEM; - goto failure; - } - reiserfs_update_inode_transaction(inode) ; - } - research: - - retval = search_for_position_by_key (inode->i_sb, &key, &path); - if (retval == IO_ERROR) { - retval = -EIO; - goto failure; - } - - bh = get_last_bh (&path); - ih = get_ih (&path); - item = get_item (&path); - pos_in_item = path.pos_in_item; - - fs_gen = get_generation (inode->i_sb); - copy_item_head (&tmp_ih, ih); - - if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) { - /* we have to allocate block for the unformatted node */ - if (!th) { - pathrelse(&path) ; - goto start_trans; - } - - repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create); - - if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) { - /* restart the transaction to give the journal a chance to free - ** some blocks. releases the path, so we have to go back to - ** research if we succeed on the second try - */ - SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1; - retval = restart_transaction(th, inode, &path) ; - if (retval) - goto failure; - repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create); - - if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) { - goto research ; - } - if (repeat == QUOTA_EXCEEDED) - retval = -EDQUOT; - else - retval = -ENOSPC; - goto failure; - } - - if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { - goto research; - } - } - - if (indirect_item_found (retval, ih)) { - b_blocknr_t unfm_ptr; - /* 'block'-th block is in the file already (there is - corresponding cell in some indirect item). But it may be - zero unformatted node pointer (hole) */ - unfm_ptr = get_block_num (item, pos_in_item); - if (unfm_ptr == 0) { - /* use allocated block to plug the hole */ - reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ; - if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { - reiserfs_restore_prepared_buffer(inode->i_sb, bh) ; - goto research; - } - set_buffer_new(bh_result); - if (buffer_dirty(bh_result) && reiserfs_data_ordered(inode->i_sb)) - reiserfs_add_ordered_list(inode, bh_result); - put_block_num(item, pos_in_item, allocated_block_nr) ; - unfm_ptr = allocated_block_nr; - journal_mark_dirty (th, inode->i_sb, bh); - reiserfs_update_sd(th, inode) ; - } - set_block_dev_mapped(bh_result, unfm_ptr, inode); - pathrelse (&path); - retval = 0; - if (!dangle && th) - retval = reiserfs_end_persistent_transaction(th); + if (block < 0) { + reiserfs_write_unlock(inode->i_sb); + return -EIO; + } - reiserfs_write_unlock(inode->i_sb); - - /* the item was found, so new blocks were not added to the file - ** there is no need to make sure the inode is updated with this - ** transaction - */ - return retval; - } - - if (!th) { - pathrelse(&path) ; - goto start_trans; - } - - /* desired position is not found or is in the direct item. We have - to append file with holes up to 'block'-th block converting - direct items to indirect one if necessary */ - done = 0; - do { - if (is_statdata_le_ih (ih)) { - __le32 unp = 0; - struct cpu_key tmp_key; - - /* indirect item has to be inserted */ - make_le_item_head (&tmp_ih, &key, version, 1, TYPE_INDIRECT, - UNFM_P_SIZE, 0/* free_space */); - - if (cpu_key_k_offset (&key) == 1) { - /* we are going to add 'block'-th block to the file. Use - allocated block for that */ - unp = cpu_to_le32 (allocated_block_nr); - set_block_dev_mapped (bh_result, allocated_block_nr, inode); - set_buffer_new(bh_result); - done = 1; - } - tmp_key = key; // ;) - set_cpu_key_k_offset (&tmp_key, 1); - PATH_LAST_POSITION(&path) ++; - - retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, inode, (char *)&unp); - if (retval) { - reiserfs_free_block (th, inode, allocated_block_nr, 1); - goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST - } - //mark_tail_converted (inode); - } else if (is_direct_le_ih (ih)) { - /* direct item has to be converted */ - loff_t tail_offset; - - tail_offset = ((le_ih_k_offset (ih) - 1) & ~(inode->i_sb->s_blocksize - 1)) + 1; - if (tail_offset == cpu_key_k_offset (&key)) { - /* direct item we just found fits into block we have - to map. Convert it into unformatted node: use - bh_result for the conversion */ - set_block_dev_mapped (bh_result, allocated_block_nr, inode); - unbh = bh_result; - done = 1; - } else { - /* we have to padd file tail stored in direct item(s) - up to block size and convert it to unformatted - node. FIXME: this should also get into page cache */ - - pathrelse(&path) ; - /* - * ugly, but we can only end the transaction if - * we aren't nested - */ - BUG_ON (!th->t_refcount); - if (th->t_refcount == 1) { - retval = reiserfs_end_persistent_transaction(th); - th = NULL; - if (retval) + if (!file_capable(inode, block)) { + reiserfs_write_unlock(inode->i_sb); + return -EFBIG; + } + + /* if !create, we aren't changing the FS, so we don't need to + ** log anything, so we don't need to start a transaction + */ + if (!(create & GET_BLOCK_CREATE)) { + int ret; + /* find number of block-th logical block of the file */ + ret = _get_block_create_0(inode, block, bh_result, + create | GET_BLOCK_READ_DIRECT); + reiserfs_write_unlock(inode->i_sb); + return ret; + } + /* + * if we're already in a transaction, make sure to close + * any new transactions we start in this func + */ + if ((create & GET_BLOCK_NO_DANGLE) || + reiserfs_transaction_running(inode->i_sb)) + dangle = 0; + + /* If file is of such a size, that it might have a tail and tails are enabled + ** we should mark it as possibly needing tail packing on close + */ + if ((have_large_tails(inode->i_sb) + && inode->i_size < i_block_size(inode) * 4) + || (have_small_tails(inode->i_sb) + && inode->i_size < i_block_size(inode))) + REISERFS_I(inode)->i_flags |= i_pack_on_close_mask; + + /* set the key of the first byte in the 'block'-th block of file */ + make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ ); + if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) { + start_trans: + th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count); + if (!th) { + retval = -ENOMEM; goto failure; } + reiserfs_update_inode_transaction(inode); + } + research: - retval = convert_tail_for_hole(inode, bh_result, tail_offset) ; - if (retval) { - if ( retval != -ENOSPC ) - reiserfs_warning (inode->i_sb, "clm-6004: convert tail failed inode %lu, error %d", inode->i_ino, retval) ; - if (allocated_block_nr) { - /* the bitmap, the super, and the stat data == 3 */ - if (!th) - th = reiserfs_persistent_transaction(inode->i_sb,3); - if (th) - reiserfs_free_block (th,inode,allocated_block_nr,1); - } - goto failure ; - } - goto research ; - } - retval = direct2indirect (th, inode, &path, unbh, tail_offset); - if (retval) { - reiserfs_unmap_buffer(unbh); - reiserfs_free_block (th, inode, allocated_block_nr, 1); - goto failure; - } - /* it is important the set_buffer_uptodate is done after - ** the direct2indirect. The buffer might contain valid - ** data newer than the data on disk (read by readpage, changed, - ** and then sent here by writepage). direct2indirect needs - ** to know if unbh was already up to date, so it can decide - ** if the data in unbh needs to be replaced with data from - ** the disk - */ - set_buffer_uptodate (unbh); - - /* unbh->b_page == NULL in case of DIRECT_IO request, this means - buffer will disappear shortly, so it should not be added to - */ - if ( unbh->b_page ) { - /* we've converted the tail, so we must - ** flush unbh before the transaction commits - */ - reiserfs_add_tail_list(inode, unbh) ; - - /* mark it dirty now to prevent commit_write from adding - ** this buffer to the inode's dirty buffer list - */ - /* - * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty(). - * It's still atomic, but it sets the page dirty too, - * which makes it eligible for writeback at any time by the - * VM (which was also the case with __mark_buffer_dirty()) - */ - mark_buffer_dirty(unbh) ; - } - } else { - /* append indirect item with holes if needed, when appending - pointer to 'block'-th block use block, which is already - allocated */ - struct cpu_key tmp_key; - unp_t unf_single=0; // We use this in case we need to allocate only - // one block which is a fastpath - unp_t *un; - __u64 max_to_insert=MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE; - __u64 blocks_needed; - - RFALSE( pos_in_item != ih_item_len(ih) / UNFM_P_SIZE, - "vs-804: invalid position for append"); - /* indirect item has to be appended, set up key of that position */ - make_cpu_key (&tmp_key, inode, - le_key_k_offset (version, &(ih->ih_key)) + op_bytes_number (ih, inode->i_sb->s_blocksize), - //pos_in_item * inode->i_sb->s_blocksize, - TYPE_INDIRECT, 3);// key type is unimportant - - blocks_needed = 1 + ((cpu_key_k_offset (&key) - cpu_key_k_offset (&tmp_key)) >> inode->i_sb->s_blocksize_bits); - RFALSE( blocks_needed < 0, "green-805: invalid offset"); - - if ( blocks_needed == 1 ) { - un = &unf_single; - } else { - un=kmalloc( min(blocks_needed,max_to_insert)*UNFM_P_SIZE, - GFP_ATOMIC); // We need to avoid scheduling. - if ( !un) { - un = &unf_single; - blocks_needed = 1; - max_to_insert = 0; - } else - memset(un, 0, UNFM_P_SIZE * min(blocks_needed,max_to_insert)); - } - if ( blocks_needed <= max_to_insert) { - /* we are going to add target block to the file. Use allocated - block for that */ - un[blocks_needed-1] = cpu_to_le32 (allocated_block_nr); - set_block_dev_mapped (bh_result, allocated_block_nr, inode); - set_buffer_new(bh_result); - done = 1; - } else { - /* paste hole to the indirect item */ - /* If kmalloc failed, max_to_insert becomes zero and it means we - only have space for one block */ - blocks_needed=max_to_insert?max_to_insert:1; - } - retval = reiserfs_paste_into_item (th, &path, &tmp_key, inode, (char *)un, UNFM_P_SIZE * blocks_needed); - - if (blocks_needed != 1) - kfree(un); - - if (retval) { - reiserfs_free_block (th, inode, allocated_block_nr, 1); - goto failure; - } - if (!done) { - /* We need to mark new file size in case this function will be - interrupted/aborted later on. And we may do this only for - holes. */ - inode->i_size += inode->i_sb->s_blocksize * blocks_needed; - } - } - - if (done == 1) - break; - - /* this loop could log more blocks than we had originally asked - ** for. So, we have to allow the transaction to end if it is - ** too big or too full. Update the inode so things are - ** consistent if we crash before the function returns - ** - ** release the path so that anybody waiting on the path before - ** ending their transaction will be able to continue. - */ - if (journal_transaction_should_end(th, th->t_blocks_allocated)) { - retval = restart_transaction(th, inode, &path) ; - if (retval) - goto failure; - } - /* inserting indirect pointers for a hole can take a - ** long time. reschedule if needed - */ - cond_resched(); - - retval = search_for_position_by_key (inode->i_sb, &key, &path); + retval = search_for_position_by_key(inode->i_sb, &key, &path); if (retval == IO_ERROR) { - retval = -EIO; - goto failure; - } - if (retval == POSITION_FOUND) { - reiserfs_warning (inode->i_sb, "vs-825: reiserfs_get_block: " - "%K should not be found", &key); - retval = -EEXIST; - if (allocated_block_nr) - reiserfs_free_block (th, inode, allocated_block_nr, 1); - pathrelse(&path) ; - goto failure; - } - bh = get_last_bh (&path); - ih = get_ih (&path); - item = get_item (&path); + retval = -EIO; + goto failure; + } + + bh = get_last_bh(&path); + ih = get_ih(&path); + item = get_item(&path); pos_in_item = path.pos_in_item; - } while (1); + fs_gen = get_generation(inode->i_sb); + copy_item_head(&tmp_ih, ih); + + if (allocation_needed + (retval, allocated_block_nr, ih, item, pos_in_item)) { + /* we have to allocate block for the unformatted node */ + if (!th) { + pathrelse(&path); + goto start_trans; + } + + repeat = + _allocate_block(th, block, inode, &allocated_block_nr, + &path, create); + + if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) { + /* restart the transaction to give the journal a chance to free + ** some blocks. releases the path, so we have to go back to + ** research if we succeed on the second try + */ + SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1; + retval = restart_transaction(th, inode, &path); + if (retval) + goto failure; + repeat = + _allocate_block(th, block, inode, + &allocated_block_nr, NULL, create); + + if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) { + goto research; + } + if (repeat == QUOTA_EXCEEDED) + retval = -EDQUOT; + else + retval = -ENOSPC; + goto failure; + } + + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + goto research; + } + } + + if (indirect_item_found(retval, ih)) { + b_blocknr_t unfm_ptr; + /* 'block'-th block is in the file already (there is + corresponding cell in some indirect item). But it may be + zero unformatted node pointer (hole) */ + unfm_ptr = get_block_num(item, pos_in_item); + if (unfm_ptr == 0) { + /* use allocated block to plug the hole */ + reiserfs_prepare_for_journal(inode->i_sb, bh, 1); + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, + bh); + goto research; + } + set_buffer_new(bh_result); + if (buffer_dirty(bh_result) + && reiserfs_data_ordered(inode->i_sb)) + reiserfs_add_ordered_list(inode, bh_result); + put_block_num(item, pos_in_item, allocated_block_nr); + unfm_ptr = allocated_block_nr; + journal_mark_dirty(th, inode->i_sb, bh); + reiserfs_update_sd(th, inode); + } + set_block_dev_mapped(bh_result, unfm_ptr, inode); + pathrelse(&path); + retval = 0; + if (!dangle && th) + retval = reiserfs_end_persistent_transaction(th); + + reiserfs_write_unlock(inode->i_sb); + + /* the item was found, so new blocks were not added to the file + ** there is no need to make sure the inode is updated with this + ** transaction + */ + return retval; + } + + if (!th) { + pathrelse(&path); + goto start_trans; + } + + /* desired position is not found or is in the direct item. We have + to append file with holes up to 'block'-th block converting + direct items to indirect one if necessary */ + done = 0; + do { + if (is_statdata_le_ih(ih)) { + __le32 unp = 0; + struct cpu_key tmp_key; + + /* indirect item has to be inserted */ + make_le_item_head(&tmp_ih, &key, version, 1, + TYPE_INDIRECT, UNFM_P_SIZE, + 0 /* free_space */ ); + + if (cpu_key_k_offset(&key) == 1) { + /* we are going to add 'block'-th block to the file. Use + allocated block for that */ + unp = cpu_to_le32(allocated_block_nr); + set_block_dev_mapped(bh_result, + allocated_block_nr, inode); + set_buffer_new(bh_result); + done = 1; + } + tmp_key = key; // ;) + set_cpu_key_k_offset(&tmp_key, 1); + PATH_LAST_POSITION(&path)++; + + retval = + reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih, + inode, (char *)&unp); + if (retval) { + reiserfs_free_block(th, inode, + allocated_block_nr, 1); + goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST + } + //mark_tail_converted (inode); + } else if (is_direct_le_ih(ih)) { + /* direct item has to be converted */ + loff_t tail_offset; + + tail_offset = + ((le_ih_k_offset(ih) - + 1) & ~(inode->i_sb->s_blocksize - 1)) + 1; + if (tail_offset == cpu_key_k_offset(&key)) { + /* direct item we just found fits into block we have + to map. Convert it into unformatted node: use + bh_result for the conversion */ + set_block_dev_mapped(bh_result, + allocated_block_nr, inode); + unbh = bh_result; + done = 1; + } else { + /* we have to padd file tail stored in direct item(s) + up to block size and convert it to unformatted + node. FIXME: this should also get into page cache */ + + pathrelse(&path); + /* + * ugly, but we can only end the transaction if + * we aren't nested + */ + BUG_ON(!th->t_refcount); + if (th->t_refcount == 1) { + retval = + reiserfs_end_persistent_transaction + (th); + th = NULL; + if (retval) + goto failure; + } + + retval = + convert_tail_for_hole(inode, bh_result, + tail_offset); + if (retval) { + if (retval != -ENOSPC) + reiserfs_warning(inode->i_sb, + "clm-6004: convert tail failed inode %lu, error %d", + inode->i_ino, + retval); + if (allocated_block_nr) { + /* the bitmap, the super, and the stat data == 3 */ + if (!th) + th = reiserfs_persistent_transaction(inode->i_sb, 3); + if (th) + reiserfs_free_block(th, + inode, + allocated_block_nr, + 1); + } + goto failure; + } + goto research; + } + retval = + direct2indirect(th, inode, &path, unbh, + tail_offset); + if (retval) { + reiserfs_unmap_buffer(unbh); + reiserfs_free_block(th, inode, + allocated_block_nr, 1); + goto failure; + } + /* it is important the set_buffer_uptodate is done after + ** the direct2indirect. The buffer might contain valid + ** data newer than the data on disk (read by readpage, changed, + ** and then sent here by writepage). direct2indirect needs + ** to know if unbh was already up to date, so it can decide + ** if the data in unbh needs to be replaced with data from + ** the disk + */ + set_buffer_uptodate(unbh); + + /* unbh->b_page == NULL in case of DIRECT_IO request, this means + buffer will disappear shortly, so it should not be added to + */ + if (unbh->b_page) { + /* we've converted the tail, so we must + ** flush unbh before the transaction commits + */ + reiserfs_add_tail_list(inode, unbh); + + /* mark it dirty now to prevent commit_write from adding + ** this buffer to the inode's dirty buffer list + */ + /* + * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty(). + * It's still atomic, but it sets the page dirty too, + * which makes it eligible for writeback at any time by the + * VM (which was also the case with __mark_buffer_dirty()) + */ + mark_buffer_dirty(unbh); + } + } else { + /* append indirect item with holes if needed, when appending + pointer to 'block'-th block use block, which is already + allocated */ + struct cpu_key tmp_key; + unp_t unf_single = 0; // We use this in case we need to allocate only + // one block which is a fastpath + unp_t *un; + __u64 max_to_insert = + MAX_ITEM_LEN(inode->i_sb->s_blocksize) / + UNFM_P_SIZE; + __u64 blocks_needed; + + RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE, + "vs-804: invalid position for append"); + /* indirect item has to be appended, set up key of that position */ + make_cpu_key(&tmp_key, inode, + le_key_k_offset(version, + &(ih->ih_key)) + + op_bytes_number(ih, + inode->i_sb->s_blocksize), + //pos_in_item * inode->i_sb->s_blocksize, + TYPE_INDIRECT, 3); // key type is unimportant + + blocks_needed = + 1 + + ((cpu_key_k_offset(&key) - + cpu_key_k_offset(&tmp_key)) >> inode->i_sb-> + s_blocksize_bits); + RFALSE(blocks_needed < 0, "green-805: invalid offset"); + + if (blocks_needed == 1) { + un = &unf_single; + } else { + un = kmalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC); // We need to avoid scheduling. + if (!un) { + un = &unf_single; + blocks_needed = 1; + max_to_insert = 0; + } else + memset(un, 0, + UNFM_P_SIZE * min(blocks_needed, + max_to_insert)); + } + if (blocks_needed <= max_to_insert) { + /* we are going to add target block to the file. Use allocated + block for that */ + un[blocks_needed - 1] = + cpu_to_le32(allocated_block_nr); + set_block_dev_mapped(bh_result, + allocated_block_nr, inode); + set_buffer_new(bh_result); + done = 1; + } else { + /* paste hole to the indirect item */ + /* If kmalloc failed, max_to_insert becomes zero and it means we + only have space for one block */ + blocks_needed = + max_to_insert ? max_to_insert : 1; + } + retval = + reiserfs_paste_into_item(th, &path, &tmp_key, inode, + (char *)un, + UNFM_P_SIZE * + blocks_needed); + + if (blocks_needed != 1) + kfree(un); + + if (retval) { + reiserfs_free_block(th, inode, + allocated_block_nr, 1); + goto failure; + } + if (!done) { + /* We need to mark new file size in case this function will be + interrupted/aborted later on. And we may do this only for + holes. */ + inode->i_size += + inode->i_sb->s_blocksize * blocks_needed; + } + } - retval = 0; + if (done == 1) + break; - failure: - if (th && (!dangle || (retval && !th->t_trans_id))) { - int err; - if (th->t_trans_id) - reiserfs_update_sd(th, inode); - err = reiserfs_end_persistent_transaction(th); - if (err) - retval = err; - } + /* this loop could log more blocks than we had originally asked + ** for. So, we have to allow the transaction to end if it is + ** too big or too full. Update the inode so things are + ** consistent if we crash before the function returns + ** + ** release the path so that anybody waiting on the path before + ** ending their transaction will be able to continue. + */ + if (journal_transaction_should_end(th, th->t_blocks_allocated)) { + retval = restart_transaction(th, inode, &path); + if (retval) + goto failure; + } + /* inserting indirect pointers for a hole can take a + ** long time. reschedule if needed + */ + cond_resched(); + + retval = search_for_position_by_key(inode->i_sb, &key, &path); + if (retval == IO_ERROR) { + retval = -EIO; + goto failure; + } + if (retval == POSITION_FOUND) { + reiserfs_warning(inode->i_sb, + "vs-825: reiserfs_get_block: " + "%K should not be found", &key); + retval = -EEXIST; + if (allocated_block_nr) + reiserfs_free_block(th, inode, + allocated_block_nr, 1); + pathrelse(&path); + goto failure; + } + bh = get_last_bh(&path); + ih = get_ih(&path); + item = get_item(&path); + pos_in_item = path.pos_in_item; + } while (1); - reiserfs_write_unlock(inode->i_sb); - reiserfs_check_path(&path) ; - return retval; + retval = 0; + + failure: + if (th && (!dangle || (retval && !th->t_trans_id))) { + int err; + if (th->t_trans_id) + reiserfs_update_sd(th, inode); + err = reiserfs_end_persistent_transaction(th); + if (err) + retval = err; + } + + reiserfs_write_unlock(inode->i_sb); + reiserfs_check_path(&path); + return retval; } static int reiserfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) + struct list_head *pages, unsigned nr_pages) { - return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block); + return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block); } /* Compute real number of used bytes by file @@ -987,51 +1058,56 @@ reiserfs_readpages(struct file *file, struct address_space *mapping, */ static int real_space_diff(struct inode *inode, int sd_size) { - int bytes; - loff_t blocksize = inode->i_sb->s_blocksize ; - - if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) - return sd_size ; - - /* End of file is also in full block with indirect reference, so round - ** up to the next block. - ** - ** there is just no way to know if the tail is actually packed - ** on the file, so we have to assume it isn't. When we pack the - ** tail, we add 4 bytes to pretend there really is an unformatted - ** node pointer - */ - bytes = ((inode->i_size + (blocksize-1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + sd_size; - return bytes ; + int bytes; + loff_t blocksize = inode->i_sb->s_blocksize; + + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) + return sd_size; + + /* End of file is also in full block with indirect reference, so round + ** up to the next block. + ** + ** there is just no way to know if the tail is actually packed + ** on the file, so we have to assume it isn't. When we pack the + ** tail, we add 4 bytes to pretend there really is an unformatted + ** node pointer + */ + bytes = + ((inode->i_size + + (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + + sd_size; + return bytes; } static inline loff_t to_real_used_space(struct inode *inode, ulong blocks, - int sd_size) + int sd_size) { - if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { - return inode->i_size + (loff_t)(real_space_diff(inode, sd_size)) ; - } - return ((loff_t)real_space_diff(inode, sd_size)) + (((loff_t)blocks) << 9); + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { + return inode->i_size + + (loff_t) (real_space_diff(inode, sd_size)); + } + return ((loff_t) real_space_diff(inode, sd_size)) + + (((loff_t) blocks) << 9); } /* Compute number of blocks used by file in ReiserFS counting */ static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size) { - loff_t bytes = inode_get_bytes(inode) ; - loff_t real_space = real_space_diff(inode, sd_size) ; - - /* keeps fsck and non-quota versions of reiserfs happy */ - if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { - bytes += (loff_t)511 ; - } - - /* files from before the quota patch might i_blocks such that - ** bytes < real_space. Deal with that here to prevent it from - ** going negative. - */ - if (bytes < real_space) - return 0 ; - return (bytes - real_space) >> 9; + loff_t bytes = inode_get_bytes(inode); + loff_t real_space = real_space_diff(inode, sd_size); + + /* keeps fsck and non-quota versions of reiserfs happy */ + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { + bytes += (loff_t) 511; + } + + /* files from before the quota patch might i_blocks such that + ** bytes < real_space. Deal with that here to prevent it from + ** going negative. + */ + if (bytes < real_space) + return 0; + return (bytes - real_space) >> 9; } // @@ -1042,263 +1118,269 @@ static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size) // // called by read_locked_inode -static void init_inode (struct inode * inode, struct path * path) +static void init_inode(struct inode *inode, struct path *path) { - struct buffer_head * bh; - struct item_head * ih; - __u32 rdev; - //int version = ITEM_VERSION_1; - - bh = PATH_PLAST_BUFFER (path); - ih = PATH_PITEM_HEAD (path); - - - copy_key (INODE_PKEY (inode), &(ih->ih_key)); - inode->i_blksize = reiserfs_default_io_size; - - INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list )); - REISERFS_I(inode)->i_flags = 0; - REISERFS_I(inode)->i_prealloc_block = 0; - REISERFS_I(inode)->i_prealloc_count = 0; - REISERFS_I(inode)->i_trans_id = 0; - REISERFS_I(inode)->i_jl = NULL; - REISERFS_I(inode)->i_acl_access = NULL; - REISERFS_I(inode)->i_acl_default = NULL; - init_rwsem (&REISERFS_I(inode)->xattr_sem); - - if (stat_data_v1 (ih)) { - struct stat_data_v1 * sd = (struct stat_data_v1 *)B_I_PITEM (bh, ih); - unsigned long blocks; - - set_inode_item_key_version (inode, KEY_FORMAT_3_5); - set_inode_sd_version (inode, STAT_DATA_V1); - inode->i_mode = sd_v1_mode(sd); - inode->i_nlink = sd_v1_nlink(sd); - inode->i_uid = sd_v1_uid(sd); - inode->i_gid = sd_v1_gid(sd); - inode->i_size = sd_v1_size(sd); - inode->i_atime.tv_sec = sd_v1_atime(sd); - inode->i_mtime.tv_sec = sd_v1_mtime(sd); - inode->i_ctime.tv_sec = sd_v1_ctime(sd); - inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; - inode->i_mtime.tv_nsec = 0; - - inode->i_blocks = sd_v1_blocks(sd); - inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id); - blocks = (inode->i_size + 511) >> 9; - blocks = _ROUND_UP (blocks, inode->i_sb->s_blocksize >> 9); - if (inode->i_blocks > blocks) { - // there was a bug in <=3.5.23 when i_blocks could take negative - // values. Starting from 3.5.17 this value could even be stored in - // stat data. For such files we set i_blocks based on file - // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be - // only updated if file's inode will ever change - inode->i_blocks = blocks; - } - - rdev = sd_v1_rdev(sd); - REISERFS_I(inode)->i_first_direct_byte = sd_v1_first_direct_byte(sd); - /* an early bug in the quota code can give us an odd number for the - ** block count. This is incorrect, fix it here. - */ - if (inode->i_blocks & 1) { - inode->i_blocks++ ; - } - inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks, - SD_V1_SIZE)); - /* nopack is initially zero for v1 objects. For v2 objects, - nopack is initialised from sd_attrs */ - REISERFS_I(inode)->i_flags &= ~i_nopack_mask; - } else { - // new stat data found, but object may have old items - // (directories and symlinks) - struct stat_data * sd = (struct stat_data *)B_I_PITEM (bh, ih); - - inode->i_mode = sd_v2_mode(sd); - inode->i_nlink = sd_v2_nlink(sd); - inode->i_uid = sd_v2_uid(sd); - inode->i_size = sd_v2_size(sd); - inode->i_gid = sd_v2_gid(sd); - inode->i_mtime.tv_sec = sd_v2_mtime(sd); - inode->i_atime.tv_sec = sd_v2_atime(sd); - inode->i_ctime.tv_sec = sd_v2_ctime(sd); - inode->i_ctime.tv_nsec = 0; - inode->i_mtime.tv_nsec = 0; - inode->i_atime.tv_nsec = 0; - inode->i_blocks = sd_v2_blocks(sd); - rdev = sd_v2_rdev(sd); - if( S_ISCHR( inode -> i_mode ) || S_ISBLK( inode -> i_mode ) ) - inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id); - else - inode->i_generation = sd_v2_generation(sd); + struct buffer_head *bh; + struct item_head *ih; + __u32 rdev; + //int version = ITEM_VERSION_1; + + bh = PATH_PLAST_BUFFER(path); + ih = PATH_PITEM_HEAD(path); + + copy_key(INODE_PKEY(inode), &(ih->ih_key)); + inode->i_blksize = reiserfs_default_io_size; + + INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list)); + REISERFS_I(inode)->i_flags = 0; + REISERFS_I(inode)->i_prealloc_block = 0; + REISERFS_I(inode)->i_prealloc_count = 0; + REISERFS_I(inode)->i_trans_id = 0; + REISERFS_I(inode)->i_jl = NULL; + REISERFS_I(inode)->i_acl_access = NULL; + REISERFS_I(inode)->i_acl_default = NULL; + init_rwsem(&REISERFS_I(inode)->xattr_sem); + + if (stat_data_v1(ih)) { + struct stat_data_v1 *sd = + (struct stat_data_v1 *)B_I_PITEM(bh, ih); + unsigned long blocks; + + set_inode_item_key_version(inode, KEY_FORMAT_3_5); + set_inode_sd_version(inode, STAT_DATA_V1); + inode->i_mode = sd_v1_mode(sd); + inode->i_nlink = sd_v1_nlink(sd); + inode->i_uid = sd_v1_uid(sd); + inode->i_gid = sd_v1_gid(sd); + inode->i_size = sd_v1_size(sd); + inode->i_atime.tv_sec = sd_v1_atime(sd); + inode->i_mtime.tv_sec = sd_v1_mtime(sd); + inode->i_ctime.tv_sec = sd_v1_ctime(sd); + inode->i_atime.tv_nsec = 0; + inode->i_ctime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + + inode->i_blocks = sd_v1_blocks(sd); + inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); + blocks = (inode->i_size + 511) >> 9; + blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9); + if (inode->i_blocks > blocks) { + // there was a bug in <=3.5.23 when i_blocks could take negative + // values. Starting from 3.5.17 this value could even be stored in + // stat data. For such files we set i_blocks based on file + // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be + // only updated if file's inode will ever change + inode->i_blocks = blocks; + } - if (S_ISDIR (inode->i_mode) || S_ISLNK (inode->i_mode)) - set_inode_item_key_version (inode, KEY_FORMAT_3_5); - else - set_inode_item_key_version (inode, KEY_FORMAT_3_6); - REISERFS_I(inode)->i_first_direct_byte = 0; - set_inode_sd_version (inode, STAT_DATA_V2); - inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks, - SD_V2_SIZE)); - /* read persistent inode attributes from sd and initalise - generic inode flags from them */ - REISERFS_I(inode)->i_attrs = sd_v2_attrs( sd ); - sd_attrs_to_i_attrs( sd_v2_attrs( sd ), inode ); - } - - pathrelse (path); - if (S_ISREG (inode->i_mode)) { - inode->i_op = &reiserfs_file_inode_operations; - inode->i_fop = &reiserfs_file_operations; - inode->i_mapping->a_ops = &reiserfs_address_space_operations ; - } else if (S_ISDIR (inode->i_mode)) { - inode->i_op = &reiserfs_dir_inode_operations; - inode->i_fop = &reiserfs_dir_operations; - } else if (S_ISLNK (inode->i_mode)) { - inode->i_op = &reiserfs_symlink_inode_operations; - inode->i_mapping->a_ops = &reiserfs_address_space_operations; - } else { - inode->i_blocks = 0; - inode->i_op = &reiserfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); - } -} + rdev = sd_v1_rdev(sd); + REISERFS_I(inode)->i_first_direct_byte = + sd_v1_first_direct_byte(sd); + /* an early bug in the quota code can give us an odd number for the + ** block count. This is incorrect, fix it here. + */ + if (inode->i_blocks & 1) { + inode->i_blocks++; + } + inode_set_bytes(inode, + to_real_used_space(inode, inode->i_blocks, + SD_V1_SIZE)); + /* nopack is initially zero for v1 objects. For v2 objects, + nopack is initialised from sd_attrs */ + REISERFS_I(inode)->i_flags &= ~i_nopack_mask; + } else { + // new stat data found, but object may have old items + // (directories and symlinks) + struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih); + + inode->i_mode = sd_v2_mode(sd); + inode->i_nlink = sd_v2_nlink(sd); + inode->i_uid = sd_v2_uid(sd); + inode->i_size = sd_v2_size(sd); + inode->i_gid = sd_v2_gid(sd); + inode->i_mtime.tv_sec = sd_v2_mtime(sd); + inode->i_atime.tv_sec = sd_v2_atime(sd); + inode->i_ctime.tv_sec = sd_v2_ctime(sd); + inode->i_ctime.tv_nsec = 0; + inode->i_mtime.tv_nsec = 0; + inode->i_atime.tv_nsec = 0; + inode->i_blocks = sd_v2_blocks(sd); + rdev = sd_v2_rdev(sd); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + inode->i_generation = + le32_to_cpu(INODE_PKEY(inode)->k_dir_id); + else + inode->i_generation = sd_v2_generation(sd); + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + set_inode_item_key_version(inode, KEY_FORMAT_3_5); + else + set_inode_item_key_version(inode, KEY_FORMAT_3_6); + REISERFS_I(inode)->i_first_direct_byte = 0; + set_inode_sd_version(inode, STAT_DATA_V2); + inode_set_bytes(inode, + to_real_used_space(inode, inode->i_blocks, + SD_V2_SIZE)); + /* read persistent inode attributes from sd and initalise + generic inode flags from them */ + REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd); + sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode); + } + + pathrelse(path); + if (S_ISREG(inode->i_mode)) { + inode->i_op = &reiserfs_file_inode_operations; + inode->i_fop = &reiserfs_file_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &reiserfs_dir_inode_operations; + inode->i_fop = &reiserfs_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &reiserfs_symlink_inode_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + } else { + inode->i_blocks = 0; + inode->i_op = &reiserfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + } +} // update new stat data with inode fields -static void inode2sd (void * sd, struct inode * inode, loff_t size) +static void inode2sd(void *sd, struct inode *inode, loff_t size) { - struct stat_data * sd_v2 = (struct stat_data *)sd; - __u16 flags; - - set_sd_v2_mode(sd_v2, inode->i_mode ); - set_sd_v2_nlink(sd_v2, inode->i_nlink ); - set_sd_v2_uid(sd_v2, inode->i_uid ); - set_sd_v2_size(sd_v2, size ); - set_sd_v2_gid(sd_v2, inode->i_gid ); - set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec ); - set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec ); - set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec ); - set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE)); - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) - set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev)); - else - set_sd_v2_generation(sd_v2, inode->i_generation); - flags = REISERFS_I(inode)->i_attrs; - i_attrs_to_sd_attrs( inode, &flags ); - set_sd_v2_attrs( sd_v2, flags ); + struct stat_data *sd_v2 = (struct stat_data *)sd; + __u16 flags; + + set_sd_v2_mode(sd_v2, inode->i_mode); + set_sd_v2_nlink(sd_v2, inode->i_nlink); + set_sd_v2_uid(sd_v2, inode->i_uid); + set_sd_v2_size(sd_v2, size); + set_sd_v2_gid(sd_v2, inode->i_gid); + set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec); + set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec); + set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec); + set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE)); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev)); + else + set_sd_v2_generation(sd_v2, inode->i_generation); + flags = REISERFS_I(inode)->i_attrs; + i_attrs_to_sd_attrs(inode, &flags); + set_sd_v2_attrs(sd_v2, flags); } - // used to copy inode's fields to old stat data -static void inode2sd_v1 (void * sd, struct inode * inode, loff_t size) +static void inode2sd_v1(void *sd, struct inode *inode, loff_t size) { - struct stat_data_v1 * sd_v1 = (struct stat_data_v1 *)sd; - - set_sd_v1_mode(sd_v1, inode->i_mode ); - set_sd_v1_uid(sd_v1, inode->i_uid ); - set_sd_v1_gid(sd_v1, inode->i_gid ); - set_sd_v1_nlink(sd_v1, inode->i_nlink ); - set_sd_v1_size(sd_v1, size ); - set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec ); - set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec ); - set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec ); - - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) - set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev)); - else - set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE)); - - // Sigh. i_first_direct_byte is back - set_sd_v1_first_direct_byte(sd_v1, REISERFS_I(inode)->i_first_direct_byte); -} + struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd; + + set_sd_v1_mode(sd_v1, inode->i_mode); + set_sd_v1_uid(sd_v1, inode->i_uid); + set_sd_v1_gid(sd_v1, inode->i_gid); + set_sd_v1_nlink(sd_v1, inode->i_nlink); + set_sd_v1_size(sd_v1, size); + set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec); + set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec); + set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev)); + else + set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE)); + // Sigh. i_first_direct_byte is back + set_sd_v1_first_direct_byte(sd_v1, + REISERFS_I(inode)->i_first_direct_byte); +} /* NOTE, you must prepare the buffer head before sending it here, ** and then log it after the call */ -static void update_stat_data (struct path * path, struct inode * inode, - loff_t size) +static void update_stat_data(struct path *path, struct inode *inode, + loff_t size) { - struct buffer_head * bh; - struct item_head * ih; - - bh = PATH_PLAST_BUFFER (path); - ih = PATH_PITEM_HEAD (path); - - if (!is_statdata_le_ih (ih)) - reiserfs_panic (inode->i_sb, "vs-13065: update_stat_data: key %k, found item %h", - INODE_PKEY (inode), ih); - - if (stat_data_v1 (ih)) { - // path points to old stat data - inode2sd_v1 (B_I_PITEM (bh, ih), inode, size); - } else { - inode2sd (B_I_PITEM (bh, ih), inode, size); - } - - return; -} + struct buffer_head *bh; + struct item_head *ih; + + bh = PATH_PLAST_BUFFER(path); + ih = PATH_PITEM_HEAD(path); + + if (!is_statdata_le_ih(ih)) + reiserfs_panic(inode->i_sb, + "vs-13065: update_stat_data: key %k, found item %h", + INODE_PKEY(inode), ih); + + if (stat_data_v1(ih)) { + // path points to old stat data + inode2sd_v1(B_I_PITEM(bh, ih), inode, size); + } else { + inode2sd(B_I_PITEM(bh, ih), inode, size); + } + return; +} -void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th, - struct inode * inode, loff_t size) +void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th, + struct inode *inode, loff_t size) { - struct cpu_key key; - INITIALIZE_PATH(path); - struct buffer_head *bh ; - int fs_gen ; - struct item_head *ih, tmp_ih ; - int retval; - - BUG_ON (!th->t_trans_id); - - make_cpu_key (&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);//key type is unimportant - - for(;;) { - int pos; - /* look for the object's stat data */ - retval = search_item (inode->i_sb, &key, &path); - if (retval == IO_ERROR) { - reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: " - "i/o failure occurred trying to update %K stat data", - &key); - return; - } - if (retval == ITEM_NOT_FOUND) { - pos = PATH_LAST_POSITION (&path); - pathrelse(&path) ; - if (inode->i_nlink == 0) { - /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found");*/ - return; - } - reiserfs_warning (inode->i_sb, "vs-13060: reiserfs_update_sd: " - "stat data of object %k (nlink == %d) not found (pos %d)", - INODE_PKEY (inode), inode->i_nlink, pos); - reiserfs_check_path(&path) ; - return; - } - - /* sigh, prepare_for_journal might schedule. When it schedules the - ** FS might change. We have to detect that, and loop back to the - ** search if the stat data item has moved - */ - bh = get_last_bh(&path) ; - ih = get_ih(&path) ; - copy_item_head (&tmp_ih, ih); - fs_gen = get_generation (inode->i_sb); - reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ; - if (fs_changed (fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) { - reiserfs_restore_prepared_buffer(inode->i_sb, bh) ; - continue ; /* Stat_data item has been moved after scheduling. */ - } - break; - } - update_stat_data (&path, inode, size); - journal_mark_dirty(th, th->t_super, bh) ; - pathrelse (&path); - return; + struct cpu_key key; + INITIALIZE_PATH(path); + struct buffer_head *bh; + int fs_gen; + struct item_head *ih, tmp_ih; + int retval; + + BUG_ON(!th->t_trans_id); + + make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3); //key type is unimportant + + for (;;) { + int pos; + /* look for the object's stat data */ + retval = search_item(inode->i_sb, &key, &path); + if (retval == IO_ERROR) { + reiserfs_warning(inode->i_sb, + "vs-13050: reiserfs_update_sd: " + "i/o failure occurred trying to update %K stat data", + &key); + return; + } + if (retval == ITEM_NOT_FOUND) { + pos = PATH_LAST_POSITION(&path); + pathrelse(&path); + if (inode->i_nlink == 0) { + /*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */ + return; + } + reiserfs_warning(inode->i_sb, + "vs-13060: reiserfs_update_sd: " + "stat data of object %k (nlink == %d) not found (pos %d)", + INODE_PKEY(inode), inode->i_nlink, + pos); + reiserfs_check_path(&path); + return; + } + + /* sigh, prepare_for_journal might schedule. When it schedules the + ** FS might change. We have to detect that, and loop back to the + ** search if the stat data item has moved + */ + bh = get_last_bh(&path); + ih = get_ih(&path); + copy_item_head(&tmp_ih, ih); + fs_gen = get_generation(inode->i_sb); + reiserfs_prepare_for_journal(inode->i_sb, bh, 1); + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, bh); + continue; /* Stat_data item has been moved after scheduling. */ + } + break; + } + update_stat_data(&path, inode, size); + journal_mark_dirty(th, th->t_super, bh); + pathrelse(&path); + return; } /* reiserfs_read_locked_inode is called to read the inode off disk, and it @@ -1307,9 +1389,10 @@ void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th, ** corresponding iput might try to delete whatever object the inode last ** represented. */ -static void reiserfs_make_bad_inode(struct inode *inode) { - memset(INODE_PKEY(inode), 0, KEY_SIZE); - make_bad_inode(inode); +static void reiserfs_make_bad_inode(struct inode *inode) +{ + memset(INODE_PKEY(inode), 0, KEY_SIZE); + make_bad_inode(inode); } // @@ -1317,77 +1400,79 @@ static void reiserfs_make_bad_inode(struct inode *inode) { // evolved as the prototype did // -int reiserfs_init_locked_inode (struct inode * inode, void *p) +int reiserfs_init_locked_inode(struct inode *inode, void *p) { - struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p ; - inode->i_ino = args->objectid; - INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid); - return 0; + struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p; + inode->i_ino = args->objectid; + INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid); + return 0; } /* looks for stat data in the tree, and fills up the fields of in-core inode stat data fields */ -void reiserfs_read_locked_inode (struct inode * inode, struct reiserfs_iget_args *args) +void reiserfs_read_locked_inode(struct inode *inode, + struct reiserfs_iget_args *args) { - INITIALIZE_PATH (path_to_sd); - struct cpu_key key; - unsigned long dirino; - int retval; - - dirino = args->dirid ; - - /* set version 1, version 2 could be used too, because stat data - key is the same in both versions */ - key.version = KEY_FORMAT_3_5; - key.on_disk_key.k_dir_id = dirino; - key.on_disk_key.k_objectid = inode->i_ino; - key.on_disk_key.k_offset = 0; - key.on_disk_key.k_type = 0; - - /* look for the object's stat data */ - retval = search_item (inode->i_sb, &key, &path_to_sd); - if (retval == IO_ERROR) { - reiserfs_warning (inode->i_sb, "vs-13070: reiserfs_read_locked_inode: " - "i/o failure occurred trying to find stat data of %K", - &key); - reiserfs_make_bad_inode(inode) ; - return; - } - if (retval != ITEM_FOUND) { - /* a stale NFS handle can trigger this without it being an error */ - pathrelse (&path_to_sd); - reiserfs_make_bad_inode(inode) ; - inode->i_nlink = 0; - return; - } - - init_inode (inode, &path_to_sd); - - /* It is possible that knfsd is trying to access inode of a file - that is being removed from the disk by some other thread. As we - update sd on unlink all that is required is to check for nlink - here. This bug was first found by Sizif when debugging - SquidNG/Butterfly, forgotten, and found again after Philippe - Gramoulle <philippe.gramoulle@mmania.com> reproduced it. - - More logical fix would require changes in fs/inode.c:iput() to - remove inode from hash-table _after_ fs cleaned disk stuff up and - in iget() to return NULL if I_FREEING inode is found in - hash-table. */ - /* Currently there is one place where it's ok to meet inode with - nlink==0: processing of open-unlinked and half-truncated files - during mount (fs/reiserfs/super.c:finish_unfinished()). */ - if( ( inode -> i_nlink == 0 ) && - ! REISERFS_SB(inode -> i_sb) -> s_is_unlinked_ok ) { - reiserfs_warning (inode->i_sb, - "vs-13075: reiserfs_read_locked_inode: " - "dead inode read from disk %K. " - "This is likely to be race with knfsd. Ignore", - &key ); - reiserfs_make_bad_inode( inode ); - } - - reiserfs_check_path(&path_to_sd) ; /* init inode should be relsing */ + INITIALIZE_PATH(path_to_sd); + struct cpu_key key; + unsigned long dirino; + int retval; + + dirino = args->dirid; + + /* set version 1, version 2 could be used too, because stat data + key is the same in both versions */ + key.version = KEY_FORMAT_3_5; + key.on_disk_key.k_dir_id = dirino; + key.on_disk_key.k_objectid = inode->i_ino; + key.on_disk_key.k_offset = 0; + key.on_disk_key.k_type = 0; + + /* look for the object's stat data */ + retval = search_item(inode->i_sb, &key, &path_to_sd); + if (retval == IO_ERROR) { + reiserfs_warning(inode->i_sb, + "vs-13070: reiserfs_read_locked_inode: " + "i/o failure occurred trying to find stat data of %K", + &key); + reiserfs_make_bad_inode(inode); + return; + } + if (retval != ITEM_FOUND) { + /* a stale NFS handle can trigger this without it being an error */ + pathrelse(&path_to_sd); + reiserfs_make_bad_inode(inode); + inode->i_nlink = 0; + return; + } + + init_inode(inode, &path_to_sd); + + /* It is possible that knfsd is trying to access inode of a file + that is being removed from the disk by some other thread. As we + update sd on unlink all that is required is to check for nlink + here. This bug was first found by Sizif when debugging + SquidNG/Butterfly, forgotten, and found again after Philippe + Gramoulle <philippe.gramoulle@mmania.com> reproduced it. + + More logical fix would require changes in fs/inode.c:iput() to + remove inode from hash-table _after_ fs cleaned disk stuff up and + in iget() to return NULL if I_FREEING inode is found in + hash-table. */ + /* Currently there is one place where it's ok to meet inode with + nlink==0: processing of open-unlinked and half-truncated files + during mount (fs/reiserfs/super.c:finish_unfinished()). */ + if ((inode->i_nlink == 0) && + !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) { + reiserfs_warning(inode->i_sb, + "vs-13075: reiserfs_read_locked_inode: " + "dead inode read from disk %K. " + "This is likely to be race with knfsd. Ignore", + &key); + reiserfs_make_bad_inode(inode); + } + + reiserfs_check_path(&path_to_sd); /* init inode should be relsing */ } @@ -1403,140 +1488,148 @@ void reiserfs_read_locked_inode (struct inode * inode, struct reiserfs_iget_args * inode numbers (objectids) are distinguished by parent directory ids. * */ -int reiserfs_find_actor( struct inode *inode, void *opaque ) +int reiserfs_find_actor(struct inode *inode, void *opaque) { - struct reiserfs_iget_args *args; + struct reiserfs_iget_args *args; - args = opaque; - /* args is already in CPU order */ - return (inode->i_ino == args->objectid) && - (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid); + args = opaque; + /* args is already in CPU order */ + return (inode->i_ino == args->objectid) && + (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid); } -struct inode * reiserfs_iget (struct super_block * s, const struct cpu_key * key) +struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key) { - struct inode * inode; - struct reiserfs_iget_args args ; - - args.objectid = key->on_disk_key.k_objectid ; - args.dirid = key->on_disk_key.k_dir_id ; - inode = iget5_locked (s, key->on_disk_key.k_objectid, - reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args)); - if (!inode) - return ERR_PTR(-ENOMEM) ; - - if (inode->i_state & I_NEW) { - reiserfs_read_locked_inode(inode, &args); - unlock_new_inode(inode); - } - - if (comp_short_keys (INODE_PKEY (inode), key) || is_bad_inode (inode)) { - /* either due to i/o error or a stale NFS handle */ - iput (inode); - inode = NULL; - } - return inode; + struct inode *inode; + struct reiserfs_iget_args args; + + args.objectid = key->on_disk_key.k_objectid; + args.dirid = key->on_disk_key.k_dir_id; + inode = iget5_locked(s, key->on_disk_key.k_objectid, + reiserfs_find_actor, reiserfs_init_locked_inode, + (void *)(&args)); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + reiserfs_read_locked_inode(inode, &args); + unlock_new_inode(inode); + } + + if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) { + /* either due to i/o error or a stale NFS handle */ + iput(inode); + inode = NULL; + } + return inode; } struct dentry *reiserfs_get_dentry(struct super_block *sb, void *vobjp) { - __u32 *data = vobjp; - struct cpu_key key ; - struct dentry *result; - struct inode *inode; - - key.on_disk_key.k_objectid = data[0] ; - key.on_disk_key.k_dir_id = data[1] ; - reiserfs_write_lock(sb); - inode = reiserfs_iget(sb, &key) ; - if (inode && !IS_ERR(inode) && data[2] != 0 && - data[2] != inode->i_generation) { - iput(inode) ; - inode = NULL ; - } - reiserfs_write_unlock(sb); - if (!inode) - inode = ERR_PTR(-ESTALE); - if (IS_ERR(inode)) - return ERR_PTR(PTR_ERR(inode)); - result = d_alloc_anon(inode); - if (!result) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - return result; + __u32 *data = vobjp; + struct cpu_key key; + struct dentry *result; + struct inode *inode; + + key.on_disk_key.k_objectid = data[0]; + key.on_disk_key.k_dir_id = data[1]; + reiserfs_write_lock(sb); + inode = reiserfs_iget(sb, &key); + if (inode && !IS_ERR(inode) && data[2] != 0 && + data[2] != inode->i_generation) { + iput(inode); + inode = NULL; + } + reiserfs_write_unlock(sb); + if (!inode) + inode = ERR_PTR(-ESTALE); + if (IS_ERR(inode)) + return ERR_PTR(PTR_ERR(inode)); + result = d_alloc_anon(inode); + if (!result) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + return result; } -struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 *data, - int len, int fhtype, - int (*acceptable)(void *contect, struct dentry *de), - void *context) { - __u32 obj[3], parent[3]; - - /* fhtype happens to reflect the number of u32s encoded. - * due to a bug in earlier code, fhtype might indicate there - * are more u32s then actually fitted. - * so if fhtype seems to be more than len, reduce fhtype. - * Valid types are: - * 2 - objectid + dir_id - legacy support - * 3 - objectid + dir_id + generation - * 4 - objectid + dir_id + objectid and dirid of parent - legacy - * 5 - objectid + dir_id + generation + objectid and dirid of parent - * 6 - as above plus generation of directory - * 6 does not fit in NFSv2 handles - */ - if (fhtype > len) { - if (fhtype != 6 || len != 5) - reiserfs_warning (sb, "nfsd/reiserfs, fhtype=%d, len=%d - odd", - fhtype, len); - fhtype = 5; - } - - obj[0] = data[0]; - obj[1] = data[1]; - if (fhtype == 3 || fhtype >= 5) - obj[2] = data[2]; - else obj[2] = 0; /* generation number */ - - if (fhtype >= 4) { - parent[0] = data[fhtype>=5?3:2] ; - parent[1] = data[fhtype>=5?4:3] ; - if (fhtype == 6) - parent[2] = data[5]; - else parent[2] = 0; - } - return sb->s_export_op->find_exported_dentry(sb, obj, fhtype < 4 ? NULL : parent, - acceptable, context); -} +struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 * data, + int len, int fhtype, + int (*acceptable) (void *contect, + struct dentry * de), + void *context) +{ + __u32 obj[3], parent[3]; + + /* fhtype happens to reflect the number of u32s encoded. + * due to a bug in earlier code, fhtype might indicate there + * are more u32s then actually fitted. + * so if fhtype seems to be more than len, reduce fhtype. + * Valid types are: + * 2 - objectid + dir_id - legacy support + * 3 - objectid + dir_id + generation + * 4 - objectid + dir_id + objectid and dirid of parent - legacy + * 5 - objectid + dir_id + generation + objectid and dirid of parent + * 6 - as above plus generation of directory + * 6 does not fit in NFSv2 handles + */ + if (fhtype > len) { + if (fhtype != 6 || len != 5) + reiserfs_warning(sb, + "nfsd/reiserfs, fhtype=%d, len=%d - odd", + fhtype, len); + fhtype = 5; + } -int reiserfs_encode_fh(struct dentry *dentry, __u32 *data, int *lenp, int need_parent) { - struct inode *inode = dentry->d_inode ; - int maxlen = *lenp; - - if (maxlen < 3) - return 255 ; - - data[0] = inode->i_ino ; - data[1] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ; - data[2] = inode->i_generation ; - *lenp = 3 ; - /* no room for directory info? return what we've stored so far */ - if (maxlen < 5 || ! need_parent) - return 3 ; - - spin_lock(&dentry->d_lock); - inode = dentry->d_parent->d_inode ; - data[3] = inode->i_ino ; - data[4] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ; - *lenp = 5 ; - if (maxlen >= 6) { - data[5] = inode->i_generation ; - *lenp = 6 ; - } - spin_unlock(&dentry->d_lock); - return *lenp ; + obj[0] = data[0]; + obj[1] = data[1]; + if (fhtype == 3 || fhtype >= 5) + obj[2] = data[2]; + else + obj[2] = 0; /* generation number */ + + if (fhtype >= 4) { + parent[0] = data[fhtype >= 5 ? 3 : 2]; + parent[1] = data[fhtype >= 5 ? 4 : 3]; + if (fhtype == 6) + parent[2] = data[5]; + else + parent[2] = 0; + } + return sb->s_export_op->find_exported_dentry(sb, obj, + fhtype < 4 ? NULL : parent, + acceptable, context); } +int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, + int need_parent) +{ + struct inode *inode = dentry->d_inode; + int maxlen = *lenp; + + if (maxlen < 3) + return 255; + + data[0] = inode->i_ino; + data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); + data[2] = inode->i_generation; + *lenp = 3; + /* no room for directory info? return what we've stored so far */ + if (maxlen < 5 || !need_parent) + return 3; + + spin_lock(&dentry->d_lock); + inode = dentry->d_parent->d_inode; + data[3] = inode->i_ino; + data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); + *lenp = 5; + if (maxlen >= 6) { + data[5] = inode->i_generation; + *lenp = 6; + } + spin_unlock(&dentry->d_lock); + return *lenp; +} /* looks for stat data, then copies fields to it, marks the buffer containing stat data as dirty */ @@ -1545,120 +1638,127 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 *data, int *lenp, int need_p ** to properly mark inodes for datasync and such, but only actually ** does something when called for a synchronous update. */ -int reiserfs_write_inode (struct inode * inode, int do_sync) { - struct reiserfs_transaction_handle th ; - int jbegin_count = 1 ; - - if (inode->i_sb->s_flags & MS_RDONLY) - return -EROFS; - /* memory pressure can sometimes initiate write_inode calls with sync == 1, - ** these cases are just when the system needs ram, not when the - ** inode needs to reach disk for safety, and they can safely be - ** ignored because the altered inode has already been logged. - */ - if (do_sync && !(current->flags & PF_MEMALLOC)) { - reiserfs_write_lock(inode->i_sb); - if (!journal_begin(&th, inode->i_sb, jbegin_count)) { - reiserfs_update_sd (&th, inode); - journal_end_sync(&th, inode->i_sb, jbegin_count) ; - } - reiserfs_write_unlock(inode->i_sb); - } - return 0; +int reiserfs_write_inode(struct inode *inode, int do_sync) +{ + struct reiserfs_transaction_handle th; + int jbegin_count = 1; + + if (inode->i_sb->s_flags & MS_RDONLY) + return -EROFS; + /* memory pressure can sometimes initiate write_inode calls with sync == 1, + ** these cases are just when the system needs ram, not when the + ** inode needs to reach disk for safety, and they can safely be + ** ignored because the altered inode has already been logged. + */ + if (do_sync && !(current->flags & PF_MEMALLOC)) { + reiserfs_write_lock(inode->i_sb); + if (!journal_begin(&th, inode->i_sb, jbegin_count)) { + reiserfs_update_sd(&th, inode); + journal_end_sync(&th, inode->i_sb, jbegin_count); + } + reiserfs_write_unlock(inode->i_sb); + } + return 0; } /* stat data of new object is inserted already, this inserts the item containing "." and ".." entries */ -static int reiserfs_new_directory (struct reiserfs_transaction_handle *th, - struct inode *inode, - struct item_head * ih, struct path * path, - struct inode * dir) +static int reiserfs_new_directory(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct item_head *ih, struct path *path, + struct inode *dir) { - struct super_block * sb = th->t_super; - char empty_dir [EMPTY_DIR_SIZE]; - char * body = empty_dir; - struct cpu_key key; - int retval; - - BUG_ON (!th->t_trans_id); - - _make_cpu_key (&key, KEY_FORMAT_3_5, le32_to_cpu (ih->ih_key.k_dir_id), - le32_to_cpu (ih->ih_key.k_objectid), DOT_OFFSET, TYPE_DIRENTRY, 3/*key length*/); - - /* compose item head for new item. Directories consist of items of - old type (ITEM_VERSION_1). Do not set key (second arg is 0), it - is done by reiserfs_new_inode */ - if (old_format_only (sb)) { - make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2); - - make_empty_dir_item_v1 (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid, - INODE_PKEY (dir)->k_dir_id, - INODE_PKEY (dir)->k_objectid ); - } else { - make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2); - - make_empty_dir_item (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid, - INODE_PKEY (dir)->k_dir_id, - INODE_PKEY (dir)->k_objectid ); - } - - /* look for place in the tree for new item */ - retval = search_item (sb, &key, path); - if (retval == IO_ERROR) { - reiserfs_warning (sb, "vs-13080: reiserfs_new_directory: " - "i/o failure occurred creating new directory"); - return -EIO; - } - if (retval == ITEM_FOUND) { - pathrelse (path); - reiserfs_warning (sb, "vs-13070: reiserfs_new_directory: " - "object with this key exists (%k)", &(ih->ih_key)); - return -EEXIST; - } - - /* insert item, that is empty directory item */ - return reiserfs_insert_item (th, path, &key, ih, inode, body); -} + struct super_block *sb = th->t_super; + char empty_dir[EMPTY_DIR_SIZE]; + char *body = empty_dir; + struct cpu_key key; + int retval; + + BUG_ON(!th->t_trans_id); + + _make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id), + le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET, + TYPE_DIRENTRY, 3 /*key length */ ); + + /* compose item head for new item. Directories consist of items of + old type (ITEM_VERSION_1). Do not set key (second arg is 0), it + is done by reiserfs_new_inode */ + if (old_format_only(sb)) { + make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, + TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2); + + make_empty_dir_item_v1(body, ih->ih_key.k_dir_id, + ih->ih_key.k_objectid, + INODE_PKEY(dir)->k_dir_id, + INODE_PKEY(dir)->k_objectid); + } else { + make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, + TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2); + + make_empty_dir_item(body, ih->ih_key.k_dir_id, + ih->ih_key.k_objectid, + INODE_PKEY(dir)->k_dir_id, + INODE_PKEY(dir)->k_objectid); + } + + /* look for place in the tree for new item */ + retval = search_item(sb, &key, path); + if (retval == IO_ERROR) { + reiserfs_warning(sb, "vs-13080: reiserfs_new_directory: " + "i/o failure occurred creating new directory"); + return -EIO; + } + if (retval == ITEM_FOUND) { + pathrelse(path); + reiserfs_warning(sb, "vs-13070: reiserfs_new_directory: " + "object with this key exists (%k)", + &(ih->ih_key)); + return -EEXIST; + } + /* insert item, that is empty directory item */ + return reiserfs_insert_item(th, path, &key, ih, inode, body); +} /* stat data of object has been inserted, this inserts the item containing the body of symlink */ -static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th, - struct inode *inode, /* Inode of symlink */ - struct item_head * ih, - struct path * path, const char * symname, int item_len) +static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode, /* Inode of symlink */ + struct item_head *ih, + struct path *path, const char *symname, + int item_len) { - struct super_block * sb = th->t_super; - struct cpu_key key; - int retval; - - BUG_ON (!th->t_trans_id); - - _make_cpu_key (&key, KEY_FORMAT_3_5, - le32_to_cpu (ih->ih_key.k_dir_id), - le32_to_cpu (ih->ih_key.k_objectid), - 1, TYPE_DIRECT, 3/*key length*/); - - make_le_item_head (ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len, 0/*free_space*/); - - /* look for place in the tree for new item */ - retval = search_item (sb, &key, path); - if (retval == IO_ERROR) { - reiserfs_warning (sb, "vs-13080: reiserfs_new_symlinik: " - "i/o failure occurred creating new symlink"); - return -EIO; - } - if (retval == ITEM_FOUND) { - pathrelse (path); - reiserfs_warning (sb, "vs-13080: reiserfs_new_symlink: " - "object with this key exists (%k)", &(ih->ih_key)); - return -EEXIST; - } - - /* insert item, that is body of symlink */ - return reiserfs_insert_item (th, path, &key, ih, inode, symname); -} + struct super_block *sb = th->t_super; + struct cpu_key key; + int retval; + + BUG_ON(!th->t_trans_id); + + _make_cpu_key(&key, KEY_FORMAT_3_5, + le32_to_cpu(ih->ih_key.k_dir_id), + le32_to_cpu(ih->ih_key.k_objectid), + 1, TYPE_DIRECT, 3 /*key length */ ); + + make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len, + 0 /*free_space */ ); + + /* look for place in the tree for new item */ + retval = search_item(sb, &key, path); + if (retval == IO_ERROR) { + reiserfs_warning(sb, "vs-13080: reiserfs_new_symlinik: " + "i/o failure occurred creating new symlink"); + return -EIO; + } + if (retval == ITEM_FOUND) { + pathrelse(path); + reiserfs_warning(sb, "vs-13080: reiserfs_new_symlink: " + "object with this key exists (%k)", + &(ih->ih_key)); + return -EEXIST; + } + /* insert item, that is body of symlink */ + return reiserfs_insert_item(th, path, &key, ih, inode, symname); +} /* inserts the stat data into the tree, and then calls reiserfs_new_directory (to insert ".", ".." item if new object is @@ -1669,213 +1769,229 @@ static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th, non-zero due to an error, we have to drop the quota previously allocated for the fresh inode. This can only be done outside a transaction, so if we return non-zero, we also end the transaction. */ -int reiserfs_new_inode (struct reiserfs_transaction_handle *th, - struct inode * dir, int mode, - const char * symname, - /* 0 for regular, EMTRY_DIR_SIZE for dirs, - strlen (symname) for symlinks)*/ - loff_t i_size, struct dentry *dentry, - struct inode *inode) +int reiserfs_new_inode(struct reiserfs_transaction_handle *th, + struct inode *dir, int mode, const char *symname, + /* 0 for regular, EMTRY_DIR_SIZE for dirs, + strlen (symname) for symlinks) */ + loff_t i_size, struct dentry *dentry, + struct inode *inode) { - struct super_block * sb; - INITIALIZE_PATH (path_to_key); - struct cpu_key key; - struct item_head ih; - struct stat_data sd; - int retval; - int err; - - BUG_ON (!th->t_trans_id); - - if (DQUOT_ALLOC_INODE(inode)) { - err = -EDQUOT; - goto out_end_trans; - } - if (!dir || !dir->i_nlink) { - err = -EPERM; - goto out_bad_inode; - } - - sb = dir->i_sb; - - /* item head of new item */ - ih.ih_key.k_dir_id = reiserfs_choose_packing(dir); - ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid (th)); - if (!ih.ih_key.k_objectid) { - err = -ENOMEM; - goto out_bad_inode ; - } - if (old_format_only (sb)) - /* not a perfect generation count, as object ids can be reused, but - ** this is as good as reiserfs can do right now. - ** note that the private part of inode isn't filled in yet, we have - ** to use the directory. - */ - inode->i_generation = le32_to_cpu (INODE_PKEY (dir)->k_objectid); - else + struct super_block *sb; + INITIALIZE_PATH(path_to_key); + struct cpu_key key; + struct item_head ih; + struct stat_data sd; + int retval; + int err; + + BUG_ON(!th->t_trans_id); + + if (DQUOT_ALLOC_INODE(inode)) { + err = -EDQUOT; + goto out_end_trans; + } + if (!dir || !dir->i_nlink) { + err = -EPERM; + goto out_bad_inode; + } + + sb = dir->i_sb; + + /* item head of new item */ + ih.ih_key.k_dir_id = reiserfs_choose_packing(dir); + ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th)); + if (!ih.ih_key.k_objectid) { + err = -ENOMEM; + goto out_bad_inode; + } + if (old_format_only(sb)) + /* not a perfect generation count, as object ids can be reused, but + ** this is as good as reiserfs can do right now. + ** note that the private part of inode isn't filled in yet, we have + ** to use the directory. + */ + inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid); + else #if defined( USE_INODE_GENERATION_COUNTER ) - inode->i_generation = le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation); + inode->i_generation = + le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation); #else - inode->i_generation = ++event; + inode->i_generation = ++event; #endif - /* fill stat data */ - inode->i_nlink = (S_ISDIR (mode) ? 2 : 1); - - /* uid and gid must already be set by the caller for quota init */ - - /* symlink cannot be immutable or append only, right? */ - if( S_ISLNK( inode -> i_mode ) ) - inode -> i_flags &= ~ ( S_IMMUTABLE | S_APPEND ); - - inode->i_mtime = inode->i_atime = inode->i_ctime = - CURRENT_TIME_SEC; - inode->i_size = i_size; - inode->i_blocks = 0; - inode->i_bytes = 0; - REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 : - U32_MAX/*NO_BYTES_IN_DIRECT_ITEM*/; - - INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list )); - REISERFS_I(inode)->i_flags = 0; - REISERFS_I(inode)->i_prealloc_block = 0; - REISERFS_I(inode)->i_prealloc_count = 0; - REISERFS_I(inode)->i_trans_id = 0; - REISERFS_I(inode)->i_jl = NULL; - REISERFS_I(inode)->i_attrs = - REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK; - sd_attrs_to_i_attrs( REISERFS_I(inode) -> i_attrs, inode ); - REISERFS_I(inode)->i_acl_access = NULL; - REISERFS_I(inode)->i_acl_default = NULL; - init_rwsem (&REISERFS_I(inode)->xattr_sem); - - if (old_format_only (sb)) - make_le_item_head (&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT); - else - make_le_item_head (&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); - - /* key to search for correct place for new stat data */ - _make_cpu_key (&key, KEY_FORMAT_3_6, le32_to_cpu (ih.ih_key.k_dir_id), - le32_to_cpu (ih.ih_key.k_objectid), SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/); - - /* find proper place for inserting of stat data */ - retval = search_item (sb, &key, &path_to_key); - if (retval == IO_ERROR) { - err = -EIO; - goto out_bad_inode; - } - if (retval == ITEM_FOUND) { - pathrelse (&path_to_key); - err = -EEXIST; - goto out_bad_inode; - } - if (old_format_only (sb)) { - if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) { - pathrelse (&path_to_key); - /* i_uid or i_gid is too big to be stored in stat data v3.5 */ - err = -EINVAL; - goto out_bad_inode; - } - inode2sd_v1 (&sd, inode, inode->i_size); - } else { - inode2sd (&sd, inode, inode->i_size); - } - // these do not go to on-disk stat data - inode->i_ino = le32_to_cpu (ih.ih_key.k_objectid); - inode->i_blksize = reiserfs_default_io_size; - - // store in in-core inode the key of stat data and version all - // object items will have (directory items will have old offset - // format, other new objects will consist of new items) - memcpy (INODE_PKEY (inode), &(ih.ih_key), KEY_SIZE); - if (old_format_only (sb) || S_ISDIR(mode) || S_ISLNK(mode)) - set_inode_item_key_version (inode, KEY_FORMAT_3_5); - else - set_inode_item_key_version (inode, KEY_FORMAT_3_6); - if (old_format_only (sb)) - set_inode_sd_version (inode, STAT_DATA_V1); - else - set_inode_sd_version (inode, STAT_DATA_V2); - - /* insert the stat data into the tree */ + /* fill stat data */ + inode->i_nlink = (S_ISDIR(mode) ? 2 : 1); + + /* uid and gid must already be set by the caller for quota init */ + + /* symlink cannot be immutable or append only, right? */ + if (S_ISLNK(inode->i_mode)) + inode->i_flags &= ~(S_IMMUTABLE | S_APPEND); + + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; + inode->i_size = i_size; + inode->i_blocks = 0; + inode->i_bytes = 0; + REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 : + U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ; + + INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list)); + REISERFS_I(inode)->i_flags = 0; + REISERFS_I(inode)->i_prealloc_block = 0; + REISERFS_I(inode)->i_prealloc_count = 0; + REISERFS_I(inode)->i_trans_id = 0; + REISERFS_I(inode)->i_jl = NULL; + REISERFS_I(inode)->i_attrs = + REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK; + sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode); + REISERFS_I(inode)->i_acl_access = NULL; + REISERFS_I(inode)->i_acl_default = NULL; + init_rwsem(&REISERFS_I(inode)->xattr_sem); + + if (old_format_only(sb)) + make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, + TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT); + else + make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, + TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); + + /* key to search for correct place for new stat data */ + _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id), + le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET, + TYPE_STAT_DATA, 3 /*key length */ ); + + /* find proper place for inserting of stat data */ + retval = search_item(sb, &key, &path_to_key); + if (retval == IO_ERROR) { + err = -EIO; + goto out_bad_inode; + } + if (retval == ITEM_FOUND) { + pathrelse(&path_to_key); + err = -EEXIST; + goto out_bad_inode; + } + if (old_format_only(sb)) { + if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) { + pathrelse(&path_to_key); + /* i_uid or i_gid is too big to be stored in stat data v3.5 */ + err = -EINVAL; + goto out_bad_inode; + } + inode2sd_v1(&sd, inode, inode->i_size); + } else { + inode2sd(&sd, inode, inode->i_size); + } + // these do not go to on-disk stat data + inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid); + inode->i_blksize = reiserfs_default_io_size; + + // store in in-core inode the key of stat data and version all + // object items will have (directory items will have old offset + // format, other new objects will consist of new items) + memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); + if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode)) + set_inode_item_key_version(inode, KEY_FORMAT_3_5); + else + set_inode_item_key_version(inode, KEY_FORMAT_3_6); + if (old_format_only(sb)) + set_inode_sd_version(inode, STAT_DATA_V1); + else + set_inode_sd_version(inode, STAT_DATA_V2); + + /* insert the stat data into the tree */ #ifdef DISPLACE_NEW_PACKING_LOCALITIES - if (REISERFS_I(dir)->new_packing_locality) - th->displace_new_blocks = 1; + if (REISERFS_I(dir)->new_packing_locality) + th->displace_new_blocks = 1; #endif - retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, inode, (char *)(&sd)); - if (retval) { - err = retval; - reiserfs_check_path(&path_to_key) ; - goto out_bad_inode; - } - + retval = + reiserfs_insert_item(th, &path_to_key, &key, &ih, inode, + (char *)(&sd)); + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key); + goto out_bad_inode; + } #ifdef DISPLACE_NEW_PACKING_LOCALITIES - if (!th->displace_new_blocks) - REISERFS_I(dir)->new_packing_locality = 0; + if (!th->displace_new_blocks) + REISERFS_I(dir)->new_packing_locality = 0; #endif - if (S_ISDIR(mode)) { - /* insert item with "." and ".." */ - retval = reiserfs_new_directory (th, inode, &ih, &path_to_key, dir); - } - - if (S_ISLNK(mode)) { - /* insert body of symlink */ - if (!old_format_only (sb)) - i_size = ROUND_UP(i_size); - retval = reiserfs_new_symlink (th, inode, &ih, &path_to_key, symname, i_size); - } - if (retval) { - err = retval; - reiserfs_check_path(&path_to_key) ; - journal_end(th, th->t_super, th->t_blocks_allocated); - goto out_inserted_sd; - } - - /* XXX CHECK THIS */ - if (reiserfs_posixacl (inode->i_sb)) { - retval = reiserfs_inherit_default_acl (dir, dentry, inode); - if (retval) { - err = retval; - reiserfs_check_path(&path_to_key) ; - journal_end(th, th->t_super, th->t_blocks_allocated); - goto out_inserted_sd; - } - } else if (inode->i_sb->s_flags & MS_POSIXACL) { - reiserfs_warning (inode->i_sb, "ACLs aren't enabled in the fs, " - "but vfs thinks they are!"); - } else if (is_reiserfs_priv_object (dir)) { - reiserfs_mark_inode_private (inode); - } - - insert_inode_hash (inode); - reiserfs_update_sd(th, inode); - reiserfs_check_path(&path_to_key) ; - - return 0; + if (S_ISDIR(mode)) { + /* insert item with "." and ".." */ + retval = + reiserfs_new_directory(th, inode, &ih, &path_to_key, dir); + } + + if (S_ISLNK(mode)) { + /* insert body of symlink */ + if (!old_format_only(sb)) + i_size = ROUND_UP(i_size); + retval = + reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname, + i_size); + } + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key); + journal_end(th, th->t_super, th->t_blocks_allocated); + goto out_inserted_sd; + } + + /* XXX CHECK THIS */ + if (reiserfs_posixacl(inode->i_sb)) { + retval = reiserfs_inherit_default_acl(dir, dentry, inode); + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key); + journal_end(th, th->t_super, th->t_blocks_allocated); + goto out_inserted_sd; + } + } else if (inode->i_sb->s_flags & MS_POSIXACL) { + reiserfs_warning(inode->i_sb, "ACLs aren't enabled in the fs, " + "but vfs thinks they are!"); + } else if (is_reiserfs_priv_object(dir)) { + reiserfs_mark_inode_private(inode); + } + + insert_inode_hash(inode); + reiserfs_update_sd(th, inode); + reiserfs_check_path(&path_to_key); + + return 0; /* it looks like you can easily compress these two goto targets into * one. Keeping it like this doesn't actually hurt anything, and they * are place holders for what the quota code actually needs. */ -out_bad_inode: - /* Invalidate the object, nothing was inserted yet */ - INODE_PKEY(inode)->k_objectid = 0; - - /* Quota change must be inside a transaction for journaling */ - DQUOT_FREE_INODE(inode); - -out_end_trans: - journal_end(th, th->t_super, th->t_blocks_allocated) ; - /* Drop can be outside and it needs more credits so it's better to have it outside */ - DQUOT_DROP(inode); - inode->i_flags |= S_NOQUOTA; - make_bad_inode(inode); - -out_inserted_sd: - inode->i_nlink = 0; - th->t_trans_id = 0; /* so the caller can't use this handle later */ - iput(inode); - return err; + out_bad_inode: + /* Invalidate the object, nothing was inserted yet */ + INODE_PKEY(inode)->k_objectid = 0; + + /* Quota change must be inside a transaction for journaling */ + DQUOT_FREE_INODE(inode); + + out_end_trans: + journal_end(th, th->t_super, th->t_blocks_allocated); + /* Drop can be outside and it needs more credits so it's better to have it outside */ + DQUOT_DROP(inode); + inode->i_flags |= S_NOQUOTA; + make_bad_inode(inode); + + out_inserted_sd: + inode->i_nlink = 0; + th->t_trans_id = 0; /* so the caller can't use this handle later */ + + /* If we were inheriting an ACL, we need to release the lock so that + * iput doesn't deadlock in reiserfs_delete_xattrs. The locking + * code really needs to be reworked, but this will take care of it + * for now. -jeffm */ + if (REISERFS_I(dir)->i_acl_default && !IS_ERR(REISERFS_I(dir)->i_acl_default)) { + reiserfs_write_unlock_xattrs(dir->i_sb); + iput(inode); + reiserfs_write_lock_xattrs(dir->i_sb); + } else + iput(inode); + return err; } /* @@ -1891,77 +2007,78 @@ out_inserted_sd: ** ** on failure, nonzero is returned, page_result and bh_result are untouched. */ -static int grab_tail_page(struct inode *p_s_inode, - struct page **page_result, - struct buffer_head **bh_result) { - - /* we want the page with the last byte in the file, - ** not the page that will hold the next byte for appending - */ - unsigned long index = (p_s_inode->i_size-1) >> PAGE_CACHE_SHIFT ; - unsigned long pos = 0 ; - unsigned long start = 0 ; - unsigned long blocksize = p_s_inode->i_sb->s_blocksize ; - unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1) ; - struct buffer_head *bh ; - struct buffer_head *head ; - struct page * page ; - int error ; - - /* we know that we are only called with inode->i_size > 0. - ** we also know that a file tail can never be as big as a block - ** If i_size % blocksize == 0, our file is currently block aligned - ** and it won't need converting or zeroing after a truncate. - */ - if ((offset & (blocksize - 1)) == 0) { - return -ENOENT ; - } - page = grab_cache_page(p_s_inode->i_mapping, index) ; - error = -ENOMEM ; - if (!page) { - goto out ; - } - /* start within the page of the last block in the file */ - start = (offset / blocksize) * blocksize ; - - error = block_prepare_write(page, start, offset, - reiserfs_get_block_create_0) ; - if (error) - goto unlock ; - - head = page_buffers(page) ; - bh = head; - do { - if (pos >= start) { - break ; - } - bh = bh->b_this_page ; - pos += blocksize ; - } while(bh != head) ; - - if (!buffer_uptodate(bh)) { - /* note, this should never happen, prepare_write should - ** be taking care of this for us. If the buffer isn't up to date, - ** I've screwed up the code to find the buffer, or the code to - ** call prepare_write - */ - reiserfs_warning (p_s_inode->i_sb, - "clm-6000: error reading block %lu on dev %s", - bh->b_blocknr, - reiserfs_bdevname (p_s_inode->i_sb)) ; - error = -EIO ; - goto unlock ; - } - *bh_result = bh ; - *page_result = page ; - -out: - return error ; - -unlock: - unlock_page(page) ; - page_cache_release(page) ; - return error ; +static int grab_tail_page(struct inode *p_s_inode, + struct page **page_result, + struct buffer_head **bh_result) +{ + + /* we want the page with the last byte in the file, + ** not the page that will hold the next byte for appending + */ + unsigned long index = (p_s_inode->i_size - 1) >> PAGE_CACHE_SHIFT; + unsigned long pos = 0; + unsigned long start = 0; + unsigned long blocksize = p_s_inode->i_sb->s_blocksize; + unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1); + struct buffer_head *bh; + struct buffer_head *head; + struct page *page; + int error; + + /* we know that we are only called with inode->i_size > 0. + ** we also know that a file tail can never be as big as a block + ** If i_size % blocksize == 0, our file is currently block aligned + ** and it won't need converting or zeroing after a truncate. + */ + if ((offset & (blocksize - 1)) == 0) { + return -ENOENT; + } + page = grab_cache_page(p_s_inode->i_mapping, index); + error = -ENOMEM; + if (!page) { + goto out; + } + /* start within the page of the last block in the file */ + start = (offset / blocksize) * blocksize; + + error = block_prepare_write(page, start, offset, + reiserfs_get_block_create_0); + if (error) + goto unlock; + + head = page_buffers(page); + bh = head; + do { + if (pos >= start) { + break; + } + bh = bh->b_this_page; + pos += blocksize; + } while (bh != head); + + if (!buffer_uptodate(bh)) { + /* note, this should never happen, prepare_write should + ** be taking care of this for us. If the buffer isn't up to date, + ** I've screwed up the code to find the buffer, or the code to + ** call prepare_write + */ + reiserfs_warning(p_s_inode->i_sb, + "clm-6000: error reading block %lu on dev %s", + bh->b_blocknr, + reiserfs_bdevname(p_s_inode->i_sb)); + error = -EIO; + goto unlock; + } + *bh_result = bh; + *page_result = page; + + out: + return error; + + unlock: + unlock_page(page); + page_cache_release(page); + return error; } /* @@ -1970,235 +2087,247 @@ unlock: ** ** some code taken from block_truncate_page */ -int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) { - struct reiserfs_transaction_handle th ; - /* we want the offset for the first byte after the end of the file */ - unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1) ; - unsigned blocksize = p_s_inode->i_sb->s_blocksize ; - unsigned length ; - struct page *page = NULL ; - int error ; - struct buffer_head *bh = NULL ; - - reiserfs_write_lock(p_s_inode->i_sb); - - if (p_s_inode->i_size > 0) { - if ((error = grab_tail_page(p_s_inode, &page, &bh))) { - // -ENOENT means we truncated past the end of the file, - // and get_block_create_0 could not find a block to read in, - // which is ok. - if (error != -ENOENT) - reiserfs_warning (p_s_inode->i_sb, - "clm-6001: grab_tail_page failed %d", - error); - page = NULL ; - bh = NULL ; - } - } - - /* so, if page != NULL, we have a buffer head for the offset at - ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0, - ** then we have an unformatted node. Otherwise, we have a direct item, - ** and no zeroing is required on disk. We zero after the truncate, - ** because the truncate might pack the item anyway - ** (it will unmap bh if it packs). - */ - /* it is enough to reserve space in transaction for 2 balancings: - one for "save" link adding and another for the first - cut_from_item. 1 is for update_sd */ - error = journal_begin (&th, p_s_inode->i_sb, - JOURNAL_PER_BALANCE_CNT * 2 + 1); - if (error) - goto out; - reiserfs_update_inode_transaction(p_s_inode) ; - if (update_timestamps) - /* we are doing real truncate: if the system crashes before the last - transaction of truncating gets committed - on reboot the file - either appears truncated properly or not truncated at all */ - add_save_link (&th, p_s_inode, 1); - error = reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ; - if (error) - goto out; - error = journal_end (&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1); - if (error) - goto out; - - if (update_timestamps) { - error = remove_save_link (p_s_inode, 1/* truncate */); - if (error) - goto out; - } - - if (page) { - length = offset & (blocksize - 1) ; - /* if we are not on a block boundary */ - if (length) { - char *kaddr; - - length = blocksize - length ; - kaddr = kmap_atomic(page, KM_USER0) ; - memset(kaddr + offset, 0, length) ; - flush_dcache_page(page) ; - kunmap_atomic(kaddr, KM_USER0) ; - if (buffer_mapped(bh) && bh->b_blocknr != 0) { - mark_buffer_dirty(bh) ; - } - } - unlock_page(page) ; - page_cache_release(page) ; - } - - reiserfs_write_unlock(p_s_inode->i_sb); - return 0; -out: - if (page) { - unlock_page (page); - page_cache_release (page); - } - reiserfs_write_unlock(p_s_inode->i_sb); - return error; -} +int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) +{ + struct reiserfs_transaction_handle th; + /* we want the offset for the first byte after the end of the file */ + unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1); + unsigned blocksize = p_s_inode->i_sb->s_blocksize; + unsigned length; + struct page *page = NULL; + int error; + struct buffer_head *bh = NULL; + + reiserfs_write_lock(p_s_inode->i_sb); + + if (p_s_inode->i_size > 0) { + if ((error = grab_tail_page(p_s_inode, &page, &bh))) { + // -ENOENT means we truncated past the end of the file, + // and get_block_create_0 could not find a block to read in, + // which is ok. + if (error != -ENOENT) + reiserfs_warning(p_s_inode->i_sb, + "clm-6001: grab_tail_page failed %d", + error); + page = NULL; + bh = NULL; + } + } -static int map_block_for_writepage(struct inode *inode, - struct buffer_head *bh_result, - unsigned long block) { - struct reiserfs_transaction_handle th ; - int fs_gen ; - struct item_head tmp_ih ; - struct item_head *ih ; - struct buffer_head *bh ; - __le32 *item ; - struct cpu_key key ; - INITIALIZE_PATH(path) ; - int pos_in_item ; - int jbegin_count = JOURNAL_PER_BALANCE_CNT ; - loff_t byte_offset = (block << inode->i_sb->s_blocksize_bits) + 1 ; - int retval ; - int use_get_block = 0 ; - int bytes_copied = 0 ; - int copy_size ; - int trans_running = 0; - - /* catch places below that try to log something without starting a trans */ - th.t_trans_id = 0; - - if (!buffer_uptodate(bh_result)) { - return -EIO; - } - - kmap(bh_result->b_page) ; -start_over: - reiserfs_write_lock(inode->i_sb); - make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3) ; - -research: - retval = search_for_position_by_key(inode->i_sb, &key, &path) ; - if (retval != POSITION_FOUND) { - use_get_block = 1; - goto out ; - } - - bh = get_last_bh(&path) ; - ih = get_ih(&path) ; - item = get_item(&path) ; - pos_in_item = path.pos_in_item ; - - /* we've found an unformatted node */ - if (indirect_item_found(retval, ih)) { - if (bytes_copied > 0) { - reiserfs_warning (inode->i_sb, "clm-6002: bytes_copied %d", - bytes_copied) ; - } - if (!get_block_num(item, pos_in_item)) { - /* crap, we are writing to a hole */ - use_get_block = 1; - goto out ; - } - set_block_dev_mapped(bh_result, get_block_num(item,pos_in_item),inode); - } else if (is_direct_le_ih(ih)) { - char *p ; - p = page_address(bh_result->b_page) ; - p += (byte_offset -1) & (PAGE_CACHE_SIZE - 1) ; - copy_size = ih_item_len(ih) - pos_in_item; - - fs_gen = get_generation(inode->i_sb) ; - copy_item_head(&tmp_ih, ih) ; - - if (!trans_running) { - /* vs-3050 is gone, no need to drop the path */ - retval = journal_begin(&th, inode->i_sb, jbegin_count) ; - if (retval) - goto out; - reiserfs_update_inode_transaction(inode) ; - trans_running = 1; - if (fs_changed(fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) { - reiserfs_restore_prepared_buffer(inode->i_sb, bh) ; - goto research; - } - } - - reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ; - - if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { - reiserfs_restore_prepared_buffer(inode->i_sb, bh) ; - goto research; - } - - memcpy( B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, copy_size) ; - - journal_mark_dirty(&th, inode->i_sb, bh) ; - bytes_copied += copy_size ; - set_block_dev_mapped(bh_result, 0, inode); - - /* are there still bytes left? */ - if (bytes_copied < bh_result->b_size && - (byte_offset + bytes_copied) < inode->i_size) { - set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + copy_size) ; - goto research ; - } - } else { - reiserfs_warning (inode->i_sb, - "clm-6003: bad item inode %lu, device %s", - inode->i_ino, reiserfs_bdevname (inode->i_sb)) ; - retval = -EIO ; - goto out ; - } - retval = 0 ; - -out: - pathrelse(&path) ; - if (trans_running) { - int err = journal_end(&th, inode->i_sb, jbegin_count) ; - if (err) - retval = err; - trans_running = 0; - } - reiserfs_write_unlock(inode->i_sb); - - /* this is where we fill in holes in the file. */ - if (use_get_block) { - retval = reiserfs_get_block(inode, block, bh_result, - GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM | - GET_BLOCK_NO_DANGLE); - if (!retval) { - if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) { - /* get_block failed to find a mapped unformatted node. */ - use_get_block = 0 ; - goto start_over ; - } - } - } - kunmap(bh_result->b_page) ; - - if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { - /* we've copied data from the page into the direct item, so the - * buffer in the page is now clean, mark it to reflect that. + /* so, if page != NULL, we have a buffer head for the offset at + ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0, + ** then we have an unformatted node. Otherwise, we have a direct item, + ** and no zeroing is required on disk. We zero after the truncate, + ** because the truncate might pack the item anyway + ** (it will unmap bh if it packs). */ - lock_buffer(bh_result); - clear_buffer_dirty(bh_result); - unlock_buffer(bh_result); - } - return retval ; + /* it is enough to reserve space in transaction for 2 balancings: + one for "save" link adding and another for the first + cut_from_item. 1 is for update_sd */ + error = journal_begin(&th, p_s_inode->i_sb, + JOURNAL_PER_BALANCE_CNT * 2 + 1); + if (error) + goto out; + reiserfs_update_inode_transaction(p_s_inode); + if (update_timestamps) + /* we are doing real truncate: if the system crashes before the last + transaction of truncating gets committed - on reboot the file + either appears truncated properly or not truncated at all */ + add_save_link(&th, p_s_inode, 1); + error = reiserfs_do_truncate(&th, p_s_inode, page, update_timestamps); + if (error) + goto out; + error = + journal_end(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1); + if (error) + goto out; + + if (update_timestamps) { + error = remove_save_link(p_s_inode, 1 /* truncate */ ); + if (error) + goto out; + } + + if (page) { + length = offset & (blocksize - 1); + /* if we are not on a block boundary */ + if (length) { + char *kaddr; + + length = blocksize - length; + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, length); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + if (buffer_mapped(bh) && bh->b_blocknr != 0) { + mark_buffer_dirty(bh); + } + } + unlock_page(page); + page_cache_release(page); + } + + reiserfs_write_unlock(p_s_inode->i_sb); + return 0; + out: + if (page) { + unlock_page(page); + page_cache_release(page); + } + reiserfs_write_unlock(p_s_inode->i_sb); + return error; +} + +static int map_block_for_writepage(struct inode *inode, + struct buffer_head *bh_result, + unsigned long block) +{ + struct reiserfs_transaction_handle th; + int fs_gen; + struct item_head tmp_ih; + struct item_head *ih; + struct buffer_head *bh; + __le32 *item; + struct cpu_key key; + INITIALIZE_PATH(path); + int pos_in_item; + int jbegin_count = JOURNAL_PER_BALANCE_CNT; + loff_t byte_offset = (block << inode->i_sb->s_blocksize_bits) + 1; + int retval; + int use_get_block = 0; + int bytes_copied = 0; + int copy_size; + int trans_running = 0; + + /* catch places below that try to log something without starting a trans */ + th.t_trans_id = 0; + + if (!buffer_uptodate(bh_result)) { + return -EIO; + } + + kmap(bh_result->b_page); + start_over: + reiserfs_write_lock(inode->i_sb); + make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3); + + research: + retval = search_for_position_by_key(inode->i_sb, &key, &path); + if (retval != POSITION_FOUND) { + use_get_block = 1; + goto out; + } + + bh = get_last_bh(&path); + ih = get_ih(&path); + item = get_item(&path); + pos_in_item = path.pos_in_item; + + /* we've found an unformatted node */ + if (indirect_item_found(retval, ih)) { + if (bytes_copied > 0) { + reiserfs_warning(inode->i_sb, + "clm-6002: bytes_copied %d", + bytes_copied); + } + if (!get_block_num(item, pos_in_item)) { + /* crap, we are writing to a hole */ + use_get_block = 1; + goto out; + } + set_block_dev_mapped(bh_result, + get_block_num(item, pos_in_item), inode); + } else if (is_direct_le_ih(ih)) { + char *p; + p = page_address(bh_result->b_page); + p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1); + copy_size = ih_item_len(ih) - pos_in_item; + + fs_gen = get_generation(inode->i_sb); + copy_item_head(&tmp_ih, ih); + + if (!trans_running) { + /* vs-3050 is gone, no need to drop the path */ + retval = journal_begin(&th, inode->i_sb, jbegin_count); + if (retval) + goto out; + reiserfs_update_inode_transaction(inode); + trans_running = 1; + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, + bh); + goto research; + } + } + + reiserfs_prepare_for_journal(inode->i_sb, bh, 1); + + if (fs_changed(fs_gen, inode->i_sb) + && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, bh); + goto research; + } + + memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, + copy_size); + + journal_mark_dirty(&th, inode->i_sb, bh); + bytes_copied += copy_size; + set_block_dev_mapped(bh_result, 0, inode); + + /* are there still bytes left? */ + if (bytes_copied < bh_result->b_size && + (byte_offset + bytes_copied) < inode->i_size) { + set_cpu_key_k_offset(&key, + cpu_key_k_offset(&key) + + copy_size); + goto research; + } + } else { + reiserfs_warning(inode->i_sb, + "clm-6003: bad item inode %lu, device %s", + inode->i_ino, reiserfs_bdevname(inode->i_sb)); + retval = -EIO; + goto out; + } + retval = 0; + + out: + pathrelse(&path); + if (trans_running) { + int err = journal_end(&th, inode->i_sb, jbegin_count); + if (err) + retval = err; + trans_running = 0; + } + reiserfs_write_unlock(inode->i_sb); + + /* this is where we fill in holes in the file. */ + if (use_get_block) { + retval = reiserfs_get_block(inode, block, bh_result, + GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM + | GET_BLOCK_NO_DANGLE); + if (!retval) { + if (!buffer_mapped(bh_result) + || bh_result->b_blocknr == 0) { + /* get_block failed to find a mapped unformatted node. */ + use_get_block = 0; + goto start_over; + } + } + } + kunmap(bh_result->b_page); + + if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { + /* we've copied data from the page into the direct item, so the + * buffer in the page is now clean, mark it to reflect that. + */ + lock_buffer(bh_result); + clear_buffer_dirty(bh_result); + unlock_buffer(bh_result); + } + return retval; } /* @@ -2206,383 +2335,390 @@ out: * start/recovery path as __block_write_full_page, along with special * code to handle reiserfs tails. */ -static int reiserfs_write_full_page(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host ; - unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ; - int error = 0; - unsigned long block ; - struct buffer_head *head, *bh; - int partial = 0 ; - int nr = 0; - int checked = PageChecked(page); - struct reiserfs_transaction_handle th; - struct super_block *s = inode->i_sb; - int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; - th.t_trans_id = 0; - - /* The page dirty bit is cleared before writepage is called, which - * means we have to tell create_empty_buffers to make dirty buffers - * The page really should be up to date at this point, so tossing - * in the BH_Uptodate is just a sanity check. - */ - if (!page_has_buffers(page)) { - create_empty_buffers(page, s->s_blocksize, - (1 << BH_Dirty) | (1 << BH_Uptodate)); - } - head = page_buffers(page) ; - - /* last page in the file, zero out any contents past the - ** last byte in the file - */ - if (page->index >= end_index) { - char *kaddr; - unsigned last_offset; - - last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1) ; - /* no file contents in this page */ - if (page->index >= end_index + 1 || !last_offset) { - unlock_page(page); - return 0; - } - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE-last_offset) ; - flush_dcache_page(page) ; - kunmap_atomic(kaddr, KM_USER0) ; - } - bh = head ; - block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits) ; - /* first map all the buffers, logging any direct items we find */ - do { - if ((checked || buffer_dirty(bh)) && (!buffer_mapped(bh) || - (buffer_mapped(bh) && bh->b_blocknr == 0))) { - /* not mapped yet, or it points to a direct item, search - * the btree for the mapping info, and log any direct - * items found - */ - if ((error = map_block_for_writepage(inode, bh, block))) { - goto fail ; - } - } - bh = bh->b_this_page; - block++; - } while(bh != head) ; - - /* - * we start the transaction after map_block_for_writepage, - * because it can create holes in the file (an unbounded operation). - * starting it here, we can make a reliable estimate for how many - * blocks we're going to log - */ - if (checked) { - ClearPageChecked(page); - reiserfs_write_lock(s); - error = journal_begin(&th, s, bh_per_page + 1); - if (error) { - reiserfs_write_unlock(s); - goto fail; - } - reiserfs_update_inode_transaction(inode); - } - /* now go through and lock any dirty buffers on the page */ - do { - get_bh(bh); - if (!buffer_mapped(bh)) - continue; - if (buffer_mapped(bh) && bh->b_blocknr == 0) - continue; +static int reiserfs_write_full_page(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT; + int error = 0; + unsigned long block; + struct buffer_head *head, *bh; + int partial = 0; + int nr = 0; + int checked = PageChecked(page); + struct reiserfs_transaction_handle th; + struct super_block *s = inode->i_sb; + int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; + th.t_trans_id = 0; + + /* The page dirty bit is cleared before writepage is called, which + * means we have to tell create_empty_buffers to make dirty buffers + * The page really should be up to date at this point, so tossing + * in the BH_Uptodate is just a sanity check. + */ + if (!page_has_buffers(page)) { + create_empty_buffers(page, s->s_blocksize, + (1 << BH_Dirty) | (1 << BH_Uptodate)); + } + head = page_buffers(page); - if (checked) { - reiserfs_prepare_for_journal(s, bh, 1); - journal_mark_dirty(&th, s, bh); - continue; + /* last page in the file, zero out any contents past the + ** last byte in the file + */ + if (page->index >= end_index) { + char *kaddr; + unsigned last_offset; + + last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1); + /* no file contents in this page */ + if (page->index >= end_index + 1 || !last_offset) { + unlock_page(page); + return 0; + } + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE - last_offset); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); } - /* from this point on, we know the buffer is mapped to a - * real block and not a direct item + bh = head; + block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits); + /* first map all the buffers, logging any direct items we find */ + do { + if ((checked || buffer_dirty(bh)) && (!buffer_mapped(bh) || + (buffer_mapped(bh) + && bh->b_blocknr == + 0))) { + /* not mapped yet, or it points to a direct item, search + * the btree for the mapping info, and log any direct + * items found + */ + if ((error = map_block_for_writepage(inode, bh, block))) { + goto fail; + } + } + bh = bh->b_this_page; + block++; + } while (bh != head); + + /* + * we start the transaction after map_block_for_writepage, + * because it can create holes in the file (an unbounded operation). + * starting it here, we can make a reliable estimate for how many + * blocks we're going to log */ - if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { - lock_buffer(bh); - } else { - if (test_set_buffer_locked(bh)) { - redirty_page_for_writepage(wbc, page); - continue; - } + if (checked) { + ClearPageChecked(page); + reiserfs_write_lock(s); + error = journal_begin(&th, s, bh_per_page + 1); + if (error) { + reiserfs_write_unlock(s); + goto fail; + } + reiserfs_update_inode_transaction(inode); } - if (test_clear_buffer_dirty(bh)) { - mark_buffer_async_write(bh); - } else { - unlock_buffer(bh); + /* now go through and lock any dirty buffers on the page */ + do { + get_bh(bh); + if (!buffer_mapped(bh)) + continue; + if (buffer_mapped(bh) && bh->b_blocknr == 0) + continue; + + if (checked) { + reiserfs_prepare_for_journal(s, bh, 1); + journal_mark_dirty(&th, s, bh); + continue; + } + /* from this point on, we know the buffer is mapped to a + * real block and not a direct item + */ + if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + lock_buffer(bh); + } else { + if (test_set_buffer_locked(bh)) { + redirty_page_for_writepage(wbc, page); + continue; + } + } + if (test_clear_buffer_dirty(bh)) { + mark_buffer_async_write(bh); + } else { + unlock_buffer(bh); + } + } while ((bh = bh->b_this_page) != head); + + if (checked) { + error = journal_end(&th, s, bh_per_page + 1); + reiserfs_write_unlock(s); + if (error) + goto fail; } - } while((bh = bh->b_this_page) != head); + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); - if (checked) { - error = journal_end(&th, s, bh_per_page + 1); - reiserfs_write_unlock(s); - if (error) - goto fail; - } - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); - - /* - * since any buffer might be the only dirty buffer on the page, - * the first submit_bh can bring the page out of writeback. - * be careful with the buffers. - */ - do { - struct buffer_head *next = bh->b_this_page; - if (buffer_async_write(bh)) { - submit_bh(WRITE, bh); - nr++; - } - put_bh(bh); - bh = next; - } while(bh != head); - - error = 0; -done: - if (nr == 0) { - /* - * if this page only had a direct item, it is very possible for - * no io to be required without there being an error. Or, - * someone else could have locked them and sent them down the - * pipe without locking the page + /* + * since any buffer might be the only dirty buffer on the page, + * the first submit_bh can bring the page out of writeback. + * be careful with the buffers. */ - bh = head ; do { - if (!buffer_uptodate(bh)) { - partial = 1; - break; - } - bh = bh->b_this_page; - } while(bh != head); - if (!partial) - SetPageUptodate(page); - end_page_writeback(page); - } - return error; - -fail: - /* catches various errors, we need to make sure any valid dirty blocks - * get to the media. The page is currently locked and not marked for - * writeback - */ - ClearPageUptodate(page); - bh = head; - do { - get_bh(bh); - if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) { - lock_buffer(bh); - mark_buffer_async_write(bh); - } else { - /* - * clear any dirty bits that might have come from getting - * attached to a dirty page - */ - clear_buffer_dirty(bh); - } - bh = bh->b_this_page; - } while(bh != head); - SetPageError(page); - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); - do { - struct buffer_head *next = bh->b_this_page; - if (buffer_async_write(bh)) { - clear_buffer_dirty(bh); - submit_bh(WRITE, bh); - nr++; - } - put_bh(bh); - bh = next; - } while(bh != head); - goto done; -} + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(WRITE, bh); + nr++; + } + put_bh(bh); + bh = next; + } while (bh != head); + error = 0; + done: + if (nr == 0) { + /* + * if this page only had a direct item, it is very possible for + * no io to be required without there being an error. Or, + * someone else could have locked them and sent them down the + * pipe without locking the page + */ + bh = head; + do { + if (!buffer_uptodate(bh)) { + partial = 1; + break; + } + bh = bh->b_this_page; + } while (bh != head); + if (!partial) + SetPageUptodate(page); + end_page_writeback(page); + } + return error; -static int reiserfs_readpage (struct file *f, struct page * page) -{ - return block_read_full_page (page, reiserfs_get_block); + fail: + /* catches various errors, we need to make sure any valid dirty blocks + * get to the media. The page is currently locked and not marked for + * writeback + */ + ClearPageUptodate(page); + bh = head; + do { + get_bh(bh); + if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) { + lock_buffer(bh); + mark_buffer_async_write(bh); + } else { + /* + * clear any dirty bits that might have come from getting + * attached to a dirty page + */ + clear_buffer_dirty(bh); + } + bh = bh->b_this_page; + } while (bh != head); + SetPageError(page); + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + clear_buffer_dirty(bh); + submit_bh(WRITE, bh); + nr++; + } + put_bh(bh); + bh = next; + } while (bh != head); + goto done; } +static int reiserfs_readpage(struct file *f, struct page *page) +{ + return block_read_full_page(page, reiserfs_get_block); +} -static int reiserfs_writepage (struct page * page, struct writeback_control *wbc) +static int reiserfs_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host ; - reiserfs_wait_on_write_block(inode->i_sb) ; - return reiserfs_write_full_page(page, wbc) ; + struct inode *inode = page->mapping->host; + reiserfs_wait_on_write_block(inode->i_sb); + return reiserfs_write_full_page(page, wbc); } static int reiserfs_prepare_write(struct file *f, struct page *page, - unsigned from, unsigned to) { - struct inode *inode = page->mapping->host ; - int ret; - int old_ref = 0; - - reiserfs_wait_on_write_block(inode->i_sb) ; - fix_tail_page_for_writing(page) ; - if (reiserfs_transaction_running(inode->i_sb)) { - struct reiserfs_transaction_handle *th; - th = (struct reiserfs_transaction_handle *)current->journal_info; - BUG_ON (!th->t_refcount); - BUG_ON (!th->t_trans_id); - old_ref = th->t_refcount; - th->t_refcount++; - } - - ret = block_prepare_write(page, from, to, reiserfs_get_block) ; - if (ret && reiserfs_transaction_running(inode->i_sb)) { - struct reiserfs_transaction_handle *th = current->journal_info; - /* this gets a little ugly. If reiserfs_get_block returned an - * error and left a transacstion running, we've got to close it, - * and we've got to free handle if it was a persistent transaction. - * - * But, if we had nested into an existing transaction, we need - * to just drop the ref count on the handle. - * - * If old_ref == 0, the transaction is from reiserfs_get_block, - * and it was a persistent trans. Otherwise, it was nested above. - */ - if (th->t_refcount > old_ref) { - if (old_ref) - th->t_refcount--; - else { - int err; - reiserfs_write_lock(inode->i_sb); - err = reiserfs_end_persistent_transaction(th); - reiserfs_write_unlock(inode->i_sb); - if (err) - ret = err; - } + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + int ret; + int old_ref = 0; + + reiserfs_wait_on_write_block(inode->i_sb); + fix_tail_page_for_writing(page); + if (reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th; + th = (struct reiserfs_transaction_handle *)current-> + journal_info; + BUG_ON(!th->t_refcount); + BUG_ON(!th->t_trans_id); + old_ref = th->t_refcount; + th->t_refcount++; } - } - return ret; -} + ret = block_prepare_write(page, from, to, reiserfs_get_block); + if (ret && reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th = current->journal_info; + /* this gets a little ugly. If reiserfs_get_block returned an + * error and left a transacstion running, we've got to close it, + * and we've got to free handle if it was a persistent transaction. + * + * But, if we had nested into an existing transaction, we need + * to just drop the ref count on the handle. + * + * If old_ref == 0, the transaction is from reiserfs_get_block, + * and it was a persistent trans. Otherwise, it was nested above. + */ + if (th->t_refcount > old_ref) { + if (old_ref) + th->t_refcount--; + else { + int err; + reiserfs_write_lock(inode->i_sb); + err = reiserfs_end_persistent_transaction(th); + reiserfs_write_unlock(inode->i_sb); + if (err) + ret = err; + } + } + } + return ret; +} -static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) { - return generic_block_bmap(as, block, reiserfs_bmap) ; +static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) +{ + return generic_block_bmap(as, block, reiserfs_bmap); } -static int reiserfs_commit_write(struct file *f, struct page *page, - unsigned from, unsigned to) { - struct inode *inode = page->mapping->host ; - loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - int ret = 0; - int update_sd = 0; - struct reiserfs_transaction_handle *th = NULL; - - reiserfs_wait_on_write_block(inode->i_sb) ; - if (reiserfs_transaction_running(inode->i_sb)) { - th = current->journal_info; - } - reiserfs_commit_page(inode, page, from, to); - - /* generic_commit_write does this for us, but does not update the - ** transaction tracking stuff when the size changes. So, we have - ** to do the i_size updates here. - */ - if (pos > inode->i_size) { - struct reiserfs_transaction_handle myth ; - reiserfs_write_lock(inode->i_sb); - /* If the file have grown beyond the border where it - can have a tail, unmark it as needing a tail - packing */ - if ( (have_large_tails (inode->i_sb) && inode->i_size > i_block_size (inode)*4) || - (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) ) - REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ; - - ret = journal_begin(&myth, inode->i_sb, 1) ; - if (ret) { - reiserfs_write_unlock(inode->i_sb); - goto journal_error; - } - reiserfs_update_inode_transaction(inode) ; - inode->i_size = pos ; - reiserfs_update_sd(&myth, inode) ; - update_sd = 1; - ret = journal_end(&myth, inode->i_sb, 1) ; - reiserfs_write_unlock(inode->i_sb); - if (ret) - goto journal_error; - } - if (th) { - reiserfs_write_lock(inode->i_sb); - if (!update_sd) - reiserfs_update_sd(th, inode) ; - ret = reiserfs_end_persistent_transaction(th); - reiserfs_write_unlock(inode->i_sb); - if (ret) - goto out; - } - - /* we test for O_SYNC here so we can commit the transaction - ** for any packed tails the file might have had - */ - if (f && (f->f_flags & O_SYNC)) { - reiserfs_write_lock(inode->i_sb); - ret = reiserfs_commit_for_inode(inode) ; - reiserfs_write_unlock(inode->i_sb); - } -out: - return ret ; +static int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to; + int ret = 0; + int update_sd = 0; + struct reiserfs_transaction_handle *th = NULL; + + reiserfs_wait_on_write_block(inode->i_sb); + if (reiserfs_transaction_running(inode->i_sb)) { + th = current->journal_info; + } + reiserfs_commit_page(inode, page, from, to); -journal_error: - if (th) { - reiserfs_write_lock(inode->i_sb); - if (!update_sd) - reiserfs_update_sd(th, inode) ; - ret = reiserfs_end_persistent_transaction(th); - reiserfs_write_unlock(inode->i_sb); - } + /* generic_commit_write does this for us, but does not update the + ** transaction tracking stuff when the size changes. So, we have + ** to do the i_size updates here. + */ + if (pos > inode->i_size) { + struct reiserfs_transaction_handle myth; + reiserfs_write_lock(inode->i_sb); + /* If the file have grown beyond the border where it + can have a tail, unmark it as needing a tail + packing */ + if ((have_large_tails(inode->i_sb) + && inode->i_size > i_block_size(inode) * 4) + || (have_small_tails(inode->i_sb) + && inode->i_size > i_block_size(inode))) + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; + + ret = journal_begin(&myth, inode->i_sb, 1); + if (ret) { + reiserfs_write_unlock(inode->i_sb); + goto journal_error; + } + reiserfs_update_inode_transaction(inode); + inode->i_size = pos; + reiserfs_update_sd(&myth, inode); + update_sd = 1; + ret = journal_end(&myth, inode->i_sb, 1); + reiserfs_write_unlock(inode->i_sb); + if (ret) + goto journal_error; + } + if (th) { + reiserfs_write_lock(inode->i_sb); + if (!update_sd) + reiserfs_update_sd(th, inode); + ret = reiserfs_end_persistent_transaction(th); + reiserfs_write_unlock(inode->i_sb); + if (ret) + goto out; + } + + /* we test for O_SYNC here so we can commit the transaction + ** for any packed tails the file might have had + */ + if (f && (f->f_flags & O_SYNC)) { + reiserfs_write_lock(inode->i_sb); + ret = reiserfs_commit_for_inode(inode); + reiserfs_write_unlock(inode->i_sb); + } + out: + return ret; - return ret; + journal_error: + if (th) { + reiserfs_write_lock(inode->i_sb); + if (!update_sd) + reiserfs_update_sd(th, inode); + ret = reiserfs_end_persistent_transaction(th); + reiserfs_write_unlock(inode->i_sb); + } + + return ret; } -void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode ) +void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode) { - if( reiserfs_attrs( inode -> i_sb ) ) { - if( sd_attrs & REISERFS_SYNC_FL ) - inode -> i_flags |= S_SYNC; + if (reiserfs_attrs(inode->i_sb)) { + if (sd_attrs & REISERFS_SYNC_FL) + inode->i_flags |= S_SYNC; else - inode -> i_flags &= ~S_SYNC; - if( sd_attrs & REISERFS_IMMUTABLE_FL ) - inode -> i_flags |= S_IMMUTABLE; + inode->i_flags &= ~S_SYNC; + if (sd_attrs & REISERFS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; else - inode -> i_flags &= ~S_IMMUTABLE; - if( sd_attrs & REISERFS_APPEND_FL ) - inode -> i_flags |= S_APPEND; + inode->i_flags &= ~S_IMMUTABLE; + if (sd_attrs & REISERFS_APPEND_FL) + inode->i_flags |= S_APPEND; else - inode -> i_flags &= ~S_APPEND; - if( sd_attrs & REISERFS_NOATIME_FL ) - inode -> i_flags |= S_NOATIME; + inode->i_flags &= ~S_APPEND; + if (sd_attrs & REISERFS_NOATIME_FL) + inode->i_flags |= S_NOATIME; else - inode -> i_flags &= ~S_NOATIME; - if( sd_attrs & REISERFS_NOTAIL_FL ) + inode->i_flags &= ~S_NOATIME; + if (sd_attrs & REISERFS_NOTAIL_FL) REISERFS_I(inode)->i_flags |= i_nopack_mask; else REISERFS_I(inode)->i_flags &= ~i_nopack_mask; } } -void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs ) +void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs) { - if( reiserfs_attrs( inode -> i_sb ) ) { - if( inode -> i_flags & S_IMMUTABLE ) + if (reiserfs_attrs(inode->i_sb)) { + if (inode->i_flags & S_IMMUTABLE) *sd_attrs |= REISERFS_IMMUTABLE_FL; else *sd_attrs &= ~REISERFS_IMMUTABLE_FL; - if( inode -> i_flags & S_SYNC ) + if (inode->i_flags & S_SYNC) *sd_attrs |= REISERFS_SYNC_FL; else *sd_attrs &= ~REISERFS_SYNC_FL; - if( inode -> i_flags & S_NOATIME ) + if (inode->i_flags & S_NOATIME) *sd_attrs |= REISERFS_NOATIME_FL; else *sd_attrs &= ~REISERFS_NOATIME_FL; - if( REISERFS_I(inode)->i_flags & i_nopack_mask ) + if (REISERFS_I(inode)->i_flags & i_nopack_mask) *sd_attrs |= REISERFS_NOTAIL_FL; else *sd_attrs &= ~REISERFS_NOTAIL_FL; @@ -2594,106 +2730,107 @@ void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs ) */ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) { - int ret = 1 ; - struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ; - - spin_lock(&j->j_dirty_buffers_lock) ; - if (!buffer_mapped(bh)) { - goto free_jh; - } - /* the page is locked, and the only places that log a data buffer - * also lock the page. - */ - if (reiserfs_file_data_log(inode)) { - /* - * very conservative, leave the buffer pinned if - * anyone might need it. - */ - if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { - ret = 0 ; - } - } else - if (buffer_dirty(bh) || buffer_locked(bh)) { - struct reiserfs_journal_list *jl; - struct reiserfs_jh *jh = bh->b_private; - - /* why is this safe? - * reiserfs_setattr updates i_size in the on disk - * stat data before allowing vmtruncate to be called. - * - * If buffer was put onto the ordered list for this - * transaction, we know for sure either this transaction - * or an older one already has updated i_size on disk, - * and this ordered data won't be referenced in the file - * if we crash. - * - * if the buffer was put onto the ordered list for an older - * transaction, we need to leave it around + int ret = 1; + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); + + spin_lock(&j->j_dirty_buffers_lock); + if (!buffer_mapped(bh)) { + goto free_jh; + } + /* the page is locked, and the only places that log a data buffer + * also lock the page. */ - if (jh && (jl = jh->jl) && jl != SB_JOURNAL(inode->i_sb)->j_current_jl) - ret = 0; - } -free_jh: - if (ret && bh->b_private) { - reiserfs_free_jh(bh); - } - spin_unlock(&j->j_dirty_buffers_lock) ; - return ret ; + if (reiserfs_file_data_log(inode)) { + /* + * very conservative, leave the buffer pinned if + * anyone might need it. + */ + if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { + ret = 0; + } + } else if (buffer_dirty(bh) || buffer_locked(bh)) { + struct reiserfs_journal_list *jl; + struct reiserfs_jh *jh = bh->b_private; + + /* why is this safe? + * reiserfs_setattr updates i_size in the on disk + * stat data before allowing vmtruncate to be called. + * + * If buffer was put onto the ordered list for this + * transaction, we know for sure either this transaction + * or an older one already has updated i_size on disk, + * and this ordered data won't be referenced in the file + * if we crash. + * + * if the buffer was put onto the ordered list for an older + * transaction, we need to leave it around + */ + if (jh && (jl = jh->jl) + && jl != SB_JOURNAL(inode->i_sb)->j_current_jl) + ret = 0; + } + free_jh: + if (ret && bh->b_private) { + reiserfs_free_jh(bh); + } + spin_unlock(&j->j_dirty_buffers_lock); + return ret; } /* clm -- taken from fs/buffer.c:block_invalidate_page */ static int reiserfs_invalidatepage(struct page *page, unsigned long offset) { - struct buffer_head *head, *bh, *next; - struct inode *inode = page->mapping->host; - unsigned int curr_off = 0; - int ret = 1; + struct buffer_head *head, *bh, *next; + struct inode *inode = page->mapping->host; + unsigned int curr_off = 0; + int ret = 1; - BUG_ON(!PageLocked(page)); + BUG_ON(!PageLocked(page)); - if (offset == 0) - ClearPageChecked(page); + if (offset == 0) + ClearPageChecked(page); - if (!page_has_buffers(page)) - goto out; + if (!page_has_buffers(page)) + goto out; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; - head = page_buffers(page); - bh = head; - do { - unsigned int next_off = curr_off + bh->b_size; - next = bh->b_this_page; + /* + * is this block fully invalidated? + */ + if (offset <= curr_off) { + if (invalidatepage_can_drop(inode, bh)) + reiserfs_unmap_buffer(bh); + else + ret = 0; + } + curr_off = next_off; + bh = next; + } while (bh != head); /* - * is this block fully invalidated? + * We release buffers only if the entire page is being invalidated. + * The get_block cached value has been unconditionally invalidated, + * so real IO is not possible anymore. */ - if (offset <= curr_off) { - if (invalidatepage_can_drop(inode, bh)) - reiserfs_unmap_buffer(bh); - else - ret = 0; - } - curr_off = next_off; - bh = next; - } while (bh != head); - - /* - * We release buffers only if the entire page is being invalidated. - * The get_block cached value has been unconditionally invalidated, - * so real IO is not possible anymore. - */ - if (!offset && ret) - ret = try_to_release_page(page, 0); -out: - return ret; + if (!offset && ret) + ret = try_to_release_page(page, 0); + out: + return ret; } -static int reiserfs_set_page_dirty(struct page *page) { - struct inode *inode = page->mapping->host; - if (reiserfs_file_data_log(inode)) { - SetPageChecked(page); - return __set_page_dirty_nobuffers(page); - } - return __set_page_dirty_buffers(page); +static int reiserfs_set_page_dirty(struct page *page) +{ + struct inode *inode = page->mapping->host; + if (reiserfs_file_data_log(inode)) { + SetPageChecked(page); + return __set_page_dirty_nobuffers(page); + } + return __set_page_dirty_buffers(page); } /* @@ -2707,140 +2844,152 @@ static int reiserfs_set_page_dirty(struct page *page) { */ static int reiserfs_releasepage(struct page *page, int unused_gfp_flags) { - struct inode *inode = page->mapping->host ; - struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ; - struct buffer_head *head ; - struct buffer_head *bh ; - int ret = 1 ; - - WARN_ON(PageChecked(page)); - spin_lock(&j->j_dirty_buffers_lock) ; - head = page_buffers(page) ; - bh = head ; - do { - if (bh->b_private) { - if (!buffer_dirty(bh) && !buffer_locked(bh)) { - reiserfs_free_jh(bh); - } else { - ret = 0 ; - break ; - } - } - bh = bh->b_this_page ; - } while (bh != head) ; - if (ret) - ret = try_to_free_buffers(page) ; - spin_unlock(&j->j_dirty_buffers_lock) ; - return ret ; + struct inode *inode = page->mapping->host; + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); + struct buffer_head *head; + struct buffer_head *bh; + int ret = 1; + + WARN_ON(PageChecked(page)); + spin_lock(&j->j_dirty_buffers_lock); + head = page_buffers(page); + bh = head; + do { + if (bh->b_private) { + if (!buffer_dirty(bh) && !buffer_locked(bh)) { + reiserfs_free_jh(bh); + } else { + ret = 0; + break; + } + } + bh = bh->b_this_page; + } while (bh != head); + if (ret) + ret = try_to_free_buffers(page); + spin_unlock(&j->j_dirty_buffers_lock); + return ret; } /* We thank Mingming Cao for helping us understand in great detail what to do in this section of the code. */ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, unsigned long nr_segs) + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; - return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, - offset, nr_segs, reiserfs_get_blocks_direct_io, NULL); + return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, + reiserfs_get_blocks_direct_io, NULL); } -int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode ; - int error ; - unsigned int ia_valid = attr->ia_valid; - reiserfs_write_lock(inode->i_sb); - if (attr->ia_valid & ATTR_SIZE) { - /* version 2 items will be caught by the s_maxbytes check - ** done for us in vmtruncate - */ - if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 && - attr->ia_size > MAX_NON_LFS) { - error = -EFBIG ; - goto out; - } - /* fill in hole pointers in the expanding truncate case. */ - if (attr->ia_size > inode->i_size) { - error = generic_cont_expand(inode, attr->ia_size) ; - if (REISERFS_I(inode)->i_prealloc_count > 0) { - int err; - struct reiserfs_transaction_handle th ; - /* we're changing at most 2 bitmaps, inode + super */ - err = journal_begin(&th, inode->i_sb, 4) ; - if (!err) { - reiserfs_discard_prealloc (&th, inode); - err = journal_end(&th, inode->i_sb, 4) ; +int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error; + unsigned int ia_valid = attr->ia_valid; + reiserfs_write_lock(inode->i_sb); + if (attr->ia_valid & ATTR_SIZE) { + /* version 2 items will be caught by the s_maxbytes check + ** done for us in vmtruncate + */ + if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 && + attr->ia_size > MAX_NON_LFS) { + error = -EFBIG; + goto out; + } + /* fill in hole pointers in the expanding truncate case. */ + if (attr->ia_size > inode->i_size) { + error = generic_cont_expand(inode, attr->ia_size); + if (REISERFS_I(inode)->i_prealloc_count > 0) { + int err; + struct reiserfs_transaction_handle th; + /* we're changing at most 2 bitmaps, inode + super */ + err = journal_begin(&th, inode->i_sb, 4); + if (!err) { + reiserfs_discard_prealloc(&th, inode); + err = journal_end(&th, inode->i_sb, 4); + } + if (err) + error = err; + } + if (error) + goto out; } - if (err) - error = err; - } - if (error) - goto out; } - } - if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) || - ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) && - (get_inode_sd_version (inode) == STAT_DATA_V1)) { + if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) || + ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) && + (get_inode_sd_version(inode) == STAT_DATA_V1)) { /* stat data of format v3.5 has 16 bit uid and gid */ - error = -EINVAL; - goto out; + error = -EINVAL; + goto out; } - error = inode_change_ok(inode, attr) ; - if (!error) { - if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || - (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { - error = reiserfs_chown_xattrs (inode, attr); - - if (!error) { - struct reiserfs_transaction_handle th; - - /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */ - journal_begin(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2); - error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; - if (error) { - journal_end(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2); - goto out; - } - /* Update corresponding info in inode so that everything is in - * one transaction */ - if (attr->ia_valid & ATTR_UID) - inode->i_uid = attr->ia_uid; - if (attr->ia_valid & ATTR_GID) - inode->i_gid = attr->ia_gid; - mark_inode_dirty(inode); - journal_end(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2); + error = inode_change_ok(inode, attr); + if (!error) { + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + error = reiserfs_chown_xattrs(inode, attr); + + if (!error) { + struct reiserfs_transaction_handle th; + int jbegin_count = + 2 * + (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) + + REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) + + 2; + + /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */ + error = + journal_begin(&th, inode->i_sb, + jbegin_count); + if (error) + goto out; + error = + DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; + if (error) { + journal_end(&th, inode->i_sb, + jbegin_count); + goto out; + } + /* Update corresponding info in inode so that everything is in + * one transaction */ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + mark_inode_dirty(inode); + error = + journal_end(&th, inode->i_sb, jbegin_count); + } } - } - if (!error) - error = inode_setattr(inode, attr) ; - } - + if (!error) + error = inode_setattr(inode, attr); + } - if (!error && reiserfs_posixacl (inode->i_sb)) { - if (attr->ia_valid & ATTR_MODE) - error = reiserfs_acl_chmod (inode); - } + if (!error && reiserfs_posixacl(inode->i_sb)) { + if (attr->ia_valid & ATTR_MODE) + error = reiserfs_acl_chmod(inode); + } -out: - reiserfs_write_unlock(inode->i_sb); - return error ; + out: + reiserfs_write_unlock(inode->i_sb); + return error; } - - struct address_space_operations reiserfs_address_space_operations = { - .writepage = reiserfs_writepage, - .readpage = reiserfs_readpage, - .readpages = reiserfs_readpages, - .releasepage = reiserfs_releasepage, - .invalidatepage = reiserfs_invalidatepage, - .sync_page = block_sync_page, - .prepare_write = reiserfs_prepare_write, - .commit_write = reiserfs_commit_write, - .bmap = reiserfs_aop_bmap, - .direct_IO = reiserfs_direct_IO, - .set_page_dirty = reiserfs_set_page_dirty, -} ; + .writepage = reiserfs_writepage, + .readpage = reiserfs_readpage, + .readpages = reiserfs_readpages, + .releasepage = reiserfs_releasepage, + .invalidatepage = reiserfs_invalidatepage, + .sync_page = block_sync_page, + .prepare_write = reiserfs_prepare_write, + .commit_write = reiserfs_commit_write, + .bmap = reiserfs_aop_bmap, + .direct_IO = reiserfs_direct_IO, + .set_page_dirty = reiserfs_set_page_dirty, +}; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 94dc42475a0..81fc00285f6 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -9,7 +9,7 @@ #include <linux/pagemap.h> #include <linux/smp_lock.h> -static int reiserfs_unpack (struct inode * inode, struct file * filp); +static int reiserfs_unpack(struct inode *inode, struct file *filp); /* ** reiserfs_ioctl - handler for ioctl for inode @@ -19,63 +19,72 @@ static int reiserfs_unpack (struct inode * inode, struct file * filp); ** 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION ** 3) That's all for a while ... */ -int reiserfs_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, - unsigned long arg) +int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, + unsigned long arg) { unsigned int flags; switch (cmd) { - case REISERFS_IOC_UNPACK: - if( S_ISREG( inode -> i_mode ) ) { - if (arg) - return reiserfs_unpack (inode, filp); + case REISERFS_IOC_UNPACK: + if (S_ISREG(inode->i_mode)) { + if (arg) + return reiserfs_unpack(inode, filp); else return 0; } else return -ENOTTY; - /* following two cases are taken from fs/ext2/ioctl.c by Remy - Card (card@masi.ibp.fr) */ + /* following two cases are taken from fs/ext2/ioctl.c by Remy + Card (card@masi.ibp.fr) */ case REISERFS_IOC_GETFLAGS: - flags = REISERFS_I(inode) -> i_attrs; - i_attrs_to_sd_attrs( inode, ( __u16 * ) &flags ); - return put_user(flags, (int __user *) arg); - case REISERFS_IOC_SETFLAGS: { - if (IS_RDONLY(inode)) - return -EROFS; + if (!reiserfs_attrs(inode->i_sb)) + return -ENOTTY; - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) - return -EPERM; + flags = REISERFS_I(inode)->i_attrs; + i_attrs_to_sd_attrs(inode, (__u16 *) & flags); + return put_user(flags, (int __user *)arg); + case REISERFS_IOC_SETFLAGS:{ + if (!reiserfs_attrs(inode->i_sb)) + return -ENOTTY; - if (get_user(flags, (int __user *) arg)) - return -EFAULT; + if (IS_RDONLY(inode)) + return -EROFS; - if ( ( ( flags ^ REISERFS_I(inode) -> i_attrs) & ( REISERFS_IMMUTABLE_FL | REISERFS_APPEND_FL)) && - !capable( CAP_LINUX_IMMUTABLE ) ) - return -EPERM; - - if( ( flags & REISERFS_NOTAIL_FL ) && - S_ISREG( inode -> i_mode ) ) { + if ((current->fsuid != inode->i_uid) + && !capable(CAP_FOWNER)) + return -EPERM; + + if (get_user(flags, (int __user *)arg)) + return -EFAULT; + + if (((flags ^ REISERFS_I(inode)-> + i_attrs) & (REISERFS_IMMUTABLE_FL | + REISERFS_APPEND_FL)) + && !capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + if ((flags & REISERFS_NOTAIL_FL) && + S_ISREG(inode->i_mode)) { int result; - result = reiserfs_unpack( inode, filp ); - if( result ) + result = reiserfs_unpack(inode, filp); + if (result) return result; + } + sd_attrs_to_i_attrs(flags, inode); + REISERFS_I(inode)->i_attrs = flags; + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + return 0; } - sd_attrs_to_i_attrs( flags, inode ); - REISERFS_I(inode) -> i_attrs = flags; - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - return 0; - } case REISERFS_IOC_GETVERSION: - return put_user(inode->i_generation, (int __user *) arg); + return put_user(inode->i_generation, (int __user *)arg); case REISERFS_IOC_SETVERSION: if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) return -EPERM; if (IS_RDONLY(inode)) return -EROFS; - if (get_user(inode->i_generation, (int __user *) arg)) - return -EFAULT; + if (get_user(inode->i_generation, (int __user *)arg)) + return -EFAULT; inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); return 0; @@ -89,63 +98,65 @@ int reiserfs_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, ** Function try to convert tail from direct item into indirect. ** It set up nopack attribute in the REISERFS_I(inode)->nopack */ -static int reiserfs_unpack (struct inode * inode, struct file * filp) +static int reiserfs_unpack(struct inode *inode, struct file *filp) { - int retval = 0; - int index ; - struct page *page ; - struct address_space *mapping ; - unsigned long write_from ; - unsigned long blocksize = inode->i_sb->s_blocksize ; - - if (inode->i_size == 0) { - REISERFS_I(inode)->i_flags |= i_nopack_mask; - return 0 ; - } - /* ioctl already done */ - if (REISERFS_I(inode)->i_flags & i_nopack_mask) { - return 0 ; - } - reiserfs_write_lock(inode->i_sb); - - /* we need to make sure nobody is changing the file size beneath - ** us - */ - down(&inode->i_sem) ; - - write_from = inode->i_size & (blocksize - 1) ; - /* if we are on a block boundary, we are already unpacked. */ - if ( write_from == 0) { + int retval = 0; + int index; + struct page *page; + struct address_space *mapping; + unsigned long write_from; + unsigned long blocksize = inode->i_sb->s_blocksize; + + if (inode->i_size == 0) { + REISERFS_I(inode)->i_flags |= i_nopack_mask; + return 0; + } + /* ioctl already done */ + if (REISERFS_I(inode)->i_flags & i_nopack_mask) { + return 0; + } + reiserfs_write_lock(inode->i_sb); + + /* we need to make sure nobody is changing the file size beneath + ** us + */ + down(&inode->i_sem); + + write_from = inode->i_size & (blocksize - 1); + /* if we are on a block boundary, we are already unpacked. */ + if (write_from == 0) { + REISERFS_I(inode)->i_flags |= i_nopack_mask; + goto out; + } + + /* we unpack by finding the page with the tail, and calling + ** reiserfs_prepare_write on that page. This will force a + ** reiserfs_get_block to unpack the tail for us. + */ + index = inode->i_size >> PAGE_CACHE_SHIFT; + mapping = inode->i_mapping; + page = grab_cache_page(mapping, index); + retval = -ENOMEM; + if (!page) { + goto out; + } + retval = + mapping->a_ops->prepare_write(NULL, page, write_from, write_from); + if (retval) + goto out_unlock; + + /* conversion can change page contents, must flush */ + flush_dcache_page(page); + retval = + mapping->a_ops->commit_write(NULL, page, write_from, write_from); REISERFS_I(inode)->i_flags |= i_nopack_mask; - goto out ; - } - - /* we unpack by finding the page with the tail, and calling - ** reiserfs_prepare_write on that page. This will force a - ** reiserfs_get_block to unpack the tail for us. - */ - index = inode->i_size >> PAGE_CACHE_SHIFT ; - mapping = inode->i_mapping ; - page = grab_cache_page(mapping, index) ; - retval = -ENOMEM; - if (!page) { - goto out ; - } - retval = mapping->a_ops->prepare_write(NULL, page, write_from, write_from) ; - if (retval) - goto out_unlock ; - - /* conversion can change page contents, must flush */ - flush_dcache_page(page) ; - retval = mapping->a_ops->commit_write(NULL, page, write_from, write_from) ; - REISERFS_I(inode)->i_flags |= i_nopack_mask; - -out_unlock: - unlock_page(page) ; - page_cache_release(page) ; - -out: - up(&inode->i_sem) ; - reiserfs_write_unlock(inode->i_sb); - return retval; + + out_unlock: + unlock_page(page); + page_cache_release(page); + + out: + up(&inode->i_sem); + reiserfs_write_unlock(inode->i_sb); + return retval; } diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c index 0ce33db1acd..e237cd668e5 100644 --- a/fs/reiserfs/item_ops.c +++ b/fs/reiserfs/item_ops.c @@ -14,776 +14,741 @@ ////////////////////////////////////////////////////////////////////////////// // stat data functions // -static int sd_bytes_number (struct item_head * ih, int block_size) +static int sd_bytes_number(struct item_head *ih, int block_size) { - return 0; + return 0; } -static void sd_decrement_key (struct cpu_key * key) +static void sd_decrement_key(struct cpu_key *key) { - key->on_disk_key.k_objectid --; - set_cpu_key_k_type (key, TYPE_ANY); - set_cpu_key_k_offset(key, (loff_t)(-1)); + key->on_disk_key.k_objectid--; + set_cpu_key_k_type(key, TYPE_ANY); + set_cpu_key_k_offset(key, (loff_t) (-1)); } -static int sd_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize) +static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize) { - return 0; + return 0; } - - -static char * print_time (time_t t) +static char *print_time(time_t t) { - static char timebuf[256]; + static char timebuf[256]; - sprintf (timebuf, "%ld", t); - return timebuf; + sprintf(timebuf, "%ld", t); + return timebuf; } - -static void sd_print_item (struct item_head * ih, char * item) +static void sd_print_item(struct item_head *ih, char *item) { - printk ("\tmode | size | nlinks | first direct | mtime\n"); - if (stat_data_v1 (ih)) { - struct stat_data_v1 * sd = (struct stat_data_v1 *)item; + printk("\tmode | size | nlinks | first direct | mtime\n"); + if (stat_data_v1(ih)) { + struct stat_data_v1 *sd = (struct stat_data_v1 *)item; - printk ("\t0%-6o | %6u | %2u | %d | %s\n", sd_v1_mode(sd), - sd_v1_size(sd), sd_v1_nlink(sd), sd_v1_first_direct_byte(sd), - print_time( sd_v1_mtime(sd) ) ); - } else { - struct stat_data * sd = (struct stat_data *)item; + printk("\t0%-6o | %6u | %2u | %d | %s\n", sd_v1_mode(sd), + sd_v1_size(sd), sd_v1_nlink(sd), + sd_v1_first_direct_byte(sd), + print_time(sd_v1_mtime(sd))); + } else { + struct stat_data *sd = (struct stat_data *)item; - printk ("\t0%-6o | %6Lu | %2u | %d | %s\n", sd_v2_mode(sd), - (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd), - sd_v2_rdev(sd), print_time(sd_v2_mtime(sd))); - } + printk("\t0%-6o | %6Lu | %2u | %d | %s\n", sd_v2_mode(sd), + (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd), + sd_v2_rdev(sd), print_time(sd_v2_mtime(sd))); + } } -static void sd_check_item (struct item_head * ih, char * item) +static void sd_check_item(struct item_head *ih, char *item) { - // FIXME: type something here! + // FIXME: type something here! } - -static int sd_create_vi (struct virtual_node * vn, - struct virtual_item * vi, - int is_affected, - int insert_size) +static int sd_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) { - vi->vi_index = TYPE_STAT_DATA; - //vi->vi_type |= VI_TYPE_STAT_DATA;// not needed? - return 0; + vi->vi_index = TYPE_STAT_DATA; + //vi->vi_type |= VI_TYPE_STAT_DATA;// not needed? + return 0; } - -static int sd_check_left (struct virtual_item * vi, int free, - int start_skip, int end_skip) +static int sd_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) { - if (start_skip || end_skip) - BUG (); - return -1; + if (start_skip || end_skip) + BUG(); + return -1; } - -static int sd_check_right (struct virtual_item * vi, int free) +static int sd_check_right(struct virtual_item *vi, int free) { - return -1; + return -1; } -static int sd_part_size (struct virtual_item * vi, int first, int count) +static int sd_part_size(struct virtual_item *vi, int first, int count) { - if (count) - BUG (); - return 0; + if (count) + BUG(); + return 0; } -static int sd_unit_num (struct virtual_item * vi) +static int sd_unit_num(struct virtual_item *vi) { - return vi->vi_item_len - IH_SIZE; + return vi->vi_item_len - IH_SIZE; } - -static void sd_print_vi (struct virtual_item * vi) +static void sd_print_vi(struct virtual_item *vi) { - reiserfs_warning (NULL, "STATDATA, index %d, type 0x%x, %h", - vi->vi_index, vi->vi_type, vi->vi_ih); + reiserfs_warning(NULL, "STATDATA, index %d, type 0x%x, %h", + vi->vi_index, vi->vi_type, vi->vi_ih); } static struct item_operations stat_data_ops = { - .bytes_number = sd_bytes_number, - .decrement_key = sd_decrement_key, - .is_left_mergeable = sd_is_left_mergeable, - .print_item = sd_print_item, - .check_item = sd_check_item, - - .create_vi = sd_create_vi, - .check_left = sd_check_left, - .check_right = sd_check_right, - .part_size = sd_part_size, - .unit_num = sd_unit_num, - .print_vi = sd_print_vi + .bytes_number = sd_bytes_number, + .decrement_key = sd_decrement_key, + .is_left_mergeable = sd_is_left_mergeable, + .print_item = sd_print_item, + .check_item = sd_check_item, + + .create_vi = sd_create_vi, + .check_left = sd_check_left, + .check_right = sd_check_right, + .part_size = sd_part_size, + .unit_num = sd_unit_num, + .print_vi = sd_print_vi }; - - ////////////////////////////////////////////////////////////////////////////// // direct item functions // -static int direct_bytes_number (struct item_head * ih, int block_size) +static int direct_bytes_number(struct item_head *ih, int block_size) { - return ih_item_len(ih); + return ih_item_len(ih); } - // FIXME: this should probably switch to indirect as well -static void direct_decrement_key (struct cpu_key * key) +static void direct_decrement_key(struct cpu_key *key) { - cpu_key_k_offset_dec (key); - if (cpu_key_k_offset (key) == 0) - set_cpu_key_k_type (key, TYPE_STAT_DATA); + cpu_key_k_offset_dec(key); + if (cpu_key_k_offset(key) == 0) + set_cpu_key_k_type(key, TYPE_STAT_DATA); } - -static int direct_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize) +static int direct_is_left_mergeable(struct reiserfs_key *key, + unsigned long bsize) { - int version = le_key_version (key); - return ((le_key_k_offset (version, key) & (bsize - 1)) != 1); + int version = le_key_version(key); + return ((le_key_k_offset(version, key) & (bsize - 1)) != 1); } - -static void direct_print_item (struct item_head * ih, char * item) +static void direct_print_item(struct item_head *ih, char *item) { - int j = 0; + int j = 0; // return; - printk ("\""); - while (j < ih_item_len(ih)) - printk ("%c", item[j++]); - printk ("\"\n"); + printk("\""); + while (j < ih_item_len(ih)) + printk("%c", item[j++]); + printk("\"\n"); } - -static void direct_check_item (struct item_head * ih, char * item) +static void direct_check_item(struct item_head *ih, char *item) { - // FIXME: type something here! + // FIXME: type something here! } - -static int direct_create_vi (struct virtual_node * vn, - struct virtual_item * vi, - int is_affected, - int insert_size) +static int direct_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) { - vi->vi_index = TYPE_DIRECT; - //vi->vi_type |= VI_TYPE_DIRECT; - return 0; + vi->vi_index = TYPE_DIRECT; + //vi->vi_type |= VI_TYPE_DIRECT; + return 0; } -static int direct_check_left (struct virtual_item * vi, int free, - int start_skip, int end_skip) +static int direct_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) { - int bytes; + int bytes; - bytes = free - free % 8; - return bytes ?: -1; + bytes = free - free % 8; + return bytes ? : -1; } - -static int direct_check_right (struct virtual_item * vi, int free) +static int direct_check_right(struct virtual_item *vi, int free) { - return direct_check_left (vi, free, 0, 0); + return direct_check_left(vi, free, 0, 0); } -static int direct_part_size (struct virtual_item * vi, int first, int count) +static int direct_part_size(struct virtual_item *vi, int first, int count) { - return count; + return count; } - -static int direct_unit_num (struct virtual_item * vi) +static int direct_unit_num(struct virtual_item *vi) { - return vi->vi_item_len - IH_SIZE; + return vi->vi_item_len - IH_SIZE; } - -static void direct_print_vi (struct virtual_item * vi) +static void direct_print_vi(struct virtual_item *vi) { - reiserfs_warning (NULL, "DIRECT, index %d, type 0x%x, %h", - vi->vi_index, vi->vi_type, vi->vi_ih); + reiserfs_warning(NULL, "DIRECT, index %d, type 0x%x, %h", + vi->vi_index, vi->vi_type, vi->vi_ih); } static struct item_operations direct_ops = { - .bytes_number = direct_bytes_number, - .decrement_key = direct_decrement_key, - .is_left_mergeable = direct_is_left_mergeable, - .print_item = direct_print_item, - .check_item = direct_check_item, - - .create_vi = direct_create_vi, - .check_left = direct_check_left, - .check_right = direct_check_right, - .part_size = direct_part_size, - .unit_num = direct_unit_num, - .print_vi = direct_print_vi + .bytes_number = direct_bytes_number, + .decrement_key = direct_decrement_key, + .is_left_mergeable = direct_is_left_mergeable, + .print_item = direct_print_item, + .check_item = direct_check_item, + + .create_vi = direct_create_vi, + .check_left = direct_check_left, + .check_right = direct_check_right, + .part_size = direct_part_size, + .unit_num = direct_unit_num, + .print_vi = direct_print_vi }; - - ////////////////////////////////////////////////////////////////////////////// // indirect item functions // -static int indirect_bytes_number (struct item_head * ih, int block_size) +static int indirect_bytes_number(struct item_head *ih, int block_size) { - return ih_item_len(ih) / UNFM_P_SIZE * block_size; //- get_ih_free_space (ih); + return ih_item_len(ih) / UNFM_P_SIZE * block_size; //- get_ih_free_space (ih); } - // decrease offset, if it becomes 0, change type to stat data -static void indirect_decrement_key (struct cpu_key * key) +static void indirect_decrement_key(struct cpu_key *key) { - cpu_key_k_offset_dec (key); - if (cpu_key_k_offset (key) == 0) - set_cpu_key_k_type (key, TYPE_STAT_DATA); + cpu_key_k_offset_dec(key); + if (cpu_key_k_offset(key) == 0) + set_cpu_key_k_type(key, TYPE_STAT_DATA); } - // if it is not first item of the body, then it is mergeable -static int indirect_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize) +static int indirect_is_left_mergeable(struct reiserfs_key *key, + unsigned long bsize) { - int version = le_key_version (key); - return (le_key_k_offset (version, key) != 1); + int version = le_key_version(key); + return (le_key_k_offset(version, key) != 1); } - // printing of indirect item -static void start_new_sequence (__u32 * start, int * len, __u32 new) +static void start_new_sequence(__u32 * start, int *len, __u32 new) { - *start = new; - *len = 1; + *start = new; + *len = 1; } - -static int sequence_finished (__u32 start, int * len, __u32 new) +static int sequence_finished(__u32 start, int *len, __u32 new) { - if (start == INT_MAX) - return 1; + if (start == INT_MAX) + return 1; - if (start == 0 && new == 0) { - (*len) ++; - return 0; - } - if (start != 0 && (start + *len) == new) { - (*len) ++; - return 0; - } - return 1; + if (start == 0 && new == 0) { + (*len)++; + return 0; + } + if (start != 0 && (start + *len) == new) { + (*len)++; + return 0; + } + return 1; } -static void print_sequence (__u32 start, int len) +static void print_sequence(__u32 start, int len) { - if (start == INT_MAX) - return; + if (start == INT_MAX) + return; - if (len == 1) - printk (" %d", start); - else - printk (" %d(%d)", start, len); + if (len == 1) + printk(" %d", start); + else + printk(" %d(%d)", start, len); } - -static void indirect_print_item (struct item_head * ih, char * item) +static void indirect_print_item(struct item_head *ih, char *item) { - int j; - __le32 * unp; - __u32 prev = INT_MAX; - int num; + int j; + __le32 *unp; + __u32 prev = INT_MAX; + int num; - unp = (__le32 *)item; + unp = (__le32 *) item; - if (ih_item_len(ih) % UNFM_P_SIZE) - reiserfs_warning (NULL, "indirect_print_item: invalid item len"); + if (ih_item_len(ih) % UNFM_P_SIZE) + reiserfs_warning(NULL, "indirect_print_item: invalid item len"); - printk ("%d pointers\n[ ", (int)I_UNFM_NUM (ih)); - for (j = 0; j < I_UNFM_NUM (ih); j ++) { - if (sequence_finished (prev, &num, get_block_num(unp, j))) { - print_sequence (prev, num); - start_new_sequence (&prev, &num, get_block_num(unp, j)); + printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih)); + for (j = 0; j < I_UNFM_NUM(ih); j++) { + if (sequence_finished(prev, &num, get_block_num(unp, j))) { + print_sequence(prev, num); + start_new_sequence(&prev, &num, get_block_num(unp, j)); + } } - } - print_sequence (prev, num); - printk ("]\n"); + print_sequence(prev, num); + printk("]\n"); } -static void indirect_check_item (struct item_head * ih, char * item) +static void indirect_check_item(struct item_head *ih, char *item) { - // FIXME: type something here! + // FIXME: type something here! } - -static int indirect_create_vi (struct virtual_node * vn, - struct virtual_item * vi, - int is_affected, - int insert_size) +static int indirect_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) { - vi->vi_index = TYPE_INDIRECT; - //vi->vi_type |= VI_TYPE_INDIRECT; - return 0; + vi->vi_index = TYPE_INDIRECT; + //vi->vi_type |= VI_TYPE_INDIRECT; + return 0; } -static int indirect_check_left (struct virtual_item * vi, int free, - int start_skip, int end_skip) +static int indirect_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) { - int bytes; + int bytes; - bytes = free - free % UNFM_P_SIZE; - return bytes ?: -1; + bytes = free - free % UNFM_P_SIZE; + return bytes ? : -1; } - -static int indirect_check_right (struct virtual_item * vi, int free) +static int indirect_check_right(struct virtual_item *vi, int free) { - return indirect_check_left (vi, free, 0, 0); + return indirect_check_left(vi, free, 0, 0); } - - // return size in bytes of 'units' units. If first == 0 - calculate from the head (left), otherwise - from tail (right) -static int indirect_part_size (struct virtual_item * vi, int first, int units) +static int indirect_part_size(struct virtual_item *vi, int first, int units) { - // unit of indirect item is byte (yet) - return units; + // unit of indirect item is byte (yet) + return units; } -static int indirect_unit_num (struct virtual_item * vi) +static int indirect_unit_num(struct virtual_item *vi) { - // unit of indirect item is byte (yet) - return vi->vi_item_len - IH_SIZE; + // unit of indirect item is byte (yet) + return vi->vi_item_len - IH_SIZE; } -static void indirect_print_vi (struct virtual_item * vi) +static void indirect_print_vi(struct virtual_item *vi) { - reiserfs_warning (NULL, "INDIRECT, index %d, type 0x%x, %h", - vi->vi_index, vi->vi_type, vi->vi_ih); + reiserfs_warning(NULL, "INDIRECT, index %d, type 0x%x, %h", + vi->vi_index, vi->vi_type, vi->vi_ih); } static struct item_operations indirect_ops = { - .bytes_number = indirect_bytes_number, - .decrement_key = indirect_decrement_key, - .is_left_mergeable = indirect_is_left_mergeable, - .print_item = indirect_print_item, - .check_item = indirect_check_item, - - .create_vi = indirect_create_vi, - .check_left = indirect_check_left, - .check_right = indirect_check_right, - .part_size = indirect_part_size, - .unit_num = indirect_unit_num, - .print_vi = indirect_print_vi + .bytes_number = indirect_bytes_number, + .decrement_key = indirect_decrement_key, + .is_left_mergeable = indirect_is_left_mergeable, + .print_item = indirect_print_item, + .check_item = indirect_check_item, + + .create_vi = indirect_create_vi, + .check_left = indirect_check_left, + .check_right = indirect_check_right, + .part_size = indirect_part_size, + .unit_num = indirect_unit_num, + .print_vi = indirect_print_vi }; - ////////////////////////////////////////////////////////////////////////////// // direntry functions // - -static int direntry_bytes_number (struct item_head * ih, int block_size) +static int direntry_bytes_number(struct item_head *ih, int block_size) { - reiserfs_warning (NULL, "vs-16090: direntry_bytes_number: " - "bytes number is asked for direntry"); - return 0; -} - -static void direntry_decrement_key (struct cpu_key * key) -{ - cpu_key_k_offset_dec (key); - if (cpu_key_k_offset (key) == 0) - set_cpu_key_k_type (key, TYPE_STAT_DATA); + reiserfs_warning(NULL, "vs-16090: direntry_bytes_number: " + "bytes number is asked for direntry"); + return 0; } - -static int direntry_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize) +static void direntry_decrement_key(struct cpu_key *key) { - if (le32_to_cpu (key->u.k_offset_v1.k_offset) == DOT_OFFSET) - return 0; - return 1; - + cpu_key_k_offset_dec(key); + if (cpu_key_k_offset(key) == 0) + set_cpu_key_k_type(key, TYPE_STAT_DATA); } - -static void direntry_print_item (struct item_head * ih, char * item) +static int direntry_is_left_mergeable(struct reiserfs_key *key, + unsigned long bsize) { - int i; - int namelen; - struct reiserfs_de_head * deh; - char * name; - static char namebuf [80]; - - - printk ("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name", "Key of pointed object", "Hash", "Gen number", "Status"); + if (le32_to_cpu(key->u.k_offset_v1.k_offset) == DOT_OFFSET) + return 0; + return 1; - deh = (struct reiserfs_de_head *)item; +} - for (i = 0; i < I_ENTRY_COUNT (ih); i ++, deh ++) { - namelen = (i ? (deh_location(deh - 1)) : ih_item_len(ih)) - deh_location(deh); - name = item + deh_location(deh); - if (name[namelen-1] == 0) - namelen = strlen (name); - namebuf[0] = '"'; - if (namelen > sizeof (namebuf) - 3) { - strncpy (namebuf + 1, name, sizeof (namebuf) - 3); - namebuf[sizeof (namebuf) - 2] = '"'; - namebuf[sizeof (namebuf) - 1] = 0; - } else { - memcpy (namebuf + 1, name, namelen); - namebuf[namelen + 1] = '"'; - namebuf[namelen + 2] = 0; +static void direntry_print_item(struct item_head *ih, char *item) +{ + int i; + int namelen; + struct reiserfs_de_head *deh; + char *name; + static char namebuf[80]; + + printk("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name", + "Key of pointed object", "Hash", "Gen number", "Status"); + + deh = (struct reiserfs_de_head *)item; + + for (i = 0; i < I_ENTRY_COUNT(ih); i++, deh++) { + namelen = + (i ? (deh_location(deh - 1)) : ih_item_len(ih)) - + deh_location(deh); + name = item + deh_location(deh); + if (name[namelen - 1] == 0) + namelen = strlen(name); + namebuf[0] = '"'; + if (namelen > sizeof(namebuf) - 3) { + strncpy(namebuf + 1, name, sizeof(namebuf) - 3); + namebuf[sizeof(namebuf) - 2] = '"'; + namebuf[sizeof(namebuf) - 1] = 0; + } else { + memcpy(namebuf + 1, name, namelen); + namebuf[namelen + 1] = '"'; + namebuf[namelen + 2] = 0; + } + + printk("%d: %-15s%-15d%-15d%-15Ld%-15Ld(%s)\n", + i, namebuf, + deh_dir_id(deh), deh_objectid(deh), + GET_HASH_VALUE(deh_offset(deh)), + GET_GENERATION_NUMBER((deh_offset(deh))), + (de_hidden(deh)) ? "HIDDEN" : "VISIBLE"); } - - printk ("%d: %-15s%-15d%-15d%-15Ld%-15Ld(%s)\n", - i, namebuf, - deh_dir_id(deh), deh_objectid(deh), - GET_HASH_VALUE (deh_offset (deh)), GET_GENERATION_NUMBER ((deh_offset (deh))), - (de_hidden (deh)) ? "HIDDEN" : "VISIBLE"); - } } - -static void direntry_check_item (struct item_head * ih, char * item) +static void direntry_check_item(struct item_head *ih, char *item) { - int i; - struct reiserfs_de_head * deh; + int i; + struct reiserfs_de_head *deh; - // FIXME: type something here! - deh = (struct reiserfs_de_head *)item; - for (i = 0; i < I_ENTRY_COUNT (ih); i ++, deh ++) { - ; - } + // FIXME: type something here! + deh = (struct reiserfs_de_head *)item; + for (i = 0; i < I_ENTRY_COUNT(ih); i++, deh++) { + ; + } } - - #define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1 /* * function returns old entry number in directory item in real node * using new entry number in virtual item in virtual node */ -static inline int old_entry_num (int is_affected, int virtual_entry_num, int pos_in_item, int mode) +static inline int old_entry_num(int is_affected, int virtual_entry_num, + int pos_in_item, int mode) { - if ( mode == M_INSERT || mode == M_DELETE) - return virtual_entry_num; - - if (!is_affected) - /* cut or paste is applied to another item */ - return virtual_entry_num; - - if (virtual_entry_num < pos_in_item) - return virtual_entry_num; + if (mode == M_INSERT || mode == M_DELETE) + return virtual_entry_num; - if (mode == M_CUT) - return virtual_entry_num + 1; + if (!is_affected) + /* cut or paste is applied to another item */ + return virtual_entry_num; - RFALSE( mode != M_PASTE || virtual_entry_num == 0, - "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'", mode); - - return virtual_entry_num - 1; -} + if (virtual_entry_num < pos_in_item) + return virtual_entry_num; + if (mode == M_CUT) + return virtual_entry_num + 1; + RFALSE(mode != M_PASTE || virtual_entry_num == 0, + "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'", + mode); + return virtual_entry_num - 1; +} /* Create an array of sizes of directory entries for virtual item. Return space used by an item. FIXME: no control over consuming of space used by this item handler */ -static int direntry_create_vi (struct virtual_node * vn, - struct virtual_item * vi, - int is_affected, - int insert_size) -{ - struct direntry_uarea * dir_u = vi->vi_uarea; - int i, j; - int size = sizeof (struct direntry_uarea); - struct reiserfs_de_head * deh; - - vi->vi_index = TYPE_DIRENTRY; - - if (!(vi->vi_ih) || !vi->vi_item) - BUG (); - - - dir_u->flags = 0; - if (le_ih_k_offset (vi->vi_ih) == DOT_OFFSET) - dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM; - - deh = (struct reiserfs_de_head *)(vi->vi_item); - - - /* virtual directory item have this amount of entry after */ - dir_u->entry_count = ih_entry_count (vi->vi_ih) + - ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 : - (vn->vn_mode == M_PASTE ? 1 : 0)) : 0); - - for (i = 0; i < dir_u->entry_count; i ++) { - j = old_entry_num (is_affected, i, vn->vn_pos_in_item, vn->vn_mode); - dir_u->entry_sizes[i] = (j ? deh_location( &(deh[j - 1]) ) : - ih_item_len (vi->vi_ih)) - - deh_location( &(deh[j])) + DEH_SIZE; - } - - size += (dir_u->entry_count * sizeof (short)); - - /* set size of pasted entry */ - if (is_affected && vn->vn_mode == M_PASTE) - dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size; +static int direntry_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) +{ + struct direntry_uarea *dir_u = vi->vi_uarea; + int i, j; + int size = sizeof(struct direntry_uarea); + struct reiserfs_de_head *deh; + vi->vi_index = TYPE_DIRENTRY; + + if (!(vi->vi_ih) || !vi->vi_item) + BUG(); + + dir_u->flags = 0; + if (le_ih_k_offset(vi->vi_ih) == DOT_OFFSET) + dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM; + + deh = (struct reiserfs_de_head *)(vi->vi_item); + + /* virtual directory item have this amount of entry after */ + dir_u->entry_count = ih_entry_count(vi->vi_ih) + + ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 : + (vn->vn_mode == M_PASTE ? 1 : 0)) : 0); + + for (i = 0; i < dir_u->entry_count; i++) { + j = old_entry_num(is_affected, i, vn->vn_pos_in_item, + vn->vn_mode); + dir_u->entry_sizes[i] = + (j ? deh_location(&(deh[j - 1])) : ih_item_len(vi->vi_ih)) - + deh_location(&(deh[j])) + DEH_SIZE; + } + + size += (dir_u->entry_count * sizeof(short)); + + /* set size of pasted entry */ + if (is_affected && vn->vn_mode == M_PASTE) + dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size; #ifdef CONFIG_REISERFS_CHECK - /* compare total size of entries with item length */ - { - int k, l; - - l = 0; - for (k = 0; k < dir_u->entry_count; k ++) - l += dir_u->entry_sizes[k]; - - if (l + IH_SIZE != vi->vi_item_len + - ((is_affected && (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT)) ? insert_size : 0) ) { - reiserfs_panic (NULL, "vs-8025: set_entry_sizes: (mode==%c, insert_size==%d), invalid length of directory item", - vn->vn_mode, insert_size); + /* compare total size of entries with item length */ + { + int k, l; + + l = 0; + for (k = 0; k < dir_u->entry_count; k++) + l += dir_u->entry_sizes[k]; + + if (l + IH_SIZE != vi->vi_item_len + + ((is_affected + && (vn->vn_mode == M_PASTE + || vn->vn_mode == M_CUT)) ? insert_size : 0)) { + reiserfs_panic(NULL, + "vs-8025: set_entry_sizes: (mode==%c, insert_size==%d), invalid length of directory item", + vn->vn_mode, insert_size); + } } - } #endif - return size; - + return size; } - // // return number of entries which may fit into specified amount of // free space, or -1 if free space is not enough even for 1 entry // -static int direntry_check_left (struct virtual_item * vi, int free, - int start_skip, int end_skip) +static int direntry_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) { - int i; - int entries = 0; - struct direntry_uarea * dir_u = vi->vi_uarea; + int i; + int entries = 0; + struct direntry_uarea *dir_u = vi->vi_uarea; - for (i = start_skip; i < dir_u->entry_count - end_skip; i ++) { - if (dir_u->entry_sizes[i] > free) - /* i-th entry doesn't fit into the remaining free space */ - break; - - free -= dir_u->entry_sizes[i]; - entries ++; - } + for (i = start_skip; i < dir_u->entry_count - end_skip; i++) { + if (dir_u->entry_sizes[i] > free) + /* i-th entry doesn't fit into the remaining free space */ + break; - if (entries == dir_u->entry_count) { - reiserfs_panic (NULL, "free space %d, entry_count %d\n", free, dir_u->entry_count); - } + free -= dir_u->entry_sizes[i]; + entries++; + } - /* "." and ".." can not be separated from each other */ - if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) && entries < 2) - entries = 0; - - return entries ?: -1; -} + if (entries == dir_u->entry_count) { + reiserfs_panic(NULL, "free space %d, entry_count %d\n", free, + dir_u->entry_count); + } + /* "." and ".." can not be separated from each other */ + if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) + && entries < 2) + entries = 0; -static int direntry_check_right (struct virtual_item * vi, int free) + return entries ? : -1; +} + +static int direntry_check_right(struct virtual_item *vi, int free) { - int i; - int entries = 0; - struct direntry_uarea * dir_u = vi->vi_uarea; - - for (i = dir_u->entry_count - 1; i >= 0; i --) { - if (dir_u->entry_sizes[i] > free) - /* i-th entry doesn't fit into the remaining free space */ - break; - - free -= dir_u->entry_sizes[i]; - entries ++; - } - if (entries == dir_u->entry_count) - BUG (); + int i; + int entries = 0; + struct direntry_uarea *dir_u = vi->vi_uarea; - /* "." and ".." can not be separated from each other */ - if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) && entries > dir_u->entry_count - 2) - entries = dir_u->entry_count - 2; + for (i = dir_u->entry_count - 1; i >= 0; i--) { + if (dir_u->entry_sizes[i] > free) + /* i-th entry doesn't fit into the remaining free space */ + break; - return entries ?: -1; -} + free -= dir_u->entry_sizes[i]; + entries++; + } + if (entries == dir_u->entry_count) + BUG(); + /* "." and ".." can not be separated from each other */ + if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) + && entries > dir_u->entry_count - 2) + entries = dir_u->entry_count - 2; + + return entries ? : -1; +} /* sum of entry sizes between from-th and to-th entries including both edges */ -static int direntry_part_size (struct virtual_item * vi, int first, int count) +static int direntry_part_size(struct virtual_item *vi, int first, int count) { - int i, retval; - int from, to; - struct direntry_uarea * dir_u = vi->vi_uarea; - - retval = 0; - if (first == 0) - from = 0; - else - from = dir_u->entry_count - count; - to = from + count - 1; + int i, retval; + int from, to; + struct direntry_uarea *dir_u = vi->vi_uarea; - for (i = from; i <= to; i ++) - retval += dir_u->entry_sizes[i]; + retval = 0; + if (first == 0) + from = 0; + else + from = dir_u->entry_count - count; + to = from + count - 1; - return retval; -} + for (i = from; i <= to; i++) + retval += dir_u->entry_sizes[i]; -static int direntry_unit_num (struct virtual_item * vi) -{ - struct direntry_uarea * dir_u = vi->vi_uarea; - - return dir_u->entry_count; + return retval; } +static int direntry_unit_num(struct virtual_item *vi) +{ + struct direntry_uarea *dir_u = vi->vi_uarea; + return dir_u->entry_count; +} -static void direntry_print_vi (struct virtual_item * vi) +static void direntry_print_vi(struct virtual_item *vi) { - int i; - struct direntry_uarea * dir_u = vi->vi_uarea; + int i; + struct direntry_uarea *dir_u = vi->vi_uarea; - reiserfs_warning (NULL, "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x", - vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags); - printk ("%d entries: ", dir_u->entry_count); - for (i = 0; i < dir_u->entry_count; i ++) - printk ("%d ", dir_u->entry_sizes[i]); - printk ("\n"); + reiserfs_warning(NULL, "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x", + vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags); + printk("%d entries: ", dir_u->entry_count); + for (i = 0; i < dir_u->entry_count; i++) + printk("%d ", dir_u->entry_sizes[i]); + printk("\n"); } static struct item_operations direntry_ops = { - .bytes_number = direntry_bytes_number, - .decrement_key = direntry_decrement_key, - .is_left_mergeable = direntry_is_left_mergeable, - .print_item = direntry_print_item, - .check_item = direntry_check_item, - - .create_vi = direntry_create_vi, - .check_left = direntry_check_left, - .check_right = direntry_check_right, - .part_size = direntry_part_size, - .unit_num = direntry_unit_num, - .print_vi = direntry_print_vi + .bytes_number = direntry_bytes_number, + .decrement_key = direntry_decrement_key, + .is_left_mergeable = direntry_is_left_mergeable, + .print_item = direntry_print_item, + .check_item = direntry_check_item, + + .create_vi = direntry_create_vi, + .check_left = direntry_check_left, + .check_right = direntry_check_right, + .part_size = direntry_part_size, + .unit_num = direntry_unit_num, + .print_vi = direntry_print_vi }; - ////////////////////////////////////////////////////////////////////////////// // Error catching functions to catch errors caused by incorrect item types. // -static int errcatch_bytes_number (struct item_head * ih, int block_size) +static int errcatch_bytes_number(struct item_head *ih, int block_size) { - reiserfs_warning (NULL, "green-16001: Invalid item type observed, run fsck ASAP"); - return 0; + reiserfs_warning(NULL, + "green-16001: Invalid item type observed, run fsck ASAP"); + return 0; } -static void errcatch_decrement_key (struct cpu_key * key) +static void errcatch_decrement_key(struct cpu_key *key) { - reiserfs_warning (NULL, "green-16002: Invalid item type observed, run fsck ASAP"); + reiserfs_warning(NULL, + "green-16002: Invalid item type observed, run fsck ASAP"); } - -static int errcatch_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize) +static int errcatch_is_left_mergeable(struct reiserfs_key *key, + unsigned long bsize) { - reiserfs_warning (NULL, "green-16003: Invalid item type observed, run fsck ASAP"); - return 0; + reiserfs_warning(NULL, + "green-16003: Invalid item type observed, run fsck ASAP"); + return 0; } - -static void errcatch_print_item (struct item_head * ih, char * item) +static void errcatch_print_item(struct item_head *ih, char *item) { - reiserfs_warning (NULL, "green-16004: Invalid item type observed, run fsck ASAP"); + reiserfs_warning(NULL, + "green-16004: Invalid item type observed, run fsck ASAP"); } - -static void errcatch_check_item (struct item_head * ih, char * item) +static void errcatch_check_item(struct item_head *ih, char *item) { - reiserfs_warning (NULL, "green-16005: Invalid item type observed, run fsck ASAP"); + reiserfs_warning(NULL, + "green-16005: Invalid item type observed, run fsck ASAP"); } -static int errcatch_create_vi (struct virtual_node * vn, - struct virtual_item * vi, - int is_affected, - int insert_size) +static int errcatch_create_vi(struct virtual_node *vn, + struct virtual_item *vi, + int is_affected, int insert_size) { - reiserfs_warning (NULL, "green-16006: Invalid item type observed, run fsck ASAP"); - return 0; // We might return -1 here as well, but it won't help as create_virtual_node() from where - // this operation is called from is of return type void. + reiserfs_warning(NULL, + "green-16006: Invalid item type observed, run fsck ASAP"); + return 0; // We might return -1 here as well, but it won't help as create_virtual_node() from where + // this operation is called from is of return type void. } -static int errcatch_check_left (struct virtual_item * vi, int free, - int start_skip, int end_skip) +static int errcatch_check_left(struct virtual_item *vi, int free, + int start_skip, int end_skip) { - reiserfs_warning (NULL, "green-16007: Invalid item type observed, run fsck ASAP"); - return -1; + reiserfs_warning(NULL, + "green-16007: Invalid item type observed, run fsck ASAP"); + return -1; } - -static int errcatch_check_right (struct virtual_item * vi, int free) +static int errcatch_check_right(struct virtual_item *vi, int free) { - reiserfs_warning (NULL, "green-16008: Invalid item type observed, run fsck ASAP"); - return -1; + reiserfs_warning(NULL, + "green-16008: Invalid item type observed, run fsck ASAP"); + return -1; } -static int errcatch_part_size (struct virtual_item * vi, int first, int count) +static int errcatch_part_size(struct virtual_item *vi, int first, int count) { - reiserfs_warning (NULL, "green-16009: Invalid item type observed, run fsck ASAP"); - return 0; + reiserfs_warning(NULL, + "green-16009: Invalid item type observed, run fsck ASAP"); + return 0; } -static int errcatch_unit_num (struct virtual_item * vi) +static int errcatch_unit_num(struct virtual_item *vi) { - reiserfs_warning (NULL, "green-16010: Invalid item type observed, run fsck ASAP"); - return 0; + reiserfs_warning(NULL, + "green-16010: Invalid item type observed, run fsck ASAP"); + return 0; } -static void errcatch_print_vi (struct virtual_item * vi) +static void errcatch_print_vi(struct virtual_item *vi) { - reiserfs_warning (NULL, "green-16011: Invalid item type observed, run fsck ASAP"); + reiserfs_warning(NULL, + "green-16011: Invalid item type observed, run fsck ASAP"); } static struct item_operations errcatch_ops = { - errcatch_bytes_number, - errcatch_decrement_key, - errcatch_is_left_mergeable, - errcatch_print_item, - errcatch_check_item, - - errcatch_create_vi, - errcatch_check_left, - errcatch_check_right, - errcatch_part_size, - errcatch_unit_num, - errcatch_print_vi + errcatch_bytes_number, + errcatch_decrement_key, + errcatch_is_left_mergeable, + errcatch_print_item, + errcatch_check_item, + + errcatch_create_vi, + errcatch_check_left, + errcatch_check_right, + errcatch_part_size, + errcatch_unit_num, + errcatch_print_vi }; - - ////////////////////////////////////////////////////////////////////////////// // // #if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3) - do not compile +#error Item types must use disk-format assigned values. #endif -struct item_operations * item_ops [TYPE_ANY + 1] = { - &stat_data_ops, - &indirect_ops, - &direct_ops, - &direntry_ops, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - &errcatch_ops /* This is to catch errors with invalid type (15th entry for TYPE_ANY) */ +struct item_operations *item_ops[TYPE_ANY + 1] = { + &stat_data_ops, + &indirect_ops, + &direct_ops, + &direntry_ops, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + &errcatch_ops /* This is to catch errors with invalid type (15th entry for TYPE_ANY) */ }; - - - - diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 3072cfdee95..ca7989b04be 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -55,7 +55,6 @@ #include <linux/writeback.h> #include <linux/blkdev.h> - /* gets a struct reiserfs_journal_list * from a list head */ #define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ j_list)) @@ -69,55 +68,61 @@ static int reiserfs_mounted_fs_count; static struct workqueue_struct *commit_wq; -#define JOURNAL_TRANS_HALF 1018 /* must be correct to keep the desc and commit - structs at 4k */ -#define BUFNR 64 /*read ahead */ +#define JOURNAL_TRANS_HALF 1018 /* must be correct to keep the desc and commit + structs at 4k */ +#define BUFNR 64 /*read ahead */ /* cnode stat bits. Move these into reiserfs_fs.h */ #define BLOCK_FREED 2 /* this block was freed, and can't be written. */ -#define BLOCK_FREED_HOLDER 3 /* this block was freed during this transaction, and can't be written */ +#define BLOCK_FREED_HOLDER 3 /* this block was freed during this transaction, and can't be written */ #define BLOCK_NEEDS_FLUSH 4 /* used in flush_journal_list */ #define BLOCK_DIRTIED 5 - /* journal list state bits */ #define LIST_TOUCHED 1 #define LIST_DIRTY 2 -#define LIST_COMMIT_PENDING 4 /* someone will commit this list */ +#define LIST_COMMIT_PENDING 4 /* someone will commit this list */ /* flags for do_journal_end */ #define FLUSH_ALL 1 /* flush commit and real blocks */ #define COMMIT_NOW 2 /* end and commit this transaction */ -#define WAIT 4 /* wait for the log blocks to hit the disk*/ - -static int do_journal_end(struct reiserfs_transaction_handle *,struct super_block *,unsigned long nblocks,int flags) ; -static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ; -static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ; -static int can_dirty(struct reiserfs_journal_cnode *cn) ; -static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks); -static int release_journal_dev( struct super_block *super, - struct reiserfs_journal *journal ); +#define WAIT 4 /* wait for the log blocks to hit the disk */ + +static int do_journal_end(struct reiserfs_transaction_handle *, + struct super_block *, unsigned long nblocks, + int flags); +static int flush_journal_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall); +static int flush_commit_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall); +static int can_dirty(struct reiserfs_journal_cnode *cn); +static int journal_join(struct reiserfs_transaction_handle *th, + struct super_block *p_s_sb, unsigned long nblocks); +static int release_journal_dev(struct super_block *super, + struct reiserfs_journal *journal); static int dirty_one_transaction(struct super_block *s, - struct reiserfs_journal_list *jl); + struct reiserfs_journal_list *jl); static void flush_async_commits(void *p); static void queue_log_writer(struct super_block *s); /* values for join in do_journal_begin_r */ enum { - JBEGIN_REG = 0, /* regular journal begin */ - JBEGIN_JOIN = 1, /* join the running transaction if at all possible */ - JBEGIN_ABORT = 2, /* called from cleanup code, ignores aborted flag */ + JBEGIN_REG = 0, /* regular journal begin */ + JBEGIN_JOIN = 1, /* join the running transaction if at all possible */ + JBEGIN_ABORT = 2, /* called from cleanup code, ignores aborted flag */ }; static int do_journal_begin_r(struct reiserfs_transaction_handle *th, - struct super_block * p_s_sb, - unsigned long nblocks,int join); + struct super_block *p_s_sb, + unsigned long nblocks, int join); -static void init_journal_hash(struct super_block *p_s_sb) { - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - memset(journal->j_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ; +static void init_journal_hash(struct super_block *p_s_sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + memset(journal->j_hash_table, 0, + JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)); } /* @@ -125,149 +130,159 @@ static void init_journal_hash(struct super_block *p_s_sb) { ** make schedule happen after I've freed a block. Look at remove_from_transaction and journal_mark_freed for ** more details. */ -static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) { - if (bh) { - clear_buffer_dirty(bh); - clear_buffer_journal_test(bh); - } - return 0 ; +static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) +{ + if (bh) { + clear_buffer_dirty(bh); + clear_buffer_journal_test(bh); + } + return 0; } static void disable_barrier(struct super_block *s) { - REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH); - printk("reiserfs: disabling flush barriers on %s\n", reiserfs_bdevname(s)); -} - -static struct reiserfs_bitmap_node * -allocate_bitmap_node(struct super_block *p_s_sb) { - struct reiserfs_bitmap_node *bn ; - static int id; - - bn = reiserfs_kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS, p_s_sb) ; - if (!bn) { - return NULL ; - } - bn->data = reiserfs_kmalloc(p_s_sb->s_blocksize, GFP_NOFS, p_s_sb) ; - if (!bn->data) { - reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ; - return NULL ; - } - bn->id = id++ ; - memset(bn->data, 0, p_s_sb->s_blocksize) ; - INIT_LIST_HEAD(&bn->list) ; - return bn ; -} - -static struct reiserfs_bitmap_node * -get_bitmap_node(struct super_block *p_s_sb) { - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - struct reiserfs_bitmap_node *bn = NULL; - struct list_head *entry = journal->j_bitmap_nodes.next ; - - journal->j_used_bitmap_nodes++ ; -repeat: - - if(entry != &journal->j_bitmap_nodes) { - bn = list_entry(entry, struct reiserfs_bitmap_node, list) ; - list_del(entry) ; - memset(bn->data, 0, p_s_sb->s_blocksize) ; - journal->j_free_bitmap_nodes-- ; - return bn ; - } - bn = allocate_bitmap_node(p_s_sb) ; - if (!bn) { - yield(); - goto repeat ; - } - return bn ; + REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH); + printk("reiserfs: disabling flush barriers on %s\n", + reiserfs_bdevname(s)); +} + +static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block + *p_s_sb) +{ + struct reiserfs_bitmap_node *bn; + static int id; + + bn = reiserfs_kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS, + p_s_sb); + if (!bn) { + return NULL; + } + bn->data = reiserfs_kmalloc(p_s_sb->s_blocksize, GFP_NOFS, p_s_sb); + if (!bn->data) { + reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb); + return NULL; + } + bn->id = id++; + memset(bn->data, 0, p_s_sb->s_blocksize); + INIT_LIST_HEAD(&bn->list); + return bn; +} + +static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *p_s_sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + struct reiserfs_bitmap_node *bn = NULL; + struct list_head *entry = journal->j_bitmap_nodes.next; + + journal->j_used_bitmap_nodes++; + repeat: + + if (entry != &journal->j_bitmap_nodes) { + bn = list_entry(entry, struct reiserfs_bitmap_node, list); + list_del(entry); + memset(bn->data, 0, p_s_sb->s_blocksize); + journal->j_free_bitmap_nodes--; + return bn; + } + bn = allocate_bitmap_node(p_s_sb); + if (!bn) { + yield(); + goto repeat; + } + return bn; } static inline void free_bitmap_node(struct super_block *p_s_sb, - struct reiserfs_bitmap_node *bn) { - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - journal->j_used_bitmap_nodes-- ; - if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) { - reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb) ; - reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ; - } else { - list_add(&bn->list, &journal->j_bitmap_nodes) ; - journal->j_free_bitmap_nodes++ ; - } -} - -static void allocate_bitmap_nodes(struct super_block *p_s_sb) { - int i ; - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - struct reiserfs_bitmap_node *bn = NULL ; - for (i = 0 ; i < REISERFS_MIN_BITMAP_NODES ; i++) { - bn = allocate_bitmap_node(p_s_sb) ; - if (bn) { - list_add(&bn->list, &journal->j_bitmap_nodes) ; - journal->j_free_bitmap_nodes++ ; - } else { - break ; // this is ok, we'll try again when more are needed - } - } + struct reiserfs_bitmap_node *bn) +{ + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + journal->j_used_bitmap_nodes--; + if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) { + reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb); + reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb); + } else { + list_add(&bn->list, &journal->j_bitmap_nodes); + journal->j_free_bitmap_nodes++; + } +} + +static void allocate_bitmap_nodes(struct super_block *p_s_sb) +{ + int i; + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + struct reiserfs_bitmap_node *bn = NULL; + for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) { + bn = allocate_bitmap_node(p_s_sb); + if (bn) { + list_add(&bn->list, &journal->j_bitmap_nodes); + journal->j_free_bitmap_nodes++; + } else { + break; // this is ok, we'll try again when more are needed + } + } } static int set_bit_in_list_bitmap(struct super_block *p_s_sb, int block, - struct reiserfs_list_bitmap *jb) { - int bmap_nr = block / (p_s_sb->s_blocksize << 3) ; - int bit_nr = block % (p_s_sb->s_blocksize << 3) ; + struct reiserfs_list_bitmap *jb) +{ + int bmap_nr = block / (p_s_sb->s_blocksize << 3); + int bit_nr = block % (p_s_sb->s_blocksize << 3); - if (!jb->bitmaps[bmap_nr]) { - jb->bitmaps[bmap_nr] = get_bitmap_node(p_s_sb) ; - } - set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data) ; - return 0 ; + if (!jb->bitmaps[bmap_nr]) { + jb->bitmaps[bmap_nr] = get_bitmap_node(p_s_sb); + } + set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data); + return 0; } static void cleanup_bitmap_list(struct super_block *p_s_sb, - struct reiserfs_list_bitmap *jb) { - int i; - if (jb->bitmaps == NULL) - return; - - for (i = 0 ; i < SB_BMAP_NR(p_s_sb) ; i++) { - if (jb->bitmaps[i]) { - free_bitmap_node(p_s_sb, jb->bitmaps[i]) ; - jb->bitmaps[i] = NULL ; - } - } + struct reiserfs_list_bitmap *jb) +{ + int i; + if (jb->bitmaps == NULL) + return; + + for (i = 0; i < SB_BMAP_NR(p_s_sb); i++) { + if (jb->bitmaps[i]) { + free_bitmap_node(p_s_sb, jb->bitmaps[i]); + jb->bitmaps[i] = NULL; + } + } } /* ** only call this on FS unmount. */ static int free_list_bitmaps(struct super_block *p_s_sb, - struct reiserfs_list_bitmap *jb_array) { - int i ; - struct reiserfs_list_bitmap *jb ; - for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) { - jb = jb_array + i ; - jb->journal_list = NULL ; - cleanup_bitmap_list(p_s_sb, jb) ; - vfree(jb->bitmaps) ; - jb->bitmaps = NULL ; - } - return 0; -} - -static int free_bitmap_nodes(struct super_block *p_s_sb) { - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - struct list_head *next = journal->j_bitmap_nodes.next ; - struct reiserfs_bitmap_node *bn ; - - while(next != &journal->j_bitmap_nodes) { - bn = list_entry(next, struct reiserfs_bitmap_node, list) ; - list_del(next) ; - reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb) ; - reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ; - next = journal->j_bitmap_nodes.next ; - journal->j_free_bitmap_nodes-- ; - } - - return 0 ; + struct reiserfs_list_bitmap *jb_array) +{ + int i; + struct reiserfs_list_bitmap *jb; + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { + jb = jb_array + i; + jb->journal_list = NULL; + cleanup_bitmap_list(p_s_sb, jb); + vfree(jb->bitmaps); + jb->bitmaps = NULL; + } + return 0; +} + +static int free_bitmap_nodes(struct super_block *p_s_sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + struct list_head *next = journal->j_bitmap_nodes.next; + struct reiserfs_bitmap_node *bn; + + while (next != &journal->j_bitmap_nodes) { + bn = list_entry(next, struct reiserfs_bitmap_node, list); + list_del(next); + reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb); + reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb); + next = journal->j_bitmap_nodes.next; + journal->j_free_bitmap_nodes--; + } + + return 0; } /* @@ -275,59 +290,65 @@ static int free_bitmap_nodes(struct super_block *p_s_sb) { ** jb_array is the array to be filled in. */ int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb, - struct reiserfs_list_bitmap *jb_array, - int bmap_nr) { - int i ; - int failed = 0 ; - struct reiserfs_list_bitmap *jb ; - int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *) ; - - for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) { - jb = jb_array + i ; - jb->journal_list = NULL ; - jb->bitmaps = vmalloc( mem ) ; - if (!jb->bitmaps) { - reiserfs_warning(p_s_sb, "clm-2000, unable to allocate bitmaps for journal lists") ; - failed = 1; - break ; - } - memset(jb->bitmaps, 0, mem) ; - } - if (failed) { - free_list_bitmaps(p_s_sb, jb_array) ; - return -1 ; - } - return 0 ; + struct reiserfs_list_bitmap *jb_array, + int bmap_nr) +{ + int i; + int failed = 0; + struct reiserfs_list_bitmap *jb; + int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *); + + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { + jb = jb_array + i; + jb->journal_list = NULL; + jb->bitmaps = vmalloc(mem); + if (!jb->bitmaps) { + reiserfs_warning(p_s_sb, + "clm-2000, unable to allocate bitmaps for journal lists"); + failed = 1; + break; + } + memset(jb->bitmaps, 0, mem); + } + if (failed) { + free_list_bitmaps(p_s_sb, jb_array); + return -1; + } + return 0; } /* ** find an available list bitmap. If you can't find one, flush a commit list ** and try again */ -static struct reiserfs_list_bitmap * -get_list_bitmap(struct super_block *p_s_sb, struct reiserfs_journal_list *jl) { - int i,j ; - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - struct reiserfs_list_bitmap *jb = NULL ; - - for (j = 0 ; j < (JOURNAL_NUM_BITMAPS * 3) ; j++) { - i = journal->j_list_bitmap_index ; - journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS ; - jb = journal->j_list_bitmap + i ; - if (journal->j_list_bitmap[i].journal_list) { - flush_commit_list(p_s_sb, journal->j_list_bitmap[i].journal_list, 1) ; - if (!journal->j_list_bitmap[i].journal_list) { - break ; - } - } else { - break ; - } - } - if (jb->journal_list) { /* double check to make sure if flushed correctly */ - return NULL ; - } - jb->journal_list = jl ; - return jb ; +static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *p_s_sb, + struct reiserfs_journal_list + *jl) +{ + int i, j; + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + struct reiserfs_list_bitmap *jb = NULL; + + for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) { + i = journal->j_list_bitmap_index; + journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS; + jb = journal->j_list_bitmap + i; + if (journal->j_list_bitmap[i].journal_list) { + flush_commit_list(p_s_sb, + journal->j_list_bitmap[i]. + journal_list, 1); + if (!journal->j_list_bitmap[i].journal_list) { + break; + } + } else { + break; + } + } + if (jb->journal_list) { /* double check to make sure if flushed correctly */ + return NULL; + } + jb->journal_list = jl; + return jb; } /* @@ -335,104 +356,114 @@ get_list_bitmap(struct super_block *p_s_sb, struct reiserfs_journal_list *jl) { ** Uses the cnode->next and cnode->prev pointers ** returns NULL on failure */ -static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) { - struct reiserfs_journal_cnode *head ; - int i ; - if (num_cnodes <= 0) { - return NULL ; - } - head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)) ; - if (!head) { - return NULL ; - } - memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode)) ; - head[0].prev = NULL ; - head[0].next = head + 1 ; - for (i = 1 ; i < num_cnodes; i++) { - head[i].prev = head + (i - 1) ; - head[i].next = head + (i + 1) ; /* if last one, overwrite it after the if */ - } - head[num_cnodes -1].next = NULL ; - return head ; +static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) +{ + struct reiserfs_journal_cnode *head; + int i; + if (num_cnodes <= 0) { + return NULL; + } + head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)); + if (!head) { + return NULL; + } + memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode)); + head[0].prev = NULL; + head[0].next = head + 1; + for (i = 1; i < num_cnodes; i++) { + head[i].prev = head + (i - 1); + head[i].next = head + (i + 1); /* if last one, overwrite it after the if */ + } + head[num_cnodes - 1].next = NULL; + return head; } /* ** pulls a cnode off the free list, or returns NULL on failure */ -static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb) { - struct reiserfs_journal_cnode *cn ; - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - - reiserfs_check_lock_depth(p_s_sb, "get_cnode") ; - - if (journal->j_cnode_free <= 0) { - return NULL ; - } - journal->j_cnode_used++ ; - journal->j_cnode_free-- ; - cn = journal->j_cnode_free_list ; - if (!cn) { - return cn ; - } - if (cn->next) { - cn->next->prev = NULL ; - } - journal->j_cnode_free_list = cn->next ; - memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; - return cn ; +static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb) +{ + struct reiserfs_journal_cnode *cn; + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + + reiserfs_check_lock_depth(p_s_sb, "get_cnode"); + + if (journal->j_cnode_free <= 0) { + return NULL; + } + journal->j_cnode_used++; + journal->j_cnode_free--; + cn = journal->j_cnode_free_list; + if (!cn) { + return cn; + } + if (cn->next) { + cn->next->prev = NULL; + } + journal->j_cnode_free_list = cn->next; + memset(cn, 0, sizeof(struct reiserfs_journal_cnode)); + return cn; } /* ** returns a cnode to the free list */ -static void free_cnode(struct super_block *p_s_sb, struct reiserfs_journal_cnode *cn) { - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); +static void free_cnode(struct super_block *p_s_sb, + struct reiserfs_journal_cnode *cn) +{ + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); - reiserfs_check_lock_depth(p_s_sb, "free_cnode") ; + reiserfs_check_lock_depth(p_s_sb, "free_cnode"); - journal->j_cnode_used-- ; - journal->j_cnode_free++ ; - /* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */ - cn->next = journal->j_cnode_free_list ; - if (journal->j_cnode_free_list) { - journal->j_cnode_free_list->prev = cn ; - } - cn->prev = NULL ; /* not needed with the memset, but I might kill the memset, and forget to do this */ - journal->j_cnode_free_list = cn ; + journal->j_cnode_used--; + journal->j_cnode_free++; + /* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */ + cn->next = journal->j_cnode_free_list; + if (journal->j_cnode_free_list) { + journal->j_cnode_free_list->prev = cn; + } + cn->prev = NULL; /* not needed with the memset, but I might kill the memset, and forget to do this */ + journal->j_cnode_free_list = cn; } -static void clear_prepared_bits(struct buffer_head *bh) { - clear_buffer_journal_prepared (bh); - clear_buffer_journal_restore_dirty (bh); +static void clear_prepared_bits(struct buffer_head *bh) +{ + clear_buffer_journal_prepared(bh); + clear_buffer_journal_restore_dirty(bh); } /* utility function to force a BUG if it is called without the big ** kernel lock held. caller is the string printed just before calling BUG() */ -void reiserfs_check_lock_depth(struct super_block *sb, char *caller) { +void reiserfs_check_lock_depth(struct super_block *sb, char *caller) +{ #ifdef CONFIG_SMP - if (current->lock_depth < 0) { - reiserfs_panic (sb, "%s called without kernel lock held", caller) ; - } + if (current->lock_depth < 0) { + reiserfs_panic(sb, "%s called without kernel lock held", + caller); + } #else - ; + ; #endif } /* return a cnode with same dev, block number and size in table, or null if not found */ -static inline struct reiserfs_journal_cnode * -get_journal_hash_dev(struct super_block *sb, - struct reiserfs_journal_cnode **table, - long bl) +static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct + super_block + *sb, + struct + reiserfs_journal_cnode + **table, + long bl) { - struct reiserfs_journal_cnode *cn ; - cn = journal_hash(table, sb, bl) ; - while(cn) { - if (cn->blocknr == bl && cn->sb == sb) - return cn ; - cn = cn->hnext ; - } - return (struct reiserfs_journal_cnode *)0 ; + struct reiserfs_journal_cnode *cn; + cn = journal_hash(table, sb, bl); + while (cn) { + if (cn->blocknr == bl && cn->sb == sb) + return cn; + cn = cn->hnext; + } + return (struct reiserfs_journal_cnode *)0; } /* @@ -454,91 +485,103 @@ get_journal_hash_dev(struct super_block *sb, ** */ int reiserfs_in_journal(struct super_block *p_s_sb, - int bmap_nr, int bit_nr, int search_all, - b_blocknr_t *next_zero_bit) { - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - struct reiserfs_journal_cnode *cn ; - struct reiserfs_list_bitmap *jb ; - int i ; - unsigned long bl; - - *next_zero_bit = 0 ; /* always start this at zero. */ - - PROC_INFO_INC( p_s_sb, journal.in_journal ); - /* If we aren't doing a search_all, this is a metablock, and it will be logged before use. - ** if we crash before the transaction that freed it commits, this transaction won't - ** have committed either, and the block will never be written - */ - if (search_all) { - for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) { - PROC_INFO_INC( p_s_sb, journal.in_journal_bitmap ); - jb = journal->j_list_bitmap + i ; - if (jb->journal_list && jb->bitmaps[bmap_nr] && - test_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data)) { - *next_zero_bit = find_next_zero_bit((unsigned long *) - (jb->bitmaps[bmap_nr]->data), - p_s_sb->s_blocksize << 3, bit_nr+1) ; - return 1 ; - } - } - } - - bl = bmap_nr * (p_s_sb->s_blocksize << 3) + bit_nr; - /* is it in any old transactions? */ - if (search_all && (cn = get_journal_hash_dev(p_s_sb, journal->j_list_hash_table, bl))) { - return 1; - } - - /* is it in the current transaction. This should never happen */ - if ((cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, bl))) { - BUG(); - return 1; - } - - PROC_INFO_INC( p_s_sb, journal.in_journal_reusable ); - /* safe for reuse */ - return 0 ; + int bmap_nr, int bit_nr, int search_all, + b_blocknr_t * next_zero_bit) +{ + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + struct reiserfs_journal_cnode *cn; + struct reiserfs_list_bitmap *jb; + int i; + unsigned long bl; + + *next_zero_bit = 0; /* always start this at zero. */ + + PROC_INFO_INC(p_s_sb, journal.in_journal); + /* If we aren't doing a search_all, this is a metablock, and it will be logged before use. + ** if we crash before the transaction that freed it commits, this transaction won't + ** have committed either, and the block will never be written + */ + if (search_all) { + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { + PROC_INFO_INC(p_s_sb, journal.in_journal_bitmap); + jb = journal->j_list_bitmap + i; + if (jb->journal_list && jb->bitmaps[bmap_nr] && + test_bit(bit_nr, + (unsigned long *)jb->bitmaps[bmap_nr]-> + data)) { + *next_zero_bit = + find_next_zero_bit((unsigned long *) + (jb->bitmaps[bmap_nr]-> + data), + p_s_sb->s_blocksize << 3, + bit_nr + 1); + return 1; + } + } + } + + bl = bmap_nr * (p_s_sb->s_blocksize << 3) + bit_nr; + /* is it in any old transactions? */ + if (search_all + && (cn = + get_journal_hash_dev(p_s_sb, journal->j_list_hash_table, bl))) { + return 1; + } + + /* is it in the current transaction. This should never happen */ + if ((cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, bl))) { + BUG(); + return 1; + } + + PROC_INFO_INC(p_s_sb, journal.in_journal_reusable); + /* safe for reuse */ + return 0; } /* insert cn into table */ -static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, struct reiserfs_journal_cnode *cn) { - struct reiserfs_journal_cnode *cn_orig ; +static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, + struct reiserfs_journal_cnode *cn) +{ + struct reiserfs_journal_cnode *cn_orig; - cn_orig = journal_hash(table, cn->sb, cn->blocknr) ; - cn->hnext = cn_orig ; - cn->hprev = NULL ; - if (cn_orig) { - cn_orig->hprev = cn ; - } - journal_hash(table, cn->sb, cn->blocknr) = cn ; + cn_orig = journal_hash(table, cn->sb, cn->blocknr); + cn->hnext = cn_orig; + cn->hprev = NULL; + if (cn_orig) { + cn_orig->hprev = cn; + } + journal_hash(table, cn->sb, cn->blocknr) = cn; } /* lock the current transaction */ -inline static void lock_journal(struct super_block *p_s_sb) { - PROC_INFO_INC( p_s_sb, journal.lock_journal ); - down(&SB_JOURNAL(p_s_sb)->j_lock); +static inline void lock_journal(struct super_block *p_s_sb) +{ + PROC_INFO_INC(p_s_sb, journal.lock_journal); + down(&SB_JOURNAL(p_s_sb)->j_lock); } /* unlock the current transaction */ -inline static void unlock_journal(struct super_block *p_s_sb) { - up(&SB_JOURNAL(p_s_sb)->j_lock); +static inline void unlock_journal(struct super_block *p_s_sb) +{ + up(&SB_JOURNAL(p_s_sb)->j_lock); } static inline void get_journal_list(struct reiserfs_journal_list *jl) { - jl->j_refcount++; + jl->j_refcount++; } static inline void put_journal_list(struct super_block *s, - struct reiserfs_journal_list *jl) + struct reiserfs_journal_list *jl) { - if (jl->j_refcount < 1) { - reiserfs_panic (s, "trans id %lu, refcount at %d", jl->j_trans_id, - jl->j_refcount); - } - if (--jl->j_refcount == 0) - reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s); + if (jl->j_refcount < 1) { + reiserfs_panic(s, "trans id %lu, refcount at %d", + jl->j_trans_id, jl->j_refcount); + } + if (--jl->j_refcount == 0) + reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s); } /* @@ -546,354 +589,375 @@ static inline void put_journal_list(struct super_block *s, ** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a ** transaction. */ -static void cleanup_freed_for_journal_list(struct super_block *p_s_sb, struct reiserfs_journal_list *jl) { +static void cleanup_freed_for_journal_list(struct super_block *p_s_sb, + struct reiserfs_journal_list *jl) +{ - struct reiserfs_list_bitmap *jb = jl->j_list_bitmap ; - if (jb) { - cleanup_bitmap_list(p_s_sb, jb) ; - } - jl->j_list_bitmap->journal_list = NULL ; - jl->j_list_bitmap = NULL ; + struct reiserfs_list_bitmap *jb = jl->j_list_bitmap; + if (jb) { + cleanup_bitmap_list(p_s_sb, jb); + } + jl->j_list_bitmap->journal_list = NULL; + jl->j_list_bitmap = NULL; } static int journal_list_still_alive(struct super_block *s, - unsigned long trans_id) -{ - struct reiserfs_journal *journal = SB_JOURNAL (s); - struct list_head *entry = &journal->j_journal_list; - struct reiserfs_journal_list *jl; - - if (!list_empty(entry)) { - jl = JOURNAL_LIST_ENTRY(entry->next); - if (jl->j_trans_id <= trans_id) { - return 1; - } - } - return 0; -} - -static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { - char b[BDEVNAME_SIZE]; - - if (buffer_journaled(bh)) { - reiserfs_warning(NULL, "clm-2084: pinned buffer %lu:%s sent to disk", - bh->b_blocknr, bdevname(bh->b_bdev, b)) ; - } - if (uptodate) - set_buffer_uptodate(bh) ; - else - clear_buffer_uptodate(bh) ; - unlock_buffer(bh) ; - put_bh(bh) ; -} - -static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate) { - if (uptodate) - set_buffer_uptodate(bh) ; - else - clear_buffer_uptodate(bh) ; - unlock_buffer(bh) ; - put_bh(bh) ; -} - -static void submit_logged_buffer(struct buffer_head *bh) { - get_bh(bh) ; - bh->b_end_io = reiserfs_end_buffer_io_sync ; - clear_buffer_journal_new (bh); - clear_buffer_dirty(bh) ; - if (!test_clear_buffer_journal_test (bh)) - BUG(); - if (!buffer_uptodate(bh)) - BUG(); - submit_bh(WRITE, bh) ; -} - -static void submit_ordered_buffer(struct buffer_head *bh) { - get_bh(bh) ; - bh->b_end_io = reiserfs_end_ordered_io; - clear_buffer_dirty(bh) ; - if (!buffer_uptodate(bh)) - BUG(); - submit_bh(WRITE, bh) ; -} - -static int submit_barrier_buffer(struct buffer_head *bh) { - get_bh(bh) ; - bh->b_end_io = reiserfs_end_ordered_io; - clear_buffer_dirty(bh) ; - if (!buffer_uptodate(bh)) - BUG(); - return submit_bh(WRITE_BARRIER, bh) ; + unsigned long trans_id) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + struct list_head *entry = &journal->j_journal_list; + struct reiserfs_journal_list *jl; + + if (!list_empty(entry)) { + jl = JOURNAL_LIST_ENTRY(entry->next); + if (jl->j_trans_id <= trans_id) { + return 1; + } + } + return 0; +} + +static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + + if (buffer_journaled(bh)) { + reiserfs_warning(NULL, + "clm-2084: pinned buffer %lu:%s sent to disk", + bh->b_blocknr, bdevname(bh->b_bdev, b)); + } + if (uptodate) + set_buffer_uptodate(bh); + else + clear_buffer_uptodate(bh); + unlock_buffer(bh); + put_bh(bh); +} + +static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate) +{ + if (uptodate) + set_buffer_uptodate(bh); + else + clear_buffer_uptodate(bh); + unlock_buffer(bh); + put_bh(bh); +} + +static void submit_logged_buffer(struct buffer_head *bh) +{ + get_bh(bh); + bh->b_end_io = reiserfs_end_buffer_io_sync; + clear_buffer_journal_new(bh); + clear_buffer_dirty(bh); + if (!test_clear_buffer_journal_test(bh)) + BUG(); + if (!buffer_uptodate(bh)) + BUG(); + submit_bh(WRITE, bh); +} + +static void submit_ordered_buffer(struct buffer_head *bh) +{ + get_bh(bh); + bh->b_end_io = reiserfs_end_ordered_io; + clear_buffer_dirty(bh); + if (!buffer_uptodate(bh)) + BUG(); + submit_bh(WRITE, bh); +} + +static int submit_barrier_buffer(struct buffer_head *bh) +{ + get_bh(bh); + bh->b_end_io = reiserfs_end_ordered_io; + clear_buffer_dirty(bh); + if (!buffer_uptodate(bh)) + BUG(); + return submit_bh(WRITE_BARRIER, bh); } static void check_barrier_completion(struct super_block *s, - struct buffer_head *bh) { - if (buffer_eopnotsupp(bh)) { - clear_buffer_eopnotsupp(bh); - disable_barrier(s); - set_buffer_uptodate(bh); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - } + struct buffer_head *bh) +{ + if (buffer_eopnotsupp(bh)) { + clear_buffer_eopnotsupp(bh); + disable_barrier(s); + set_buffer_uptodate(bh); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + } } #define CHUNK_SIZE 32 struct buffer_chunk { - struct buffer_head *bh[CHUNK_SIZE]; - int nr; + struct buffer_head *bh[CHUNK_SIZE]; + int nr; }; -static void write_chunk(struct buffer_chunk *chunk) { - int i; - for (i = 0; i < chunk->nr ; i++) { - submit_logged_buffer(chunk->bh[i]) ; - } - chunk->nr = 0; +static void write_chunk(struct buffer_chunk *chunk) +{ + int i; + get_fs_excl(); + for (i = 0; i < chunk->nr; i++) { + submit_logged_buffer(chunk->bh[i]); + } + chunk->nr = 0; + put_fs_excl(); } -static void write_ordered_chunk(struct buffer_chunk *chunk) { - int i; - for (i = 0; i < chunk->nr ; i++) { - submit_ordered_buffer(chunk->bh[i]) ; - } - chunk->nr = 0; +static void write_ordered_chunk(struct buffer_chunk *chunk) +{ + int i; + get_fs_excl(); + for (i = 0; i < chunk->nr; i++) { + submit_ordered_buffer(chunk->bh[i]); + } + chunk->nr = 0; + put_fs_excl(); } static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, - spinlock_t *lock, - void (fn)(struct buffer_chunk *)) + spinlock_t * lock, void (fn) (struct buffer_chunk *)) { - int ret = 0; - if (chunk->nr >= CHUNK_SIZE) - BUG(); - chunk->bh[chunk->nr++] = bh; - if (chunk->nr >= CHUNK_SIZE) { - ret = 1; - if (lock) - spin_unlock(lock); - fn(chunk); - if (lock) - spin_lock(lock); - } - return ret; + int ret = 0; + if (chunk->nr >= CHUNK_SIZE) + BUG(); + chunk->bh[chunk->nr++] = bh; + if (chunk->nr >= CHUNK_SIZE) { + ret = 1; + if (lock) + spin_unlock(lock); + fn(chunk); + if (lock) + spin_lock(lock); + } + return ret; } - static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0); -static struct reiserfs_jh *alloc_jh(void) { - struct reiserfs_jh *jh; - while(1) { - jh = kmalloc(sizeof(*jh), GFP_NOFS); - if (jh) { - atomic_inc(&nr_reiserfs_jh); - return jh; +static struct reiserfs_jh *alloc_jh(void) +{ + struct reiserfs_jh *jh; + while (1) { + jh = kmalloc(sizeof(*jh), GFP_NOFS); + if (jh) { + atomic_inc(&nr_reiserfs_jh); + return jh; + } + yield(); } - yield(); - } } /* * we want to free the jh when the buffer has been written * and waited on */ -void reiserfs_free_jh(struct buffer_head *bh) { - struct reiserfs_jh *jh; - - jh = bh->b_private; - if (jh) { - bh->b_private = NULL; - jh->bh = NULL; - list_del_init(&jh->list); - kfree(jh); - if (atomic_read(&nr_reiserfs_jh) <= 0) - BUG(); - atomic_dec(&nr_reiserfs_jh); - put_bh(bh); - } +void reiserfs_free_jh(struct buffer_head *bh) +{ + struct reiserfs_jh *jh; + + jh = bh->b_private; + if (jh) { + bh->b_private = NULL; + jh->bh = NULL; + list_del_init(&jh->list); + kfree(jh); + if (atomic_read(&nr_reiserfs_jh) <= 0) + BUG(); + atomic_dec(&nr_reiserfs_jh); + put_bh(bh); + } } static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh, - int tail) + int tail) { - struct reiserfs_jh *jh; + struct reiserfs_jh *jh; - if (bh->b_private) { - spin_lock(&j->j_dirty_buffers_lock); - if (!bh->b_private) { - spin_unlock(&j->j_dirty_buffers_lock); - goto no_jh; + if (bh->b_private) { + spin_lock(&j->j_dirty_buffers_lock); + if (!bh->b_private) { + spin_unlock(&j->j_dirty_buffers_lock); + goto no_jh; + } + jh = bh->b_private; + list_del_init(&jh->list); + } else { + no_jh: + get_bh(bh); + jh = alloc_jh(); + spin_lock(&j->j_dirty_buffers_lock); + /* buffer must be locked for __add_jh, should be able to have + * two adds at the same time + */ + if (bh->b_private) + BUG(); + jh->bh = bh; + bh->b_private = jh; } - jh = bh->b_private; - list_del_init(&jh->list); - } else { -no_jh: - get_bh(bh); - jh = alloc_jh(); - spin_lock(&j->j_dirty_buffers_lock); - /* buffer must be locked for __add_jh, should be able to have - * two adds at the same time - */ - if (bh->b_private) - BUG(); - jh->bh = bh; - bh->b_private = jh; - } - jh->jl = j->j_current_jl; - if (tail) - list_add_tail(&jh->list, &jh->jl->j_tail_bh_list); - else { - list_add_tail(&jh->list, &jh->jl->j_bh_list); - } - spin_unlock(&j->j_dirty_buffers_lock); - return 0; + jh->jl = j->j_current_jl; + if (tail) + list_add_tail(&jh->list, &jh->jl->j_tail_bh_list); + else { + list_add_tail(&jh->list, &jh->jl->j_bh_list); + } + spin_unlock(&j->j_dirty_buffers_lock); + return 0; } -int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh) { - return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1); +int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh) +{ + return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1); } -int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh) { - return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0); +int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh) +{ + return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0); } #define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list) -static int write_ordered_buffers(spinlock_t *lock, +static int write_ordered_buffers(spinlock_t * lock, struct reiserfs_journal *j, - struct reiserfs_journal_list *jl, + struct reiserfs_journal_list *jl, struct list_head *list) { - struct buffer_head *bh; - struct reiserfs_jh *jh; - int ret = j->j_errno; - struct buffer_chunk chunk; - struct list_head tmp; - INIT_LIST_HEAD(&tmp); - - chunk.nr = 0; - spin_lock(lock); - while(!list_empty(list)) { - jh = JH_ENTRY(list->next); - bh = jh->bh; - get_bh(bh); - if (test_set_buffer_locked(bh)) { - if (!buffer_dirty(bh)) { - list_del_init(&jh->list); - list_add(&jh->list, &tmp); - goto loop_next; - } - spin_unlock(lock); - if (chunk.nr) + struct buffer_head *bh; + struct reiserfs_jh *jh; + int ret = j->j_errno; + struct buffer_chunk chunk; + struct list_head tmp; + INIT_LIST_HEAD(&tmp); + + chunk.nr = 0; + spin_lock(lock); + while (!list_empty(list)) { + jh = JH_ENTRY(list->next); + bh = jh->bh; + get_bh(bh); + if (test_set_buffer_locked(bh)) { + if (!buffer_dirty(bh)) { + list_del_init(&jh->list); + list_add(&jh->list, &tmp); + goto loop_next; + } + spin_unlock(lock); + if (chunk.nr) + write_ordered_chunk(&chunk); + wait_on_buffer(bh); + cond_resched(); + spin_lock(lock); + goto loop_next; + } + if (buffer_dirty(bh)) { + list_del_init(&jh->list); + list_add(&jh->list, &tmp); + add_to_chunk(&chunk, bh, lock, write_ordered_chunk); + } else { + reiserfs_free_jh(bh); + unlock_buffer(bh); + } + loop_next: + put_bh(bh); + cond_resched_lock(lock); + } + if (chunk.nr) { + spin_unlock(lock); write_ordered_chunk(&chunk); - wait_on_buffer(bh); - cond_resched(); - spin_lock(lock); - goto loop_next; - } - if (buffer_dirty(bh)) { - list_del_init(&jh->list); - list_add(&jh->list, &tmp); - add_to_chunk(&chunk, bh, lock, write_ordered_chunk); - } else { - reiserfs_free_jh(bh); - unlock_buffer(bh); + spin_lock(lock); } -loop_next: - put_bh(bh); - cond_resched_lock(lock); - } - if (chunk.nr) { - spin_unlock(lock); - write_ordered_chunk(&chunk); - spin_lock(lock); - } - while(!list_empty(&tmp)) { - jh = JH_ENTRY(tmp.prev); - bh = jh->bh; - get_bh(bh); - reiserfs_free_jh(bh); - - if (buffer_locked(bh)) { - spin_unlock(lock); - wait_on_buffer(bh); - spin_lock(lock); + while (!list_empty(&tmp)) { + jh = JH_ENTRY(tmp.prev); + bh = jh->bh; + get_bh(bh); + reiserfs_free_jh(bh); + + if (buffer_locked(bh)) { + spin_unlock(lock); + wait_on_buffer(bh); + spin_lock(lock); + } + if (!buffer_uptodate(bh)) { + ret = -EIO; + } + put_bh(bh); + cond_resched_lock(lock); } - if (!buffer_uptodate(bh)) { - ret = -EIO; - } - put_bh(bh); - cond_resched_lock(lock); - } - spin_unlock(lock); - return ret; -} - -static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) { - struct reiserfs_journal *journal = SB_JOURNAL (s); - struct reiserfs_journal_list *other_jl; - struct reiserfs_journal_list *first_jl; - struct list_head *entry; - unsigned long trans_id = jl->j_trans_id; - unsigned long other_trans_id; - unsigned long first_trans_id; - -find_first: - /* - * first we walk backwards to find the oldest uncommitted transation - */ - first_jl = jl; - entry = jl->j_list.prev; - while(1) { - other_jl = JOURNAL_LIST_ENTRY(entry); - if (entry == &journal->j_journal_list || - atomic_read(&other_jl->j_older_commits_done)) - break; - - first_jl = other_jl; - entry = other_jl->j_list.prev; - } - - /* if we didn't find any older uncommitted transactions, return now */ - if (first_jl == jl) { - return 0; - } - - first_trans_id = first_jl->j_trans_id; + spin_unlock(lock); + return ret; +} - entry = &first_jl->j_list; - while(1) { - other_jl = JOURNAL_LIST_ENTRY(entry); - other_trans_id = other_jl->j_trans_id; +static int flush_older_commits(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + struct reiserfs_journal_list *other_jl; + struct reiserfs_journal_list *first_jl; + struct list_head *entry; + unsigned long trans_id = jl->j_trans_id; + unsigned long other_trans_id; + unsigned long first_trans_id; + + find_first: + /* + * first we walk backwards to find the oldest uncommitted transation + */ + first_jl = jl; + entry = jl->j_list.prev; + while (1) { + other_jl = JOURNAL_LIST_ENTRY(entry); + if (entry == &journal->j_journal_list || + atomic_read(&other_jl->j_older_commits_done)) + break; - if (other_trans_id < trans_id) { - if (atomic_read(&other_jl->j_commit_left) != 0) { - flush_commit_list(s, other_jl, 0); + first_jl = other_jl; + entry = other_jl->j_list.prev; + } - /* list we were called with is gone, return */ - if (!journal_list_still_alive(s, trans_id)) - return 1; + /* if we didn't find any older uncommitted transactions, return now */ + if (first_jl == jl) { + return 0; + } - /* the one we just flushed is gone, this means all - * older lists are also gone, so first_jl is no longer - * valid either. Go back to the beginning. - */ - if (!journal_list_still_alive(s, other_trans_id)) { - goto find_first; + first_trans_id = first_jl->j_trans_id; + + entry = &first_jl->j_list; + while (1) { + other_jl = JOURNAL_LIST_ENTRY(entry); + other_trans_id = other_jl->j_trans_id; + + if (other_trans_id < trans_id) { + if (atomic_read(&other_jl->j_commit_left) != 0) { + flush_commit_list(s, other_jl, 0); + + /* list we were called with is gone, return */ + if (!journal_list_still_alive(s, trans_id)) + return 1; + + /* the one we just flushed is gone, this means all + * older lists are also gone, so first_jl is no longer + * valid either. Go back to the beginning. + */ + if (!journal_list_still_alive + (s, other_trans_id)) { + goto find_first; + } + } + entry = entry->next; + if (entry == &journal->j_journal_list) + return 0; + } else { + return 0; } - } - entry = entry->next; - if (entry == &journal->j_journal_list) - return 0; - } else { - return 0; } - } - return 0; + return 0; } -int reiserfs_async_progress_wait(struct super_block *s) { - DEFINE_WAIT(wait); - struct reiserfs_journal *j = SB_JOURNAL(s); - if (atomic_read(&j->j_async_throttle)) - blk_congestion_wait(WRITE, HZ/10); - return 0; +int reiserfs_async_progress_wait(struct super_block *s) +{ + DEFINE_WAIT(wait); + struct reiserfs_journal *j = SB_JOURNAL(s); + if (atomic_read(&j->j_async_throttle)) + blk_congestion_wait(WRITE, HZ / 10); + return 0; } /* @@ -903,209 +967,225 @@ int reiserfs_async_progress_wait(struct super_block *s) { ** Before the commit block can by written, every other log block must be safely on disk ** */ -static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { - int i; - int bn ; - struct buffer_head *tbh = NULL ; - unsigned long trans_id = jl->j_trans_id; - struct reiserfs_journal *journal = SB_JOURNAL (s); - int barrier = 0; - int retval = 0; - - reiserfs_check_lock_depth(s, "flush_commit_list") ; - - if (atomic_read(&jl->j_older_commits_done)) { - return 0 ; - } - - /* before we can put our commit blocks on disk, we have to make sure everyone older than - ** us is on disk too - */ - BUG_ON (jl->j_len <= 0); - BUG_ON (trans_id == journal->j_trans_id); - - get_journal_list(jl); - if (flushall) { - if (flush_older_commits(s, jl) == 1) { - /* list disappeared during flush_older_commits. return */ - goto put_jl; - } - } - - /* make sure nobody is trying to flush this one at the same time */ - down(&jl->j_commit_lock); - if (!journal_list_still_alive(s, trans_id)) { - up(&jl->j_commit_lock); - goto put_jl; - } - BUG_ON (jl->j_trans_id == 0); - - /* this commit is done, exit */ - if (atomic_read(&(jl->j_commit_left)) <= 0) { - if (flushall) { - atomic_set(&(jl->j_older_commits_done), 1) ; - } - up(&jl->j_commit_lock); - goto put_jl; - } - - if (!list_empty(&jl->j_bh_list)) { - unlock_kernel(); - write_ordered_buffers(&journal->j_dirty_buffers_lock, - journal, jl, &jl->j_bh_list); - lock_kernel(); - } - BUG_ON (!list_empty(&jl->j_bh_list)); - /* - * for the description block and all the log blocks, submit any buffers - * that haven't already reached the disk - */ - atomic_inc(&journal->j_async_throttle); - for (i = 0 ; i < (jl->j_len + 1) ; i++) { - bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) % - SB_ONDISK_JOURNAL_SIZE(s); - tbh = journal_find_get_block(s, bn) ; - if (buffer_dirty(tbh)) /* redundant, ll_rw_block() checks */ - ll_rw_block(WRITE, 1, &tbh) ; - put_bh(tbh) ; - } - atomic_dec(&journal->j_async_throttle); - - /* wait on everything written so far before writing the commit - * if we are in barrier mode, send the commit down now - */ - barrier = reiserfs_barrier_flush(s); - if (barrier) { - int ret; - lock_buffer(jl->j_commit_bh); - ret = submit_barrier_buffer(jl->j_commit_bh); - if (ret == -EOPNOTSUPP) { - set_buffer_uptodate(jl->j_commit_bh); - disable_barrier(s); - barrier = 0; - } - } - for (i = 0 ; i < (jl->j_len + 1) ; i++) { - bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + - (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ; - tbh = journal_find_get_block(s, bn) ; - wait_on_buffer(tbh) ; - // since we're using ll_rw_blk above, it might have skipped over - // a locked buffer. Double check here - // - if (buffer_dirty(tbh)) /* redundant, sync_dirty_buffer() checks */ - sync_dirty_buffer(tbh); - if (unlikely (!buffer_uptodate(tbh))) { +static int flush_commit_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall) +{ + int i; + int bn; + struct buffer_head *tbh = NULL; + unsigned long trans_id = jl->j_trans_id; + struct reiserfs_journal *journal = SB_JOURNAL(s); + int barrier = 0; + int retval = 0; + + reiserfs_check_lock_depth(s, "flush_commit_list"); + + if (atomic_read(&jl->j_older_commits_done)) { + return 0; + } + + get_fs_excl(); + + /* before we can put our commit blocks on disk, we have to make sure everyone older than + ** us is on disk too + */ + BUG_ON(jl->j_len <= 0); + BUG_ON(trans_id == journal->j_trans_id); + + get_journal_list(jl); + if (flushall) { + if (flush_older_commits(s, jl) == 1) { + /* list disappeared during flush_older_commits. return */ + goto put_jl; + } + } + + /* make sure nobody is trying to flush this one at the same time */ + down(&jl->j_commit_lock); + if (!journal_list_still_alive(s, trans_id)) { + up(&jl->j_commit_lock); + goto put_jl; + } + BUG_ON(jl->j_trans_id == 0); + + /* this commit is done, exit */ + if (atomic_read(&(jl->j_commit_left)) <= 0) { + if (flushall) { + atomic_set(&(jl->j_older_commits_done), 1); + } + up(&jl->j_commit_lock); + goto put_jl; + } + + if (!list_empty(&jl->j_bh_list)) { + unlock_kernel(); + write_ordered_buffers(&journal->j_dirty_buffers_lock, + journal, jl, &jl->j_bh_list); + lock_kernel(); + } + BUG_ON(!list_empty(&jl->j_bh_list)); + /* + * for the description block and all the log blocks, submit any buffers + * that haven't already reached the disk + */ + atomic_inc(&journal->j_async_throttle); + for (i = 0; i < (jl->j_len + 1); i++) { + bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % + SB_ONDISK_JOURNAL_SIZE(s); + tbh = journal_find_get_block(s, bn); + if (buffer_dirty(tbh)) /* redundant, ll_rw_block() checks */ + ll_rw_block(WRITE, 1, &tbh); + put_bh(tbh); + } + atomic_dec(&journal->j_async_throttle); + + /* wait on everything written so far before writing the commit + * if we are in barrier mode, send the commit down now + */ + barrier = reiserfs_barrier_flush(s); + if (barrier) { + int ret; + lock_buffer(jl->j_commit_bh); + ret = submit_barrier_buffer(jl->j_commit_bh); + if (ret == -EOPNOTSUPP) { + set_buffer_uptodate(jl->j_commit_bh); + disable_barrier(s); + barrier = 0; + } + } + for (i = 0; i < (jl->j_len + 1); i++) { + bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); + tbh = journal_find_get_block(s, bn); + wait_on_buffer(tbh); + // since we're using ll_rw_blk above, it might have skipped over + // a locked buffer. Double check here + // + if (buffer_dirty(tbh)) /* redundant, sync_dirty_buffer() checks */ + sync_dirty_buffer(tbh); + if (unlikely(!buffer_uptodate(tbh))) { #ifdef CONFIG_REISERFS_CHECK - reiserfs_warning(s, "journal-601, buffer write failed") ; + reiserfs_warning(s, "journal-601, buffer write failed"); #endif - retval = -EIO; - } - put_bh(tbh) ; /* once for journal_find_get_block */ - put_bh(tbh) ; /* once due to original getblk in do_journal_end */ - atomic_dec(&(jl->j_commit_left)) ; - } - - BUG_ON (atomic_read(&(jl->j_commit_left)) != 1); - - if (!barrier) { - if (buffer_dirty(jl->j_commit_bh)) - BUG(); - mark_buffer_dirty(jl->j_commit_bh) ; - sync_dirty_buffer(jl->j_commit_bh) ; - } else - wait_on_buffer(jl->j_commit_bh); - - check_barrier_completion(s, jl->j_commit_bh); - - /* If there was a write error in the journal - we can't commit this - * transaction - it will be invalid and, if successful, will just end - * up propogating the write error out to the filesystem. */ - if (unlikely (!buffer_uptodate(jl->j_commit_bh))) { + retval = -EIO; + } + put_bh(tbh); /* once for journal_find_get_block */ + put_bh(tbh); /* once due to original getblk in do_journal_end */ + atomic_dec(&(jl->j_commit_left)); + } + + BUG_ON(atomic_read(&(jl->j_commit_left)) != 1); + + if (!barrier) { + if (buffer_dirty(jl->j_commit_bh)) + BUG(); + mark_buffer_dirty(jl->j_commit_bh); + sync_dirty_buffer(jl->j_commit_bh); + } else + wait_on_buffer(jl->j_commit_bh); + + check_barrier_completion(s, jl->j_commit_bh); + + /* If there was a write error in the journal - we can't commit this + * transaction - it will be invalid and, if successful, will just end + * up propogating the write error out to the filesystem. */ + if (unlikely(!buffer_uptodate(jl->j_commit_bh))) { #ifdef CONFIG_REISERFS_CHECK - reiserfs_warning(s, "journal-615: buffer write failed") ; + reiserfs_warning(s, "journal-615: buffer write failed"); #endif - retval = -EIO; - } - bforget(jl->j_commit_bh) ; - if (journal->j_last_commit_id != 0 && - (jl->j_trans_id - journal->j_last_commit_id) != 1) { - reiserfs_warning(s, "clm-2200: last commit %lu, current %lu", - journal->j_last_commit_id, - jl->j_trans_id); - } - journal->j_last_commit_id = jl->j_trans_id; - - /* now, every commit block is on the disk. It is safe to allow blocks freed during this transaction to be reallocated */ - cleanup_freed_for_journal_list(s, jl) ; - - retval = retval ? retval : journal->j_errno; - - /* mark the metadata dirty */ - if (!retval) - dirty_one_transaction(s, jl); - atomic_dec(&(jl->j_commit_left)) ; - - if (flushall) { - atomic_set(&(jl->j_older_commits_done), 1) ; - } - up(&jl->j_commit_lock); -put_jl: - put_journal_list(s, jl); - - if (retval) - reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__); - return retval; + retval = -EIO; + } + bforget(jl->j_commit_bh); + if (journal->j_last_commit_id != 0 && + (jl->j_trans_id - journal->j_last_commit_id) != 1) { + reiserfs_warning(s, "clm-2200: last commit %lu, current %lu", + journal->j_last_commit_id, jl->j_trans_id); + } + journal->j_last_commit_id = jl->j_trans_id; + + /* now, every commit block is on the disk. It is safe to allow blocks freed during this transaction to be reallocated */ + cleanup_freed_for_journal_list(s, jl); + + retval = retval ? retval : journal->j_errno; + + /* mark the metadata dirty */ + if (!retval) + dirty_one_transaction(s, jl); + atomic_dec(&(jl->j_commit_left)); + + if (flushall) { + atomic_set(&(jl->j_older_commits_done), 1); + } + up(&jl->j_commit_lock); + put_jl: + put_journal_list(s, jl); + + if (retval) + reiserfs_abort(s, retval, "Journal write error in %s", + __FUNCTION__); + put_fs_excl(); + return retval; } /* ** flush_journal_list frequently needs to find a newer transaction for a given block. This does that, or ** returns NULL if it can't find anything */ -static struct reiserfs_journal_list *find_newer_jl_for_cn(struct reiserfs_journal_cnode *cn) { - struct super_block *sb = cn->sb; - b_blocknr_t blocknr = cn->blocknr ; +static struct reiserfs_journal_list *find_newer_jl_for_cn(struct + reiserfs_journal_cnode + *cn) +{ + struct super_block *sb = cn->sb; + b_blocknr_t blocknr = cn->blocknr; - cn = cn->hprev ; - while(cn) { - if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) { - return cn->jlist ; - } - cn = cn->hprev ; - } - return NULL ; + cn = cn->hprev; + while (cn) { + if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) { + return cn->jlist; + } + cn = cn->hprev; + } + return NULL; } -static void remove_journal_hash(struct super_block *, struct reiserfs_journal_cnode **, -struct reiserfs_journal_list *, unsigned long, int); +static void remove_journal_hash(struct super_block *, + struct reiserfs_journal_cnode **, + struct reiserfs_journal_list *, unsigned long, + int); /* ** once all the real blocks have been flushed, it is safe to remove them from the ** journal list for this transaction. Aside from freeing the cnode, this also allows the ** block to be reallocated for data blocks if it had been deleted. */ -static void remove_all_from_journal_list(struct super_block *p_s_sb, struct reiserfs_journal_list *jl, int debug) { - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - struct reiserfs_journal_cnode *cn, *last ; - cn = jl->j_realblock ; - - /* which is better, to lock once around the whole loop, or - ** to lock for each call to remove_journal_hash? - */ - while(cn) { - if (cn->blocknr != 0) { - if (debug) { - reiserfs_warning (p_s_sb, "block %u, bh is %d, state %ld", cn->blocknr, - cn->bh ? 1: 0, cn->state) ; - } - cn->state = 0 ; - remove_journal_hash(p_s_sb, journal->j_list_hash_table, jl, cn->blocknr, 1) ; - } - last = cn ; - cn = cn->next ; - free_cnode(p_s_sb, last) ; - } - jl->j_realblock = NULL ; +static void remove_all_from_journal_list(struct super_block *p_s_sb, + struct reiserfs_journal_list *jl, + int debug) +{ + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + struct reiserfs_journal_cnode *cn, *last; + cn = jl->j_realblock; + + /* which is better, to lock once around the whole loop, or + ** to lock for each call to remove_journal_hash? + */ + while (cn) { + if (cn->blocknr != 0) { + if (debug) { + reiserfs_warning(p_s_sb, + "block %u, bh is %d, state %ld", + cn->blocknr, cn->bh ? 1 : 0, + cn->state); + } + cn->state = 0; + remove_journal_hash(p_s_sb, journal->j_list_hash_table, + jl, cn->blocknr, 1); + } + last = cn; + cn = cn->next; + free_cnode(p_s_sb, last); + } + jl->j_realblock = NULL; } /* @@ -1115,98 +1195,107 @@ static void remove_all_from_journal_list(struct super_block *p_s_sb, struct reis ** called by flush_journal_list, before it calls remove_all_from_journal_list ** */ -static int _update_journal_header_block(struct super_block *p_s_sb, unsigned long offset, unsigned long trans_id) { - struct reiserfs_journal_header *jh ; - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); +static int _update_journal_header_block(struct super_block *p_s_sb, + unsigned long offset, + unsigned long trans_id) +{ + struct reiserfs_journal_header *jh; + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); - if (reiserfs_is_journal_aborted (journal)) - return -EIO; + if (reiserfs_is_journal_aborted(journal)) + return -EIO; - if (trans_id >= journal->j_last_flush_trans_id) { - if (buffer_locked((journal->j_header_bh))) { - wait_on_buffer((journal->j_header_bh)) ; - if (unlikely (!buffer_uptodate(journal->j_header_bh))) { + if (trans_id >= journal->j_last_flush_trans_id) { + if (buffer_locked((journal->j_header_bh))) { + wait_on_buffer((journal->j_header_bh)); + if (unlikely(!buffer_uptodate(journal->j_header_bh))) { #ifdef CONFIG_REISERFS_CHECK - reiserfs_warning (p_s_sb, "journal-699: buffer write failed") ; + reiserfs_warning(p_s_sb, + "journal-699: buffer write failed"); #endif - return -EIO; - } - } - journal->j_last_flush_trans_id = trans_id ; - journal->j_first_unflushed_offset = offset ; - jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data) ; - jh->j_last_flush_trans_id = cpu_to_le32(trans_id) ; - jh->j_first_unflushed_offset = cpu_to_le32(offset) ; - jh->j_mount_id = cpu_to_le32(journal->j_mount_id) ; - - if (reiserfs_barrier_flush(p_s_sb)) { - int ret; - lock_buffer(journal->j_header_bh); - ret = submit_barrier_buffer(journal->j_header_bh); - if (ret == -EOPNOTSUPP) { - set_buffer_uptodate(journal->j_header_bh); - disable_barrier(p_s_sb); - goto sync; - } - wait_on_buffer(journal->j_header_bh); - check_barrier_completion(p_s_sb, journal->j_header_bh); - } else { -sync: - set_buffer_dirty(journal->j_header_bh) ; - sync_dirty_buffer(journal->j_header_bh) ; - } - if (!buffer_uptodate(journal->j_header_bh)) { - reiserfs_warning (p_s_sb, "journal-837: IO error during journal replay"); - return -EIO ; - } - } - return 0 ; -} - -static int update_journal_header_block(struct super_block *p_s_sb, - unsigned long offset, - unsigned long trans_id) { - return _update_journal_header_block(p_s_sb, offset, trans_id); + return -EIO; + } + } + journal->j_last_flush_trans_id = trans_id; + journal->j_first_unflushed_offset = offset; + jh = (struct reiserfs_journal_header *)(journal->j_header_bh-> + b_data); + jh->j_last_flush_trans_id = cpu_to_le32(trans_id); + jh->j_first_unflushed_offset = cpu_to_le32(offset); + jh->j_mount_id = cpu_to_le32(journal->j_mount_id); + + if (reiserfs_barrier_flush(p_s_sb)) { + int ret; + lock_buffer(journal->j_header_bh); + ret = submit_barrier_buffer(journal->j_header_bh); + if (ret == -EOPNOTSUPP) { + set_buffer_uptodate(journal->j_header_bh); + disable_barrier(p_s_sb); + goto sync; + } + wait_on_buffer(journal->j_header_bh); + check_barrier_completion(p_s_sb, journal->j_header_bh); + } else { + sync: + set_buffer_dirty(journal->j_header_bh); + sync_dirty_buffer(journal->j_header_bh); + } + if (!buffer_uptodate(journal->j_header_bh)) { + reiserfs_warning(p_s_sb, + "journal-837: IO error during journal replay"); + return -EIO; + } + } + return 0; } + +static int update_journal_header_block(struct super_block *p_s_sb, + unsigned long offset, + unsigned long trans_id) +{ + return _update_journal_header_block(p_s_sb, offset, trans_id); +} + /* ** flush any and all journal lists older than you are ** can only be called from flush_journal_list */ static int flush_older_journal_lists(struct super_block *p_s_sb, - struct reiserfs_journal_list *jl) -{ - struct list_head *entry; - struct reiserfs_journal_list *other_jl ; - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - unsigned long trans_id = jl->j_trans_id; - - /* we know we are the only ones flushing things, no extra race - * protection is required. - */ -restart: - entry = journal->j_journal_list.next; - /* Did we wrap? */ - if (entry == &journal->j_journal_list) - return 0; - other_jl = JOURNAL_LIST_ENTRY(entry); - if (other_jl->j_trans_id < trans_id) { - BUG_ON (other_jl->j_refcount <= 0); - /* do not flush all */ - flush_journal_list(p_s_sb, other_jl, 0) ; - - /* other_jl is now deleted from the list */ - goto restart; - } - return 0 ; + struct reiserfs_journal_list *jl) +{ + struct list_head *entry; + struct reiserfs_journal_list *other_jl; + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + unsigned long trans_id = jl->j_trans_id; + + /* we know we are the only ones flushing things, no extra race + * protection is required. + */ + restart: + entry = journal->j_journal_list.next; + /* Did we wrap? */ + if (entry == &journal->j_journal_list) + return 0; + other_jl = JOURNAL_LIST_ENTRY(entry); + if (other_jl->j_trans_id < trans_id) { + BUG_ON(other_jl->j_refcount <= 0); + /* do not flush all */ + flush_journal_list(p_s_sb, other_jl, 0); + + /* other_jl is now deleted from the list */ + goto restart; + } + return 0; } static void del_from_work_list(struct super_block *s, - struct reiserfs_journal_list *jl) { - struct reiserfs_journal *journal = SB_JOURNAL (s); - if (!list_empty(&jl->j_working_list)) { - list_del_init(&jl->j_working_list); - journal->j_num_work_lists--; - } + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + if (!list_empty(&jl->j_working_list)) { + list_del_init(&jl->j_working_list); + journal->j_num_work_lists--; + } } /* flush a journal list, both commit and real blocks @@ -1218,383 +1307,407 @@ static void del_from_work_list(struct super_block *s, ** and the journal is locked. That means it can only be called from ** do_journal_end, or by journal_release */ -static int flush_journal_list(struct super_block *s, - struct reiserfs_journal_list *jl, int flushall) { - struct reiserfs_journal_list *pjl ; - struct reiserfs_journal_cnode *cn, *last ; - int count ; - int was_jwait = 0 ; - int was_dirty = 0 ; - struct buffer_head *saved_bh ; - unsigned long j_len_saved = jl->j_len ; - struct reiserfs_journal *journal = SB_JOURNAL (s); - int err = 0; - - BUG_ON (j_len_saved <= 0); - - if (atomic_read(&journal->j_wcount) != 0) { - reiserfs_warning(s, "clm-2048: flush_journal_list called with wcount %d", - atomic_read(&journal->j_wcount)) ; - } - BUG_ON (jl->j_trans_id == 0); - - /* if flushall == 0, the lock is already held */ - if (flushall) { - down(&journal->j_flush_sem); - } else if (!down_trylock(&journal->j_flush_sem)) { - BUG(); - } - - count = 0 ; - if (j_len_saved > journal->j_trans_max) { - reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, trans id %lu\n", j_len_saved, jl->j_trans_id); - return 0 ; - } - - /* if all the work is already done, get out of here */ - if (atomic_read(&(jl->j_nonzerolen)) <= 0 && - atomic_read(&(jl->j_commit_left)) <= 0) { - goto flush_older_and_return ; - } - - /* start by putting the commit list on disk. This will also flush - ** the commit lists of any olders transactions - */ - flush_commit_list(s, jl, 1) ; - - if (!(jl->j_state & LIST_DIRTY) && !reiserfs_is_journal_aborted (journal)) - BUG(); - - /* are we done now? */ - if (atomic_read(&(jl->j_nonzerolen)) <= 0 && - atomic_read(&(jl->j_commit_left)) <= 0) { - goto flush_older_and_return ; - } - - /* loop through each cnode, see if we need to write it, - ** or wait on a more recent transaction, or just ignore it - */ - if (atomic_read(&(journal->j_wcount)) != 0) { - reiserfs_panic(s, "journal-844: panic journal list is flushing, wcount is not 0\n") ; - } - cn = jl->j_realblock ; - while(cn) { - was_jwait = 0 ; - was_dirty = 0 ; - saved_bh = NULL ; - /* blocknr of 0 is no longer in the hash, ignore it */ - if (cn->blocknr == 0) { - goto free_cnode ; - } - - /* This transaction failed commit. Don't write out to the disk */ - if (!(jl->j_state & LIST_DIRTY)) - goto free_cnode; - - pjl = find_newer_jl_for_cn(cn) ; - /* the order is important here. We check pjl to make sure we - ** don't clear BH_JDirty_wait if we aren't the one writing this - ** block to disk - */ - if (!pjl && cn->bh) { - saved_bh = cn->bh ; - - /* we do this to make sure nobody releases the buffer while - ** we are working with it - */ - get_bh(saved_bh) ; - - if (buffer_journal_dirty(saved_bh)) { - BUG_ON (!can_dirty (cn)); - was_jwait = 1 ; - was_dirty = 1 ; - } else if (can_dirty(cn)) { - /* everything with !pjl && jwait should be writable */ - BUG(); - } - } - - /* if someone has this block in a newer transaction, just make - ** sure they are commited, and don't try writing it to disk - */ - if (pjl) { - if (atomic_read(&pjl->j_commit_left)) - flush_commit_list(s, pjl, 1) ; - goto free_cnode ; - } - - /* bh == NULL when the block got to disk on its own, OR, - ** the block got freed in a future transaction - */ - if (saved_bh == NULL) { - goto free_cnode ; - } - - /* this should never happen. kupdate_one_transaction has this list - ** locked while it works, so we should never see a buffer here that - ** is not marked JDirty_wait - */ - if ((!was_jwait) && !buffer_locked(saved_bh)) { - reiserfs_warning (s, "journal-813: BAD! buffer %llu %cdirty %cjwait, " - "not in a newer tranasction", - (unsigned long long)saved_bh->b_blocknr, - was_dirty ? ' ' : '!', was_jwait ? ' ' : '!') ; - } - if (was_dirty) { - /* we inc again because saved_bh gets decremented at free_cnode */ - get_bh(saved_bh) ; - set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ; - lock_buffer(saved_bh); - BUG_ON (cn->blocknr != saved_bh->b_blocknr); - if (buffer_dirty(saved_bh)) - submit_logged_buffer(saved_bh) ; - else - unlock_buffer(saved_bh); - count++ ; - } else { - reiserfs_warning (s, "clm-2082: Unable to flush buffer %llu in %s", - (unsigned long long)saved_bh->b_blocknr, __FUNCTION__); - } -free_cnode: - last = cn ; - cn = cn->next ; - if (saved_bh) { - /* we incremented this to keep others from taking the buffer head away */ - put_bh(saved_bh) ; - if (atomic_read(&(saved_bh->b_count)) < 0) { - reiserfs_warning (s, "journal-945: saved_bh->b_count < 0"); - } - } - } - if (count > 0) { - cn = jl->j_realblock ; - while(cn) { - if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) { - if (!cn->bh) { - reiserfs_panic(s, "journal-1011: cn->bh is NULL\n") ; - } - wait_on_buffer(cn->bh) ; - if (!cn->bh) { - reiserfs_panic(s, "journal-1012: cn->bh is NULL\n") ; - } - if (unlikely (!buffer_uptodate(cn->bh))) { +static int flush_journal_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall) +{ + struct reiserfs_journal_list *pjl; + struct reiserfs_journal_cnode *cn, *last; + int count; + int was_jwait = 0; + int was_dirty = 0; + struct buffer_head *saved_bh; + unsigned long j_len_saved = jl->j_len; + struct reiserfs_journal *journal = SB_JOURNAL(s); + int err = 0; + + BUG_ON(j_len_saved <= 0); + + if (atomic_read(&journal->j_wcount) != 0) { + reiserfs_warning(s, + "clm-2048: flush_journal_list called with wcount %d", + atomic_read(&journal->j_wcount)); + } + BUG_ON(jl->j_trans_id == 0); + + /* if flushall == 0, the lock is already held */ + if (flushall) { + down(&journal->j_flush_sem); + } else if (!down_trylock(&journal->j_flush_sem)) { + BUG(); + } + + count = 0; + if (j_len_saved > journal->j_trans_max) { + reiserfs_panic(s, + "journal-715: flush_journal_list, length is %lu, trans id %lu\n", + j_len_saved, jl->j_trans_id); + return 0; + } + + get_fs_excl(); + + /* if all the work is already done, get out of here */ + if (atomic_read(&(jl->j_nonzerolen)) <= 0 && + atomic_read(&(jl->j_commit_left)) <= 0) { + goto flush_older_and_return; + } + + /* start by putting the commit list on disk. This will also flush + ** the commit lists of any olders transactions + */ + flush_commit_list(s, jl, 1); + + if (!(jl->j_state & LIST_DIRTY) + && !reiserfs_is_journal_aborted(journal)) + BUG(); + + /* are we done now? */ + if (atomic_read(&(jl->j_nonzerolen)) <= 0 && + atomic_read(&(jl->j_commit_left)) <= 0) { + goto flush_older_and_return; + } + + /* loop through each cnode, see if we need to write it, + ** or wait on a more recent transaction, or just ignore it + */ + if (atomic_read(&(journal->j_wcount)) != 0) { + reiserfs_panic(s, + "journal-844: panic journal list is flushing, wcount is not 0\n"); + } + cn = jl->j_realblock; + while (cn) { + was_jwait = 0; + was_dirty = 0; + saved_bh = NULL; + /* blocknr of 0 is no longer in the hash, ignore it */ + if (cn->blocknr == 0) { + goto free_cnode; + } + + /* This transaction failed commit. Don't write out to the disk */ + if (!(jl->j_state & LIST_DIRTY)) + goto free_cnode; + + pjl = find_newer_jl_for_cn(cn); + /* the order is important here. We check pjl to make sure we + ** don't clear BH_JDirty_wait if we aren't the one writing this + ** block to disk + */ + if (!pjl && cn->bh) { + saved_bh = cn->bh; + + /* we do this to make sure nobody releases the buffer while + ** we are working with it + */ + get_bh(saved_bh); + + if (buffer_journal_dirty(saved_bh)) { + BUG_ON(!can_dirty(cn)); + was_jwait = 1; + was_dirty = 1; + } else if (can_dirty(cn)) { + /* everything with !pjl && jwait should be writable */ + BUG(); + } + } + + /* if someone has this block in a newer transaction, just make + ** sure they are commited, and don't try writing it to disk + */ + if (pjl) { + if (atomic_read(&pjl->j_commit_left)) + flush_commit_list(s, pjl, 1); + goto free_cnode; + } + + /* bh == NULL when the block got to disk on its own, OR, + ** the block got freed in a future transaction + */ + if (saved_bh == NULL) { + goto free_cnode; + } + + /* this should never happen. kupdate_one_transaction has this list + ** locked while it works, so we should never see a buffer here that + ** is not marked JDirty_wait + */ + if ((!was_jwait) && !buffer_locked(saved_bh)) { + reiserfs_warning(s, + "journal-813: BAD! buffer %llu %cdirty %cjwait, " + "not in a newer tranasction", + (unsigned long long)saved_bh-> + b_blocknr, was_dirty ? ' ' : '!', + was_jwait ? ' ' : '!'); + } + if (was_dirty) { + /* we inc again because saved_bh gets decremented at free_cnode */ + get_bh(saved_bh); + set_bit(BLOCK_NEEDS_FLUSH, &cn->state); + lock_buffer(saved_bh); + BUG_ON(cn->blocknr != saved_bh->b_blocknr); + if (buffer_dirty(saved_bh)) + submit_logged_buffer(saved_bh); + else + unlock_buffer(saved_bh); + count++; + } else { + reiserfs_warning(s, + "clm-2082: Unable to flush buffer %llu in %s", + (unsigned long long)saved_bh-> + b_blocknr, __FUNCTION__); + } + free_cnode: + last = cn; + cn = cn->next; + if (saved_bh) { + /* we incremented this to keep others from taking the buffer head away */ + put_bh(saved_bh); + if (atomic_read(&(saved_bh->b_count)) < 0) { + reiserfs_warning(s, + "journal-945: saved_bh->b_count < 0"); + } + } + } + if (count > 0) { + cn = jl->j_realblock; + while (cn) { + if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) { + if (!cn->bh) { + reiserfs_panic(s, + "journal-1011: cn->bh is NULL\n"); + } + wait_on_buffer(cn->bh); + if (!cn->bh) { + reiserfs_panic(s, + "journal-1012: cn->bh is NULL\n"); + } + if (unlikely(!buffer_uptodate(cn->bh))) { #ifdef CONFIG_REISERFS_CHECK - reiserfs_warning(s, "journal-949: buffer write failed\n") ; + reiserfs_warning(s, + "journal-949: buffer write failed\n"); #endif - err = -EIO; - } - /* note, we must clear the JDirty_wait bit after the up to date - ** check, otherwise we race against our flushpage routine - */ - BUG_ON (!test_clear_buffer_journal_dirty (cn->bh)); - - /* undo the inc from journal_mark_dirty */ - put_bh(cn->bh) ; - brelse(cn->bh) ; - } - cn = cn->next ; - } - } - - if (err) - reiserfs_abort (s, -EIO, "Write error while pushing transaction to disk in %s", __FUNCTION__); -flush_older_and_return: - - - /* before we can update the journal header block, we _must_ flush all - ** real blocks from all older transactions to disk. This is because - ** once the header block is updated, this transaction will not be - ** replayed after a crash - */ - if (flushall) { - flush_older_journal_lists(s, jl); - } - - err = journal->j_errno; - /* before we can remove everything from the hash tables for this - ** transaction, we must make sure it can never be replayed - ** - ** since we are only called from do_journal_end, we know for sure there - ** are no allocations going on while we are flushing journal lists. So, - ** we only need to update the journal header block for the last list - ** being flushed - */ - if (!err && flushall) { - err = update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(s), jl->j_trans_id) ; - if (err) - reiserfs_abort (s, -EIO, "Write error while updating journal header in %s", __FUNCTION__); - } - remove_all_from_journal_list(s, jl, 0) ; - list_del_init(&jl->j_list); - journal->j_num_lists--; - del_from_work_list(s, jl); - - if (journal->j_last_flush_id != 0 && - (jl->j_trans_id - journal->j_last_flush_id) != 1) { - reiserfs_warning(s, "clm-2201: last flush %lu, current %lu", - journal->j_last_flush_id, - jl->j_trans_id); - } - journal->j_last_flush_id = jl->j_trans_id; - - /* not strictly required since we are freeing the list, but it should - * help find code using dead lists later on - */ - jl->j_len = 0 ; - atomic_set(&(jl->j_nonzerolen), 0) ; - jl->j_start = 0 ; - jl->j_realblock = NULL ; - jl->j_commit_bh = NULL ; - jl->j_trans_id = 0 ; - jl->j_state = 0; - put_journal_list(s, jl); - if (flushall) - up(&journal->j_flush_sem); - return err ; -} + err = -EIO; + } + /* note, we must clear the JDirty_wait bit after the up to date + ** check, otherwise we race against our flushpage routine + */ + BUG_ON(!test_clear_buffer_journal_dirty + (cn->bh)); + + /* undo the inc from journal_mark_dirty */ + put_bh(cn->bh); + brelse(cn->bh); + } + cn = cn->next; + } + } + + if (err) + reiserfs_abort(s, -EIO, + "Write error while pushing transaction to disk in %s", + __FUNCTION__); + flush_older_and_return: + + /* before we can update the journal header block, we _must_ flush all + ** real blocks from all older transactions to disk. This is because + ** once the header block is updated, this transaction will not be + ** replayed after a crash + */ + if (flushall) { + flush_older_journal_lists(s, jl); + } + + err = journal->j_errno; + /* before we can remove everything from the hash tables for this + ** transaction, we must make sure it can never be replayed + ** + ** since we are only called from do_journal_end, we know for sure there + ** are no allocations going on while we are flushing journal lists. So, + ** we only need to update the journal header block for the last list + ** being flushed + */ + if (!err && flushall) { + err = + update_journal_header_block(s, + (jl->j_start + jl->j_len + + 2) % SB_ONDISK_JOURNAL_SIZE(s), + jl->j_trans_id); + if (err) + reiserfs_abort(s, -EIO, + "Write error while updating journal header in %s", + __FUNCTION__); + } + remove_all_from_journal_list(s, jl, 0); + list_del_init(&jl->j_list); + journal->j_num_lists--; + del_from_work_list(s, jl); + + if (journal->j_last_flush_id != 0 && + (jl->j_trans_id - journal->j_last_flush_id) != 1) { + reiserfs_warning(s, "clm-2201: last flush %lu, current %lu", + journal->j_last_flush_id, jl->j_trans_id); + } + journal->j_last_flush_id = jl->j_trans_id; + + /* not strictly required since we are freeing the list, but it should + * help find code using dead lists later on + */ + jl->j_len = 0; + atomic_set(&(jl->j_nonzerolen), 0); + jl->j_start = 0; + jl->j_realblock = NULL; + jl->j_commit_bh = NULL; + jl->j_trans_id = 0; + jl->j_state = 0; + put_journal_list(s, jl); + if (flushall) + up(&journal->j_flush_sem); + put_fs_excl(); + return err; +} static int write_one_transaction(struct super_block *s, - struct reiserfs_journal_list *jl, + struct reiserfs_journal_list *jl, struct buffer_chunk *chunk) { - struct reiserfs_journal_cnode *cn; - int ret = 0 ; - - jl->j_state |= LIST_TOUCHED; - del_from_work_list(s, jl); - if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) { - return 0; - } - - cn = jl->j_realblock ; - while(cn) { - /* if the blocknr == 0, this has been cleared from the hash, - ** skip it - */ - if (cn->blocknr == 0) { - goto next ; - } - if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) { - struct buffer_head *tmp_bh; - /* we can race against journal_mark_freed when we try - * to lock_buffer(cn->bh), so we have to inc the buffer - * count, and recheck things after locking - */ - tmp_bh = cn->bh; - get_bh(tmp_bh); - lock_buffer(tmp_bh); - if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) { - if (!buffer_journal_dirty(tmp_bh) || - buffer_journal_prepared(tmp_bh)) - BUG(); - add_to_chunk(chunk, tmp_bh, NULL, write_chunk); - ret++; - } else { - /* note, cn->bh might be null now */ - unlock_buffer(tmp_bh); - } - put_bh(tmp_bh); - } -next: - cn = cn->next ; - cond_resched(); - } - return ret ; + struct reiserfs_journal_cnode *cn; + int ret = 0; + + jl->j_state |= LIST_TOUCHED; + del_from_work_list(s, jl); + if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) { + return 0; + } + + cn = jl->j_realblock; + while (cn) { + /* if the blocknr == 0, this has been cleared from the hash, + ** skip it + */ + if (cn->blocknr == 0) { + goto next; + } + if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) { + struct buffer_head *tmp_bh; + /* we can race against journal_mark_freed when we try + * to lock_buffer(cn->bh), so we have to inc the buffer + * count, and recheck things after locking + */ + tmp_bh = cn->bh; + get_bh(tmp_bh); + lock_buffer(tmp_bh); + if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) { + if (!buffer_journal_dirty(tmp_bh) || + buffer_journal_prepared(tmp_bh)) + BUG(); + add_to_chunk(chunk, tmp_bh, NULL, write_chunk); + ret++; + } else { + /* note, cn->bh might be null now */ + unlock_buffer(tmp_bh); + } + put_bh(tmp_bh); + } + next: + cn = cn->next; + cond_resched(); + } + return ret; } /* used by flush_commit_list */ static int dirty_one_transaction(struct super_block *s, - struct reiserfs_journal_list *jl) -{ - struct reiserfs_journal_cnode *cn; - struct reiserfs_journal_list *pjl; - int ret = 0 ; - - jl->j_state |= LIST_DIRTY; - cn = jl->j_realblock ; - while(cn) { - /* look for a more recent transaction that logged this - ** buffer. Only the most recent transaction with a buffer in - ** it is allowed to send that buffer to disk - */ - pjl = find_newer_jl_for_cn(cn) ; - if (!pjl && cn->blocknr && cn->bh && buffer_journal_dirty(cn->bh)) - { - BUG_ON (!can_dirty(cn)); - /* if the buffer is prepared, it will either be logged - * or restored. If restored, we need to make sure - * it actually gets marked dirty - */ - clear_buffer_journal_new (cn->bh); - if (buffer_journal_prepared (cn->bh)) { - set_buffer_journal_restore_dirty (cn->bh); - } else { - set_buffer_journal_test (cn->bh); - mark_buffer_dirty(cn->bh); - } - } - cn = cn->next ; - } - return ret ; + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal_cnode *cn; + struct reiserfs_journal_list *pjl; + int ret = 0; + + jl->j_state |= LIST_DIRTY; + cn = jl->j_realblock; + while (cn) { + /* look for a more recent transaction that logged this + ** buffer. Only the most recent transaction with a buffer in + ** it is allowed to send that buffer to disk + */ + pjl = find_newer_jl_for_cn(cn); + if (!pjl && cn->blocknr && cn->bh + && buffer_journal_dirty(cn->bh)) { + BUG_ON(!can_dirty(cn)); + /* if the buffer is prepared, it will either be logged + * or restored. If restored, we need to make sure + * it actually gets marked dirty + */ + clear_buffer_journal_new(cn->bh); + if (buffer_journal_prepared(cn->bh)) { + set_buffer_journal_restore_dirty(cn->bh); + } else { + set_buffer_journal_test(cn->bh); + mark_buffer_dirty(cn->bh); + } + } + cn = cn->next; + } + return ret; } static int kupdate_transactions(struct super_block *s, - struct reiserfs_journal_list *jl, - struct reiserfs_journal_list **next_jl, - unsigned long *next_trans_id, - int num_blocks, - int num_trans) { - int ret = 0; - int written = 0 ; - int transactions_flushed = 0; - unsigned long orig_trans_id = jl->j_trans_id; - struct buffer_chunk chunk; - struct list_head *entry; - struct reiserfs_journal *journal = SB_JOURNAL (s); - chunk.nr = 0; - - down(&journal->j_flush_sem); - if (!journal_list_still_alive(s, orig_trans_id)) { - goto done; - } - - /* we've got j_flush_sem held, nobody is going to delete any - * of these lists out from underneath us - */ - while((num_trans && transactions_flushed < num_trans) || - (!num_trans && written < num_blocks)) { - - if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) || - atomic_read(&jl->j_commit_left) || !(jl->j_state & LIST_DIRTY)) - { - del_from_work_list(s, jl); - break; - } - ret = write_one_transaction(s, jl, &chunk); - - if (ret < 0) - goto done; - transactions_flushed++; - written += ret; - entry = jl->j_list.next; - - /* did we wrap? */ - if (entry == &journal->j_journal_list) { - break; - } - jl = JOURNAL_LIST_ENTRY(entry); - - /* don't bother with older transactions */ - if (jl->j_trans_id <= orig_trans_id) - break; - } - if (chunk.nr) { - write_chunk(&chunk); - } - -done: - up(&journal->j_flush_sem); - return ret; + struct reiserfs_journal_list *jl, + struct reiserfs_journal_list **next_jl, + unsigned long *next_trans_id, + int num_blocks, int num_trans) +{ + int ret = 0; + int written = 0; + int transactions_flushed = 0; + unsigned long orig_trans_id = jl->j_trans_id; + struct buffer_chunk chunk; + struct list_head *entry; + struct reiserfs_journal *journal = SB_JOURNAL(s); + chunk.nr = 0; + + down(&journal->j_flush_sem); + if (!journal_list_still_alive(s, orig_trans_id)) { + goto done; + } + + /* we've got j_flush_sem held, nobody is going to delete any + * of these lists out from underneath us + */ + while ((num_trans && transactions_flushed < num_trans) || + (!num_trans && written < num_blocks)) { + + if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) || + atomic_read(&jl->j_commit_left) + || !(jl->j_state & LIST_DIRTY)) { + del_from_work_list(s, jl); + break; + } + ret = write_one_transaction(s, jl, &chunk); + + if (ret < 0) + goto done; + transactions_flushed++; + written += ret; + entry = jl->j_list.next; + + /* did we wrap? */ + if (entry == &journal->j_journal_list) { + break; + } + jl = JOURNAL_LIST_ENTRY(entry); + + /* don't bother with older transactions */ + if (jl->j_trans_id <= orig_trans_id) + break; + } + if (chunk.nr) { + write_chunk(&chunk); + } + + done: + up(&journal->j_flush_sem); + return ret; } /* for o_sync and fsync heavy applications, they tend to use @@ -1607,47 +1720,48 @@ done: ** list updates the header block */ static int flush_used_journal_lists(struct super_block *s, - struct reiserfs_journal_list *jl) { - unsigned long len = 0; - unsigned long cur_len; - int ret; - int i; - int limit = 256; - struct reiserfs_journal_list *tjl; - struct reiserfs_journal_list *flush_jl; - unsigned long trans_id; - struct reiserfs_journal *journal = SB_JOURNAL (s); - - flush_jl = tjl = jl; - - /* in data logging mode, try harder to flush a lot of blocks */ - if (reiserfs_data_log(s)) - limit = 1024; - /* flush for 256 transactions or limit blocks, whichever comes first */ - for(i = 0 ; i < 256 && len < limit ; i++) { - if (atomic_read(&tjl->j_commit_left) || - tjl->j_trans_id < jl->j_trans_id) { - break; - } - cur_len = atomic_read(&tjl->j_nonzerolen); - if (cur_len > 0) { - tjl->j_state &= ~LIST_TOUCHED; - } - len += cur_len; - flush_jl = tjl; - if (tjl->j_list.next == &journal->j_journal_list) - break; - tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next); - } - /* try to find a group of blocks we can flush across all the - ** transactions, but only bother if we've actually spanned - ** across multiple lists - */ - if (flush_jl != jl) { - ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i); - } - flush_journal_list(s, flush_jl, 1); - return 0; + struct reiserfs_journal_list *jl) +{ + unsigned long len = 0; + unsigned long cur_len; + int ret; + int i; + int limit = 256; + struct reiserfs_journal_list *tjl; + struct reiserfs_journal_list *flush_jl; + unsigned long trans_id; + struct reiserfs_journal *journal = SB_JOURNAL(s); + + flush_jl = tjl = jl; + + /* in data logging mode, try harder to flush a lot of blocks */ + if (reiserfs_data_log(s)) + limit = 1024; + /* flush for 256 transactions or limit blocks, whichever comes first */ + for (i = 0; i < 256 && len < limit; i++) { + if (atomic_read(&tjl->j_commit_left) || + tjl->j_trans_id < jl->j_trans_id) { + break; + } + cur_len = atomic_read(&tjl->j_nonzerolen); + if (cur_len > 0) { + tjl->j_state &= ~LIST_TOUCHED; + } + len += cur_len; + flush_jl = tjl; + if (tjl->j_list.next == &journal->j_journal_list) + break; + tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next); + } + /* try to find a group of blocks we can flush across all the + ** transactions, but only bother if we've actually spanned + ** across multiple lists + */ + if (flush_jl != jl) { + ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i); + } + flush_journal_list(s, flush_jl, 1); + return 0; } /* @@ -1655,207 +1769,248 @@ static int flush_used_journal_lists(struct super_block *s, ** only touchs the hnext and hprev pointers. */ void remove_journal_hash(struct super_block *sb, - struct reiserfs_journal_cnode **table, - struct reiserfs_journal_list *jl, - unsigned long block, int remove_freed) -{ - struct reiserfs_journal_cnode *cur ; - struct reiserfs_journal_cnode **head ; - - head= &(journal_hash(table, sb, block)) ; - if (!head) { - return ; - } - cur = *head ; - while(cur) { - if (cur->blocknr == block && cur->sb == sb && (jl == NULL || jl == cur->jlist) && - (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) { - if (cur->hnext) { - cur->hnext->hprev = cur->hprev ; - } - if (cur->hprev) { - cur->hprev->hnext = cur->hnext ; - } else { - *head = cur->hnext ; - } - cur->blocknr = 0 ; - cur->sb = NULL ; - cur->state = 0 ; - if (cur->bh && cur->jlist) /* anybody who clears the cur->bh will also dec the nonzerolen */ - atomic_dec(&(cur->jlist->j_nonzerolen)) ; - cur->bh = NULL ; - cur->jlist = NULL ; - } - cur = cur->hnext ; - } -} - -static void free_journal_ram(struct super_block *p_s_sb) { - struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); - reiserfs_kfree(journal->j_current_jl, - sizeof(struct reiserfs_journal_list), p_s_sb); - journal->j_num_lists--; - - vfree(journal->j_cnode_free_orig) ; - free_list_bitmaps(p_s_sb, journal->j_list_bitmap) ; - free_bitmap_nodes(p_s_sb) ; /* must be after free_list_bitmaps */ - if (journal->j_header_bh) { - brelse(journal->j_header_bh) ; - } - /* j_header_bh is on the journal dev, make sure not to release the journal - * dev until we brelse j_header_bh - */ - release_journal_dev(p_s_sb, journal); - vfree(journal) ; + struct reiserfs_journal_cnode **table, + struct reiserfs_journal_list *jl, + unsigned long block, int remove_freed) +{ + struct reiserfs_journal_cnode *cur; + struct reiserfs_journal_cnode **head; + + head = &(journal_hash(table, sb, block)); + if (!head) { + return; + } + cur = *head; + while (cur) { + if (cur->blocknr == block && cur->sb == sb + && (jl == NULL || jl == cur->jlist) + && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) { + if (cur->hnext) { + cur->hnext->hprev = cur->hprev; + } + if (cur->hprev) { + cur->hprev->hnext = cur->hnext; + } else { + *head = cur->hnext; + } + cur->blocknr = 0; + cur->sb = NULL; + cur->state = 0; + if (cur->bh && cur->jlist) /* anybody who clears the cur->bh will also dec the nonzerolen */ + atomic_dec(&(cur->jlist->j_nonzerolen)); + cur->bh = NULL; + cur->jlist = NULL; + } + cur = cur->hnext; + } +} + +static void free_journal_ram(struct super_block *p_s_sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + reiserfs_kfree(journal->j_current_jl, + sizeof(struct reiserfs_journal_list), p_s_sb); + journal->j_num_lists--; + + vfree(journal->j_cnode_free_orig); + free_list_bitmaps(p_s_sb, journal->j_list_bitmap); + free_bitmap_nodes(p_s_sb); /* must be after free_list_bitmaps */ + if (journal->j_header_bh) { + brelse(journal->j_header_bh); + } + /* j_header_bh is on the journal dev, make sure not to release the journal + * dev until we brelse j_header_bh + */ + release_journal_dev(p_s_sb, journal); + vfree(journal); } /* ** call on unmount. Only set error to 1 if you haven't made your way out ** of read_super() yet. Any other caller must keep error at 0. */ -static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, int error) { - struct reiserfs_transaction_handle myth ; - int flushed = 0; - struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); - - /* we only want to flush out transactions if we were called with error == 0 - */ - if (!error && !(p_s_sb->s_flags & MS_RDONLY)) { - /* end the current trans */ - BUG_ON (!th->t_trans_id); - do_journal_end(th, p_s_sb,10, FLUSH_ALL) ; - - /* make sure something gets logged to force our way into the flush code */ - if (!journal_join(&myth, p_s_sb, 1)) { - reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; - journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; - do_journal_end(&myth, p_s_sb,1, FLUSH_ALL) ; - flushed = 1; - } - } - - /* this also catches errors during the do_journal_end above */ - if (!error && reiserfs_is_journal_aborted(journal)) { - memset(&myth, 0, sizeof(myth)); - if (!journal_join_abort(&myth, p_s_sb, 1)) { - reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; - journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; - do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL) ; - } - } - - reiserfs_mounted_fs_count-- ; - /* wait for all commits to finish */ - cancel_delayed_work(&SB_JOURNAL(p_s_sb)->j_work); - flush_workqueue(commit_wq); - if (!reiserfs_mounted_fs_count) { - destroy_workqueue(commit_wq); - commit_wq = NULL; - } - - free_journal_ram(p_s_sb) ; - - return 0 ; +static int do_journal_release(struct reiserfs_transaction_handle *th, + struct super_block *p_s_sb, int error) +{ + struct reiserfs_transaction_handle myth; + int flushed = 0; + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + + /* we only want to flush out transactions if we were called with error == 0 + */ + if (!error && !(p_s_sb->s_flags & MS_RDONLY)) { + /* end the current trans */ + BUG_ON(!th->t_trans_id); + do_journal_end(th, p_s_sb, 10, FLUSH_ALL); + + /* make sure something gets logged to force our way into the flush code */ + if (!journal_join(&myth, p_s_sb, 1)) { + reiserfs_prepare_for_journal(p_s_sb, + SB_BUFFER_WITH_SB(p_s_sb), + 1); + journal_mark_dirty(&myth, p_s_sb, + SB_BUFFER_WITH_SB(p_s_sb)); + do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL); + flushed = 1; + } + } + + /* this also catches errors during the do_journal_end above */ + if (!error && reiserfs_is_journal_aborted(journal)) { + memset(&myth, 0, sizeof(myth)); + if (!journal_join_abort(&myth, p_s_sb, 1)) { + reiserfs_prepare_for_journal(p_s_sb, + SB_BUFFER_WITH_SB(p_s_sb), + 1); + journal_mark_dirty(&myth, p_s_sb, + SB_BUFFER_WITH_SB(p_s_sb)); + do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL); + } + } + + reiserfs_mounted_fs_count--; + /* wait for all commits to finish */ + cancel_delayed_work(&SB_JOURNAL(p_s_sb)->j_work); + flush_workqueue(commit_wq); + if (!reiserfs_mounted_fs_count) { + destroy_workqueue(commit_wq); + commit_wq = NULL; + } + + free_journal_ram(p_s_sb); + + return 0; } /* ** call on unmount. flush all journal trans, release all alloc'd ram */ -int journal_release(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb) { - return do_journal_release(th, p_s_sb, 0) ; +int journal_release(struct reiserfs_transaction_handle *th, + struct super_block *p_s_sb) +{ + return do_journal_release(th, p_s_sb, 0); } + /* ** only call from an error condition inside reiserfs_read_super! */ -int journal_release_error(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb) { - return do_journal_release(th, p_s_sb, 1) ; +int journal_release_error(struct reiserfs_transaction_handle *th, + struct super_block *p_s_sb) +{ + return do_journal_release(th, p_s_sb, 1); } /* compares description block with commit block. returns 1 if they differ, 0 if they are the same */ -static int journal_compare_desc_commit(struct super_block *p_s_sb, struct reiserfs_journal_desc *desc, - struct reiserfs_journal_commit *commit) { - if (get_commit_trans_id (commit) != get_desc_trans_id (desc) || - get_commit_trans_len (commit) != get_desc_trans_len (desc) || - get_commit_trans_len (commit) > SB_JOURNAL(p_s_sb)->j_trans_max || - get_commit_trans_len (commit) <= 0 - ) { - return 1 ; - } - return 0 ; +static int journal_compare_desc_commit(struct super_block *p_s_sb, + struct reiserfs_journal_desc *desc, + struct reiserfs_journal_commit *commit) +{ + if (get_commit_trans_id(commit) != get_desc_trans_id(desc) || + get_commit_trans_len(commit) != get_desc_trans_len(desc) || + get_commit_trans_len(commit) > SB_JOURNAL(p_s_sb)->j_trans_max || + get_commit_trans_len(commit) <= 0) { + return 1; + } + return 0; } + /* returns 0 if it did not find a description block ** returns -1 if it found a corrupt commit block ** returns 1 if both desc and commit were valid */ -static int journal_transaction_is_valid(struct super_block *p_s_sb, struct buffer_head *d_bh, unsigned long *oldest_invalid_trans_id, unsigned long *newest_mount_id) { - struct reiserfs_journal_desc *desc ; - struct reiserfs_journal_commit *commit ; - struct buffer_head *c_bh ; - unsigned long offset ; - - if (!d_bh) - return 0 ; - - desc = (struct reiserfs_journal_desc *)d_bh->b_data ; - if (get_desc_trans_len(desc) > 0 && !memcmp(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8)) { - if (oldest_invalid_trans_id && *oldest_invalid_trans_id && get_desc_trans_id(desc) > *oldest_invalid_trans_id) { - reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-986: transaction " - "is valid returning because trans_id %d is greater than " - "oldest_invalid %lu", get_desc_trans_id(desc), - *oldest_invalid_trans_id); - return 0 ; - } - if (newest_mount_id && *newest_mount_id > get_desc_mount_id (desc)) { - reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1087: transaction " - "is valid returning because mount_id %d is less than " - "newest_mount_id %lu", get_desc_mount_id (desc), - *newest_mount_id) ; - return -1 ; - } - if ( get_desc_trans_len(desc) > SB_JOURNAL(p_s_sb)->j_trans_max ) { - reiserfs_warning(p_s_sb, "journal-2018: Bad transaction length %d encountered, ignoring transaction", get_desc_trans_len(desc)); - return -1 ; - } - offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ; - - /* ok, we have a journal description block, lets see if the transaction was valid */ - c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + - ((offset + get_desc_trans_len(desc) + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ; - if (!c_bh) - return 0 ; - commit = (struct reiserfs_journal_commit *)c_bh->b_data ; - if (journal_compare_desc_commit(p_s_sb, desc, commit)) { - reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, - "journal_transaction_is_valid, commit offset %ld had bad " - "time %d or length %d", - c_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), - get_commit_trans_id (commit), - get_commit_trans_len(commit)); - brelse(c_bh) ; - if (oldest_invalid_trans_id) { - *oldest_invalid_trans_id = get_desc_trans_id(desc) ; - reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1004: " - "transaction_is_valid setting oldest invalid trans_id " - "to %d", get_desc_trans_id(desc)) ; - } - return -1; - } - brelse(c_bh) ; - reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1006: found valid " - "transaction start offset %llu, len %d id %d", - d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), - get_desc_trans_len(desc), get_desc_trans_id(desc)) ; - return 1 ; - } else { - return 0 ; - } -} - -static void brelse_array(struct buffer_head **heads, int num) { - int i ; - for (i = 0 ; i < num ; i++) { - brelse(heads[i]) ; - } +static int journal_transaction_is_valid(struct super_block *p_s_sb, + struct buffer_head *d_bh, + unsigned long *oldest_invalid_trans_id, + unsigned long *newest_mount_id) +{ + struct reiserfs_journal_desc *desc; + struct reiserfs_journal_commit *commit; + struct buffer_head *c_bh; + unsigned long offset; + + if (!d_bh) + return 0; + + desc = (struct reiserfs_journal_desc *)d_bh->b_data; + if (get_desc_trans_len(desc) > 0 + && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) { + if (oldest_invalid_trans_id && *oldest_invalid_trans_id + && get_desc_trans_id(desc) > *oldest_invalid_trans_id) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, + "journal-986: transaction " + "is valid returning because trans_id %d is greater than " + "oldest_invalid %lu", + get_desc_trans_id(desc), + *oldest_invalid_trans_id); + return 0; + } + if (newest_mount_id + && *newest_mount_id > get_desc_mount_id(desc)) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, + "journal-1087: transaction " + "is valid returning because mount_id %d is less than " + "newest_mount_id %lu", + get_desc_mount_id(desc), + *newest_mount_id); + return -1; + } + if (get_desc_trans_len(desc) > SB_JOURNAL(p_s_sb)->j_trans_max) { + reiserfs_warning(p_s_sb, + "journal-2018: Bad transaction length %d encountered, ignoring transaction", + get_desc_trans_len(desc)); + return -1; + } + offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb); + + /* ok, we have a journal description block, lets see if the transaction was valid */ + c_bh = + journal_bread(p_s_sb, + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + ((offset + get_desc_trans_len(desc) + + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))); + if (!c_bh) + return 0; + commit = (struct reiserfs_journal_commit *)c_bh->b_data; + if (journal_compare_desc_commit(p_s_sb, desc, commit)) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, + "journal_transaction_is_valid, commit offset %ld had bad " + "time %d or length %d", + c_bh->b_blocknr - + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), + get_commit_trans_id(commit), + get_commit_trans_len(commit)); + brelse(c_bh); + if (oldest_invalid_trans_id) { + *oldest_invalid_trans_id = + get_desc_trans_id(desc); + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, + "journal-1004: " + "transaction_is_valid setting oldest invalid trans_id " + "to %d", + get_desc_trans_id(desc)); + } + return -1; + } + brelse(c_bh); + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, + "journal-1006: found valid " + "transaction start offset %llu, len %d id %d", + d_bh->b_blocknr - + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), + get_desc_trans_len(desc), + get_desc_trans_id(desc)); + return 1; + } else { + return 0; + } +} + +static void brelse_array(struct buffer_head **heads, int num) +{ + int i; + for (i = 0; i < num; i++) { + brelse(heads[i]); + } } /* @@ -1863,149 +2018,202 @@ static void brelse_array(struct buffer_head **heads, int num) { ** this either reads in a replays a transaction, or returns because the transaction ** is invalid, or too old. */ -static int journal_read_transaction(struct super_block *p_s_sb, unsigned long cur_dblock, unsigned long oldest_start, - unsigned long oldest_trans_id, unsigned long newest_mount_id) { - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - struct reiserfs_journal_desc *desc ; - struct reiserfs_journal_commit *commit ; - unsigned long trans_id = 0 ; - struct buffer_head *c_bh ; - struct buffer_head *d_bh ; - struct buffer_head **log_blocks = NULL ; - struct buffer_head **real_blocks = NULL ; - unsigned long trans_offset ; - int i; - int trans_half; - - d_bh = journal_bread(p_s_sb, cur_dblock) ; - if (!d_bh) - return 1 ; - desc = (struct reiserfs_journal_desc *)d_bh->b_data ; - trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ; - reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: " - "journal_read_transaction, offset %llu, len %d mount_id %d", - d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), - get_desc_trans_len(desc), get_desc_mount_id(desc)) ; - if (get_desc_trans_id(desc) < oldest_trans_id) { - reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1039: " - "journal_read_trans skipping because %lu is too old", - cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)) ; - brelse(d_bh) ; - return 1 ; - } - if (get_desc_mount_id(desc) != newest_mount_id) { - reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1146: " - "journal_read_trans skipping because %d is != " - "newest_mount_id %lu", get_desc_mount_id(desc), - newest_mount_id) ; - brelse(d_bh) ; - return 1 ; - } - c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + - ((trans_offset + get_desc_trans_len(desc) + 1) % - SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ; - if (!c_bh) { - brelse(d_bh) ; - return 1 ; - } - commit = (struct reiserfs_journal_commit *)c_bh->b_data ; - if (journal_compare_desc_commit(p_s_sb, desc, commit)) { - reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal_read_transaction, " - "commit offset %llu had bad time %d or length %d", - c_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), - get_commit_trans_id(commit), get_commit_trans_len(commit)); - brelse(c_bh) ; - brelse(d_bh) ; - return 1; - } - trans_id = get_desc_trans_id(desc) ; - /* now we know we've got a good transaction, and it was inside the valid time ranges */ - log_blocks = reiserfs_kmalloc(get_desc_trans_len(desc) * sizeof(struct buffer_head *), GFP_NOFS, p_s_sb) ; - real_blocks = reiserfs_kmalloc(get_desc_trans_len(desc) * sizeof(struct buffer_head *), GFP_NOFS, p_s_sb) ; - if (!log_blocks || !real_blocks) { - brelse(c_bh) ; - brelse(d_bh) ; - reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; - reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; - reiserfs_warning(p_s_sb, "journal-1169: kmalloc failed, unable to mount FS") ; - return -1 ; - } - /* get all the buffer heads */ - trans_half = journal_trans_half (p_s_sb->s_blocksize) ; - for(i = 0 ; i < get_desc_trans_len(desc) ; i++) { - log_blocks[i] = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + (trans_offset + 1 + i) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)); - if (i < trans_half) { - real_blocks[i] = sb_getblk(p_s_sb, le32_to_cpu(desc->j_realblock[i])) ; - } else { - real_blocks[i] = sb_getblk(p_s_sb, le32_to_cpu(commit->j_realblock[i - trans_half])) ; - } - if ( real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(p_s_sb) ) { - reiserfs_warning(p_s_sb, "journal-1207: REPLAY FAILURE fsck required! Block to replay is outside of filesystem"); - goto abort_replay; - } - /* make sure we don't try to replay onto log or reserved area */ - if (is_block_in_log_or_reserved_area(p_s_sb, real_blocks[i]->b_blocknr)) { - reiserfs_warning(p_s_sb, "journal-1204: REPLAY FAILURE fsck required! Trying to replay onto a log block") ; -abort_replay: - brelse_array(log_blocks, i) ; - brelse_array(real_blocks, i) ; - brelse(c_bh) ; - brelse(d_bh) ; - reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; - reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; - return -1 ; - } - } - /* read in the log blocks, memcpy to the corresponding real block */ - ll_rw_block(READ, get_desc_trans_len(desc), log_blocks) ; - for (i = 0 ; i < get_desc_trans_len(desc) ; i++) { - wait_on_buffer(log_blocks[i]) ; - if (!buffer_uptodate(log_blocks[i])) { - reiserfs_warning(p_s_sb, "journal-1212: REPLAY FAILURE fsck required! buffer write failed") ; - brelse_array(log_blocks + i, get_desc_trans_len(desc) - i) ; - brelse_array(real_blocks, get_desc_trans_len(desc)) ; - brelse(c_bh) ; - brelse(d_bh) ; - reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; - reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; - return -1 ; - } - memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data, real_blocks[i]->b_size) ; - set_buffer_uptodate(real_blocks[i]) ; - brelse(log_blocks[i]) ; - } - /* flush out the real blocks */ - for (i = 0 ; i < get_desc_trans_len(desc) ; i++) { - set_buffer_dirty(real_blocks[i]) ; - ll_rw_block(WRITE, 1, real_blocks + i) ; - } - for (i = 0 ; i < get_desc_trans_len(desc) ; i++) { - wait_on_buffer(real_blocks[i]) ; - if (!buffer_uptodate(real_blocks[i])) { - reiserfs_warning(p_s_sb, "journal-1226: REPLAY FAILURE, fsck required! buffer write failed") ; - brelse_array(real_blocks + i, get_desc_trans_len(desc) - i) ; - brelse(c_bh) ; - brelse(d_bh) ; - reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; - reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ; - return -1 ; - } - brelse(real_blocks[i]) ; - } - cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + ((trans_offset + get_desc_trans_len(desc) + 2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)) ; - reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1095: setting journal " - "start to offset %ld", - cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)) ; - - /* init starting values for the first transaction, in case this is the last transaction to be replayed. */ - journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ; - journal->j_last_flush_trans_id = trans_id ; - journal->j_trans_id = trans_id + 1; - brelse(c_bh) ; - brelse(d_bh) ; - reiserfs_kfree(log_blocks, le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), p_s_sb) ; - reiserfs_kfree(real_blocks, le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), p_s_sb) ; - return 0 ; +static int journal_read_transaction(struct super_block *p_s_sb, + unsigned long cur_dblock, + unsigned long oldest_start, + unsigned long oldest_trans_id, + unsigned long newest_mount_id) +{ + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + struct reiserfs_journal_desc *desc; + struct reiserfs_journal_commit *commit; + unsigned long trans_id = 0; + struct buffer_head *c_bh; + struct buffer_head *d_bh; + struct buffer_head **log_blocks = NULL; + struct buffer_head **real_blocks = NULL; + unsigned long trans_offset; + int i; + int trans_half; + + d_bh = journal_bread(p_s_sb, cur_dblock); + if (!d_bh) + return 1; + desc = (struct reiserfs_journal_desc *)d_bh->b_data; + trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb); + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: " + "journal_read_transaction, offset %llu, len %d mount_id %d", + d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), + get_desc_trans_len(desc), get_desc_mount_id(desc)); + if (get_desc_trans_id(desc) < oldest_trans_id) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1039: " + "journal_read_trans skipping because %lu is too old", + cur_dblock - + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)); + brelse(d_bh); + return 1; + } + if (get_desc_mount_id(desc) != newest_mount_id) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1146: " + "journal_read_trans skipping because %d is != " + "newest_mount_id %lu", get_desc_mount_id(desc), + newest_mount_id); + brelse(d_bh); + return 1; + } + c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + ((trans_offset + get_desc_trans_len(desc) + 1) % + SB_ONDISK_JOURNAL_SIZE(p_s_sb))); + if (!c_bh) { + brelse(d_bh); + return 1; + } + commit = (struct reiserfs_journal_commit *)c_bh->b_data; + if (journal_compare_desc_commit(p_s_sb, desc, commit)) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, + "journal_read_transaction, " + "commit offset %llu had bad time %d or length %d", + c_bh->b_blocknr - + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), + get_commit_trans_id(commit), + get_commit_trans_len(commit)); + brelse(c_bh); + brelse(d_bh); + return 1; + } + trans_id = get_desc_trans_id(desc); + /* now we know we've got a good transaction, and it was inside the valid time ranges */ + log_blocks = + reiserfs_kmalloc(get_desc_trans_len(desc) * + sizeof(struct buffer_head *), GFP_NOFS, p_s_sb); + real_blocks = + reiserfs_kmalloc(get_desc_trans_len(desc) * + sizeof(struct buffer_head *), GFP_NOFS, p_s_sb); + if (!log_blocks || !real_blocks) { + brelse(c_bh); + brelse(d_bh); + reiserfs_kfree(log_blocks, + get_desc_trans_len(desc) * + sizeof(struct buffer_head *), p_s_sb); + reiserfs_kfree(real_blocks, + get_desc_trans_len(desc) * + sizeof(struct buffer_head *), p_s_sb); + reiserfs_warning(p_s_sb, + "journal-1169: kmalloc failed, unable to mount FS"); + return -1; + } + /* get all the buffer heads */ + trans_half = journal_trans_half(p_s_sb->s_blocksize); + for (i = 0; i < get_desc_trans_len(desc); i++) { + log_blocks[i] = + journal_getblk(p_s_sb, + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + (trans_offset + 1 + + i) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)); + if (i < trans_half) { + real_blocks[i] = + sb_getblk(p_s_sb, + le32_to_cpu(desc->j_realblock[i])); + } else { + real_blocks[i] = + sb_getblk(p_s_sb, + le32_to_cpu(commit-> + j_realblock[i - trans_half])); + } + if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) { + reiserfs_warning(p_s_sb, + "journal-1207: REPLAY FAILURE fsck required! Block to replay is outside of filesystem"); + goto abort_replay; + } + /* make sure we don't try to replay onto log or reserved area */ + if (is_block_in_log_or_reserved_area + (p_s_sb, real_blocks[i]->b_blocknr)) { + reiserfs_warning(p_s_sb, + "journal-1204: REPLAY FAILURE fsck required! Trying to replay onto a log block"); + abort_replay: + brelse_array(log_blocks, i); + brelse_array(real_blocks, i); + brelse(c_bh); + brelse(d_bh); + reiserfs_kfree(log_blocks, + get_desc_trans_len(desc) * + sizeof(struct buffer_head *), p_s_sb); + reiserfs_kfree(real_blocks, + get_desc_trans_len(desc) * + sizeof(struct buffer_head *), p_s_sb); + return -1; + } + } + /* read in the log blocks, memcpy to the corresponding real block */ + ll_rw_block(READ, get_desc_trans_len(desc), log_blocks); + for (i = 0; i < get_desc_trans_len(desc); i++) { + wait_on_buffer(log_blocks[i]); + if (!buffer_uptodate(log_blocks[i])) { + reiserfs_warning(p_s_sb, + "journal-1212: REPLAY FAILURE fsck required! buffer write failed"); + brelse_array(log_blocks + i, + get_desc_trans_len(desc) - i); + brelse_array(real_blocks, get_desc_trans_len(desc)); + brelse(c_bh); + brelse(d_bh); + reiserfs_kfree(log_blocks, + get_desc_trans_len(desc) * + sizeof(struct buffer_head *), p_s_sb); + reiserfs_kfree(real_blocks, + get_desc_trans_len(desc) * + sizeof(struct buffer_head *), p_s_sb); + return -1; + } + memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data, + real_blocks[i]->b_size); + set_buffer_uptodate(real_blocks[i]); + brelse(log_blocks[i]); + } + /* flush out the real blocks */ + for (i = 0; i < get_desc_trans_len(desc); i++) { + set_buffer_dirty(real_blocks[i]); + ll_rw_block(WRITE, 1, real_blocks + i); + } + for (i = 0; i < get_desc_trans_len(desc); i++) { + wait_on_buffer(real_blocks[i]); + if (!buffer_uptodate(real_blocks[i])) { + reiserfs_warning(p_s_sb, + "journal-1226: REPLAY FAILURE, fsck required! buffer write failed"); + brelse_array(real_blocks + i, + get_desc_trans_len(desc) - i); + brelse(c_bh); + brelse(d_bh); + reiserfs_kfree(log_blocks, + get_desc_trans_len(desc) * + sizeof(struct buffer_head *), p_s_sb); + reiserfs_kfree(real_blocks, + get_desc_trans_len(desc) * + sizeof(struct buffer_head *), p_s_sb); + return -1; + } + brelse(real_blocks[i]); + } + cur_dblock = + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + ((trans_offset + get_desc_trans_len(desc) + + 2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)); + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, + "journal-1095: setting journal " "start to offset %ld", + cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)); + + /* init starting values for the first transaction, in case this is the last transaction to be replayed. */ + journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb); + journal->j_last_flush_trans_id = trans_id; + journal->j_trans_id = trans_id + 1; + brelse(c_bh); + brelse(d_bh); + reiserfs_kfree(log_blocks, + le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), + p_s_sb); + reiserfs_kfree(real_blocks, + le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), + p_s_sb); + return 0; } /* This function reads blocks starting from block and to max_block of bufsize @@ -2014,39 +2222,39 @@ abort_replay: Right now it is only used from journal code. But later we might use it from other places. Note: Do not use journal_getblk/sb_getblk functions here! */ -static struct buffer_head * reiserfs_breada (struct block_device *dev, int block, int bufsize, - unsigned int max_block) +static struct buffer_head *reiserfs_breada(struct block_device *dev, int block, + int bufsize, unsigned int max_block) { - struct buffer_head * bhlist[BUFNR]; + struct buffer_head *bhlist[BUFNR]; unsigned int blocks = BUFNR; - struct buffer_head * bh; + struct buffer_head *bh; int i, j; - - bh = __getblk (dev, block, bufsize ); - if (buffer_uptodate (bh)) - return (bh); - + + bh = __getblk(dev, block, bufsize); + if (buffer_uptodate(bh)) + return (bh); + if (block + BUFNR > max_block) { blocks = max_block - block; } bhlist[0] = bh; j = 1; for (i = 1; i < blocks; i++) { - bh = __getblk (dev, block + i, bufsize); - if (buffer_uptodate (bh)) { - brelse (bh); + bh = __getblk(dev, block + i, bufsize); + if (buffer_uptodate(bh)) { + brelse(bh); break; - } - else bhlist[j++] = bh; + } else + bhlist[j++] = bh; } - ll_rw_block (READ, j, bhlist); - for(i = 1; i < j; i++) - brelse (bhlist[i]); + ll_rw_block(READ, j, bhlist); + for (i = 1; i < j; i++) + brelse(bhlist[i]); bh = bhlist[0]; - wait_on_buffer (bh); - if (buffer_uptodate (bh)) + wait_on_buffer(bh); + if (buffer_uptodate(bh)) return bh; - brelse (bh); + brelse(bh); return NULL; } @@ -2059,218 +2267,250 @@ static struct buffer_head * reiserfs_breada (struct block_device *dev, int block ** ** On exit, it sets things up so the first transaction will work correctly. */ -static int journal_read(struct super_block *p_s_sb) { - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - struct reiserfs_journal_desc *desc ; - unsigned long oldest_trans_id = 0; - unsigned long oldest_invalid_trans_id = 0 ; - time_t start ; - unsigned long oldest_start = 0; - unsigned long cur_dblock = 0 ; - unsigned long newest_mount_id = 9 ; - struct buffer_head *d_bh ; - struct reiserfs_journal_header *jh ; - int valid_journal_header = 0 ; - int replay_count = 0 ; - int continue_replay = 1 ; - int ret ; - char b[BDEVNAME_SIZE]; - - cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ; - reiserfs_info (p_s_sb, "checking transaction log (%s)\n", - bdevname(journal->j_dev_bd, b)); - start = get_seconds(); - - /* step 1, read in the journal header block. Check the transaction it says - ** is the first unflushed, and if that transaction is not valid, - ** replay is done - */ - journal->j_header_bh = journal_bread(p_s_sb, - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + - SB_ONDISK_JOURNAL_SIZE(p_s_sb)); - if (!journal->j_header_bh) { - return 1 ; - } - jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data) ; - if (le32_to_cpu(jh->j_first_unflushed_offset) >= 0 && - le32_to_cpu(jh->j_first_unflushed_offset) < SB_ONDISK_JOURNAL_SIZE(p_s_sb) && - le32_to_cpu(jh->j_last_flush_trans_id) > 0) { - oldest_start = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + - le32_to_cpu(jh->j_first_unflushed_offset) ; - oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1; - newest_mount_id = le32_to_cpu(jh->j_mount_id); - reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1153: found in " - "header: first_unflushed_offset %d, last_flushed_trans_id " - "%lu", le32_to_cpu(jh->j_first_unflushed_offset), - le32_to_cpu(jh->j_last_flush_trans_id)) ; - valid_journal_header = 1 ; - - /* now, we try to read the first unflushed offset. If it is not valid, - ** there is nothing more we can do, and it makes no sense to read - ** through the whole log. - */ - d_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + le32_to_cpu(jh->j_first_unflushed_offset)) ; - ret = journal_transaction_is_valid(p_s_sb, d_bh, NULL, NULL) ; - if (!ret) { - continue_replay = 0 ; - } - brelse(d_bh) ; - goto start_log_replay; - } - - if (continue_replay && bdev_read_only(p_s_sb->s_bdev)) { - reiserfs_warning (p_s_sb, - "clm-2076: device is readonly, unable to replay log") ; - return -1 ; - } - - /* ok, there are transactions that need to be replayed. start with the first log block, find - ** all the valid transactions, and pick out the oldest. - */ - while(continue_replay && cur_dblock < (SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb))) { - /* Note that it is required for blocksize of primary fs device and journal - device to be the same */ - d_bh = reiserfs_breada(journal->j_dev_bd, cur_dblock, p_s_sb->s_blocksize, - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) ; - ret = journal_transaction_is_valid(p_s_sb, d_bh, &oldest_invalid_trans_id, &newest_mount_id) ; - if (ret == 1) { - desc = (struct reiserfs_journal_desc *)d_bh->b_data ; - if (oldest_start == 0) { /* init all oldest_ values */ - oldest_trans_id = get_desc_trans_id(desc) ; - oldest_start = d_bh->b_blocknr ; - newest_mount_id = get_desc_mount_id(desc) ; - reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1179: Setting " - "oldest_start to offset %llu, trans_id %lu", - oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), - oldest_trans_id) ; - } else if (oldest_trans_id > get_desc_trans_id(desc)) { - /* one we just read was older */ - oldest_trans_id = get_desc_trans_id(desc) ; - oldest_start = d_bh->b_blocknr ; - reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1180: Resetting " - "oldest_start to offset %lu, trans_id %lu", - oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), - oldest_trans_id) ; - } - if (newest_mount_id < get_desc_mount_id(desc)) { - newest_mount_id = get_desc_mount_id(desc) ; +static int journal_read(struct super_block *p_s_sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + struct reiserfs_journal_desc *desc; + unsigned long oldest_trans_id = 0; + unsigned long oldest_invalid_trans_id = 0; + time_t start; + unsigned long oldest_start = 0; + unsigned long cur_dblock = 0; + unsigned long newest_mount_id = 9; + struct buffer_head *d_bh; + struct reiserfs_journal_header *jh; + int valid_journal_header = 0; + int replay_count = 0; + int continue_replay = 1; + int ret; + char b[BDEVNAME_SIZE]; + + cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb); + reiserfs_info(p_s_sb, "checking transaction log (%s)\n", + bdevname(journal->j_dev_bd, b)); + start = get_seconds(); + + /* step 1, read in the journal header block. Check the transaction it says + ** is the first unflushed, and if that transaction is not valid, + ** replay is done + */ + journal->j_header_bh = journal_bread(p_s_sb, + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + SB_ONDISK_JOURNAL_SIZE(p_s_sb)); + if (!journal->j_header_bh) { + return 1; + } + jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data); + if (le32_to_cpu(jh->j_first_unflushed_offset) >= 0 && + le32_to_cpu(jh->j_first_unflushed_offset) < + SB_ONDISK_JOURNAL_SIZE(p_s_sb) + && le32_to_cpu(jh->j_last_flush_trans_id) > 0) { + oldest_start = + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + le32_to_cpu(jh->j_first_unflushed_offset); + oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1; + newest_mount_id = le32_to_cpu(jh->j_mount_id); + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, + "journal-1153: found in " + "header: first_unflushed_offset %d, last_flushed_trans_id " + "%lu", le32_to_cpu(jh->j_first_unflushed_offset), + le32_to_cpu(jh->j_last_flush_trans_id)); + valid_journal_header = 1; + + /* now, we try to read the first unflushed offset. If it is not valid, + ** there is nothing more we can do, and it makes no sense to read + ** through the whole log. + */ + d_bh = + journal_bread(p_s_sb, + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + le32_to_cpu(jh->j_first_unflushed_offset)); + ret = journal_transaction_is_valid(p_s_sb, d_bh, NULL, NULL); + if (!ret) { + continue_replay = 0; + } + brelse(d_bh); + goto start_log_replay; + } + + if (continue_replay && bdev_read_only(p_s_sb->s_bdev)) { + reiserfs_warning(p_s_sb, + "clm-2076: device is readonly, unable to replay log"); + return -1; + } + + /* ok, there are transactions that need to be replayed. start with the first log block, find + ** all the valid transactions, and pick out the oldest. + */ + while (continue_replay + && cur_dblock < + (SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + SB_ONDISK_JOURNAL_SIZE(p_s_sb))) { + /* Note that it is required for blocksize of primary fs device and journal + device to be the same */ + d_bh = + reiserfs_breada(journal->j_dev_bd, cur_dblock, + p_s_sb->s_blocksize, + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + SB_ONDISK_JOURNAL_SIZE(p_s_sb)); + ret = + journal_transaction_is_valid(p_s_sb, d_bh, + &oldest_invalid_trans_id, + &newest_mount_id); + if (ret == 1) { + desc = (struct reiserfs_journal_desc *)d_bh->b_data; + if (oldest_start == 0) { /* init all oldest_ values */ + oldest_trans_id = get_desc_trans_id(desc); + oldest_start = d_bh->b_blocknr; + newest_mount_id = get_desc_mount_id(desc); + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, + "journal-1179: Setting " + "oldest_start to offset %llu, trans_id %lu", + oldest_start - + SB_ONDISK_JOURNAL_1st_BLOCK + (p_s_sb), oldest_trans_id); + } else if (oldest_trans_id > get_desc_trans_id(desc)) { + /* one we just read was older */ + oldest_trans_id = get_desc_trans_id(desc); + oldest_start = d_bh->b_blocknr; + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, + "journal-1180: Resetting " + "oldest_start to offset %lu, trans_id %lu", + oldest_start - + SB_ONDISK_JOURNAL_1st_BLOCK + (p_s_sb), oldest_trans_id); + } + if (newest_mount_id < get_desc_mount_id(desc)) { + newest_mount_id = get_desc_mount_id(desc); + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, + "journal-1299: Setting " + "newest_mount_id to %d", + get_desc_mount_id(desc)); + } + cur_dblock += get_desc_trans_len(desc) + 2; + } else { + cur_dblock++; + } + brelse(d_bh); + } + + start_log_replay: + cur_dblock = oldest_start; + if (oldest_trans_id) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, + "journal-1206: Starting replay " + "from offset %llu, trans_id %lu", + cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), + oldest_trans_id); + + } + replay_count = 0; + while (continue_replay && oldest_trans_id > 0) { + ret = + journal_read_transaction(p_s_sb, cur_dblock, oldest_start, + oldest_trans_id, newest_mount_id); + if (ret < 0) { + return ret; + } else if (ret != 0) { + break; + } + cur_dblock = + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start; + replay_count++; + if (cur_dblock == oldest_start) + break; + } + + if (oldest_trans_id == 0) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, + "journal-1225: No valid " "transactions found"); + } + /* j_start does not get set correctly if we don't replay any transactions. + ** if we had a valid journal_header, set j_start to the first unflushed transaction value, + ** copy the trans_id from the header + */ + if (valid_journal_header && replay_count == 0) { + journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset); + journal->j_trans_id = + le32_to_cpu(jh->j_last_flush_trans_id) + 1; + journal->j_last_flush_trans_id = + le32_to_cpu(jh->j_last_flush_trans_id); + journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1; + } else { + journal->j_mount_id = newest_mount_id + 1; + } reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting " - "newest_mount_id to %d", get_desc_mount_id(desc)); - } - cur_dblock += get_desc_trans_len(desc) + 2 ; - } else { - cur_dblock++ ; - } - brelse(d_bh) ; - } - -start_log_replay: - cur_dblock = oldest_start ; - if (oldest_trans_id) { - reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1206: Starting replay " - "from offset %llu, trans_id %lu", - cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), - oldest_trans_id) ; - - } - replay_count = 0 ; - while(continue_replay && oldest_trans_id > 0) { - ret = journal_read_transaction(p_s_sb, cur_dblock, oldest_start, oldest_trans_id, newest_mount_id) ; - if (ret < 0) { - return ret ; - } else if (ret != 0) { - break ; - } - cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start ; - replay_count++ ; - if (cur_dblock == oldest_start) - break; - } - - if (oldest_trans_id == 0) { - reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1225: No valid " - "transactions found") ; - } - /* j_start does not get set correctly if we don't replay any transactions. - ** if we had a valid journal_header, set j_start to the first unflushed transaction value, - ** copy the trans_id from the header - */ - if (valid_journal_header && replay_count == 0) { - journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset) ; - journal->j_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1; - journal->j_last_flush_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) ; - journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1; - } else { - journal->j_mount_id = newest_mount_id + 1 ; - } - reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting " - "newest_mount_id to %lu", journal->j_mount_id) ; - journal->j_first_unflushed_offset = journal->j_start ; - if (replay_count > 0) { - reiserfs_info (p_s_sb, "replayed %d transactions in %lu seconds\n", - replay_count, get_seconds() - start) ; - } - if (!bdev_read_only(p_s_sb->s_bdev) && - _update_journal_header_block(p_s_sb, journal->j_start, - journal->j_last_flush_trans_id)) - { - /* replay failed, caller must call free_journal_ram and abort - ** the mount - */ - return -1 ; - } - return 0 ; + "newest_mount_id to %lu", journal->j_mount_id); + journal->j_first_unflushed_offset = journal->j_start; + if (replay_count > 0) { + reiserfs_info(p_s_sb, + "replayed %d transactions in %lu seconds\n", + replay_count, get_seconds() - start); + } + if (!bdev_read_only(p_s_sb->s_bdev) && + _update_journal_header_block(p_s_sb, journal->j_start, + journal->j_last_flush_trans_id)) { + /* replay failed, caller must call free_journal_ram and abort + ** the mount + */ + return -1; + } + return 0; } static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) { - struct reiserfs_journal_list *jl; -retry: - jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, s); - if (!jl) { - yield(); - goto retry; - } - memset(jl, 0, sizeof(*jl)); - INIT_LIST_HEAD(&jl->j_list); - INIT_LIST_HEAD(&jl->j_working_list); - INIT_LIST_HEAD(&jl->j_tail_bh_list); - INIT_LIST_HEAD(&jl->j_bh_list); - sema_init(&jl->j_commit_lock, 1); - SB_JOURNAL(s)->j_num_lists++; - get_journal_list(jl); - return jl; -} - -static void journal_list_init(struct super_block *p_s_sb) { - SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb); -} - -static int release_journal_dev( struct super_block *super, - struct reiserfs_journal *journal ) -{ - int result; - - result = 0; - - if( journal -> j_dev_file != NULL ) { - result = filp_close( journal -> j_dev_file, NULL ); - journal -> j_dev_file = NULL; - journal -> j_dev_bd = NULL; - } else if( journal -> j_dev_bd != NULL ) { - result = blkdev_put( journal -> j_dev_bd ); - journal -> j_dev_bd = NULL; - } - - if( result != 0 ) { - reiserfs_warning(super, "sh-457: release_journal_dev: Cannot release journal device: %i", result ); - } - return result; -} - -static int journal_init_dev( struct super_block *super, - struct reiserfs_journal *journal, - const char *jdev_name ) + struct reiserfs_journal_list *jl; + retry: + jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, + s); + if (!jl) { + yield(); + goto retry; + } + memset(jl, 0, sizeof(*jl)); + INIT_LIST_HEAD(&jl->j_list); + INIT_LIST_HEAD(&jl->j_working_list); + INIT_LIST_HEAD(&jl->j_tail_bh_list); + INIT_LIST_HEAD(&jl->j_bh_list); + sema_init(&jl->j_commit_lock, 1); + SB_JOURNAL(s)->j_num_lists++; + get_journal_list(jl); + return jl; +} + +static void journal_list_init(struct super_block *p_s_sb) +{ + SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb); +} + +static int release_journal_dev(struct super_block *super, + struct reiserfs_journal *journal) +{ + int result; + + result = 0; + + if (journal->j_dev_file != NULL) { + result = filp_close(journal->j_dev_file, NULL); + journal->j_dev_file = NULL; + journal->j_dev_bd = NULL; + } else if (journal->j_dev_bd != NULL) { + result = blkdev_put(journal->j_dev_bd); + journal->j_dev_bd = NULL; + } + + if (result != 0) { + reiserfs_warning(super, + "sh-457: release_journal_dev: Cannot release journal device: %i", + result); + } + return result; +} + +static int journal_init_dev(struct super_block *super, + struct reiserfs_journal *journal, + const char *jdev_name) { int result; dev_t jdev; @@ -2279,50 +2519,51 @@ static int journal_init_dev( struct super_block *super, result = 0; - journal -> j_dev_bd = NULL; - journal -> j_dev_file = NULL; - jdev = SB_ONDISK_JOURNAL_DEVICE( super ) ? - new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev; + journal->j_dev_bd = NULL; + journal->j_dev_file = NULL; + jdev = SB_ONDISK_JOURNAL_DEVICE(super) ? + new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev; if (bdev_read_only(super->s_bdev)) - blkdev_mode = FMODE_READ; + blkdev_mode = FMODE_READ; /* there is no "jdev" option and journal is on separate device */ - if( ( !jdev_name || !jdev_name[ 0 ] ) ) { + if ((!jdev_name || !jdev_name[0])) { journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode); if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; - reiserfs_warning (super, "sh-458: journal_init_dev: " - "cannot init journal device '%s': %i", - __bdevname(jdev, b), result ); + reiserfs_warning(super, "sh-458: journal_init_dev: " + "cannot init journal device '%s': %i", + __bdevname(jdev, b), result); return result; } else if (jdev != super->s_dev) set_blocksize(journal->j_dev_bd, super->s_blocksize); return 0; } - journal -> j_dev_file = filp_open( jdev_name, 0, 0 ); - if( !IS_ERR( journal -> j_dev_file ) ) { + journal->j_dev_file = filp_open(jdev_name, 0, 0); + if (!IS_ERR(journal->j_dev_file)) { struct inode *jdev_inode = journal->j_dev_file->f_mapping->host; - if( !S_ISBLK( jdev_inode -> i_mode ) ) { + if (!S_ISBLK(jdev_inode->i_mode)) { reiserfs_warning(super, "journal_init_dev: '%s' is " - "not a block device", jdev_name ); + "not a block device", jdev_name); result = -ENOTBLK; - release_journal_dev( super, journal ); - } else { + release_journal_dev(super, journal); + } else { /* ok */ journal->j_dev_bd = I_BDEV(jdev_inode); set_blocksize(journal->j_dev_bd, super->s_blocksize); - reiserfs_info(super, "journal_init_dev: journal device: %s\n", + reiserfs_info(super, + "journal_init_dev: journal device: %s\n", bdevname(journal->j_dev_bd, b)); } } else { - result = PTR_ERR( journal -> j_dev_file ); - journal -> j_dev_file = NULL; - reiserfs_warning (super, - "journal_init_dev: Cannot open '%s': %i", - jdev_name, result ); + result = PTR_ERR(journal->j_dev_file); + journal->j_dev_file = NULL; + reiserfs_warning(super, + "journal_init_dev: Cannot open '%s': %i", + jdev_name, result); } return result; } @@ -2330,193 +2571,214 @@ static int journal_init_dev( struct super_block *super, /* ** must be called once on fs mount. calls journal_read for you */ -int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_format, unsigned int commit_max_age) { - int num_cnodes = SB_ONDISK_JOURNAL_SIZE(p_s_sb) * 2 ; - struct buffer_head *bhjh; - struct reiserfs_super_block * rs; - struct reiserfs_journal_header *jh; - struct reiserfs_journal *journal; - struct reiserfs_journal_list *jl; - char b[BDEVNAME_SIZE]; - - journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof (struct reiserfs_journal)) ; - if (!journal) { - reiserfs_warning (p_s_sb, "journal-1256: unable to get memory for journal structure") ; - return 1 ; - } - memset(journal, 0, sizeof(struct reiserfs_journal)) ; - INIT_LIST_HEAD(&journal->j_bitmap_nodes) ; - INIT_LIST_HEAD (&journal->j_prealloc_list); - INIT_LIST_HEAD(&journal->j_working_list); - INIT_LIST_HEAD(&journal->j_journal_list); - journal->j_persistent_trans = 0; - if (reiserfs_allocate_list_bitmaps(p_s_sb, - journal->j_list_bitmap, - SB_BMAP_NR(p_s_sb))) - goto free_and_return ; - allocate_bitmap_nodes(p_s_sb) ; - - /* reserved for journal area support */ - SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) = (old_format ? - REISERFS_OLD_DISK_OFFSET_IN_BYTES / p_s_sb->s_blocksize + - SB_BMAP_NR(p_s_sb) + 1 : - REISERFS_DISK_OFFSET_IN_BYTES / p_s_sb->s_blocksize + 2); - - /* Sanity check to see is the standard journal fitting withing first bitmap - (actual for small blocksizes) */ - if ( !SB_ONDISK_JOURNAL_DEVICE( p_s_sb ) && - (SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb) > p_s_sb->s_blocksize * 8) ) { - reiserfs_warning (p_s_sb, "journal-1393: journal does not fit for area " - "addressed by first of bitmap blocks. It starts at " - "%u and its size is %u. Block size %ld", - SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb), - SB_ONDISK_JOURNAL_SIZE(p_s_sb), p_s_sb->s_blocksize); - goto free_and_return; - } - - if( journal_init_dev( p_s_sb, journal, j_dev_name ) != 0 ) { - reiserfs_warning (p_s_sb, "sh-462: unable to initialize jornal device"); - goto free_and_return; - } - - rs = SB_DISK_SUPER_BLOCK(p_s_sb); - - /* read journal header */ - bhjh = journal_bread(p_s_sb, - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb)); - if (!bhjh) { - reiserfs_warning (p_s_sb, "sh-459: unable to read journal header"); - goto free_and_return; - } - jh = (struct reiserfs_journal_header *)(bhjh->b_data); - - /* make sure that journal matches to the super block */ - if (is_reiserfs_jr(rs) && (le32_to_cpu(jh->jh_journal.jp_journal_magic) != sb_jp_journal_magic(rs))) { - reiserfs_warning (p_s_sb, "sh-460: journal header magic %x " - "(device %s) does not match to magic found in super " - "block %x", - jh->jh_journal.jp_journal_magic, - bdevname( journal->j_dev_bd, b), - sb_jp_journal_magic(rs)); - brelse (bhjh); - goto free_and_return; - } - - journal->j_trans_max = le32_to_cpu (jh->jh_journal.jp_journal_trans_max); - journal->j_max_batch = le32_to_cpu (jh->jh_journal.jp_journal_max_batch); - journal->j_max_commit_age = le32_to_cpu (jh->jh_journal.jp_journal_max_commit_age); - journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; - - if (journal->j_trans_max) { - /* make sure these parameters are available, assign it if they are not */ - __u32 initial = journal->j_trans_max; - __u32 ratio = 1; - - if (p_s_sb->s_blocksize < 4096) - ratio = 4096 / p_s_sb->s_blocksize; - - if (SB_ONDISK_JOURNAL_SIZE(p_s_sb)/journal->j_trans_max < JOURNAL_MIN_RATIO) - journal->j_trans_max = SB_ONDISK_JOURNAL_SIZE(p_s_sb) / JOURNAL_MIN_RATIO; - if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio) - journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT / ratio; - if (journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio) - journal->j_trans_max = JOURNAL_TRANS_MIN_DEFAULT / ratio; - - if (journal->j_trans_max != initial) - reiserfs_warning (p_s_sb, "sh-461: journal_init: wrong transaction max size (%u). Changed to %u", - initial, journal->j_trans_max); - - journal->j_max_batch = journal->j_trans_max* - JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT; - } - - if (!journal->j_trans_max) { - /*we have the file system was created by old version of mkreiserfs - so this field contains zero value */ - journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT ; - journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT ; - journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE ; - - /* for blocksize >= 4096 - max transaction size is 1024. For block size < 4096 - trans max size is decreased proportionally */ - if (p_s_sb->s_blocksize < 4096) { - journal->j_trans_max /= (4096 / p_s_sb->s_blocksize) ; - journal->j_max_batch = (journal->j_trans_max) * 9 / 10 ; - } - } - - journal->j_default_max_commit_age = journal->j_max_commit_age; - - if (commit_max_age != 0) { - journal->j_max_commit_age = commit_max_age; - journal->j_max_trans_age = commit_max_age; - } - - reiserfs_info (p_s_sb, "journal params: device %s, size %u, " - "journal first block %u, max trans len %u, max batch %u, " - "max commit age %u, max trans age %u\n", - bdevname( journal->j_dev_bd, b), - SB_ONDISK_JOURNAL_SIZE(p_s_sb), - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), - journal->j_trans_max, - journal->j_max_batch, - journal->j_max_commit_age, - journal->j_max_trans_age); - - brelse (bhjh); - - journal->j_list_bitmap_index = 0 ; - journal_list_init(p_s_sb) ; - - memset(journal->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ; - - INIT_LIST_HEAD(&journal->j_dirty_buffers) ; - spin_lock_init(&journal->j_dirty_buffers_lock) ; - - journal->j_start = 0 ; - journal->j_len = 0 ; - journal->j_len_alloc = 0 ; - atomic_set(&(journal->j_wcount), 0) ; - atomic_set(&(journal->j_async_throttle), 0) ; - journal->j_bcount = 0 ; - journal->j_trans_start_time = 0 ; - journal->j_last = NULL ; - journal->j_first = NULL ; - init_waitqueue_head(&(journal->j_join_wait)) ; - sema_init(&journal->j_lock, 1); - sema_init(&journal->j_flush_sem, 1); - - journal->j_trans_id = 10 ; - journal->j_mount_id = 10 ; - journal->j_state = 0 ; - atomic_set(&(journal->j_jlock), 0) ; - journal->j_cnode_free_list = allocate_cnodes(num_cnodes) ; - journal->j_cnode_free_orig = journal->j_cnode_free_list ; - journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0 ; - journal->j_cnode_used = 0 ; - journal->j_must_wait = 0 ; - - init_journal_hash(p_s_sb) ; - jl = journal->j_current_jl; - jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl); - if (!jl->j_list_bitmap) { - reiserfs_warning(p_s_sb, "journal-2005, get_list_bitmap failed for journal list 0") ; - goto free_and_return; - } - if (journal_read(p_s_sb) < 0) { - reiserfs_warning(p_s_sb, "Replay Failure, unable to mount") ; - goto free_and_return; - } - - reiserfs_mounted_fs_count++ ; - if (reiserfs_mounted_fs_count <= 1) - commit_wq = create_workqueue("reiserfs"); - - INIT_WORK(&journal->j_work, flush_async_commits, p_s_sb); - return 0 ; -free_and_return: - free_journal_ram(p_s_sb); - return 1; +int journal_init(struct super_block *p_s_sb, const char *j_dev_name, + int old_format, unsigned int commit_max_age) +{ + int num_cnodes = SB_ONDISK_JOURNAL_SIZE(p_s_sb) * 2; + struct buffer_head *bhjh; + struct reiserfs_super_block *rs; + struct reiserfs_journal_header *jh; + struct reiserfs_journal *journal; + struct reiserfs_journal_list *jl; + char b[BDEVNAME_SIZE]; + + journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof(struct reiserfs_journal)); + if (!journal) { + reiserfs_warning(p_s_sb, + "journal-1256: unable to get memory for journal structure"); + return 1; + } + memset(journal, 0, sizeof(struct reiserfs_journal)); + INIT_LIST_HEAD(&journal->j_bitmap_nodes); + INIT_LIST_HEAD(&journal->j_prealloc_list); + INIT_LIST_HEAD(&journal->j_working_list); + INIT_LIST_HEAD(&journal->j_journal_list); + journal->j_persistent_trans = 0; + if (reiserfs_allocate_list_bitmaps(p_s_sb, + journal->j_list_bitmap, + SB_BMAP_NR(p_s_sb))) + goto free_and_return; + allocate_bitmap_nodes(p_s_sb); + + /* reserved for journal area support */ + SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) = (old_format ? + REISERFS_OLD_DISK_OFFSET_IN_BYTES + / p_s_sb->s_blocksize + + SB_BMAP_NR(p_s_sb) + + 1 : + REISERFS_DISK_OFFSET_IN_BYTES / + p_s_sb->s_blocksize + 2); + + /* Sanity check to see is the standard journal fitting withing first bitmap + (actual for small blocksizes) */ + if (!SB_ONDISK_JOURNAL_DEVICE(p_s_sb) && + (SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) + + SB_ONDISK_JOURNAL_SIZE(p_s_sb) > p_s_sb->s_blocksize * 8)) { + reiserfs_warning(p_s_sb, + "journal-1393: journal does not fit for area " + "addressed by first of bitmap blocks. It starts at " + "%u and its size is %u. Block size %ld", + SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb), + SB_ONDISK_JOURNAL_SIZE(p_s_sb), + p_s_sb->s_blocksize); + goto free_and_return; + } + + if (journal_init_dev(p_s_sb, journal, j_dev_name) != 0) { + reiserfs_warning(p_s_sb, + "sh-462: unable to initialize jornal device"); + goto free_and_return; + } + + rs = SB_DISK_SUPER_BLOCK(p_s_sb); + + /* read journal header */ + bhjh = journal_bread(p_s_sb, + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + SB_ONDISK_JOURNAL_SIZE(p_s_sb)); + if (!bhjh) { + reiserfs_warning(p_s_sb, + "sh-459: unable to read journal header"); + goto free_and_return; + } + jh = (struct reiserfs_journal_header *)(bhjh->b_data); + + /* make sure that journal matches to the super block */ + if (is_reiserfs_jr(rs) + && (le32_to_cpu(jh->jh_journal.jp_journal_magic) != + sb_jp_journal_magic(rs))) { + reiserfs_warning(p_s_sb, + "sh-460: journal header magic %x " + "(device %s) does not match to magic found in super " + "block %x", jh->jh_journal.jp_journal_magic, + bdevname(journal->j_dev_bd, b), + sb_jp_journal_magic(rs)); + brelse(bhjh); + goto free_and_return; + } + + journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max); + journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch); + journal->j_max_commit_age = + le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age); + journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; + + if (journal->j_trans_max) { + /* make sure these parameters are available, assign it if they are not */ + __u32 initial = journal->j_trans_max; + __u32 ratio = 1; + + if (p_s_sb->s_blocksize < 4096) + ratio = 4096 / p_s_sb->s_blocksize; + + if (SB_ONDISK_JOURNAL_SIZE(p_s_sb) / journal->j_trans_max < + JOURNAL_MIN_RATIO) + journal->j_trans_max = + SB_ONDISK_JOURNAL_SIZE(p_s_sb) / JOURNAL_MIN_RATIO; + if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio) + journal->j_trans_max = + JOURNAL_TRANS_MAX_DEFAULT / ratio; + if (journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio) + journal->j_trans_max = + JOURNAL_TRANS_MIN_DEFAULT / ratio; + + if (journal->j_trans_max != initial) + reiserfs_warning(p_s_sb, + "sh-461: journal_init: wrong transaction max size (%u). Changed to %u", + initial, journal->j_trans_max); + + journal->j_max_batch = journal->j_trans_max * + JOURNAL_MAX_BATCH_DEFAULT / JOURNAL_TRANS_MAX_DEFAULT; + } + + if (!journal->j_trans_max) { + /*we have the file system was created by old version of mkreiserfs + so this field contains zero value */ + journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT; + journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT; + journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE; + + /* for blocksize >= 4096 - max transaction size is 1024. For block size < 4096 + trans max size is decreased proportionally */ + if (p_s_sb->s_blocksize < 4096) { + journal->j_trans_max /= (4096 / p_s_sb->s_blocksize); + journal->j_max_batch = (journal->j_trans_max) * 9 / 10; + } + } + + journal->j_default_max_commit_age = journal->j_max_commit_age; + + if (commit_max_age != 0) { + journal->j_max_commit_age = commit_max_age; + journal->j_max_trans_age = commit_max_age; + } + + reiserfs_info(p_s_sb, "journal params: device %s, size %u, " + "journal first block %u, max trans len %u, max batch %u, " + "max commit age %u, max trans age %u\n", + bdevname(journal->j_dev_bd, b), + SB_ONDISK_JOURNAL_SIZE(p_s_sb), + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), + journal->j_trans_max, + journal->j_max_batch, + journal->j_max_commit_age, journal->j_max_trans_age); + + brelse(bhjh); + + journal->j_list_bitmap_index = 0; + journal_list_init(p_s_sb); + + memset(journal->j_list_hash_table, 0, + JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)); + + INIT_LIST_HEAD(&journal->j_dirty_buffers); + spin_lock_init(&journal->j_dirty_buffers_lock); + + journal->j_start = 0; + journal->j_len = 0; + journal->j_len_alloc = 0; + atomic_set(&(journal->j_wcount), 0); + atomic_set(&(journal->j_async_throttle), 0); + journal->j_bcount = 0; + journal->j_trans_start_time = 0; + journal->j_last = NULL; + journal->j_first = NULL; + init_waitqueue_head(&(journal->j_join_wait)); + sema_init(&journal->j_lock, 1); + sema_init(&journal->j_flush_sem, 1); + + journal->j_trans_id = 10; + journal->j_mount_id = 10; + journal->j_state = 0; + atomic_set(&(journal->j_jlock), 0); + journal->j_cnode_free_list = allocate_cnodes(num_cnodes); + journal->j_cnode_free_orig = journal->j_cnode_free_list; + journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; + journal->j_cnode_used = 0; + journal->j_must_wait = 0; + + init_journal_hash(p_s_sb); + jl = journal->j_current_jl; + jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl); + if (!jl->j_list_bitmap) { + reiserfs_warning(p_s_sb, + "journal-2005, get_list_bitmap failed for journal list 0"); + goto free_and_return; + } + if (journal_read(p_s_sb) < 0) { + reiserfs_warning(p_s_sb, "Replay Failure, unable to mount"); + goto free_and_return; + } + + reiserfs_mounted_fs_count++; + if (reiserfs_mounted_fs_count <= 1) + commit_wq = create_workqueue("reiserfs"); + + INIT_WORK(&journal->j_work, flush_async_commits, p_s_sb); + return 0; + free_and_return: + free_journal_ram(p_s_sb); + return 1; } /* @@ -2524,96 +2786,102 @@ free_and_return: ** be used by delete to make sure they don't write more than can fit inside a single ** transaction */ -int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) { - struct reiserfs_journal *journal = SB_JOURNAL (th->t_super); - time_t now = get_seconds() ; - /* cannot restart while nested */ - BUG_ON (!th->t_trans_id); - if (th->t_refcount > 1) - return 0 ; - if ( journal->j_must_wait > 0 || - (journal->j_len_alloc + new_alloc) >= journal->j_max_batch || - atomic_read(&(journal->j_jlock)) || - (now - journal->j_trans_start_time) > journal->j_max_trans_age || - journal->j_cnode_free < (journal->j_trans_max * 3)) { - return 1 ; - } - return 0 ; +int journal_transaction_should_end(struct reiserfs_transaction_handle *th, + int new_alloc) +{ + struct reiserfs_journal *journal = SB_JOURNAL(th->t_super); + time_t now = get_seconds(); + /* cannot restart while nested */ + BUG_ON(!th->t_trans_id); + if (th->t_refcount > 1) + return 0; + if (journal->j_must_wait > 0 || + (journal->j_len_alloc + new_alloc) >= journal->j_max_batch || + atomic_read(&(journal->j_jlock)) || + (now - journal->j_trans_start_time) > journal->j_max_trans_age || + journal->j_cnode_free < (journal->j_trans_max * 3)) { + return 1; + } + return 0; } /* this must be called inside a transaction, and requires the ** kernel_lock to be held */ -void reiserfs_block_writes(struct reiserfs_transaction_handle *th) { - struct reiserfs_journal *journal = SB_JOURNAL (th->t_super); - BUG_ON (!th->t_trans_id); - journal->j_must_wait = 1 ; - set_bit(J_WRITERS_BLOCKED, &journal->j_state) ; - return ; +void reiserfs_block_writes(struct reiserfs_transaction_handle *th) +{ + struct reiserfs_journal *journal = SB_JOURNAL(th->t_super); + BUG_ON(!th->t_trans_id); + journal->j_must_wait = 1; + set_bit(J_WRITERS_BLOCKED, &journal->j_state); + return; } /* this must be called without a transaction started, and does not ** require BKL */ -void reiserfs_allow_writes(struct super_block *s) { - struct reiserfs_journal *journal = SB_JOURNAL (s); - clear_bit(J_WRITERS_BLOCKED, &journal->j_state) ; - wake_up(&journal->j_join_wait) ; +void reiserfs_allow_writes(struct super_block *s) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + clear_bit(J_WRITERS_BLOCKED, &journal->j_state); + wake_up(&journal->j_join_wait); } /* this must be called without a transaction started, and does not ** require BKL */ -void reiserfs_wait_on_write_block(struct super_block *s) { - struct reiserfs_journal *journal = SB_JOURNAL (s); - wait_event(journal->j_join_wait, - !test_bit(J_WRITERS_BLOCKED, &journal->j_state)) ; -} - -static void queue_log_writer(struct super_block *s) { - wait_queue_t wait; - struct reiserfs_journal *journal = SB_JOURNAL (s); - set_bit(J_WRITERS_QUEUED, &journal->j_state); - - /* - * we don't want to use wait_event here because - * we only want to wait once. - */ - init_waitqueue_entry(&wait, current); - add_wait_queue(&journal->j_join_wait, &wait); - set_current_state(TASK_UNINTERRUPTIBLE); - if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) - schedule(); - current->state = TASK_RUNNING; - remove_wait_queue(&journal->j_join_wait, &wait); -} - -static void wake_queued_writers(struct super_block *s) { - struct reiserfs_journal *journal = SB_JOURNAL (s); - if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state)) - wake_up(&journal->j_join_wait); -} - -static void let_transaction_grow(struct super_block *sb, - unsigned long trans_id) -{ - struct reiserfs_journal *journal = SB_JOURNAL (sb); - unsigned long bcount = journal->j_bcount; - while(1) { +void reiserfs_wait_on_write_block(struct super_block *s) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + wait_event(journal->j_join_wait, + !test_bit(J_WRITERS_BLOCKED, &journal->j_state)); +} + +static void queue_log_writer(struct super_block *s) +{ + wait_queue_t wait; + struct reiserfs_journal *journal = SB_JOURNAL(s); + set_bit(J_WRITERS_QUEUED, &journal->j_state); + + /* + * we don't want to use wait_event here because + * we only want to wait once. + */ + init_waitqueue_entry(&wait, current); + add_wait_queue(&journal->j_join_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(1); - journal->j_current_jl->j_state |= LIST_COMMIT_PENDING; - while ((atomic_read(&journal->j_wcount) > 0 || - atomic_read(&journal->j_jlock)) && - journal->j_trans_id == trans_id) { - queue_log_writer(sb); + if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) + schedule(); + current->state = TASK_RUNNING; + remove_wait_queue(&journal->j_join_wait, &wait); +} + +static void wake_queued_writers(struct super_block *s) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state)) + wake_up(&journal->j_join_wait); +} + +static void let_transaction_grow(struct super_block *sb, unsigned long trans_id) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + unsigned long bcount = journal->j_bcount; + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(1); + journal->j_current_jl->j_state |= LIST_COMMIT_PENDING; + while ((atomic_read(&journal->j_wcount) > 0 || + atomic_read(&journal->j_jlock)) && + journal->j_trans_id == trans_id) { + queue_log_writer(sb); + } + if (journal->j_trans_id != trans_id) + break; + if (bcount == journal->j_bcount) + break; + bcount = journal->j_bcount; } - if (journal->j_trans_id != trans_id) - break; - if (bcount == journal->j_bcount) - break; - bcount = journal->j_bcount; - } } /* join == true if you must join an existing transaction. @@ -2622,221 +2890,244 @@ static void let_transaction_grow(struct super_block *sb, ** this will block until the transaction is joinable. send the number of blocks you ** expect to use in nblocks. */ -static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,unsigned long nblocks,int join) { - time_t now = get_seconds() ; - int old_trans_id ; - struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); - struct reiserfs_transaction_handle myth; - int sched_count = 0; - int retval; - - reiserfs_check_lock_depth(p_s_sb, "journal_begin") ; - - PROC_INFO_INC( p_s_sb, journal.journal_being ); - /* set here for journal_join */ - th->t_refcount = 1; - th->t_super = p_s_sb ; - -relock: - lock_journal(p_s_sb) ; - if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted (journal)) { - unlock_journal (p_s_sb); - retval = journal->j_errno; - goto out_fail; - } - journal->j_bcount++; - - if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { - unlock_journal(p_s_sb) ; - reiserfs_wait_on_write_block(p_s_sb) ; - PROC_INFO_INC( p_s_sb, journal.journal_relock_writers ); - goto relock ; - } - now = get_seconds(); - - /* if there is no room in the journal OR - ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning - ** we don't sleep if there aren't other writers - */ - - if ( (!join && journal->j_must_wait > 0) || - ( !join && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch) || - (!join && atomic_read(&journal->j_wcount) > 0 && journal->j_trans_start_time > 0 && - (now - journal->j_trans_start_time) > journal->j_max_trans_age) || - (!join && atomic_read(&journal->j_jlock)) || - (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) { - - old_trans_id = journal->j_trans_id; - unlock_journal(p_s_sb) ; /* allow others to finish this transaction */ - - if (!join && (journal->j_len_alloc + nblocks + 2) >= - journal->j_max_batch && - ((journal->j_len + nblocks + 2) * 100) < (journal->j_len_alloc * 75)) - { - if (atomic_read(&journal->j_wcount) > 10) { - sched_count++; - queue_log_writer(p_s_sb); - goto relock; - } - } - /* don't mess with joining the transaction if all we have to do is - * wait for someone else to do a commit - */ - if (atomic_read(&journal->j_jlock)) { - while (journal->j_trans_id == old_trans_id && - atomic_read(&journal->j_jlock)) { - queue_log_writer(p_s_sb); - } - goto relock; - } - retval = journal_join(&myth, p_s_sb, 1) ; - if (retval) - goto out_fail; - - /* someone might have ended the transaction while we joined */ - if (old_trans_id != journal->j_trans_id) { - retval = do_journal_end(&myth, p_s_sb, 1, 0) ; - } else { - retval = do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW) ; - } - - if (retval) - goto out_fail; - - PROC_INFO_INC( p_s_sb, journal.journal_relock_wcount ); - goto relock ; - } - /* we are the first writer, set trans_id */ - if (journal->j_trans_start_time == 0) { - journal->j_trans_start_time = get_seconds(); - } - atomic_inc(&(journal->j_wcount)) ; - journal->j_len_alloc += nblocks ; - th->t_blocks_logged = 0 ; - th->t_blocks_allocated = nblocks ; - th->t_trans_id = journal->j_trans_id ; - unlock_journal(p_s_sb) ; - INIT_LIST_HEAD (&th->t_list); - return 0 ; - -out_fail: - memset (th, 0, sizeof (*th)); - /* Re-set th->t_super, so we can properly keep track of how many - * persistent transactions there are. We need to do this so if this - * call is part of a failed restart_transaction, we can free it later */ - th->t_super = p_s_sb; - return retval; -} - -struct reiserfs_transaction_handle * -reiserfs_persistent_transaction(struct super_block *s, int nblocks) { - int ret ; - struct reiserfs_transaction_handle *th ; - - /* if we're nesting into an existing transaction. It will be - ** persistent on its own - */ - if (reiserfs_transaction_running(s)) { - th = current->journal_info ; - th->t_refcount++ ; - if (th->t_refcount < 2) { - BUG() ; - } - return th ; - } - th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS, s) ; - if (!th) - return NULL; - ret = journal_begin(th, s, nblocks) ; - if (ret) { - reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ; - return NULL; - } - - SB_JOURNAL(s)->j_persistent_trans++; - return th ; -} - -int -reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) { - struct super_block *s = th->t_super; - int ret = 0; - if (th->t_trans_id) - ret = journal_end(th, th->t_super, th->t_blocks_allocated); - else - ret = -EIO; - if (th->t_refcount == 0) { - SB_JOURNAL(s)->j_persistent_trans--; - reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ; - } - return ret; -} - -static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { - struct reiserfs_transaction_handle *cur_th = current->journal_info; - - /* this keeps do_journal_end from NULLing out the current->journal_info - ** pointer - */ - th->t_handle_save = cur_th ; - if (cur_th && cur_th->t_refcount > 1) { - BUG() ; - } - return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_JOIN) ; -} - -int journal_join_abort(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { - struct reiserfs_transaction_handle *cur_th = current->journal_info; - - /* this keeps do_journal_end from NULLing out the current->journal_info - ** pointer - */ - th->t_handle_save = cur_th ; - if (cur_th && cur_th->t_refcount > 1) { - BUG() ; - } - return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_ABORT) ; -} - -int journal_begin(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks) { - struct reiserfs_transaction_handle *cur_th = current->journal_info ; - int ret ; - - th->t_handle_save = NULL ; - if (cur_th) { - /* we are nesting into the current transaction */ - if (cur_th->t_super == p_s_sb) { - BUG_ON (!cur_th->t_refcount); - cur_th->t_refcount++ ; - memcpy(th, cur_th, sizeof(*th)); - if (th->t_refcount <= 1) - reiserfs_warning (p_s_sb, "BAD: refcount <= 1, but journal_info != 0"); - return 0; +static int do_journal_begin_r(struct reiserfs_transaction_handle *th, + struct super_block *p_s_sb, unsigned long nblocks, + int join) +{ + time_t now = get_seconds(); + int old_trans_id; + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + struct reiserfs_transaction_handle myth; + int sched_count = 0; + int retval; + + reiserfs_check_lock_depth(p_s_sb, "journal_begin"); + if (nblocks > journal->j_trans_max) + BUG(); + + PROC_INFO_INC(p_s_sb, journal.journal_being); + /* set here for journal_join */ + th->t_refcount = 1; + th->t_super = p_s_sb; + + relock: + lock_journal(p_s_sb); + if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) { + unlock_journal(p_s_sb); + retval = journal->j_errno; + goto out_fail; + } + journal->j_bcount++; + + if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { + unlock_journal(p_s_sb); + reiserfs_wait_on_write_block(p_s_sb); + PROC_INFO_INC(p_s_sb, journal.journal_relock_writers); + goto relock; + } + now = get_seconds(); + + /* if there is no room in the journal OR + ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning + ** we don't sleep if there aren't other writers + */ + + if ((!join && journal->j_must_wait > 0) || + (!join + && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch) + || (!join && atomic_read(&journal->j_wcount) > 0 + && journal->j_trans_start_time > 0 + && (now - journal->j_trans_start_time) > + journal->j_max_trans_age) || (!join + && atomic_read(&journal->j_jlock)) + || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) { + + old_trans_id = journal->j_trans_id; + unlock_journal(p_s_sb); /* allow others to finish this transaction */ + + if (!join && (journal->j_len_alloc + nblocks + 2) >= + journal->j_max_batch && + ((journal->j_len + nblocks + 2) * 100) < + (journal->j_len_alloc * 75)) { + if (atomic_read(&journal->j_wcount) > 10) { + sched_count++; + queue_log_writer(p_s_sb); + goto relock; + } + } + /* don't mess with joining the transaction if all we have to do is + * wait for someone else to do a commit + */ + if (atomic_read(&journal->j_jlock)) { + while (journal->j_trans_id == old_trans_id && + atomic_read(&journal->j_jlock)) { + queue_log_writer(p_s_sb); + } + goto relock; + } + retval = journal_join(&myth, p_s_sb, 1); + if (retval) + goto out_fail; + + /* someone might have ended the transaction while we joined */ + if (old_trans_id != journal->j_trans_id) { + retval = do_journal_end(&myth, p_s_sb, 1, 0); + } else { + retval = do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW); + } + + if (retval) + goto out_fail; + + PROC_INFO_INC(p_s_sb, journal.journal_relock_wcount); + goto relock; + } + /* we are the first writer, set trans_id */ + if (journal->j_trans_start_time == 0) { + journal->j_trans_start_time = get_seconds(); + } + atomic_inc(&(journal->j_wcount)); + journal->j_len_alloc += nblocks; + th->t_blocks_logged = 0; + th->t_blocks_allocated = nblocks; + th->t_trans_id = journal->j_trans_id; + unlock_journal(p_s_sb); + INIT_LIST_HEAD(&th->t_list); + get_fs_excl(); + return 0; + + out_fail: + memset(th, 0, sizeof(*th)); + /* Re-set th->t_super, so we can properly keep track of how many + * persistent transactions there are. We need to do this so if this + * call is part of a failed restart_transaction, we can free it later */ + th->t_super = p_s_sb; + return retval; +} + +struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct + super_block + *s, + int nblocks) +{ + int ret; + struct reiserfs_transaction_handle *th; + + /* if we're nesting into an existing transaction. It will be + ** persistent on its own + */ + if (reiserfs_transaction_running(s)) { + th = current->journal_info; + th->t_refcount++; + if (th->t_refcount < 2) { + BUG(); + } + return th; + } + th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), + GFP_NOFS, s); + if (!th) + return NULL; + ret = journal_begin(th, s, nblocks); + if (ret) { + reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), + s); + return NULL; + } + + SB_JOURNAL(s)->j_persistent_trans++; + return th; +} + +int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) +{ + struct super_block *s = th->t_super; + int ret = 0; + if (th->t_trans_id) + ret = journal_end(th, th->t_super, th->t_blocks_allocated); + else + ret = -EIO; + if (th->t_refcount == 0) { + SB_JOURNAL(s)->j_persistent_trans--; + reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), + s); + } + return ret; +} + +static int journal_join(struct reiserfs_transaction_handle *th, + struct super_block *p_s_sb, unsigned long nblocks) +{ + struct reiserfs_transaction_handle *cur_th = current->journal_info; + + /* this keeps do_journal_end from NULLing out the current->journal_info + ** pointer + */ + th->t_handle_save = cur_th; + if (cur_th && cur_th->t_refcount > 1) { + BUG(); + } + return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_JOIN); +} + +int journal_join_abort(struct reiserfs_transaction_handle *th, + struct super_block *p_s_sb, unsigned long nblocks) +{ + struct reiserfs_transaction_handle *cur_th = current->journal_info; + + /* this keeps do_journal_end from NULLing out the current->journal_info + ** pointer + */ + th->t_handle_save = cur_th; + if (cur_th && cur_th->t_refcount > 1) { + BUG(); + } + return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_ABORT); +} + +int journal_begin(struct reiserfs_transaction_handle *th, + struct super_block *p_s_sb, unsigned long nblocks) +{ + struct reiserfs_transaction_handle *cur_th = current->journal_info; + int ret; + + th->t_handle_save = NULL; + if (cur_th) { + /* we are nesting into the current transaction */ + if (cur_th->t_super == p_s_sb) { + BUG_ON(!cur_th->t_refcount); + cur_th->t_refcount++; + memcpy(th, cur_th, sizeof(*th)); + if (th->t_refcount <= 1) + reiserfs_warning(p_s_sb, + "BAD: refcount <= 1, but journal_info != 0"); + return 0; + } else { + /* we've ended up with a handle from a different filesystem. + ** save it and restore on journal_end. This should never + ** really happen... + */ + reiserfs_warning(p_s_sb, + "clm-2100: nesting info a different FS"); + th->t_handle_save = current->journal_info; + current->journal_info = th; + } } else { - /* we've ended up with a handle from a different filesystem. - ** save it and restore on journal_end. This should never - ** really happen... - */ - reiserfs_warning(p_s_sb, "clm-2100: nesting info a different FS") ; - th->t_handle_save = current->journal_info ; - current->journal_info = th; - } - } else { - current->journal_info = th; - } - ret = do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_REG) ; - if (current->journal_info != th) - BUG() ; + current->journal_info = th; + } + ret = do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_REG); + if (current->journal_info != th) + BUG(); - /* I guess this boils down to being the reciprocal of clm-2100 above. - * If do_journal_begin_r fails, we need to put it back, since journal_end - * won't be called to do it. */ - if (ret) - current->journal_info = th->t_handle_save; - else - BUG_ON (!th->t_refcount); + /* I guess this boils down to being the reciprocal of clm-2100 above. + * If do_journal_begin_r fails, we need to put it back, since journal_end + * won't be called to do it. */ + if (ret) + current->journal_info = th->t_handle_save; + else + BUG_ON(!th->t_refcount); - return ret ; + return ret; } /* @@ -2848,129 +3139,140 @@ int journal_begin(struct reiserfs_transaction_handle *th, struct super_block * ** ** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len. */ -int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, struct buffer_head *bh) { - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - struct reiserfs_journal_cnode *cn = NULL; - int count_already_incd = 0 ; - int prepared = 0 ; - BUG_ON (!th->t_trans_id); - - PROC_INFO_INC( p_s_sb, journal.mark_dirty ); - if (th->t_trans_id != journal->j_trans_id) { - reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", - th->t_trans_id, journal->j_trans_id); - } - - p_s_sb->s_dirt = 1; - - prepared = test_clear_buffer_journal_prepared (bh); - clear_buffer_journal_restore_dirty (bh); - /* already in this transaction, we are done */ - if (buffer_journaled(bh)) { - PROC_INFO_INC( p_s_sb, journal.mark_dirty_already ); - return 0 ; - } - - /* this must be turned into a panic instead of a warning. We can't allow - ** a dirty or journal_dirty or locked buffer to be logged, as some changes - ** could get to disk too early. NOT GOOD. - */ - if (!prepared || buffer_dirty(bh)) { - reiserfs_warning (p_s_sb, "journal-1777: buffer %llu bad state " - "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT", - (unsigned long long)bh->b_blocknr, prepared ? ' ' : '!', - buffer_locked(bh) ? ' ' : '!', - buffer_dirty(bh) ? ' ' : '!', - buffer_journal_dirty(bh) ? ' ' : '!') ; - } - - if (atomic_read(&(journal->j_wcount)) <= 0) { - reiserfs_warning (p_s_sb, "journal-1409: journal_mark_dirty returning because j_wcount was %d", atomic_read(&(journal->j_wcount))) ; - return 1 ; - } - /* this error means I've screwed up, and we've overflowed the transaction. - ** Nothing can be done here, except make the FS readonly or panic. - */ - if (journal->j_len >= journal->j_trans_max) { - reiserfs_panic(th->t_super, "journal-1413: journal_mark_dirty: j_len (%lu) is too big\n", journal->j_len) ; - } - - if (buffer_journal_dirty(bh)) { - count_already_incd = 1 ; - PROC_INFO_INC( p_s_sb, journal.mark_dirty_notjournal ); - clear_buffer_journal_dirty (bh); - } - - if (journal->j_len > journal->j_len_alloc) { - journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT ; - } - - set_buffer_journaled (bh); - - /* now put this guy on the end */ - if (!cn) { - cn = get_cnode(p_s_sb) ; - if (!cn) { - reiserfs_panic(p_s_sb, "get_cnode failed!\n"); - } - - if (th->t_blocks_logged == th->t_blocks_allocated) { - th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT ; - journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT ; - } - th->t_blocks_logged++ ; - journal->j_len++ ; - - cn->bh = bh ; - cn->blocknr = bh->b_blocknr ; - cn->sb = p_s_sb; - cn->jlist = NULL ; - insert_journal_hash(journal->j_hash_table, cn) ; - if (!count_already_incd) { - get_bh(bh) ; - } - } - cn->next = NULL ; - cn->prev = journal->j_last ; - cn->bh = bh ; - if (journal->j_last) { - journal->j_last->next = cn ; - journal->j_last = cn ; - } else { - journal->j_first = cn ; - journal->j_last = cn ; - } - return 0 ; -} - -int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { - if (!current->journal_info && th->t_refcount > 1) - reiserfs_warning (p_s_sb, "REISER-NESTING: th NULL, refcount %d", - th->t_refcount); - - if (!th->t_trans_id) { - WARN_ON (1); - return -EIO; - } - - th->t_refcount--; - if (th->t_refcount > 0) { - struct reiserfs_transaction_handle *cur_th = current->journal_info ; - - /* we aren't allowed to close a nested transaction on a different - ** filesystem from the one in the task struct - */ - if (cur_th->t_super != th->t_super) - BUG() ; - - if (th != cur_th) { - memcpy(current->journal_info, th, sizeof(*th)); - th->t_trans_id = 0; - } - return 0; - } else { - return do_journal_end(th, p_s_sb, nblocks, 0) ; - } +int journal_mark_dirty(struct reiserfs_transaction_handle *th, + struct super_block *p_s_sb, struct buffer_head *bh) +{ + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + struct reiserfs_journal_cnode *cn = NULL; + int count_already_incd = 0; + int prepared = 0; + BUG_ON(!th->t_trans_id); + + PROC_INFO_INC(p_s_sb, journal.mark_dirty); + if (th->t_trans_id != journal->j_trans_id) { + reiserfs_panic(th->t_super, + "journal-1577: handle trans id %ld != current trans id %ld\n", + th->t_trans_id, journal->j_trans_id); + } + + p_s_sb->s_dirt = 1; + + prepared = test_clear_buffer_journal_prepared(bh); + clear_buffer_journal_restore_dirty(bh); + /* already in this transaction, we are done */ + if (buffer_journaled(bh)) { + PROC_INFO_INC(p_s_sb, journal.mark_dirty_already); + return 0; + } + + /* this must be turned into a panic instead of a warning. We can't allow + ** a dirty or journal_dirty or locked buffer to be logged, as some changes + ** could get to disk too early. NOT GOOD. + */ + if (!prepared || buffer_dirty(bh)) { + reiserfs_warning(p_s_sb, "journal-1777: buffer %llu bad state " + "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT", + (unsigned long long)bh->b_blocknr, + prepared ? ' ' : '!', + buffer_locked(bh) ? ' ' : '!', + buffer_dirty(bh) ? ' ' : '!', + buffer_journal_dirty(bh) ? ' ' : '!'); + } + + if (atomic_read(&(journal->j_wcount)) <= 0) { + reiserfs_warning(p_s_sb, + "journal-1409: journal_mark_dirty returning because j_wcount was %d", + atomic_read(&(journal->j_wcount))); + return 1; + } + /* this error means I've screwed up, and we've overflowed the transaction. + ** Nothing can be done here, except make the FS readonly or panic. + */ + if (journal->j_len >= journal->j_trans_max) { + reiserfs_panic(th->t_super, + "journal-1413: journal_mark_dirty: j_len (%lu) is too big\n", + journal->j_len); + } + + if (buffer_journal_dirty(bh)) { + count_already_incd = 1; + PROC_INFO_INC(p_s_sb, journal.mark_dirty_notjournal); + clear_buffer_journal_dirty(bh); + } + + if (journal->j_len > journal->j_len_alloc) { + journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT; + } + + set_buffer_journaled(bh); + + /* now put this guy on the end */ + if (!cn) { + cn = get_cnode(p_s_sb); + if (!cn) { + reiserfs_panic(p_s_sb, "get_cnode failed!\n"); + } + + if (th->t_blocks_logged == th->t_blocks_allocated) { + th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT; + journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT; + } + th->t_blocks_logged++; + journal->j_len++; + + cn->bh = bh; + cn->blocknr = bh->b_blocknr; + cn->sb = p_s_sb; + cn->jlist = NULL; + insert_journal_hash(journal->j_hash_table, cn); + if (!count_already_incd) { + get_bh(bh); + } + } + cn->next = NULL; + cn->prev = journal->j_last; + cn->bh = bh; + if (journal->j_last) { + journal->j_last->next = cn; + journal->j_last = cn; + } else { + journal->j_first = cn; + journal->j_last = cn; + } + return 0; +} + +int journal_end(struct reiserfs_transaction_handle *th, + struct super_block *p_s_sb, unsigned long nblocks) +{ + if (!current->journal_info && th->t_refcount > 1) + reiserfs_warning(p_s_sb, "REISER-NESTING: th NULL, refcount %d", + th->t_refcount); + + if (!th->t_trans_id) { + WARN_ON(1); + return -EIO; + } + + th->t_refcount--; + if (th->t_refcount > 0) { + struct reiserfs_transaction_handle *cur_th = + current->journal_info; + + /* we aren't allowed to close a nested transaction on a different + ** filesystem from the one in the task struct + */ + if (cur_th->t_super != th->t_super) + BUG(); + + if (th != cur_th) { + memcpy(current->journal_info, th, sizeof(*th)); + th->t_trans_id = 0; + } + return 0; + } else { + return do_journal_end(th, p_s_sb, nblocks, 0); + } } /* removes from the current transaction, relsing and descrementing any counters. @@ -2980,47 +3282,51 @@ int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_ ** ** returns 1 if it cleaned and relsed the buffer. 0 otherwise */ -static int remove_from_transaction(struct super_block *p_s_sb, b_blocknr_t blocknr, int already_cleaned) { - struct buffer_head *bh ; - struct reiserfs_journal_cnode *cn ; - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - int ret = 0; - - cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr) ; - if (!cn || !cn->bh) { - return ret ; - } - bh = cn->bh ; - if (cn->prev) { - cn->prev->next = cn->next ; - } - if (cn->next) { - cn->next->prev = cn->prev ; - } - if (cn == journal->j_first) { - journal->j_first = cn->next ; - } - if (cn == journal->j_last) { - journal->j_last = cn->prev ; - } - if (bh) - remove_journal_hash(p_s_sb, journal->j_hash_table, NULL, bh->b_blocknr, 0) ; - clear_buffer_journaled (bh); /* don't log this one */ - - if (!already_cleaned) { - clear_buffer_journal_dirty (bh); - clear_buffer_dirty(bh); - clear_buffer_journal_test (bh); - put_bh(bh) ; - if (atomic_read(&(bh->b_count)) < 0) { - reiserfs_warning (p_s_sb, "journal-1752: remove from trans, b_count < 0"); - } - ret = 1 ; - } - journal->j_len-- ; - journal->j_len_alloc-- ; - free_cnode(p_s_sb, cn) ; - return ret ; +static int remove_from_transaction(struct super_block *p_s_sb, + b_blocknr_t blocknr, int already_cleaned) +{ + struct buffer_head *bh; + struct reiserfs_journal_cnode *cn; + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + int ret = 0; + + cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr); + if (!cn || !cn->bh) { + return ret; + } + bh = cn->bh; + if (cn->prev) { + cn->prev->next = cn->next; + } + if (cn->next) { + cn->next->prev = cn->prev; + } + if (cn == journal->j_first) { + journal->j_first = cn->next; + } + if (cn == journal->j_last) { + journal->j_last = cn->prev; + } + if (bh) + remove_journal_hash(p_s_sb, journal->j_hash_table, NULL, + bh->b_blocknr, 0); + clear_buffer_journaled(bh); /* don't log this one */ + + if (!already_cleaned) { + clear_buffer_journal_dirty(bh); + clear_buffer_dirty(bh); + clear_buffer_journal_test(bh); + put_bh(bh); + if (atomic_read(&(bh->b_count)) < 0) { + reiserfs_warning(p_s_sb, + "journal-1752: remove from trans, b_count < 0"); + } + ret = 1; + } + journal->j_len--; + journal->j_len_alloc--; + free_cnode(p_s_sb, cn); + return ret; } /* @@ -3033,120 +3339,129 @@ static int remove_from_transaction(struct super_block *p_s_sb, b_blocknr_t block ** blocks for a given transaction on disk ** */ -static int can_dirty(struct reiserfs_journal_cnode *cn) { - struct super_block *sb = cn->sb; - b_blocknr_t blocknr = cn->blocknr ; - struct reiserfs_journal_cnode *cur = cn->hprev ; - int can_dirty = 1 ; - - /* first test hprev. These are all newer than cn, so any node here - ** with the same block number and dev means this node can't be sent - ** to disk right now. - */ - while(cur && can_dirty) { - if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb && - cur->blocknr == blocknr) { - can_dirty = 0 ; - } - cur = cur->hprev ; - } - /* then test hnext. These are all older than cn. As long as they - ** are committed to the log, it is safe to write cn to disk - */ - cur = cn->hnext ; - while(cur && can_dirty) { - if (cur->jlist && cur->jlist->j_len > 0 && - atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh && - cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) { - can_dirty = 0 ; - } - cur = cur->hnext ; - } - return can_dirty ; +static int can_dirty(struct reiserfs_journal_cnode *cn) +{ + struct super_block *sb = cn->sb; + b_blocknr_t blocknr = cn->blocknr; + struct reiserfs_journal_cnode *cur = cn->hprev; + int can_dirty = 1; + + /* first test hprev. These are all newer than cn, so any node here + ** with the same block number and dev means this node can't be sent + ** to disk right now. + */ + while (cur && can_dirty) { + if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb && + cur->blocknr == blocknr) { + can_dirty = 0; + } + cur = cur->hprev; + } + /* then test hnext. These are all older than cn. As long as they + ** are committed to the log, it is safe to write cn to disk + */ + cur = cn->hnext; + while (cur && can_dirty) { + if (cur->jlist && cur->jlist->j_len > 0 && + atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh && + cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) { + can_dirty = 0; + } + cur = cur->hnext; + } + return can_dirty; } /* syncs the commit blocks, but does not force the real buffers to disk ** will wait until the current transaction is done/commited before returning */ -int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); +int journal_end_sync(struct reiserfs_transaction_handle *th, + struct super_block *p_s_sb, unsigned long nblocks) +{ + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); - BUG_ON (!th->t_trans_id); - /* you can sync while nested, very, very bad */ - if (th->t_refcount > 1) { - BUG() ; - } - if (journal->j_len == 0) { - reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; - journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; - } - return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT) ; + BUG_ON(!th->t_trans_id); + /* you can sync while nested, very, very bad */ + if (th->t_refcount > 1) { + BUG(); + } + if (journal->j_len == 0) { + reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), + 1); + journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)); + } + return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT); } /* ** writeback the pending async commits to disk */ -static void flush_async_commits(void *p) { - struct super_block *p_s_sb = p; - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - struct reiserfs_journal_list *jl; - struct list_head *entry; - - lock_kernel(); - if (!list_empty(&journal->j_journal_list)) { - /* last entry is the youngest, commit it and you get everything */ - entry = journal->j_journal_list.prev; - jl = JOURNAL_LIST_ENTRY(entry); - flush_commit_list(p_s_sb, jl, 1); - } - unlock_kernel(); - /* - * this is a little racey, but there's no harm in missing - * the filemap_fdata_write - */ - if (!atomic_read(&journal->j_async_throttle) && !reiserfs_is_journal_aborted (journal)) { - atomic_inc(&journal->j_async_throttle); - filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping); - atomic_dec(&journal->j_async_throttle); - } +static void flush_async_commits(void *p) +{ + struct super_block *p_s_sb = p; + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + struct reiserfs_journal_list *jl; + struct list_head *entry; + + lock_kernel(); + if (!list_empty(&journal->j_journal_list)) { + /* last entry is the youngest, commit it and you get everything */ + entry = journal->j_journal_list.prev; + jl = JOURNAL_LIST_ENTRY(entry); + flush_commit_list(p_s_sb, jl, 1); + } + unlock_kernel(); + /* + * this is a little racey, but there's no harm in missing + * the filemap_fdata_write + */ + if (!atomic_read(&journal->j_async_throttle) + && !reiserfs_is_journal_aborted(journal)) { + atomic_inc(&journal->j_async_throttle); + filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping); + atomic_dec(&journal->j_async_throttle); + } } /* ** flushes any old transactions to disk ** ends the current transaction if it is too old */ -int reiserfs_flush_old_commits(struct super_block *p_s_sb) { - time_t now ; - struct reiserfs_transaction_handle th ; - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - - now = get_seconds(); - /* safety check so we don't flush while we are replaying the log during - * mount - */ - if (list_empty(&journal->j_journal_list)) { - return 0 ; - } - - /* check the current transaction. If there are no writers, and it is - * too old, finish it, and force the commit blocks to disk - */ - if (atomic_read(&journal->j_wcount) <= 0 && - journal->j_trans_start_time > 0 && - journal->j_len > 0 && - (now - journal->j_trans_start_time) > journal->j_max_trans_age) - { - if (!journal_join(&th, p_s_sb, 1)) { - reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; - journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; - - /* we're only being called from kreiserfsd, it makes no sense to do - ** an async commit so that kreiserfsd can do it later - */ - do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ; - } - } - return p_s_sb->s_dirt; +int reiserfs_flush_old_commits(struct super_block *p_s_sb) +{ + time_t now; + struct reiserfs_transaction_handle th; + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + + now = get_seconds(); + /* safety check so we don't flush while we are replaying the log during + * mount + */ + if (list_empty(&journal->j_journal_list)) { + return 0; + } + + /* check the current transaction. If there are no writers, and it is + * too old, finish it, and force the commit blocks to disk + */ + if (atomic_read(&journal->j_wcount) <= 0 && + journal->j_trans_start_time > 0 && + journal->j_len > 0 && + (now - journal->j_trans_start_time) > journal->j_max_trans_age) { + if (!journal_join(&th, p_s_sb, 1)) { + reiserfs_prepare_for_journal(p_s_sb, + SB_BUFFER_WITH_SB(p_s_sb), + 1); + journal_mark_dirty(&th, p_s_sb, + SB_BUFFER_WITH_SB(p_s_sb)); + + /* we're only being called from kreiserfsd, it makes no sense to do + ** an async commit so that kreiserfsd can do it later + */ + do_journal_end(&th, p_s_sb, 1, COMMIT_NOW | WAIT); + } + } + return p_s_sb->s_dirt; } /* @@ -3160,101 +3475,108 @@ int reiserfs_flush_old_commits(struct super_block *p_s_sb) { ** ** Note, we can't allow the journal_end to proceed while there are still writers in the log. */ -static int check_journal_end(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, - unsigned long nblocks, int flags) { - - time_t now ; - int flush = flags & FLUSH_ALL ; - int commit_now = flags & COMMIT_NOW ; - int wait_on_commit = flags & WAIT ; - struct reiserfs_journal_list *jl; - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - - BUG_ON (!th->t_trans_id); - - if (th->t_trans_id != journal->j_trans_id) { - reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", - th->t_trans_id, journal->j_trans_id); - } - - journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged) ; - if (atomic_read(&(journal->j_wcount)) > 0) { /* <= 0 is allowed. unmounting might not call begin */ - atomic_dec(&(journal->j_wcount)) ; - } - - /* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released - ** will be dealt with by next transaction that actually writes something, but should be taken - ** care of in this trans - */ - if (journal->j_len == 0) { - BUG(); - } - /* if wcount > 0, and we are called to with flush or commit_now, - ** we wait on j_join_wait. We will wake up when the last writer has - ** finished the transaction, and started it on its way to the disk. - ** Then, we flush the commit or journal list, and just return 0 - ** because the rest of journal end was already done for this transaction. - */ - if (atomic_read(&(journal->j_wcount)) > 0) { - if (flush || commit_now) { - unsigned trans_id ; - - jl = journal->j_current_jl; - trans_id = jl->j_trans_id; - if (wait_on_commit) - jl->j_state |= LIST_COMMIT_PENDING; - atomic_set(&(journal->j_jlock), 1) ; - if (flush) { - journal->j_next_full_flush = 1 ; - } - unlock_journal(p_s_sb) ; - - /* sleep while the current transaction is still j_jlocked */ - while(journal->j_trans_id == trans_id) { - if (atomic_read(&journal->j_jlock)) { - queue_log_writer(p_s_sb); - } else { - lock_journal(p_s_sb); - if (journal->j_trans_id == trans_id) { - atomic_set(&(journal->j_jlock), 1) ; - } - unlock_journal(p_s_sb); - } - } - if (journal->j_trans_id == trans_id) { - BUG(); - } - if (commit_now && journal_list_still_alive(p_s_sb, trans_id) && - wait_on_commit) - { - flush_commit_list(p_s_sb, jl, 1) ; - } - return 0 ; - } - unlock_journal(p_s_sb) ; - return 0 ; - } - - /* deal with old transactions where we are the last writers */ - now = get_seconds(); - if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) { - commit_now = 1 ; - journal->j_next_async_flush = 1 ; - } - /* don't batch when someone is waiting on j_join_wait */ - /* don't batch when syncing the commit or flushing the whole trans */ - if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock))) && !flush && !commit_now && - (journal->j_len < journal->j_max_batch) && - journal->j_len_alloc < journal->j_max_batch && journal->j_cnode_free > (journal->j_trans_max * 3)) { - journal->j_bcount++ ; - unlock_journal(p_s_sb) ; - return 0 ; - } - - if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) { - reiserfs_panic(p_s_sb, "journal-003: journal_end: j_start (%ld) is too high\n", journal->j_start) ; - } - return 1 ; +static int check_journal_end(struct reiserfs_transaction_handle *th, + struct super_block *p_s_sb, unsigned long nblocks, + int flags) +{ + + time_t now; + int flush = flags & FLUSH_ALL; + int commit_now = flags & COMMIT_NOW; + int wait_on_commit = flags & WAIT; + struct reiserfs_journal_list *jl; + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + + BUG_ON(!th->t_trans_id); + + if (th->t_trans_id != journal->j_trans_id) { + reiserfs_panic(th->t_super, + "journal-1577: handle trans id %ld != current trans id %ld\n", + th->t_trans_id, journal->j_trans_id); + } + + journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged); + if (atomic_read(&(journal->j_wcount)) > 0) { /* <= 0 is allowed. unmounting might not call begin */ + atomic_dec(&(journal->j_wcount)); + } + + /* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released + ** will be dealt with by next transaction that actually writes something, but should be taken + ** care of in this trans + */ + if (journal->j_len == 0) { + BUG(); + } + /* if wcount > 0, and we are called to with flush or commit_now, + ** we wait on j_join_wait. We will wake up when the last writer has + ** finished the transaction, and started it on its way to the disk. + ** Then, we flush the commit or journal list, and just return 0 + ** because the rest of journal end was already done for this transaction. + */ + if (atomic_read(&(journal->j_wcount)) > 0) { + if (flush || commit_now) { + unsigned trans_id; + + jl = journal->j_current_jl; + trans_id = jl->j_trans_id; + if (wait_on_commit) + jl->j_state |= LIST_COMMIT_PENDING; + atomic_set(&(journal->j_jlock), 1); + if (flush) { + journal->j_next_full_flush = 1; + } + unlock_journal(p_s_sb); + + /* sleep while the current transaction is still j_jlocked */ + while (journal->j_trans_id == trans_id) { + if (atomic_read(&journal->j_jlock)) { + queue_log_writer(p_s_sb); + } else { + lock_journal(p_s_sb); + if (journal->j_trans_id == trans_id) { + atomic_set(&(journal->j_jlock), + 1); + } + unlock_journal(p_s_sb); + } + } + if (journal->j_trans_id == trans_id) { + BUG(); + } + if (commit_now + && journal_list_still_alive(p_s_sb, trans_id) + && wait_on_commit) { + flush_commit_list(p_s_sb, jl, 1); + } + return 0; + } + unlock_journal(p_s_sb); + return 0; + } + + /* deal with old transactions where we are the last writers */ + now = get_seconds(); + if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) { + commit_now = 1; + journal->j_next_async_flush = 1; + } + /* don't batch when someone is waiting on j_join_wait */ + /* don't batch when syncing the commit or flushing the whole trans */ + if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock))) + && !flush && !commit_now && (journal->j_len < journal->j_max_batch) + && journal->j_len_alloc < journal->j_max_batch + && journal->j_cnode_free > (journal->j_trans_max * 3)) { + journal->j_bcount++; + unlock_journal(p_s_sb); + return 0; + } + + if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) { + reiserfs_panic(p_s_sb, + "journal-003: journal_end: j_start (%ld) is too high\n", + journal->j_start); + } + return 1; } /* @@ -3271,83 +3593,95 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, struct supe ** ** Then remove it from the current transaction, decrementing any counters and filing it on the clean list. */ -int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, b_blocknr_t blocknr) { - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - struct reiserfs_journal_cnode *cn = NULL ; - struct buffer_head *bh = NULL ; - struct reiserfs_list_bitmap *jb = NULL ; - int cleaned = 0 ; - BUG_ON (!th->t_trans_id); - - cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr); - if (cn && cn->bh) { - bh = cn->bh ; - get_bh(bh) ; - } - /* if it is journal new, we just remove it from this transaction */ - if (bh && buffer_journal_new(bh)) { - clear_buffer_journal_new (bh); - clear_prepared_bits(bh) ; - reiserfs_clean_and_file_buffer(bh) ; - cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ; - } else { - /* set the bit for this block in the journal bitmap for this transaction */ - jb = journal->j_current_jl->j_list_bitmap; - if (!jb) { - reiserfs_panic(p_s_sb, "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n") ; - } - set_bit_in_list_bitmap(p_s_sb, blocknr, jb) ; - - /* Note, the entire while loop is not allowed to schedule. */ - - if (bh) { - clear_prepared_bits(bh) ; - reiserfs_clean_and_file_buffer(bh) ; - } - cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ; - - /* find all older transactions with this block, make sure they don't try to write it out */ - cn = get_journal_hash_dev(p_s_sb,journal->j_list_hash_table, blocknr) ; - while (cn) { - if (p_s_sb == cn->sb && blocknr == cn->blocknr) { - set_bit(BLOCK_FREED, &cn->state) ; - if (cn->bh) { - if (!cleaned) { - /* remove_from_transaction will brelse the buffer if it was - ** in the current trans - */ - clear_buffer_journal_dirty (cn->bh); - clear_buffer_dirty(cn->bh); - clear_buffer_journal_test(cn->bh); - cleaned = 1 ; - put_bh(cn->bh) ; - if (atomic_read(&(cn->bh->b_count)) < 0) { - reiserfs_warning (p_s_sb, "journal-2138: cn->bh->b_count < 0"); - } - } - if (cn->jlist) { /* since we are clearing the bh, we MUST dec nonzerolen */ - atomic_dec(&(cn->jlist->j_nonzerolen)) ; - } - cn->bh = NULL ; - } - } - cn = cn->hnext ; - } - } - - if (bh) { - put_bh(bh) ; /* get_hash grabs the buffer */ - if (atomic_read(&(bh->b_count)) < 0) { - reiserfs_warning (p_s_sb, "journal-2165: bh->b_count < 0"); - } - } - return 0 ; -} - -void reiserfs_update_inode_transaction(struct inode *inode) { - struct reiserfs_journal *journal = SB_JOURNAL (inode->i_sb); - REISERFS_I(inode)->i_jl = journal->j_current_jl; - REISERFS_I(inode)->i_trans_id = journal->j_trans_id ; +int journal_mark_freed(struct reiserfs_transaction_handle *th, + struct super_block *p_s_sb, b_blocknr_t blocknr) +{ + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + struct reiserfs_journal_cnode *cn = NULL; + struct buffer_head *bh = NULL; + struct reiserfs_list_bitmap *jb = NULL; + int cleaned = 0; + BUG_ON(!th->t_trans_id); + + cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr); + if (cn && cn->bh) { + bh = cn->bh; + get_bh(bh); + } + /* if it is journal new, we just remove it from this transaction */ + if (bh && buffer_journal_new(bh)) { + clear_buffer_journal_new(bh); + clear_prepared_bits(bh); + reiserfs_clean_and_file_buffer(bh); + cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned); + } else { + /* set the bit for this block in the journal bitmap for this transaction */ + jb = journal->j_current_jl->j_list_bitmap; + if (!jb) { + reiserfs_panic(p_s_sb, + "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n"); + } + set_bit_in_list_bitmap(p_s_sb, blocknr, jb); + + /* Note, the entire while loop is not allowed to schedule. */ + + if (bh) { + clear_prepared_bits(bh); + reiserfs_clean_and_file_buffer(bh); + } + cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned); + + /* find all older transactions with this block, make sure they don't try to write it out */ + cn = get_journal_hash_dev(p_s_sb, journal->j_list_hash_table, + blocknr); + while (cn) { + if (p_s_sb == cn->sb && blocknr == cn->blocknr) { + set_bit(BLOCK_FREED, &cn->state); + if (cn->bh) { + if (!cleaned) { + /* remove_from_transaction will brelse the buffer if it was + ** in the current trans + */ + clear_buffer_journal_dirty(cn-> + bh); + clear_buffer_dirty(cn->bh); + clear_buffer_journal_test(cn-> + bh); + cleaned = 1; + put_bh(cn->bh); + if (atomic_read + (&(cn->bh->b_count)) < 0) { + reiserfs_warning(p_s_sb, + "journal-2138: cn->bh->b_count < 0"); + } + } + if (cn->jlist) { /* since we are clearing the bh, we MUST dec nonzerolen */ + atomic_dec(& + (cn->jlist-> + j_nonzerolen)); + } + cn->bh = NULL; + } + } + cn = cn->hnext; + } + } + + if (bh) { + put_bh(bh); /* get_hash grabs the buffer */ + if (atomic_read(&(bh->b_count)) < 0) { + reiserfs_warning(p_s_sb, + "journal-2165: bh->b_count < 0"); + } + } + return 0; +} + +void reiserfs_update_inode_transaction(struct inode *inode) +{ + struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb); + REISERFS_I(inode)->i_jl = journal->j_current_jl; + REISERFS_I(inode)->i_trans_id = journal->j_trans_id; } /* @@ -3355,99 +3689,102 @@ void reiserfs_update_inode_transaction(struct inode *inode) { * if a transaction was actually committed and the barrier was done */ static int __commit_trans_jl(struct inode *inode, unsigned long id, - struct reiserfs_journal_list *jl) + struct reiserfs_journal_list *jl) { - struct reiserfs_transaction_handle th ; - struct super_block *sb = inode->i_sb ; - struct reiserfs_journal *journal = SB_JOURNAL (sb); - int ret = 0; + struct reiserfs_transaction_handle th; + struct super_block *sb = inode->i_sb; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + int ret = 0; + + /* is it from the current transaction, or from an unknown transaction? */ + if (id == journal->j_trans_id) { + jl = journal->j_current_jl; + /* try to let other writers come in and grow this transaction */ + let_transaction_grow(sb, id); + if (journal->j_trans_id != id) { + goto flush_commit_only; + } - /* is it from the current transaction, or from an unknown transaction? */ - if (id == journal->j_trans_id) { - jl = journal->j_current_jl; - /* try to let other writers come in and grow this transaction */ - let_transaction_grow(sb, id); - if (journal->j_trans_id != id) { - goto flush_commit_only; - } + ret = journal_begin(&th, sb, 1); + if (ret) + return ret; + + /* someone might have ended this transaction while we joined */ + if (journal->j_trans_id != id) { + reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), + 1); + journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)); + ret = journal_end(&th, sb, 1); + goto flush_commit_only; + } - ret = journal_begin(&th, sb, 1) ; - if (ret) - return ret; + ret = journal_end_sync(&th, sb, 1); + if (!ret) + ret = 1; - /* someone might have ended this transaction while we joined */ - if (journal->j_trans_id != id) { - reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1) ; - journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)) ; - ret = journal_end(&th, sb, 1) ; - goto flush_commit_only; + } else { + /* this gets tricky, we have to make sure the journal list in + * the inode still exists. We know the list is still around + * if we've got a larger transaction id than the oldest list + */ + flush_commit_only: + if (journal_list_still_alive(inode->i_sb, id)) { + /* + * we only set ret to 1 when we know for sure + * the barrier hasn't been started yet on the commit + * block. + */ + if (atomic_read(&jl->j_commit_left) > 1) + ret = 1; + flush_commit_list(sb, jl, 1); + if (journal->j_errno) + ret = journal->j_errno; + } } + /* otherwise the list is gone, and long since committed */ + return ret; +} - ret = journal_end_sync(&th, sb, 1) ; - if (!ret) - ret = 1; +int reiserfs_commit_for_inode(struct inode *inode) +{ + unsigned long id = REISERFS_I(inode)->i_trans_id; + struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl; - } else { - /* this gets tricky, we have to make sure the journal list in - * the inode still exists. We know the list is still around - * if we've got a larger transaction id than the oldest list + /* for the whole inode, assume unset id means it was + * changed in the current transaction. More conservative */ -flush_commit_only: - if (journal_list_still_alive(inode->i_sb, id)) { - /* - * we only set ret to 1 when we know for sure - * the barrier hasn't been started yet on the commit - * block. - */ - if (atomic_read(&jl->j_commit_left) > 1) - ret = 1; - flush_commit_list(sb, jl, 1) ; - if (journal->j_errno) - ret = journal->j_errno; - } - } - /* otherwise the list is gone, and long since committed */ - return ret; -} - -int reiserfs_commit_for_inode(struct inode *inode) { - unsigned long id = REISERFS_I(inode)->i_trans_id; - struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl; - - /* for the whole inode, assume unset id means it was - * changed in the current transaction. More conservative - */ - if (!id || !jl) { - reiserfs_update_inode_transaction(inode) ; - id = REISERFS_I(inode)->i_trans_id; - /* jl will be updated in __commit_trans_jl */ - } - - return __commit_trans_jl(inode, id, jl); -} - -void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb, - struct buffer_head *bh) { - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - PROC_INFO_INC( p_s_sb, journal.restore_prepared ); - if (!bh) { - return ; - } - if (test_clear_buffer_journal_restore_dirty (bh) && - buffer_journal_dirty(bh)) { - struct reiserfs_journal_cnode *cn; - cn = get_journal_hash_dev(p_s_sb, - journal->j_list_hash_table, - bh->b_blocknr); - if (cn && can_dirty(cn)) { - set_buffer_journal_test (bh); - mark_buffer_dirty(bh); - } - } - clear_buffer_journal_prepared (bh); -} - -extern struct tree_balance *cur_tb ; + if (!id || !jl) { + reiserfs_update_inode_transaction(inode); + id = REISERFS_I(inode)->i_trans_id; + /* jl will be updated in __commit_trans_jl */ + } + + return __commit_trans_jl(inode, id, jl); +} + +void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb, + struct buffer_head *bh) +{ + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + PROC_INFO_INC(p_s_sb, journal.restore_prepared); + if (!bh) { + return; + } + if (test_clear_buffer_journal_restore_dirty(bh) && + buffer_journal_dirty(bh)) { + struct reiserfs_journal_cnode *cn; + cn = get_journal_hash_dev(p_s_sb, + journal->j_list_hash_table, + bh->b_blocknr); + if (cn && can_dirty(cn)) { + set_buffer_journal_test(bh); + mark_buffer_dirty(bh); + } + } + clear_buffer_journal_prepared(bh); +} + +extern struct tree_balance *cur_tb; /* ** before we can change a metadata block, we have to make sure it won't ** be written to disk while we are altering it. So, we must: @@ -3456,39 +3793,41 @@ extern struct tree_balance *cur_tb ; ** */ int reiserfs_prepare_for_journal(struct super_block *p_s_sb, - struct buffer_head *bh, int wait) { - PROC_INFO_INC( p_s_sb, journal.prepare ); - - if (test_set_buffer_locked(bh)) { - if (!wait) - return 0; - lock_buffer(bh); - } - set_buffer_journal_prepared (bh); - if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) { - clear_buffer_journal_test (bh); - set_buffer_journal_restore_dirty (bh); - } - unlock_buffer(bh); - return 1; -} - -static void flush_old_journal_lists(struct super_block *s) { - struct reiserfs_journal *journal = SB_JOURNAL (s); - struct reiserfs_journal_list *jl; - struct list_head *entry; - time_t now = get_seconds(); - - while(!list_empty(&journal->j_journal_list)) { - entry = journal->j_journal_list.next; - jl = JOURNAL_LIST_ENTRY(entry); - /* this check should always be run, to send old lists to disk */ - if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) { - flush_used_journal_lists(s, jl); - } else { - break; + struct buffer_head *bh, int wait) +{ + PROC_INFO_INC(p_s_sb, journal.prepare); + + if (test_set_buffer_locked(bh)) { + if (!wait) + return 0; + lock_buffer(bh); + } + set_buffer_journal_prepared(bh); + if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) { + clear_buffer_journal_test(bh); + set_buffer_journal_restore_dirty(bh); + } + unlock_buffer(bh); + return 1; +} + +static void flush_old_journal_lists(struct super_block *s) +{ + struct reiserfs_journal *journal = SB_JOURNAL(s); + struct reiserfs_journal_list *jl; + struct list_head *entry; + time_t now = get_seconds(); + + while (!list_empty(&journal->j_journal_list)) { + entry = journal->j_journal_list.next; + jl = JOURNAL_LIST_ENTRY(entry); + /* this check should always be run, to send old lists to disk */ + if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) { + flush_used_journal_lists(s, jl); + } else { + break; + } } - } } /* @@ -3501,374 +3840,390 @@ static void flush_old_journal_lists(struct super_block *s) { ** If the journal is aborted, we just clean up. Things like flushing ** journal lists, etc just won't happen. */ -static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks, - int flags) { - struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb); - struct reiserfs_journal_cnode *cn, *next, *jl_cn; - struct reiserfs_journal_cnode *last_cn = NULL; - struct reiserfs_journal_desc *desc ; - struct reiserfs_journal_commit *commit ; - struct buffer_head *c_bh ; /* commit bh */ - struct buffer_head *d_bh ; /* desc bh */ - int cur_write_start = 0 ; /* start index of current log write */ - int old_start ; - int i ; - int flush = flags & FLUSH_ALL ; - int wait_on_commit = flags & WAIT ; - struct reiserfs_journal_list *jl, *temp_jl; - struct list_head *entry, *safe; - unsigned long jindex; - unsigned long commit_trans_id; - int trans_half; - - BUG_ON (th->t_refcount > 1); - BUG_ON (!th->t_trans_id); - - current->journal_info = th->t_handle_save; - reiserfs_check_lock_depth(p_s_sb, "journal end"); - if (journal->j_len == 0) { - reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; - journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; - } - - lock_journal(p_s_sb) ; - if (journal->j_next_full_flush) { - flags |= FLUSH_ALL ; - flush = 1 ; - } - if (journal->j_next_async_flush) { - flags |= COMMIT_NOW | WAIT; - wait_on_commit = 1; - } - - /* check_journal_end locks the journal, and unlocks if it does not return 1 - ** it tells us if we should continue with the journal_end, or just return - */ - if (!check_journal_end(th, p_s_sb, nblocks, flags)) { - p_s_sb->s_dirt = 1; - wake_queued_writers(p_s_sb); - reiserfs_async_progress_wait(p_s_sb); - goto out ; - } - - /* check_journal_end might set these, check again */ - if (journal->j_next_full_flush) { - flush = 1 ; - } - - /* - ** j must wait means we have to flush the log blocks, and the real blocks for - ** this transaction - */ - if (journal->j_must_wait > 0) { - flush = 1 ; - } +static int do_journal_end(struct reiserfs_transaction_handle *th, + struct super_block *p_s_sb, unsigned long nblocks, + int flags) +{ + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + struct reiserfs_journal_cnode *cn, *next, *jl_cn; + struct reiserfs_journal_cnode *last_cn = NULL; + struct reiserfs_journal_desc *desc; + struct reiserfs_journal_commit *commit; + struct buffer_head *c_bh; /* commit bh */ + struct buffer_head *d_bh; /* desc bh */ + int cur_write_start = 0; /* start index of current log write */ + int old_start; + int i; + int flush = flags & FLUSH_ALL; + int wait_on_commit = flags & WAIT; + struct reiserfs_journal_list *jl, *temp_jl; + struct list_head *entry, *safe; + unsigned long jindex; + unsigned long commit_trans_id; + int trans_half; + + BUG_ON(th->t_refcount > 1); + BUG_ON(!th->t_trans_id); + + put_fs_excl(); + current->journal_info = th->t_handle_save; + reiserfs_check_lock_depth(p_s_sb, "journal end"); + if (journal->j_len == 0) { + reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), + 1); + journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)); + } + lock_journal(p_s_sb); + if (journal->j_next_full_flush) { + flags |= FLUSH_ALL; + flush = 1; + } + if (journal->j_next_async_flush) { + flags |= COMMIT_NOW | WAIT; + wait_on_commit = 1; + } + + /* check_journal_end locks the journal, and unlocks if it does not return 1 + ** it tells us if we should continue with the journal_end, or just return + */ + if (!check_journal_end(th, p_s_sb, nblocks, flags)) { + p_s_sb->s_dirt = 1; + wake_queued_writers(p_s_sb); + reiserfs_async_progress_wait(p_s_sb); + goto out; + } + + /* check_journal_end might set these, check again */ + if (journal->j_next_full_flush) { + flush = 1; + } + + /* + ** j must wait means we have to flush the log blocks, and the real blocks for + ** this transaction + */ + if (journal->j_must_wait > 0) { + flush = 1; + } #ifdef REISERFS_PREALLOCATE - /* quota ops might need to nest, setup the journal_info pointer for them */ - current->journal_info = th ; - reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into - * the transaction */ - current->journal_info = th->t_handle_save ; + /* quota ops might need to nest, setup the journal_info pointer for them */ + current->journal_info = th; + reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into + * the transaction */ + current->journal_info = th->t_handle_save; #endif - - /* setup description block */ - d_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start) ; - set_buffer_uptodate(d_bh); - desc = (struct reiserfs_journal_desc *)(d_bh)->b_data ; - memset(d_bh->b_data, 0, d_bh->b_size) ; - memcpy(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8) ; - set_desc_trans_id(desc, journal->j_trans_id) ; - - /* setup commit block. Don't write (keep it clean too) this one until after everyone else is written */ - c_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + - ((journal->j_start + journal->j_len + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ; - commit = (struct reiserfs_journal_commit *)c_bh->b_data ; - memset(c_bh->b_data, 0, c_bh->b_size) ; - set_commit_trans_id(commit, journal->j_trans_id) ; - set_buffer_uptodate(c_bh) ; - - /* init this journal list */ - jl = journal->j_current_jl; - - /* we lock the commit before doing anything because - * we want to make sure nobody tries to run flush_commit_list until - * the new transaction is fully setup, and we've already flushed the - * ordered bh list - */ - down(&jl->j_commit_lock); - - /* save the transaction id in case we need to commit it later */ - commit_trans_id = jl->j_trans_id; - - atomic_set(&jl->j_older_commits_done, 0) ; - jl->j_trans_id = journal->j_trans_id ; - jl->j_timestamp = journal->j_trans_start_time ; - jl->j_commit_bh = c_bh ; - jl->j_start = journal->j_start ; - jl->j_len = journal->j_len ; - atomic_set(&jl->j_nonzerolen, journal->j_len) ; - atomic_set(&jl->j_commit_left, journal->j_len + 2); - jl->j_realblock = NULL ; - - /* The ENTIRE FOR LOOP MUST not cause schedule to occur. - ** for each real block, add it to the journal list hash, - ** copy into real block index array in the commit or desc block - */ - trans_half = journal_trans_half(p_s_sb->s_blocksize); - for (i = 0, cn = journal->j_first ; cn ; cn = cn->next, i++) { - if (buffer_journaled (cn->bh)) { - jl_cn = get_cnode(p_s_sb) ; - if (!jl_cn) { - reiserfs_panic(p_s_sb, "journal-1676, get_cnode returned NULL\n") ; - } - if (i == 0) { - jl->j_realblock = jl_cn ; - } - jl_cn->prev = last_cn ; - jl_cn->next = NULL ; - if (last_cn) { - last_cn->next = jl_cn ; - } - last_cn = jl_cn ; - /* make sure the block we are trying to log is not a block - of journal or reserved area */ - - if (is_block_in_log_or_reserved_area(p_s_sb, cn->bh->b_blocknr)) { - reiserfs_panic(p_s_sb, "journal-2332: Trying to log block %lu, which is a log block\n", cn->bh->b_blocknr) ; - } - jl_cn->blocknr = cn->bh->b_blocknr ; - jl_cn->state = 0 ; - jl_cn->sb = p_s_sb; - jl_cn->bh = cn->bh ; - jl_cn->jlist = jl; - insert_journal_hash(journal->j_list_hash_table, jl_cn) ; - if (i < trans_half) { - desc->j_realblock[i] = cpu_to_le32(cn->bh->b_blocknr) ; - } else { - commit->j_realblock[i - trans_half] = cpu_to_le32(cn->bh->b_blocknr) ; - } - } else { - i-- ; - } - } - set_desc_trans_len(desc, journal->j_len) ; - set_desc_mount_id(desc, journal->j_mount_id) ; - set_desc_trans_id(desc, journal->j_trans_id) ; - set_commit_trans_len(commit, journal->j_len); - - /* special check in case all buffers in the journal were marked for not logging */ - if (journal->j_len == 0) { - BUG(); - } - - /* we're about to dirty all the log blocks, mark the description block - * dirty now too. Don't mark the commit block dirty until all the - * others are on disk - */ - mark_buffer_dirty(d_bh); - - /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */ - cur_write_start = journal->j_start ; - cn = journal->j_first ; - jindex = 1 ; /* start at one so we don't get the desc again */ - while(cn) { - clear_buffer_journal_new (cn->bh); - /* copy all the real blocks into log area. dirty log blocks */ - if (buffer_journaled (cn->bh)) { - struct buffer_head *tmp_bh ; - char *addr; - struct page *page; - tmp_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + - ((cur_write_start + jindex) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ; - set_buffer_uptodate(tmp_bh); - page = cn->bh->b_page; - addr = kmap(page); - memcpy(tmp_bh->b_data, addr + offset_in_page(cn->bh->b_data), - cn->bh->b_size); - kunmap(page); - mark_buffer_dirty(tmp_bh); - jindex++ ; - set_buffer_journal_dirty (cn->bh); - clear_buffer_journaled (cn->bh); - } else { - /* JDirty cleared sometime during transaction. don't log this one */ - reiserfs_warning(p_s_sb, "journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!") ; - brelse(cn->bh) ; - } - next = cn->next ; - free_cnode(p_s_sb, cn) ; - cn = next ; - cond_resched(); - } - - /* we are done with both the c_bh and d_bh, but - ** c_bh must be written after all other commit blocks, - ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. - */ - - journal->j_current_jl = alloc_journal_list(p_s_sb); - - /* now it is safe to insert this transaction on the main list */ - list_add_tail(&jl->j_list, &journal->j_journal_list); - list_add_tail(&jl->j_working_list, &journal->j_working_list); - journal->j_num_work_lists++; - - /* reset journal values for the next transaction */ - old_start = journal->j_start ; - journal->j_start = (journal->j_start + journal->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb); - atomic_set(&(journal->j_wcount), 0) ; - journal->j_bcount = 0 ; - journal->j_last = NULL ; - journal->j_first = NULL ; - journal->j_len = 0 ; - journal->j_trans_start_time = 0 ; - journal->j_trans_id++ ; - journal->j_current_jl->j_trans_id = journal->j_trans_id; - journal->j_must_wait = 0 ; - journal->j_len_alloc = 0 ; - journal->j_next_full_flush = 0 ; - journal->j_next_async_flush = 0 ; - init_journal_hash(p_s_sb) ; - - // make sure reiserfs_add_jh sees the new current_jl before we - // write out the tails - smp_mb(); - - /* tail conversion targets have to hit the disk before we end the - * transaction. Otherwise a later transaction might repack the tail - * before this transaction commits, leaving the data block unflushed and - * clean, if we crash before the later transaction commits, the data block - * is lost. - */ - if (!list_empty(&jl->j_tail_bh_list)) { - unlock_kernel(); - write_ordered_buffers(&journal->j_dirty_buffers_lock, - journal, jl, &jl->j_tail_bh_list); - lock_kernel(); - } - if (!list_empty(&jl->j_tail_bh_list)) - BUG(); - up(&jl->j_commit_lock); - - /* honor the flush wishes from the caller, simple commits can - ** be done outside the journal lock, they are done below - ** - ** if we don't flush the commit list right now, we put it into - ** the work queue so the people waiting on the async progress work - ** queue don't wait for this proc to flush journal lists and such. - */ - if (flush) { - flush_commit_list(p_s_sb, jl, 1) ; - flush_journal_list(p_s_sb, jl, 1) ; - } else if (!(jl->j_state & LIST_COMMIT_PENDING)) - queue_delayed_work(commit_wq, &journal->j_work, HZ/10); - - - /* if the next transaction has any chance of wrapping, flush - ** transactions that might get overwritten. If any journal lists are very - ** old flush them as well. - */ -first_jl: - list_for_each_safe(entry, safe, &journal->j_journal_list) { - temp_jl = JOURNAL_LIST_ENTRY(entry); - if (journal->j_start <= temp_jl->j_start) { - if ((journal->j_start + journal->j_trans_max + 1) >= - temp_jl->j_start) - { - flush_used_journal_lists(p_s_sb, temp_jl); - goto first_jl; - } else if ((journal->j_start + - journal->j_trans_max + 1) < - SB_ONDISK_JOURNAL_SIZE(p_s_sb)) - { - /* if we don't cross into the next transaction and we don't - * wrap, there is no way we can overlap any later transactions - * break now - */ - break; - } - } else if ((journal->j_start + - journal->j_trans_max + 1) > - SB_ONDISK_JOURNAL_SIZE(p_s_sb)) - { - if (((journal->j_start + journal->j_trans_max + 1) % - SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= temp_jl->j_start) - { - flush_used_journal_lists(p_s_sb, temp_jl); - goto first_jl; - } else { - /* we don't overlap anything from out start to the end of the - * log, and our wrapped portion doesn't overlap anything at - * the start of the log. We can break - */ - break; - } - } - } - flush_old_journal_lists(p_s_sb); - - journal->j_current_jl->j_list_bitmap = get_list_bitmap(p_s_sb, journal->j_current_jl) ; - - if (!(journal->j_current_jl->j_list_bitmap)) { - reiserfs_panic(p_s_sb, "journal-1996: do_journal_end, could not get a list bitmap\n") ; - } - - atomic_set(&(journal->j_jlock), 0) ; - unlock_journal(p_s_sb) ; - /* wake up any body waiting to join. */ - clear_bit(J_WRITERS_QUEUED, &journal->j_state); - wake_up(&(journal->j_join_wait)) ; - - if (!flush && wait_on_commit && - journal_list_still_alive(p_s_sb, commit_trans_id)) { - flush_commit_list(p_s_sb, jl, 1) ; - } -out: - reiserfs_check_lock_depth(p_s_sb, "journal end2"); - - memset (th, 0, sizeof (*th)); - /* Re-set th->t_super, so we can properly keep track of how many - * persistent transactions there are. We need to do this so if this - * call is part of a failed restart_transaction, we can free it later */ - th->t_super = p_s_sb; - - return journal->j_errno; -} - -static void -__reiserfs_journal_abort_hard (struct super_block *sb) -{ - struct reiserfs_journal *journal = SB_JOURNAL (sb); - if (test_bit (J_ABORTED, &journal->j_state)) - return; - - printk (KERN_CRIT "REISERFS: Aborting journal for filesystem on %s\n", - reiserfs_bdevname (sb)); - - sb->s_flags |= MS_RDONLY; - set_bit (J_ABORTED, &journal->j_state); + + /* setup description block */ + d_bh = + journal_getblk(p_s_sb, + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + journal->j_start); + set_buffer_uptodate(d_bh); + desc = (struct reiserfs_journal_desc *)(d_bh)->b_data; + memset(d_bh->b_data, 0, d_bh->b_size); + memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8); + set_desc_trans_id(desc, journal->j_trans_id); + + /* setup commit block. Don't write (keep it clean too) this one until after everyone else is written */ + c_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + ((journal->j_start + journal->j_len + + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))); + commit = (struct reiserfs_journal_commit *)c_bh->b_data; + memset(c_bh->b_data, 0, c_bh->b_size); + set_commit_trans_id(commit, journal->j_trans_id); + set_buffer_uptodate(c_bh); + + /* init this journal list */ + jl = journal->j_current_jl; + + /* we lock the commit before doing anything because + * we want to make sure nobody tries to run flush_commit_list until + * the new transaction is fully setup, and we've already flushed the + * ordered bh list + */ + down(&jl->j_commit_lock); + + /* save the transaction id in case we need to commit it later */ + commit_trans_id = jl->j_trans_id; + + atomic_set(&jl->j_older_commits_done, 0); + jl->j_trans_id = journal->j_trans_id; + jl->j_timestamp = journal->j_trans_start_time; + jl->j_commit_bh = c_bh; + jl->j_start = journal->j_start; + jl->j_len = journal->j_len; + atomic_set(&jl->j_nonzerolen, journal->j_len); + atomic_set(&jl->j_commit_left, journal->j_len + 2); + jl->j_realblock = NULL; + + /* The ENTIRE FOR LOOP MUST not cause schedule to occur. + ** for each real block, add it to the journal list hash, + ** copy into real block index array in the commit or desc block + */ + trans_half = journal_trans_half(p_s_sb->s_blocksize); + for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) { + if (buffer_journaled(cn->bh)) { + jl_cn = get_cnode(p_s_sb); + if (!jl_cn) { + reiserfs_panic(p_s_sb, + "journal-1676, get_cnode returned NULL\n"); + } + if (i == 0) { + jl->j_realblock = jl_cn; + } + jl_cn->prev = last_cn; + jl_cn->next = NULL; + if (last_cn) { + last_cn->next = jl_cn; + } + last_cn = jl_cn; + /* make sure the block we are trying to log is not a block + of journal or reserved area */ + + if (is_block_in_log_or_reserved_area + (p_s_sb, cn->bh->b_blocknr)) { + reiserfs_panic(p_s_sb, + "journal-2332: Trying to log block %lu, which is a log block\n", + cn->bh->b_blocknr); + } + jl_cn->blocknr = cn->bh->b_blocknr; + jl_cn->state = 0; + jl_cn->sb = p_s_sb; + jl_cn->bh = cn->bh; + jl_cn->jlist = jl; + insert_journal_hash(journal->j_list_hash_table, jl_cn); + if (i < trans_half) { + desc->j_realblock[i] = + cpu_to_le32(cn->bh->b_blocknr); + } else { + commit->j_realblock[i - trans_half] = + cpu_to_le32(cn->bh->b_blocknr); + } + } else { + i--; + } + } + set_desc_trans_len(desc, journal->j_len); + set_desc_mount_id(desc, journal->j_mount_id); + set_desc_trans_id(desc, journal->j_trans_id); + set_commit_trans_len(commit, journal->j_len); + + /* special check in case all buffers in the journal were marked for not logging */ + if (journal->j_len == 0) { + BUG(); + } + + /* we're about to dirty all the log blocks, mark the description block + * dirty now too. Don't mark the commit block dirty until all the + * others are on disk + */ + mark_buffer_dirty(d_bh); + + /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */ + cur_write_start = journal->j_start; + cn = journal->j_first; + jindex = 1; /* start at one so we don't get the desc again */ + while (cn) { + clear_buffer_journal_new(cn->bh); + /* copy all the real blocks into log area. dirty log blocks */ + if (buffer_journaled(cn->bh)) { + struct buffer_head *tmp_bh; + char *addr; + struct page *page; + tmp_bh = + journal_getblk(p_s_sb, + SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + + ((cur_write_start + + jindex) % + SB_ONDISK_JOURNAL_SIZE(p_s_sb))); + set_buffer_uptodate(tmp_bh); + page = cn->bh->b_page; + addr = kmap(page); + memcpy(tmp_bh->b_data, + addr + offset_in_page(cn->bh->b_data), + cn->bh->b_size); + kunmap(page); + mark_buffer_dirty(tmp_bh); + jindex++; + set_buffer_journal_dirty(cn->bh); + clear_buffer_journaled(cn->bh); + } else { + /* JDirty cleared sometime during transaction. don't log this one */ + reiserfs_warning(p_s_sb, + "journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!"); + brelse(cn->bh); + } + next = cn->next; + free_cnode(p_s_sb, cn); + cn = next; + cond_resched(); + } + + /* we are done with both the c_bh and d_bh, but + ** c_bh must be written after all other commit blocks, + ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. + */ + + journal->j_current_jl = alloc_journal_list(p_s_sb); + + /* now it is safe to insert this transaction on the main list */ + list_add_tail(&jl->j_list, &journal->j_journal_list); + list_add_tail(&jl->j_working_list, &journal->j_working_list); + journal->j_num_work_lists++; + + /* reset journal values for the next transaction */ + old_start = journal->j_start; + journal->j_start = + (journal->j_start + journal->j_len + + 2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb); + atomic_set(&(journal->j_wcount), 0); + journal->j_bcount = 0; + journal->j_last = NULL; + journal->j_first = NULL; + journal->j_len = 0; + journal->j_trans_start_time = 0; + journal->j_trans_id++; + journal->j_current_jl->j_trans_id = journal->j_trans_id; + journal->j_must_wait = 0; + journal->j_len_alloc = 0; + journal->j_next_full_flush = 0; + journal->j_next_async_flush = 0; + init_journal_hash(p_s_sb); + + // make sure reiserfs_add_jh sees the new current_jl before we + // write out the tails + smp_mb(); + + /* tail conversion targets have to hit the disk before we end the + * transaction. Otherwise a later transaction might repack the tail + * before this transaction commits, leaving the data block unflushed and + * clean, if we crash before the later transaction commits, the data block + * is lost. + */ + if (!list_empty(&jl->j_tail_bh_list)) { + unlock_kernel(); + write_ordered_buffers(&journal->j_dirty_buffers_lock, + journal, jl, &jl->j_tail_bh_list); + lock_kernel(); + } + if (!list_empty(&jl->j_tail_bh_list)) + BUG(); + up(&jl->j_commit_lock); + + /* honor the flush wishes from the caller, simple commits can + ** be done outside the journal lock, they are done below + ** + ** if we don't flush the commit list right now, we put it into + ** the work queue so the people waiting on the async progress work + ** queue don't wait for this proc to flush journal lists and such. + */ + if (flush) { + flush_commit_list(p_s_sb, jl, 1); + flush_journal_list(p_s_sb, jl, 1); + } else if (!(jl->j_state & LIST_COMMIT_PENDING)) + queue_delayed_work(commit_wq, &journal->j_work, HZ / 10); + + /* if the next transaction has any chance of wrapping, flush + ** transactions that might get overwritten. If any journal lists are very + ** old flush them as well. + */ + first_jl: + list_for_each_safe(entry, safe, &journal->j_journal_list) { + temp_jl = JOURNAL_LIST_ENTRY(entry); + if (journal->j_start <= temp_jl->j_start) { + if ((journal->j_start + journal->j_trans_max + 1) >= + temp_jl->j_start) { + flush_used_journal_lists(p_s_sb, temp_jl); + goto first_jl; + } else if ((journal->j_start + + journal->j_trans_max + 1) < + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) { + /* if we don't cross into the next transaction and we don't + * wrap, there is no way we can overlap any later transactions + * break now + */ + break; + } + } else if ((journal->j_start + + journal->j_trans_max + 1) > + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) { + if (((journal->j_start + journal->j_trans_max + 1) % + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= + temp_jl->j_start) { + flush_used_journal_lists(p_s_sb, temp_jl); + goto first_jl; + } else { + /* we don't overlap anything from out start to the end of the + * log, and our wrapped portion doesn't overlap anything at + * the start of the log. We can break + */ + break; + } + } + } + flush_old_journal_lists(p_s_sb); + + journal->j_current_jl->j_list_bitmap = + get_list_bitmap(p_s_sb, journal->j_current_jl); + + if (!(journal->j_current_jl->j_list_bitmap)) { + reiserfs_panic(p_s_sb, + "journal-1996: do_journal_end, could not get a list bitmap\n"); + } + + atomic_set(&(journal->j_jlock), 0); + unlock_journal(p_s_sb); + /* wake up any body waiting to join. */ + clear_bit(J_WRITERS_QUEUED, &journal->j_state); + wake_up(&(journal->j_join_wait)); + + if (!flush && wait_on_commit && + journal_list_still_alive(p_s_sb, commit_trans_id)) { + flush_commit_list(p_s_sb, jl, 1); + } + out: + reiserfs_check_lock_depth(p_s_sb, "journal end2"); + + memset(th, 0, sizeof(*th)); + /* Re-set th->t_super, so we can properly keep track of how many + * persistent transactions there are. We need to do this so if this + * call is part of a failed restart_transaction, we can free it later */ + th->t_super = p_s_sb; + + return journal->j_errno; +} + +static void __reiserfs_journal_abort_hard(struct super_block *sb) +{ + struct reiserfs_journal *journal = SB_JOURNAL(sb); + if (test_bit(J_ABORTED, &journal->j_state)) + return; + + printk(KERN_CRIT "REISERFS: Aborting journal for filesystem on %s\n", + reiserfs_bdevname(sb)); + + sb->s_flags |= MS_RDONLY; + set_bit(J_ABORTED, &journal->j_state); #ifdef CONFIG_REISERFS_CHECK - dump_stack(); + dump_stack(); #endif } -static void -__reiserfs_journal_abort_soft (struct super_block *sb, int errno) +static void __reiserfs_journal_abort_soft(struct super_block *sb, int errno) { - struct reiserfs_journal *journal = SB_JOURNAL (sb); - if (test_bit (J_ABORTED, &journal->j_state)) - return; + struct reiserfs_journal *journal = SB_JOURNAL(sb); + if (test_bit(J_ABORTED, &journal->j_state)) + return; - if (!journal->j_errno) - journal->j_errno = errno; + if (!journal->j_errno) + journal->j_errno = errno; - __reiserfs_journal_abort_hard (sb); + __reiserfs_journal_abort_hard(sb); } -void -reiserfs_journal_abort (struct super_block *sb, int errno) +void reiserfs_journal_abort(struct super_block *sb, int errno) { - return __reiserfs_journal_abort_soft (sb, errno); + return __reiserfs_journal_abort_soft(sb, errno); } diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c index 2406608fc5c..2533c1f64ab 100644 --- a/fs/reiserfs/lbalance.c +++ b/fs/reiserfs/lbalance.c @@ -21,648 +21,709 @@ leaf_paste_entries */ - /* copy copy_count entries from source directory item to dest buffer (creating new item if needed) */ -static void leaf_copy_dir_entries (struct buffer_info * dest_bi, struct buffer_head * source, - int last_first, int item_num, int from, int copy_count) +static void leaf_copy_dir_entries(struct buffer_info *dest_bi, + struct buffer_head *source, int last_first, + int item_num, int from, int copy_count) { - struct buffer_head * dest = dest_bi->bi_bh; - int item_num_in_dest; /* either the number of target item, - or if we must create a new item, - the number of the item we will - create it next to */ - struct item_head * ih; - struct reiserfs_de_head * deh; - int copy_records_len; /* length of all records in item to be copied */ - char * records; - - ih = B_N_PITEM_HEAD (source, item_num); - - RFALSE( !is_direntry_le_ih (ih), "vs-10000: item must be directory item"); - - /* length of all record to be copied and first byte of the last of them */ - deh = B_I_DEH (source, ih); - if (copy_count) { - copy_records_len = (from ? deh_location( &(deh[from - 1]) ) : - ih_item_len(ih)) - deh_location( &(deh[from + copy_count - 1])); - records = source->b_data + ih_location(ih) + - deh_location( &(deh[from + copy_count - 1])); - } else { - copy_records_len = 0; - records = NULL; - } - - /* when copy last to first, dest buffer can contain 0 items */ - item_num_in_dest = (last_first == LAST_TO_FIRST) ? (( B_NR_ITEMS(dest) ) ? 0 : -1) : (B_NR_ITEMS(dest) - 1); - - /* if there are no items in dest or the first/last item in dest is not item of the same directory */ - if ( (item_num_in_dest == - 1) || - (last_first == FIRST_TO_LAST && le_ih_k_offset (ih) == DOT_OFFSET) || - (last_first == LAST_TO_FIRST && comp_short_le_keys/*COMP_SHORT_KEYS*/ (&ih->ih_key, B_N_PKEY (dest, item_num_in_dest)))) { - /* create new item in dest */ - struct item_head new_ih; - - /* form item header */ - memcpy (&new_ih.ih_key, &ih->ih_key, KEY_SIZE); - put_ih_version( &new_ih, KEY_FORMAT_3_5 ); - /* calculate item len */ - put_ih_item_len( &new_ih, DEH_SIZE * copy_count + copy_records_len ); - put_ih_entry_count( &new_ih, 0 ); - - if (last_first == LAST_TO_FIRST) { - /* form key by the following way */ - if (from < I_ENTRY_COUNT(ih)) { - set_le_ih_k_offset( &new_ih, deh_offset( &(deh[from]) ) ); - /*memcpy (&new_ih.ih_key.k_offset, &deh[from].deh_offset, SHORT_KEY_SIZE);*/ - } else { - /* no entries will be copied to this item in this function */ - set_le_ih_k_offset (&new_ih, U32_MAX); - /* this item is not yet valid, but we want I_IS_DIRECTORY_ITEM to return 1 for it, so we -1 */ - } - set_le_key_k_type (KEY_FORMAT_3_5, &(new_ih.ih_key), TYPE_DIRENTRY); + struct buffer_head *dest = dest_bi->bi_bh; + int item_num_in_dest; /* either the number of target item, + or if we must create a new item, + the number of the item we will + create it next to */ + struct item_head *ih; + struct reiserfs_de_head *deh; + int copy_records_len; /* length of all records in item to be copied */ + char *records; + + ih = B_N_PITEM_HEAD(source, item_num); + + RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item"); + + /* length of all record to be copied and first byte of the last of them */ + deh = B_I_DEH(source, ih); + if (copy_count) { + copy_records_len = (from ? deh_location(&(deh[from - 1])) : + ih_item_len(ih)) - + deh_location(&(deh[from + copy_count - 1])); + records = + source->b_data + ih_location(ih) + + deh_location(&(deh[from + copy_count - 1])); + } else { + copy_records_len = 0; + records = NULL; + } + + /* when copy last to first, dest buffer can contain 0 items */ + item_num_in_dest = + (last_first == + LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest) + - 1); + + /* if there are no items in dest or the first/last item in dest is not item of the same directory */ + if ((item_num_in_dest == -1) || + (last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) || + (last_first == LAST_TO_FIRST + && comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key, + B_N_PKEY(dest, + item_num_in_dest)))) + { + /* create new item in dest */ + struct item_head new_ih; + + /* form item header */ + memcpy(&new_ih.ih_key, &ih->ih_key, KEY_SIZE); + put_ih_version(&new_ih, KEY_FORMAT_3_5); + /* calculate item len */ + put_ih_item_len(&new_ih, + DEH_SIZE * copy_count + copy_records_len); + put_ih_entry_count(&new_ih, 0); + + if (last_first == LAST_TO_FIRST) { + /* form key by the following way */ + if (from < I_ENTRY_COUNT(ih)) { + set_le_ih_k_offset(&new_ih, + deh_offset(&(deh[from]))); + /*memcpy (&new_ih.ih_key.k_offset, &deh[from].deh_offset, SHORT_KEY_SIZE); */ + } else { + /* no entries will be copied to this item in this function */ + set_le_ih_k_offset(&new_ih, U32_MAX); + /* this item is not yet valid, but we want I_IS_DIRECTORY_ITEM to return 1 for it, so we -1 */ + } + set_le_key_k_type(KEY_FORMAT_3_5, &(new_ih.ih_key), + TYPE_DIRENTRY); + } + + /* insert item into dest buffer */ + leaf_insert_into_buf(dest_bi, + (last_first == + LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest), + &new_ih, NULL, 0); + } else { + /* prepare space for entries */ + leaf_paste_in_buffer(dest_bi, + (last_first == + FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - + 1) : 0, MAX_US_INT, + DEH_SIZE * copy_count + copy_records_len, + records, 0); } - - /* insert item into dest buffer */ - leaf_insert_into_buf (dest_bi, (last_first == LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest), &new_ih, NULL, 0); - } else { - /* prepare space for entries */ - leaf_paste_in_buffer (dest_bi, (last_first==FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0, MAX_US_INT, - DEH_SIZE * copy_count + copy_records_len, records, 0 - ); - } - - item_num_in_dest = (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest)-1) : 0; - - leaf_paste_entries (dest_bi->bi_bh, item_num_in_dest, - (last_first == FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD (dest, item_num_in_dest)) : 0, - copy_count, deh + from, records, - DEH_SIZE * copy_count + copy_records_len - ); -} + item_num_in_dest = + (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0; + + leaf_paste_entries(dest_bi->bi_bh, item_num_in_dest, + (last_first == + FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD(dest, + item_num_in_dest)) + : 0, copy_count, deh + from, records, + DEH_SIZE * copy_count + copy_records_len); +} /* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or part of it or nothing (see the return 0 below) from SOURCE to the end (if last_first) or beginning (!last_first) of the DEST */ /* returns 1 if anything was copied, else 0 */ -static int leaf_copy_boundary_item (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, - int bytes_or_entries) +static int leaf_copy_boundary_item(struct buffer_info *dest_bi, + struct buffer_head *src, int last_first, + int bytes_or_entries) { - struct buffer_head * dest = dest_bi->bi_bh; - int dest_nr_item, src_nr_item; /* number of items in the source and destination buffers */ - struct item_head * ih; - struct item_head * dih; - - dest_nr_item = B_NR_ITEMS(dest); - - if ( last_first == FIRST_TO_LAST ) { - /* if ( DEST is empty or first item of SOURCE and last item of DEST are the items of different objects - or of different types ) then there is no need to treat this item differently from the other items - that we copy, so we return */ - ih = B_N_PITEM_HEAD (src, 0); - dih = B_N_PITEM_HEAD (dest, dest_nr_item - 1); - if (!dest_nr_item || (!op_is_left_mergeable (&(ih->ih_key), src->b_size))) - /* there is nothing to merge */ - return 0; - - RFALSE( ! ih_item_len(ih), "vs-10010: item can not have empty length"); - - if ( is_direntry_le_ih (ih) ) { - if ( bytes_or_entries == -1 ) - /* copy all entries to dest */ - bytes_or_entries = ih_entry_count(ih); - leaf_copy_dir_entries (dest_bi, src, FIRST_TO_LAST, 0, 0, bytes_or_entries); - return 1; - } - - /* copy part of the body of the first item of SOURCE to the end of the body of the last item of the DEST - part defined by 'bytes_or_entries'; if bytes_or_entries == -1 copy whole body; don't create new item header - */ - if ( bytes_or_entries == -1 ) - bytes_or_entries = ih_item_len(ih); + struct buffer_head *dest = dest_bi->bi_bh; + int dest_nr_item, src_nr_item; /* number of items in the source and destination buffers */ + struct item_head *ih; + struct item_head *dih; + + dest_nr_item = B_NR_ITEMS(dest); + + if (last_first == FIRST_TO_LAST) { + /* if ( DEST is empty or first item of SOURCE and last item of DEST are the items of different objects + or of different types ) then there is no need to treat this item differently from the other items + that we copy, so we return */ + ih = B_N_PITEM_HEAD(src, 0); + dih = B_N_PITEM_HEAD(dest, dest_nr_item - 1); + if (!dest_nr_item + || (!op_is_left_mergeable(&(ih->ih_key), src->b_size))) + /* there is nothing to merge */ + return 0; + + RFALSE(!ih_item_len(ih), + "vs-10010: item can not have empty length"); + + if (is_direntry_le_ih(ih)) { + if (bytes_or_entries == -1) + /* copy all entries to dest */ + bytes_or_entries = ih_entry_count(ih); + leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, 0, 0, + bytes_or_entries); + return 1; + } + + /* copy part of the body of the first item of SOURCE to the end of the body of the last item of the DEST + part defined by 'bytes_or_entries'; if bytes_or_entries == -1 copy whole body; don't create new item header + */ + if (bytes_or_entries == -1) + bytes_or_entries = ih_item_len(ih); #ifdef CONFIG_REISERFS_CHECK - else { - if (bytes_or_entries == ih_item_len(ih) && is_indirect_le_ih(ih)) - if (get_ih_free_space (ih)) - reiserfs_panic (NULL, "vs-10020: leaf_copy_boundary_item: " - "last unformatted node must be filled entirely (%h)", - ih); - } + else { + if (bytes_or_entries == ih_item_len(ih) + && is_indirect_le_ih(ih)) + if (get_ih_free_space(ih)) + reiserfs_panic(NULL, + "vs-10020: leaf_copy_boundary_item: " + "last unformatted node must be filled entirely (%h)", + ih); + } #endif - - /* merge first item (or its part) of src buffer with the last - item of dest buffer. Both are of the same file */ - leaf_paste_in_buffer (dest_bi, - dest_nr_item - 1, ih_item_len(dih), bytes_or_entries, B_I_PITEM(src,ih), 0 - ); - - if (is_indirect_le_ih (dih)) { - RFALSE( get_ih_free_space (dih), - "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space", - ih); - if (bytes_or_entries == ih_item_len(ih)) - set_ih_free_space (dih, get_ih_free_space (ih)); - } - - return 1; - } - - - /* copy boundary item to right (last_first == LAST_TO_FIRST) */ - - /* ( DEST is empty or last item of SOURCE and first item of DEST - are the items of different object or of different types ) - */ - src_nr_item = B_NR_ITEMS (src); - ih = B_N_PITEM_HEAD (src, src_nr_item - 1); - dih = B_N_PITEM_HEAD (dest, 0); - - if (!dest_nr_item || !op_is_left_mergeable (&(dih->ih_key), src->b_size)) - return 0; - - if ( is_direntry_le_ih (ih)) { - if ( bytes_or_entries == -1 ) - /* bytes_or_entries = entries number in last item body of SOURCE */ - bytes_or_entries = ih_entry_count(ih); - - leaf_copy_dir_entries (dest_bi, src, LAST_TO_FIRST, src_nr_item - 1, ih_entry_count(ih) - bytes_or_entries, bytes_or_entries); - return 1; - } - - /* copy part of the body of the last item of SOURCE to the begin of the body of the first item of the DEST; - part defined by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; change first item key of the DEST; - don't create new item header - */ - - RFALSE( is_indirect_le_ih(ih) && get_ih_free_space (ih), - "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)", - ih); - - if ( bytes_or_entries == -1 ) { - /* bytes_or_entries = length of last item body of SOURCE */ - bytes_or_entries = ih_item_len(ih); - - RFALSE( le_ih_k_offset (dih) != - le_ih_k_offset (ih) + op_bytes_number (ih, src->b_size), - "vs-10050: items %h and %h do not match", ih, dih); - - /* change first item key of the DEST */ - set_le_ih_k_offset (dih, le_ih_k_offset (ih)); - - /* item becomes non-mergeable */ - /* or mergeable if left item was */ - set_le_ih_k_type (dih, le_ih_k_type (ih)); - } else { - /* merge to right only part of item */ - RFALSE( ih_item_len(ih) <= bytes_or_entries, - "vs-10060: no so much bytes %lu (needed %lu)", - ( unsigned long )ih_item_len(ih), ( unsigned long )bytes_or_entries); - - /* change first item key of the DEST */ - if ( is_direct_le_ih (dih) ) { - RFALSE( le_ih_k_offset (dih) <= (unsigned long)bytes_or_entries, - "vs-10070: dih %h, bytes_or_entries(%d)", dih, bytes_or_entries); - set_le_ih_k_offset (dih, le_ih_k_offset (dih) - bytes_or_entries); - } else { - RFALSE( le_ih_k_offset (dih) <= - (bytes_or_entries / UNFM_P_SIZE) * dest->b_size, - "vs-10080: dih %h, bytes_or_entries(%d)", - dih, (bytes_or_entries/UNFM_P_SIZE)*dest->b_size); - set_le_ih_k_offset (dih, le_ih_k_offset (dih) - ((bytes_or_entries / UNFM_P_SIZE) * dest->b_size)); - } - } - - leaf_paste_in_buffer (dest_bi, 0, 0, bytes_or_entries, B_I_PITEM(src,ih) + ih_item_len(ih) - bytes_or_entries, 0); - return 1; -} + /* merge first item (or its part) of src buffer with the last + item of dest buffer. Both are of the same file */ + leaf_paste_in_buffer(dest_bi, + dest_nr_item - 1, ih_item_len(dih), + bytes_or_entries, B_I_PITEM(src, ih), 0); + + if (is_indirect_le_ih(dih)) { + RFALSE(get_ih_free_space(dih), + "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space", + ih); + if (bytes_or_entries == ih_item_len(ih)) + set_ih_free_space(dih, get_ih_free_space(ih)); + } + + return 1; + } + + /* copy boundary item to right (last_first == LAST_TO_FIRST) */ + + /* ( DEST is empty or last item of SOURCE and first item of DEST + are the items of different object or of different types ) + */ + src_nr_item = B_NR_ITEMS(src); + ih = B_N_PITEM_HEAD(src, src_nr_item - 1); + dih = B_N_PITEM_HEAD(dest, 0); + + if (!dest_nr_item || !op_is_left_mergeable(&(dih->ih_key), src->b_size)) + return 0; + + if (is_direntry_le_ih(ih)) { + if (bytes_or_entries == -1) + /* bytes_or_entries = entries number in last item body of SOURCE */ + bytes_or_entries = ih_entry_count(ih); + + leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, + src_nr_item - 1, + ih_entry_count(ih) - bytes_or_entries, + bytes_or_entries); + return 1; + } + + /* copy part of the body of the last item of SOURCE to the begin of the body of the first item of the DEST; + part defined by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; change first item key of the DEST; + don't create new item header + */ + + RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih), + "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)", + ih); + + if (bytes_or_entries == -1) { + /* bytes_or_entries = length of last item body of SOURCE */ + bytes_or_entries = ih_item_len(ih); + + RFALSE(le_ih_k_offset(dih) != + le_ih_k_offset(ih) + op_bytes_number(ih, src->b_size), + "vs-10050: items %h and %h do not match", ih, dih); + + /* change first item key of the DEST */ + set_le_ih_k_offset(dih, le_ih_k_offset(ih)); + + /* item becomes non-mergeable */ + /* or mergeable if left item was */ + set_le_ih_k_type(dih, le_ih_k_type(ih)); + } else { + /* merge to right only part of item */ + RFALSE(ih_item_len(ih) <= bytes_or_entries, + "vs-10060: no so much bytes %lu (needed %lu)", + (unsigned long)ih_item_len(ih), + (unsigned long)bytes_or_entries); + + /* change first item key of the DEST */ + if (is_direct_le_ih(dih)) { + RFALSE(le_ih_k_offset(dih) <= + (unsigned long)bytes_or_entries, + "vs-10070: dih %h, bytes_or_entries(%d)", dih, + bytes_or_entries); + set_le_ih_k_offset(dih, + le_ih_k_offset(dih) - + bytes_or_entries); + } else { + RFALSE(le_ih_k_offset(dih) <= + (bytes_or_entries / UNFM_P_SIZE) * dest->b_size, + "vs-10080: dih %h, bytes_or_entries(%d)", + dih, + (bytes_or_entries / UNFM_P_SIZE) * dest->b_size); + set_le_ih_k_offset(dih, + le_ih_k_offset(dih) - + ((bytes_or_entries / UNFM_P_SIZE) * + dest->b_size)); + } + } + + leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries, + B_I_PITEM(src, + ih) + ih_item_len(ih) - bytes_or_entries, + 0); + return 1; +} /* copy cpy_mun items from buffer src to buffer dest * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning from first-th item in src to tail of dest * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning from first-th item in src to head of dest */ -static void leaf_copy_items_entirely (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, - int first, int cpy_num) +static void leaf_copy_items_entirely(struct buffer_info *dest_bi, + struct buffer_head *src, int last_first, + int first, int cpy_num) { - struct buffer_head * dest; - int nr, free_space; - int dest_before; - int last_loc, last_inserted_loc, location; - int i, j; - struct block_head * blkh; - struct item_head * ih; - - RFALSE( last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST, - "vs-10090: bad last_first parameter %d", last_first); - RFALSE( B_NR_ITEMS (src) - first < cpy_num, - "vs-10100: too few items in source %d, required %d from %d", - B_NR_ITEMS(src), cpy_num, first); - RFALSE( cpy_num < 0, "vs-10110: can not copy negative amount of items"); - RFALSE( ! dest_bi, "vs-10120: can not copy negative amount of items"); - - dest = dest_bi->bi_bh; - - RFALSE( ! dest, "vs-10130: can not copy negative amount of items"); - - if (cpy_num == 0) - return; - - blkh = B_BLK_HEAD(dest); - nr = blkh_nr_item( blkh ); - free_space = blkh_free_space(blkh); - - /* we will insert items before 0-th or nr-th item in dest buffer. It depends of last_first parameter */ - dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr; - - /* location of head of first new item */ - ih = B_N_PITEM_HEAD (dest, dest_before); - - RFALSE( blkh_free_space(blkh) < cpy_num * IH_SIZE, - "vs-10140: not enough free space for headers %d (needed %d)", - B_FREE_SPACE (dest), cpy_num * IH_SIZE); - - /* prepare space for headers */ - memmove (ih + cpy_num, ih, (nr-dest_before) * IH_SIZE); - - /* copy item headers */ - memcpy (ih, B_N_PITEM_HEAD (src, first), cpy_num * IH_SIZE); - - free_space -= (IH_SIZE * cpy_num); - set_blkh_free_space( blkh, free_space ); - - /* location of unmovable item */ - j = location = (dest_before == 0) ? dest->b_size : ih_location(ih-1); - for (i = dest_before; i < nr + cpy_num; i ++) { - location -= ih_item_len( ih + i - dest_before ); - put_ih_location( ih + i - dest_before, location ); - } - - /* prepare space for items */ - last_loc = ih_location( &(ih[nr+cpy_num-1-dest_before]) ); - last_inserted_loc = ih_location( &(ih[cpy_num-1]) ); - - /* check free space */ - RFALSE( free_space < j - last_inserted_loc, - "vs-10150: not enough free space for items %d (needed %d)", - free_space, j - last_inserted_loc); - - memmove (dest->b_data + last_loc, - dest->b_data + last_loc + j - last_inserted_loc, - last_inserted_loc - last_loc); - - /* copy items */ - memcpy (dest->b_data + last_inserted_loc, B_N_PITEM(src,(first + cpy_num - 1)), - j - last_inserted_loc); - - /* sizes, item number */ - set_blkh_nr_item( blkh, nr + cpy_num ); - set_blkh_free_space( blkh, free_space - (j - last_inserted_loc) ); - - do_balance_mark_leaf_dirty (dest_bi->tb, dest, 0); - - if (dest_bi->bi_parent) { - struct disk_child *t_dc; - t_dc = B_N_CHILD (dest_bi->bi_parent, dest_bi->bi_position); - RFALSE( dc_block_number(t_dc) != dest->b_blocknr, - "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu", - ( long unsigned ) dest->b_blocknr, - ( long unsigned ) dc_block_number(t_dc)); - put_dc_size( t_dc, dc_size(t_dc) + (j - last_inserted_loc + IH_SIZE * cpy_num ) ); - - do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent, 0); - } -} + struct buffer_head *dest; + int nr, free_space; + int dest_before; + int last_loc, last_inserted_loc, location; + int i, j; + struct block_head *blkh; + struct item_head *ih; + + RFALSE(last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST, + "vs-10090: bad last_first parameter %d", last_first); + RFALSE(B_NR_ITEMS(src) - first < cpy_num, + "vs-10100: too few items in source %d, required %d from %d", + B_NR_ITEMS(src), cpy_num, first); + RFALSE(cpy_num < 0, "vs-10110: can not copy negative amount of items"); + RFALSE(!dest_bi, "vs-10120: can not copy negative amount of items"); + + dest = dest_bi->bi_bh; + + RFALSE(!dest, "vs-10130: can not copy negative amount of items"); + + if (cpy_num == 0) + return; + + blkh = B_BLK_HEAD(dest); + nr = blkh_nr_item(blkh); + free_space = blkh_free_space(blkh); + + /* we will insert items before 0-th or nr-th item in dest buffer. It depends of last_first parameter */ + dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr; + + /* location of head of first new item */ + ih = B_N_PITEM_HEAD(dest, dest_before); + + RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE, + "vs-10140: not enough free space for headers %d (needed %d)", + B_FREE_SPACE(dest), cpy_num * IH_SIZE); + + /* prepare space for headers */ + memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE); + /* copy item headers */ + memcpy(ih, B_N_PITEM_HEAD(src, first), cpy_num * IH_SIZE); + + free_space -= (IH_SIZE * cpy_num); + set_blkh_free_space(blkh, free_space); + + /* location of unmovable item */ + j = location = (dest_before == 0) ? dest->b_size : ih_location(ih - 1); + for (i = dest_before; i < nr + cpy_num; i++) { + location -= ih_item_len(ih + i - dest_before); + put_ih_location(ih + i - dest_before, location); + } + + /* prepare space for items */ + last_loc = ih_location(&(ih[nr + cpy_num - 1 - dest_before])); + last_inserted_loc = ih_location(&(ih[cpy_num - 1])); + + /* check free space */ + RFALSE(free_space < j - last_inserted_loc, + "vs-10150: not enough free space for items %d (needed %d)", + free_space, j - last_inserted_loc); + + memmove(dest->b_data + last_loc, + dest->b_data + last_loc + j - last_inserted_loc, + last_inserted_loc - last_loc); + + /* copy items */ + memcpy(dest->b_data + last_inserted_loc, + B_N_PITEM(src, (first + cpy_num - 1)), j - last_inserted_loc); + + /* sizes, item number */ + set_blkh_nr_item(blkh, nr + cpy_num); + set_blkh_free_space(blkh, free_space - (j - last_inserted_loc)); + + do_balance_mark_leaf_dirty(dest_bi->tb, dest, 0); + + if (dest_bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position); + RFALSE(dc_block_number(t_dc) != dest->b_blocknr, + "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu", + (long unsigned)dest->b_blocknr, + (long unsigned)dc_block_number(t_dc)); + put_dc_size(t_dc, + dc_size(t_dc) + (j - last_inserted_loc + + IH_SIZE * cpy_num)); + + do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent, + 0); + } +} /* This function splits the (liquid) item into two items (useful when shifting part of an item into another node.) */ -static void leaf_item_bottle (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, - int item_num, int cpy_bytes) +static void leaf_item_bottle(struct buffer_info *dest_bi, + struct buffer_head *src, int last_first, + int item_num, int cpy_bytes) { - struct buffer_head * dest = dest_bi->bi_bh; - struct item_head * ih; - - RFALSE( cpy_bytes == -1, "vs-10170: bytes == - 1 means: do not split item"); - - if ( last_first == FIRST_TO_LAST ) { - /* if ( if item in position item_num in buffer SOURCE is directory item ) */ - if (is_direntry_le_ih (ih = B_N_PITEM_HEAD(src,item_num))) - leaf_copy_dir_entries (dest_bi, src, FIRST_TO_LAST, item_num, 0, cpy_bytes); - else { - struct item_head n_ih; - - /* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST - part defined by 'cpy_bytes'; create new item header; change old item_header (????); - n_ih = new item_header; - */ - memcpy (&n_ih, ih, IH_SIZE); - put_ih_item_len( &n_ih, cpy_bytes ); - if (is_indirect_le_ih (ih)) { - RFALSE( cpy_bytes == ih_item_len(ih) && get_ih_free_space(ih), - "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)", - ( long unsigned ) get_ih_free_space (ih)); - set_ih_free_space (&n_ih, 0); - } - - RFALSE( op_is_left_mergeable (&(ih->ih_key), src->b_size), - "vs-10190: bad mergeability of item %h", ih); - n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ - leaf_insert_into_buf (dest_bi, B_NR_ITEMS(dest), &n_ih, B_N_PITEM (src, item_num), 0); + struct buffer_head *dest = dest_bi->bi_bh; + struct item_head *ih; + + RFALSE(cpy_bytes == -1, + "vs-10170: bytes == - 1 means: do not split item"); + + if (last_first == FIRST_TO_LAST) { + /* if ( if item in position item_num in buffer SOURCE is directory item ) */ + if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num))) + leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, + item_num, 0, cpy_bytes); + else { + struct item_head n_ih; + + /* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST + part defined by 'cpy_bytes'; create new item header; change old item_header (????); + n_ih = new item_header; + */ + memcpy(&n_ih, ih, IH_SIZE); + put_ih_item_len(&n_ih, cpy_bytes); + if (is_indirect_le_ih(ih)) { + RFALSE(cpy_bytes == ih_item_len(ih) + && get_ih_free_space(ih), + "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)", + (long unsigned)get_ih_free_space(ih)); + set_ih_free_space(&n_ih, 0); + } + + RFALSE(op_is_left_mergeable(&(ih->ih_key), src->b_size), + "vs-10190: bad mergeability of item %h", ih); + n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ + leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih, + B_N_PITEM(src, item_num), 0); + } + } else { + /* if ( if item in position item_num in buffer SOURCE is directory item ) */ + if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num))) + leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, + item_num, + I_ENTRY_COUNT(ih) - cpy_bytes, + cpy_bytes); + else { + struct item_head n_ih; + + /* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST + part defined by 'cpy_bytes'; create new item header; + n_ih = new item_header; + */ + memcpy(&n_ih, ih, SHORT_KEY_SIZE); + + n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ + + if (is_direct_le_ih(ih)) { + set_le_ih_k_offset(&n_ih, + le_ih_k_offset(ih) + + ih_item_len(ih) - cpy_bytes); + set_le_ih_k_type(&n_ih, TYPE_DIRECT); + set_ih_free_space(&n_ih, MAX_US_INT); + } else { + /* indirect item */ + RFALSE(!cpy_bytes && get_ih_free_space(ih), + "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended"); + set_le_ih_k_offset(&n_ih, + le_ih_k_offset(ih) + + (ih_item_len(ih) - + cpy_bytes) / UNFM_P_SIZE * + dest->b_size); + set_le_ih_k_type(&n_ih, TYPE_INDIRECT); + set_ih_free_space(&n_ih, get_ih_free_space(ih)); + } + + /* set item length */ + put_ih_item_len(&n_ih, cpy_bytes); + + n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ + + leaf_insert_into_buf(dest_bi, 0, &n_ih, + B_N_PITEM(src, + item_num) + + ih_item_len(ih) - cpy_bytes, 0); + } } - } else { - /* if ( if item in position item_num in buffer SOURCE is directory item ) */ - if (is_direntry_le_ih(ih = B_N_PITEM_HEAD (src, item_num))) - leaf_copy_dir_entries (dest_bi, src, LAST_TO_FIRST, item_num, I_ENTRY_COUNT(ih) - cpy_bytes, cpy_bytes); - else { - struct item_head n_ih; - - /* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST - part defined by 'cpy_bytes'; create new item header; - n_ih = new item_header; - */ - memcpy (&n_ih, ih, SHORT_KEY_SIZE); - - n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ - - if (is_direct_le_ih (ih)) { - set_le_ih_k_offset (&n_ih, le_ih_k_offset (ih) + ih_item_len(ih) - cpy_bytes); - set_le_ih_k_type (&n_ih, TYPE_DIRECT); - set_ih_free_space (&n_ih, MAX_US_INT); - } else { - /* indirect item */ - RFALSE( !cpy_bytes && get_ih_free_space (ih), - "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended"); - set_le_ih_k_offset (&n_ih, le_ih_k_offset (ih) + (ih_item_len(ih) - cpy_bytes) / UNFM_P_SIZE * dest->b_size); - set_le_ih_k_type (&n_ih, TYPE_INDIRECT); - set_ih_free_space (&n_ih, get_ih_free_space (ih)); - } - - /* set item length */ - put_ih_item_len( &n_ih, cpy_bytes ); - - n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ - - leaf_insert_into_buf (dest_bi, 0, &n_ih, B_N_PITEM(src,item_num) + ih_item_len(ih) - cpy_bytes, 0); - } - } } - /* If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE to DEST. If cpy_bytes not equal to minus one than copy cpy_num-1 whole items from SOURCE to DEST. From last item copy cpy_num bytes for regular item and cpy_num directory entries for directory item. */ -static int leaf_copy_items (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, int cpy_num, - int cpy_bytes) +static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src, + int last_first, int cpy_num, int cpy_bytes) { - struct buffer_head * dest; - int pos, i, src_nr_item, bytes; - - dest = dest_bi->bi_bh; - RFALSE( !dest || !src, "vs-10210: !dest || !src"); - RFALSE( last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST, - "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST"); - RFALSE( B_NR_ITEMS(src) < cpy_num, - "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src), cpy_num); - RFALSE( cpy_num < 0,"vs-10240: cpy_num < 0 (%d)", cpy_num); - - if ( cpy_num == 0 ) - return 0; - - if ( last_first == FIRST_TO_LAST ) { - /* copy items to left */ - pos = 0; - if ( cpy_num == 1 ) - bytes = cpy_bytes; - else - bytes = -1; - - /* copy the first item or it part or nothing to the end of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) */ - i = leaf_copy_boundary_item (dest_bi, src, FIRST_TO_LAST, bytes); - cpy_num -= i; - if ( cpy_num == 0 ) - return i; - pos += i; - if ( cpy_bytes == -1 ) - /* copy first cpy_num items starting from position 'pos' of SOURCE to end of DEST */ - leaf_copy_items_entirely (dest_bi, src, FIRST_TO_LAST, pos, cpy_num); - else { - /* copy first cpy_num-1 items starting from position 'pos-1' of the SOURCE to the end of the DEST */ - leaf_copy_items_entirely (dest_bi, src, FIRST_TO_LAST, pos, cpy_num-1); - - /* copy part of the item which number is cpy_num+pos-1 to the end of the DEST */ - leaf_item_bottle (dest_bi, src, FIRST_TO_LAST, cpy_num+pos-1, cpy_bytes); - } - } else { - /* copy items to right */ - src_nr_item = B_NR_ITEMS (src); - if ( cpy_num == 1 ) - bytes = cpy_bytes; - else - bytes = -1; - - /* copy the last item or it part or nothing to the begin of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); */ - i = leaf_copy_boundary_item (dest_bi, src, LAST_TO_FIRST, bytes); - - cpy_num -= i; - if ( cpy_num == 0 ) - return i; - - pos = src_nr_item - cpy_num - i; - if ( cpy_bytes == -1 ) { - /* starting from position 'pos' copy last cpy_num items of SOURCE to begin of DEST */ - leaf_copy_items_entirely (dest_bi, src, LAST_TO_FIRST, pos, cpy_num); - } else { - /* copy last cpy_num-1 items starting from position 'pos+1' of the SOURCE to the begin of the DEST; */ - leaf_copy_items_entirely (dest_bi, src, LAST_TO_FIRST, pos+1, cpy_num-1); - - /* copy part of the item which number is pos to the begin of the DEST */ - leaf_item_bottle (dest_bi, src, LAST_TO_FIRST, pos, cpy_bytes); - } - } - return i; + struct buffer_head *dest; + int pos, i, src_nr_item, bytes; + + dest = dest_bi->bi_bh; + RFALSE(!dest || !src, "vs-10210: !dest || !src"); + RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST, + "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST"); + RFALSE(B_NR_ITEMS(src) < cpy_num, + "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src), + cpy_num); + RFALSE(cpy_num < 0, "vs-10240: cpy_num < 0 (%d)", cpy_num); + + if (cpy_num == 0) + return 0; + + if (last_first == FIRST_TO_LAST) { + /* copy items to left */ + pos = 0; + if (cpy_num == 1) + bytes = cpy_bytes; + else + bytes = -1; + + /* copy the first item or it part or nothing to the end of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) */ + i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes); + cpy_num -= i; + if (cpy_num == 0) + return i; + pos += i; + if (cpy_bytes == -1) + /* copy first cpy_num items starting from position 'pos' of SOURCE to end of DEST */ + leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST, + pos, cpy_num); + else { + /* copy first cpy_num-1 items starting from position 'pos-1' of the SOURCE to the end of the DEST */ + leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST, + pos, cpy_num - 1); + + /* copy part of the item which number is cpy_num+pos-1 to the end of the DEST */ + leaf_item_bottle(dest_bi, src, FIRST_TO_LAST, + cpy_num + pos - 1, cpy_bytes); + } + } else { + /* copy items to right */ + src_nr_item = B_NR_ITEMS(src); + if (cpy_num == 1) + bytes = cpy_bytes; + else + bytes = -1; + + /* copy the last item or it part or nothing to the begin of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); */ + i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes); + + cpy_num -= i; + if (cpy_num == 0) + return i; + + pos = src_nr_item - cpy_num - i; + if (cpy_bytes == -1) { + /* starting from position 'pos' copy last cpy_num items of SOURCE to begin of DEST */ + leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST, + pos, cpy_num); + } else { + /* copy last cpy_num-1 items starting from position 'pos+1' of the SOURCE to the begin of the DEST; */ + leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST, + pos + 1, cpy_num - 1); + + /* copy part of the item which number is pos to the begin of the DEST */ + leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos, + cpy_bytes); + } + } + return i; } - /* there are types of coping: from S[0] to L[0], from S[0] to R[0], from R[0] to L[0]. for each of these we have to define parent and positions of destination and source buffers */ -static void leaf_define_dest_src_infos (int shift_mode, struct tree_balance * tb, struct buffer_info * dest_bi, - struct buffer_info * src_bi, int * first_last, - struct buffer_head * Snew) +static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb, + struct buffer_info *dest_bi, + struct buffer_info *src_bi, + int *first_last, + struct buffer_head *Snew) { - memset (dest_bi, 0, sizeof (struct buffer_info)); - memset (src_bi, 0, sizeof (struct buffer_info)); - - /* define dest, src, dest parent, dest position */ - switch (shift_mode) { - case LEAF_FROM_S_TO_L: /* it is used in leaf_shift_left */ - src_bi->tb = tb; - src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path); - src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0); - src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0); /* src->b_item_order */ - dest_bi->tb = tb; - dest_bi->bi_bh = tb->L[0]; - dest_bi->bi_parent = tb->FL[0]; - dest_bi->bi_position = get_left_neighbor_position (tb, 0); - *first_last = FIRST_TO_LAST; - break; - - case LEAF_FROM_S_TO_R: /* it is used in leaf_shift_right */ - src_bi->tb = tb; - src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path); - src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0); - src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0); - dest_bi->tb = tb; - dest_bi->bi_bh = tb->R[0]; - dest_bi->bi_parent = tb->FR[0]; - dest_bi->bi_position = get_right_neighbor_position (tb, 0); - *first_last = LAST_TO_FIRST; - break; - - case LEAF_FROM_R_TO_L: /* it is used in balance_leaf_when_delete */ - src_bi->tb = tb; - src_bi->bi_bh = tb->R[0]; - src_bi->bi_parent = tb->FR[0]; - src_bi->bi_position = get_right_neighbor_position (tb, 0); - dest_bi->tb = tb; - dest_bi->bi_bh = tb->L[0]; - dest_bi->bi_parent = tb->FL[0]; - dest_bi->bi_position = get_left_neighbor_position (tb, 0); - *first_last = FIRST_TO_LAST; - break; - - case LEAF_FROM_L_TO_R: /* it is used in balance_leaf_when_delete */ - src_bi->tb = tb; - src_bi->bi_bh = tb->L[0]; - src_bi->bi_parent = tb->FL[0]; - src_bi->bi_position = get_left_neighbor_position (tb, 0); - dest_bi->tb = tb; - dest_bi->bi_bh = tb->R[0]; - dest_bi->bi_parent = tb->FR[0]; - dest_bi->bi_position = get_right_neighbor_position (tb, 0); - *first_last = LAST_TO_FIRST; - break; - - case LEAF_FROM_S_TO_SNEW: - src_bi->tb = tb; - src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path); - src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0); - src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0); - dest_bi->tb = tb; - dest_bi->bi_bh = Snew; - dest_bi->bi_parent = NULL; - dest_bi->bi_position = 0; - *first_last = LAST_TO_FIRST; - break; - - default: - reiserfs_panic (NULL, "vs-10250: leaf_define_dest_src_infos: shift type is unknown (%d)", shift_mode); - } - RFALSE( src_bi->bi_bh == 0 || dest_bi->bi_bh == 0, - "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly", - shift_mode, src_bi->bi_bh, dest_bi->bi_bh); + memset(dest_bi, 0, sizeof(struct buffer_info)); + memset(src_bi, 0, sizeof(struct buffer_info)); + + /* define dest, src, dest parent, dest position */ + switch (shift_mode) { + case LEAF_FROM_S_TO_L: /* it is used in leaf_shift_left */ + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); /* src->b_item_order */ + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[0]; + dest_bi->bi_parent = tb->FL[0]; + dest_bi->bi_position = get_left_neighbor_position(tb, 0); + *first_last = FIRST_TO_LAST; + break; + + case LEAF_FROM_S_TO_R: /* it is used in leaf_shift_right */ + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[0]; + dest_bi->bi_parent = tb->FR[0]; + dest_bi->bi_position = get_right_neighbor_position(tb, 0); + *first_last = LAST_TO_FIRST; + break; + + case LEAF_FROM_R_TO_L: /* it is used in balance_leaf_when_delete */ + src_bi->tb = tb; + src_bi->bi_bh = tb->R[0]; + src_bi->bi_parent = tb->FR[0]; + src_bi->bi_position = get_right_neighbor_position(tb, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[0]; + dest_bi->bi_parent = tb->FL[0]; + dest_bi->bi_position = get_left_neighbor_position(tb, 0); + *first_last = FIRST_TO_LAST; + break; + + case LEAF_FROM_L_TO_R: /* it is used in balance_leaf_when_delete */ + src_bi->tb = tb; + src_bi->bi_bh = tb->L[0]; + src_bi->bi_parent = tb->FL[0]; + src_bi->bi_position = get_left_neighbor_position(tb, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[0]; + dest_bi->bi_parent = tb->FR[0]; + dest_bi->bi_position = get_right_neighbor_position(tb, 0); + *first_last = LAST_TO_FIRST; + break; + + case LEAF_FROM_S_TO_SNEW: + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); + src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = Snew; + dest_bi->bi_parent = NULL; + dest_bi->bi_position = 0; + *first_last = LAST_TO_FIRST; + break; + + default: + reiserfs_panic(NULL, + "vs-10250: leaf_define_dest_src_infos: shift type is unknown (%d)", + shift_mode); + } + RFALSE(src_bi->bi_bh == 0 || dest_bi->bi_bh == 0, + "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly", + shift_mode, src_bi->bi_bh, dest_bi->bi_bh); } - - - /* copy mov_num items and mov_bytes of the (mov_num-1)th item to neighbor. Delete them from source */ -int leaf_move_items (int shift_mode, struct tree_balance * tb, int mov_num, int mov_bytes, struct buffer_head * Snew) +int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num, + int mov_bytes, struct buffer_head *Snew) { - int ret_value; - struct buffer_info dest_bi, src_bi; - int first_last; + int ret_value; + struct buffer_info dest_bi, src_bi; + int first_last; - leaf_define_dest_src_infos (shift_mode, tb, &dest_bi, &src_bi, &first_last, Snew); + leaf_define_dest_src_infos(shift_mode, tb, &dest_bi, &src_bi, + &first_last, Snew); - ret_value = leaf_copy_items (&dest_bi, src_bi.bi_bh, first_last, mov_num, mov_bytes); + ret_value = + leaf_copy_items(&dest_bi, src_bi.bi_bh, first_last, mov_num, + mov_bytes); - leaf_delete_items (&src_bi, first_last, (first_last == FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) - mov_num), mov_num, mov_bytes); + leaf_delete_items(&src_bi, first_last, + (first_last == + FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) - + mov_num), mov_num, mov_bytes); - - return ret_value; + return ret_value; } - /* Shift shift_num items (and shift_bytes of last shifted item if shift_bytes != -1) from S[0] to L[0] and replace the delimiting key */ -int leaf_shift_left (struct tree_balance * tb, int shift_num, int shift_bytes) +int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes) { - struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path); - int i; + struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path); + int i; - /* move shift_num (and shift_bytes bytes) items from S[0] to left neighbor L[0] */ - i = leaf_move_items (LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL); + /* move shift_num (and shift_bytes bytes) items from S[0] to left neighbor L[0] */ + i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL); - if ( shift_num ) { - if (B_NR_ITEMS (S0) == 0) { /* number of items in S[0] == 0 */ + if (shift_num) { + if (B_NR_ITEMS(S0) == 0) { /* number of items in S[0] == 0 */ - RFALSE( shift_bytes != -1, - "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)", - shift_bytes); + RFALSE(shift_bytes != -1, + "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)", + shift_bytes); #ifdef CONFIG_REISERFS_CHECK - if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) { - print_cur_tb ("vs-10275"); - reiserfs_panic (tb->tb_sb, "vs-10275: leaf_shift_left: balance condition corrupted (%c)", tb->tb_mode); - } + if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) { + print_cur_tb("vs-10275"); + reiserfs_panic(tb->tb_sb, + "vs-10275: leaf_shift_left: balance condition corrupted (%c)", + tb->tb_mode); + } #endif - if (PATH_H_POSITION (tb->tb_path, 1) == 0) - replace_key (tb, tb->CFL[0], tb->lkey[0], PATH_H_PPARENT (tb->tb_path, 0), 0); - - } else { - /* replace lkey in CFL[0] by 0-th key from S[0]; */ - replace_key (tb, tb->CFL[0], tb->lkey[0], S0, 0); - - RFALSE( (shift_bytes != -1 && - !(is_direntry_le_ih (B_N_PITEM_HEAD (S0, 0)) - && !I_ENTRY_COUNT (B_N_PITEM_HEAD (S0, 0)))) && - (!op_is_left_mergeable (B_N_PKEY (S0, 0), S0->b_size)), - "vs-10280: item must be mergeable"); - } - } - - return i; -} - - - + if (PATH_H_POSITION(tb->tb_path, 1) == 0) + replace_key(tb, tb->CFL[0], tb->lkey[0], + PATH_H_PPARENT(tb->tb_path, 0), 0); + + } else { + /* replace lkey in CFL[0] by 0-th key from S[0]; */ + replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0); + + RFALSE((shift_bytes != -1 && + !(is_direntry_le_ih(B_N_PITEM_HEAD(S0, 0)) + && !I_ENTRY_COUNT(B_N_PITEM_HEAD(S0, 0)))) && + (!op_is_left_mergeable + (B_N_PKEY(S0, 0), S0->b_size)), + "vs-10280: item must be mergeable"); + } + } + return i; +} /* CLEANING STOPPED HERE */ - - - /* Shift shift_num (shift_bytes) items from S[0] to the right neighbor, and replace the delimiting key */ -int leaf_shift_right( - struct tree_balance * tb, - int shift_num, - int shift_bytes - ) +int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes) { - // struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path); - int ret_value; + // struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path); + int ret_value; - /* move shift_num (and shift_bytes) items from S[0] to right neighbor R[0] */ - ret_value = leaf_move_items (LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL); + /* move shift_num (and shift_bytes) items from S[0] to right neighbor R[0] */ + ret_value = + leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL); - /* replace rkey in CFR[0] by the 0-th key from R[0] */ - if (shift_num) { - replace_key (tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + /* replace rkey in CFR[0] by the 0-th key from R[0] */ + if (shift_num) { + replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); - } + } - return ret_value; + return ret_value; } - - -static void leaf_delete_items_entirely (struct buffer_info * bi, - int first, int del_num); +static void leaf_delete_items_entirely(struct buffer_info *bi, + int first, int del_num); /* If del_bytes == -1, starting from position 'first' delete del_num items in whole in buffer CUR. If not. If last_first == 0. Starting from position 'first' delete del_num-1 items in whole. Delete part of body of @@ -670,287 +731,292 @@ static void leaf_delete_items_entirely (struct buffer_info * bi, If last_first == 1. Starting from position 'first+1' delete del_num-1 items in whole. Delete part of body of the last item . Part defined by del_bytes. Don't delete last item header. */ -void leaf_delete_items (struct buffer_info * cur_bi, int last_first, - int first, int del_num, int del_bytes) +void leaf_delete_items(struct buffer_info *cur_bi, int last_first, + int first, int del_num, int del_bytes) { - struct buffer_head * bh; - int item_amount = B_NR_ITEMS (bh = cur_bi->bi_bh); - - RFALSE( !bh, "10155: bh is not defined"); - RFALSE( del_num < 0, "10160: del_num can not be < 0. del_num==%d", del_num); - RFALSE( first < 0 || first + del_num > item_amount, - "10165: invalid number of first item to be deleted (%d) or " - "no so much items (%d) to delete (only %d)", - first, first + del_num, item_amount); - - if ( del_num == 0 ) - return; - - if ( first == 0 && del_num == item_amount && del_bytes == -1 ) { - make_empty_node (cur_bi); - do_balance_mark_leaf_dirty (cur_bi->tb, bh, 0); - return; - } - - if ( del_bytes == -1 ) - /* delete del_num items beginning from item in position first */ - leaf_delete_items_entirely (cur_bi, first, del_num); - else { - if ( last_first == FIRST_TO_LAST ) { - /* delete del_num-1 items beginning from item in position first */ - leaf_delete_items_entirely (cur_bi, first, del_num-1); - - /* delete the part of the first item of the bh - do not delete item header - */ - leaf_cut_from_buffer (cur_bi, 0, 0, del_bytes); - } else { - struct item_head * ih; - int len; - - /* delete del_num-1 items beginning from item in position first+1 */ - leaf_delete_items_entirely (cur_bi, first+1, del_num-1); - - if (is_direntry_le_ih (ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh)-1))) /* the last item is directory */ - /* len = numbers of directory entries in this item */ - len = ih_entry_count(ih); - else - /* len = body len of item */ - len = ih_item_len(ih); - - /* delete the part of the last item of the bh - do not delete item header - */ - leaf_cut_from_buffer (cur_bi, B_NR_ITEMS(bh)-1, len - del_bytes, del_bytes); + struct buffer_head *bh; + int item_amount = B_NR_ITEMS(bh = cur_bi->bi_bh); + + RFALSE(!bh, "10155: bh is not defined"); + RFALSE(del_num < 0, "10160: del_num can not be < 0. del_num==%d", + del_num); + RFALSE(first < 0 + || first + del_num > item_amount, + "10165: invalid number of first item to be deleted (%d) or " + "no so much items (%d) to delete (only %d)", first, + first + del_num, item_amount); + + if (del_num == 0) + return; + + if (first == 0 && del_num == item_amount && del_bytes == -1) { + make_empty_node(cur_bi); + do_balance_mark_leaf_dirty(cur_bi->tb, bh, 0); + return; } - } -} + if (del_bytes == -1) + /* delete del_num items beginning from item in position first */ + leaf_delete_items_entirely(cur_bi, first, del_num); + else { + if (last_first == FIRST_TO_LAST) { + /* delete del_num-1 items beginning from item in position first */ + leaf_delete_items_entirely(cur_bi, first, del_num - 1); + + /* delete the part of the first item of the bh + do not delete item header + */ + leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes); + } else { + struct item_head *ih; + int len; + + /* delete del_num-1 items beginning from item in position first+1 */ + leaf_delete_items_entirely(cur_bi, first + 1, + del_num - 1); + + if (is_direntry_le_ih + (ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1))) + /* the last item is directory */ + /* len = numbers of directory entries in this item */ + len = ih_entry_count(ih); + else + /* len = body len of item */ + len = ih_item_len(ih); + + /* delete the part of the last item of the bh + do not delete item header + */ + leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1, + len - del_bytes, del_bytes); + } + } +} /* insert item into the leaf node in position before */ -void leaf_insert_into_buf (struct buffer_info * bi, int before, - struct item_head * inserted_item_ih, - const char * inserted_item_body, - int zeros_number) +void leaf_insert_into_buf(struct buffer_info *bi, int before, + struct item_head *inserted_item_ih, + const char *inserted_item_body, int zeros_number) { - struct buffer_head * bh = bi->bi_bh; - int nr, free_space; - struct block_head * blkh; - struct item_head * ih; - int i; - int last_loc, unmoved_loc; - char * to; - - - blkh = B_BLK_HEAD(bh); - nr = blkh_nr_item(blkh); - free_space = blkh_free_space( blkh ); - - /* check free space */ - RFALSE( free_space < ih_item_len(inserted_item_ih) + IH_SIZE, - "vs-10170: not enough free space in block %z, new item %h", - bh, inserted_item_ih); - RFALSE( zeros_number > ih_item_len(inserted_item_ih), - "vs-10172: zero number == %d, item length == %d", - zeros_number, ih_item_len(inserted_item_ih)); - - - /* get item new item must be inserted before */ - ih = B_N_PITEM_HEAD (bh, before); - - /* prepare space for the body of new item */ - last_loc = nr ? ih_location( &(ih[nr - before - 1]) ) : bh->b_size; - unmoved_loc = before ? ih_location( ih-1 ) : bh->b_size; - - - memmove (bh->b_data + last_loc - ih_item_len(inserted_item_ih), - bh->b_data + last_loc, unmoved_loc - last_loc); - - to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih); - memset (to, 0, zeros_number); - to += zeros_number; - - /* copy body to prepared space */ - if (inserted_item_body) - memmove (to, inserted_item_body, ih_item_len(inserted_item_ih) - zeros_number); - else - memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number); - - /* insert item header */ - memmove (ih + 1, ih, IH_SIZE * (nr - before)); - memmove (ih, inserted_item_ih, IH_SIZE); - - /* change locations */ - for (i = before; i < nr + 1; i ++) - { - unmoved_loc -= ih_item_len( &(ih[i-before])); - put_ih_location( &(ih[i-before]), unmoved_loc ); - } - - /* sizes, free space, item number */ - set_blkh_nr_item( blkh, blkh_nr_item(blkh) + 1 ); - set_blkh_free_space( blkh, - free_space - (IH_SIZE + ih_item_len(inserted_item_ih ) ) ); - do_balance_mark_leaf_dirty (bi->tb, bh, 1); - - if (bi->bi_parent) { - struct disk_child *t_dc; - t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position); - put_dc_size( t_dc, dc_size(t_dc) + (IH_SIZE + ih_item_len(inserted_item_ih))); - do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0); - } -} + struct buffer_head *bh = bi->bi_bh; + int nr, free_space; + struct block_head *blkh; + struct item_head *ih; + int i; + int last_loc, unmoved_loc; + char *to; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + free_space = blkh_free_space(blkh); + + /* check free space */ + RFALSE(free_space < ih_item_len(inserted_item_ih) + IH_SIZE, + "vs-10170: not enough free space in block %z, new item %h", + bh, inserted_item_ih); + RFALSE(zeros_number > ih_item_len(inserted_item_ih), + "vs-10172: zero number == %d, item length == %d", + zeros_number, ih_item_len(inserted_item_ih)); + + /* get item new item must be inserted before */ + ih = B_N_PITEM_HEAD(bh, before); + + /* prepare space for the body of new item */ + last_loc = nr ? ih_location(&(ih[nr - before - 1])) : bh->b_size; + unmoved_loc = before ? ih_location(ih - 1) : bh->b_size; + + memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih), + bh->b_data + last_loc, unmoved_loc - last_loc); + + to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih); + memset(to, 0, zeros_number); + to += zeros_number; + + /* copy body to prepared space */ + if (inserted_item_body) + memmove(to, inserted_item_body, + ih_item_len(inserted_item_ih) - zeros_number); + else + memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number); + + /* insert item header */ + memmove(ih + 1, ih, IH_SIZE * (nr - before)); + memmove(ih, inserted_item_ih, IH_SIZE); + + /* change locations */ + for (i = before; i < nr + 1; i++) { + unmoved_loc -= ih_item_len(&(ih[i - before])); + put_ih_location(&(ih[i - before]), unmoved_loc); + } + /* sizes, free space, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1); + set_blkh_free_space(blkh, + free_space - (IH_SIZE + + ih_item_len(inserted_item_ih))); + do_balance_mark_leaf_dirty(bi->tb, bh, 1); + + if (bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) + (IH_SIZE + + ih_item_len(inserted_item_ih))); + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); + } +} /* paste paste_size bytes to affected_item_num-th item. When item is a directory, this only prepare space for new entries */ -void leaf_paste_in_buffer (struct buffer_info * bi, int affected_item_num, - int pos_in_item, int paste_size, - const char * body, - int zeros_number) +void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num, + int pos_in_item, int paste_size, + const char *body, int zeros_number) { - struct buffer_head * bh = bi->bi_bh; - int nr, free_space; - struct block_head * blkh; - struct item_head * ih; - int i; - int last_loc, unmoved_loc; - - blkh = B_BLK_HEAD(bh); - nr = blkh_nr_item(blkh); - free_space = blkh_free_space(blkh); - - - /* check free space */ - RFALSE( free_space < paste_size, - "vs-10175: not enough free space: needed %d, available %d", - paste_size, free_space); + struct buffer_head *bh = bi->bi_bh; + int nr, free_space; + struct block_head *blkh; + struct item_head *ih; + int i; + int last_loc, unmoved_loc; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + free_space = blkh_free_space(blkh); + + /* check free space */ + RFALSE(free_space < paste_size, + "vs-10175: not enough free space: needed %d, available %d", + paste_size, free_space); #ifdef CONFIG_REISERFS_CHECK - if (zeros_number > paste_size) { - print_cur_tb ("10177"); - reiserfs_panic ( NULL, "vs-10177: leaf_paste_in_buffer: ero number == %d, paste_size == %d", - zeros_number, paste_size); - } -#endif /* CONFIG_REISERFS_CHECK */ - - - /* item to be appended */ - ih = B_N_PITEM_HEAD(bh, affected_item_num); - - last_loc = ih_location( &(ih[nr - affected_item_num - 1]) ); - unmoved_loc = affected_item_num ? ih_location( ih-1 ) : bh->b_size; - - /* prepare space */ - memmove (bh->b_data + last_loc - paste_size, bh->b_data + last_loc, - unmoved_loc - last_loc); - - - /* change locations */ - for (i = affected_item_num; i < nr; i ++) - put_ih_location( &(ih[i-affected_item_num]), - ih_location( &(ih[i-affected_item_num])) - paste_size ); - - if ( body ) { - if (!is_direntry_le_ih (ih)) { - if (!pos_in_item) { - /* shift data to right */ - memmove (bh->b_data + ih_location(ih) + paste_size, - bh->b_data + ih_location(ih), ih_item_len(ih)); - /* paste data in the head of item */ - memset (bh->b_data + ih_location(ih), 0, zeros_number); - memcpy (bh->b_data + ih_location(ih) + zeros_number, body, paste_size - zeros_number); - } else { - memset (bh->b_data + unmoved_loc - paste_size, 0, zeros_number); - memcpy (bh->b_data + unmoved_loc - paste_size + zeros_number, body, paste_size - zeros_number); - } + if (zeros_number > paste_size) { + print_cur_tb("10177"); + reiserfs_panic(NULL, + "vs-10177: leaf_paste_in_buffer: ero number == %d, paste_size == %d", + zeros_number, paste_size); + } +#endif /* CONFIG_REISERFS_CHECK */ + + /* item to be appended */ + ih = B_N_PITEM_HEAD(bh, affected_item_num); + + last_loc = ih_location(&(ih[nr - affected_item_num - 1])); + unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size; + + /* prepare space */ + memmove(bh->b_data + last_loc - paste_size, bh->b_data + last_loc, + unmoved_loc - last_loc); + + /* change locations */ + for (i = affected_item_num; i < nr; i++) + put_ih_location(&(ih[i - affected_item_num]), + ih_location(&(ih[i - affected_item_num])) - + paste_size); + + if (body) { + if (!is_direntry_le_ih(ih)) { + if (!pos_in_item) { + /* shift data to right */ + memmove(bh->b_data + ih_location(ih) + + paste_size, + bh->b_data + ih_location(ih), + ih_item_len(ih)); + /* paste data in the head of item */ + memset(bh->b_data + ih_location(ih), 0, + zeros_number); + memcpy(bh->b_data + ih_location(ih) + + zeros_number, body, + paste_size - zeros_number); + } else { + memset(bh->b_data + unmoved_loc - paste_size, 0, + zeros_number); + memcpy(bh->b_data + unmoved_loc - paste_size + + zeros_number, body, + paste_size - zeros_number); + } + } + } else + memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size); + + put_ih_item_len(ih, ih_item_len(ih) + paste_size); + + /* change free space */ + set_blkh_free_space(blkh, free_space - paste_size); + + do_balance_mark_leaf_dirty(bi->tb, bh, 0); + + if (bi->bi_parent) { + struct disk_child *t_dc = + B_N_CHILD(bi->bi_parent, bi->bi_position); + put_dc_size(t_dc, dc_size(t_dc) + paste_size); + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); } - } - else - memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size); - - put_ih_item_len( ih, ih_item_len(ih) + paste_size ); - - /* change free space */ - set_blkh_free_space( blkh, free_space - paste_size ); - - do_balance_mark_leaf_dirty (bi->tb, bh, 0); - - if (bi->bi_parent) { - struct disk_child *t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position); - put_dc_size( t_dc, dc_size(t_dc) + paste_size ); - do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0); - } } - /* cuts DEL_COUNT entries beginning from FROM-th entry. Directory item does not have free space, so it moves DEHs and remaining records as necessary. Return value is size of removed part of directory item in bytes. */ -static int leaf_cut_entries ( - struct buffer_head * bh, - struct item_head * ih, - int from, - int del_count - ) +static int leaf_cut_entries(struct buffer_head *bh, + struct item_head *ih, int from, int del_count) { - char * item; - struct reiserfs_de_head * deh; - int prev_record_offset; /* offset of record, that is (from-1)th */ - char * prev_record; /* */ - int cut_records_len; /* length of all removed records */ - int i; - - - /* make sure, that item is directory and there are enough entries to - remove */ - RFALSE( !is_direntry_le_ih (ih), "10180: item is not directory item"); - RFALSE( I_ENTRY_COUNT(ih) < from + del_count, - "10185: item contains not enough entries: entry_cout = %d, from = %d, to delete = %d", - I_ENTRY_COUNT(ih), from, del_count); - - if (del_count == 0) - return 0; - - /* first byte of item */ - item = bh->b_data + ih_location(ih); - - /* entry head array */ - deh = B_I_DEH (bh, ih); - - /* first byte of remaining entries, those are BEFORE cut entries - (prev_record) and length of all removed records (cut_records_len) */ - prev_record_offset = (from ? deh_location( &(deh[from - 1])) : ih_item_len(ih)); - cut_records_len = prev_record_offset/*from_record*/ - - deh_location( &(deh[from + del_count - 1])); - prev_record = item + prev_record_offset; - - - /* adjust locations of remaining entries */ - for (i = I_ENTRY_COUNT(ih) - 1; i > from + del_count - 1; i --) - put_deh_location( &(deh[i]), - deh_location( &deh[i] ) - (DEH_SIZE * del_count ) ); - - for (i = 0; i < from; i ++) - put_deh_location( &(deh[i]), - deh_location( &deh[i] ) - (DEH_SIZE * del_count + cut_records_len) ); - - put_ih_entry_count( ih, ih_entry_count(ih) - del_count ); - - /* shift entry head array and entries those are AFTER removed entries */ - memmove ((char *)(deh + from), - deh + from + del_count, - prev_record - cut_records_len - (char *)(deh + from + del_count)); - - /* shift records, those are BEFORE removed entries */ - memmove (prev_record - cut_records_len - DEH_SIZE * del_count, - prev_record, item + ih_item_len(ih) - prev_record); - - return DEH_SIZE * del_count + cut_records_len; + char *item; + struct reiserfs_de_head *deh; + int prev_record_offset; /* offset of record, that is (from-1)th */ + char *prev_record; /* */ + int cut_records_len; /* length of all removed records */ + int i; + + /* make sure, that item is directory and there are enough entries to + remove */ + RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item"); + RFALSE(I_ENTRY_COUNT(ih) < from + del_count, + "10185: item contains not enough entries: entry_cout = %d, from = %d, to delete = %d", + I_ENTRY_COUNT(ih), from, del_count); + + if (del_count == 0) + return 0; + + /* first byte of item */ + item = bh->b_data + ih_location(ih); + + /* entry head array */ + deh = B_I_DEH(bh, ih); + + /* first byte of remaining entries, those are BEFORE cut entries + (prev_record) and length of all removed records (cut_records_len) */ + prev_record_offset = + (from ? deh_location(&(deh[from - 1])) : ih_item_len(ih)); + cut_records_len = prev_record_offset /*from_record */ - + deh_location(&(deh[from + del_count - 1])); + prev_record = item + prev_record_offset; + + /* adjust locations of remaining entries */ + for (i = I_ENTRY_COUNT(ih) - 1; i > from + del_count - 1; i--) + put_deh_location(&(deh[i]), + deh_location(&deh[i]) - + (DEH_SIZE * del_count)); + + for (i = 0; i < from; i++) + put_deh_location(&(deh[i]), + deh_location(&deh[i]) - (DEH_SIZE * del_count + + cut_records_len)); + + put_ih_entry_count(ih, ih_entry_count(ih) - del_count); + + /* shift entry head array and entries those are AFTER removed entries */ + memmove((char *)(deh + from), + deh + from + del_count, + prev_record - cut_records_len - (char *)(deh + from + + del_count)); + + /* shift records, those are BEFORE removed entries */ + memmove(prev_record - cut_records_len - DEH_SIZE * del_count, + prev_record, item + ih_item_len(ih) - prev_record); + + return DEH_SIZE * del_count + cut_records_len; } - /* when cut item is part of regular file pos_in_item - first byte that must be cut cut_size - number of bytes to be cut beginning from pos_in_item @@ -959,264 +1025,278 @@ static int leaf_cut_entries ( pos_in_item - number of first deleted entry cut_size - count of deleted entries */ -void leaf_cut_from_buffer (struct buffer_info * bi, int cut_item_num, - int pos_in_item, int cut_size) +void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, + int pos_in_item, int cut_size) { - int nr; - struct buffer_head * bh = bi->bi_bh; - struct block_head * blkh; - struct item_head * ih; - int last_loc, unmoved_loc; - int i; - - blkh = B_BLK_HEAD(bh); - nr = blkh_nr_item(blkh); - - /* item head of truncated item */ - ih = B_N_PITEM_HEAD (bh, cut_item_num); - - if (is_direntry_le_ih (ih)) { - /* first cut entry ()*/ - cut_size = leaf_cut_entries (bh, ih, pos_in_item, cut_size); - if (pos_in_item == 0) { - /* change key */ - RFALSE( cut_item_num, - "when 0-th enrty of item is cut, that item must be first in the node, not %d-th", cut_item_num); - /* change item key by key of first entry in the item */ - set_le_ih_k_offset (ih, deh_offset(B_I_DEH (bh, ih))); - /*memcpy (&ih->ih_key.k_offset, &(B_I_DEH (bh, ih)->deh_offset), SHORT_KEY_SIZE);*/ - } - } else { - /* item is direct or indirect */ - RFALSE( is_statdata_le_ih (ih), "10195: item is stat data"); - RFALSE( pos_in_item && pos_in_item + cut_size != ih_item_len(ih), - "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)", - ( long unsigned ) pos_in_item, ( long unsigned ) cut_size, - ( long unsigned ) ih_item_len (ih)); - - /* shift item body to left if cut is from the head of item */ - if (pos_in_item == 0) { - memmove( bh->b_data + ih_location(ih), - bh->b_data + ih_location(ih) + cut_size, - ih_item_len(ih) - cut_size); - - /* change key of item */ - if (is_direct_le_ih (ih)) - set_le_ih_k_offset (ih, le_ih_k_offset (ih) + cut_size); - else { - set_le_ih_k_offset (ih, le_ih_k_offset (ih) + (cut_size / UNFM_P_SIZE) * bh->b_size); - RFALSE( ih_item_len(ih) == cut_size && get_ih_free_space (ih), - "10205: invalid ih_free_space (%h)", ih); - } - } - } - - - /* location of the last item */ - last_loc = ih_location( &(ih[nr - cut_item_num - 1]) ); - - /* location of the item, which is remaining at the same place */ - unmoved_loc = cut_item_num ? ih_location(ih-1) : bh->b_size; - - - /* shift */ - memmove (bh->b_data + last_loc + cut_size, bh->b_data + last_loc, - unmoved_loc - last_loc - cut_size); - - /* change item length */ - put_ih_item_len( ih, ih_item_len(ih) - cut_size ); - - if (is_indirect_le_ih (ih)) { - if (pos_in_item) - set_ih_free_space (ih, 0); - } - - /* change locations */ - for (i = cut_item_num; i < nr; i ++) - put_ih_location( &(ih[i-cut_item_num]), ih_location( &ih[i-cut_item_num]) + cut_size ); - - /* size, free space */ - set_blkh_free_space( blkh, blkh_free_space(blkh) + cut_size ); - - do_balance_mark_leaf_dirty (bi->tb, bh, 0); - - if (bi->bi_parent) { - struct disk_child *t_dc; - t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position); - put_dc_size( t_dc, dc_size(t_dc) - cut_size ); - do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0); - } -} + int nr; + struct buffer_head *bh = bi->bi_bh; + struct block_head *blkh; + struct item_head *ih; + int last_loc, unmoved_loc; + int i; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + + /* item head of truncated item */ + ih = B_N_PITEM_HEAD(bh, cut_item_num); + + if (is_direntry_le_ih(ih)) { + /* first cut entry () */ + cut_size = leaf_cut_entries(bh, ih, pos_in_item, cut_size); + if (pos_in_item == 0) { + /* change key */ + RFALSE(cut_item_num, + "when 0-th enrty of item is cut, that item must be first in the node, not %d-th", + cut_item_num); + /* change item key by key of first entry in the item */ + set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih))); + /*memcpy (&ih->ih_key.k_offset, &(B_I_DEH (bh, ih)->deh_offset), SHORT_KEY_SIZE); */ + } + } else { + /* item is direct or indirect */ + RFALSE(is_statdata_le_ih(ih), "10195: item is stat data"); + RFALSE(pos_in_item && pos_in_item + cut_size != ih_item_len(ih), + "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)", + (long unsigned)pos_in_item, (long unsigned)cut_size, + (long unsigned)ih_item_len(ih)); + + /* shift item body to left if cut is from the head of item */ + if (pos_in_item == 0) { + memmove(bh->b_data + ih_location(ih), + bh->b_data + ih_location(ih) + cut_size, + ih_item_len(ih) - cut_size); + + /* change key of item */ + if (is_direct_le_ih(ih)) + set_le_ih_k_offset(ih, + le_ih_k_offset(ih) + + cut_size); + else { + set_le_ih_k_offset(ih, + le_ih_k_offset(ih) + + (cut_size / UNFM_P_SIZE) * + bh->b_size); + RFALSE(ih_item_len(ih) == cut_size + && get_ih_free_space(ih), + "10205: invalid ih_free_space (%h)", ih); + } + } + } + + /* location of the last item */ + last_loc = ih_location(&(ih[nr - cut_item_num - 1])); + + /* location of the item, which is remaining at the same place */ + unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size; + + /* shift */ + memmove(bh->b_data + last_loc + cut_size, bh->b_data + last_loc, + unmoved_loc - last_loc - cut_size); + + /* change item length */ + put_ih_item_len(ih, ih_item_len(ih) - cut_size); + if (is_indirect_le_ih(ih)) { + if (pos_in_item) + set_ih_free_space(ih, 0); + } + + /* change locations */ + for (i = cut_item_num; i < nr; i++) + put_ih_location(&(ih[i - cut_item_num]), + ih_location(&ih[i - cut_item_num]) + cut_size); + + /* size, free space */ + set_blkh_free_space(blkh, blkh_free_space(blkh) + cut_size); + + do_balance_mark_leaf_dirty(bi->tb, bh, 0); + + if (bi->bi_parent) { + struct disk_child *t_dc; + t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position); + put_dc_size(t_dc, dc_size(t_dc) - cut_size); + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); + } +} /* delete del_num items from buffer starting from the first'th item */ -static void leaf_delete_items_entirely (struct buffer_info * bi, - int first, int del_num) +static void leaf_delete_items_entirely(struct buffer_info *bi, + int first, int del_num) { - struct buffer_head * bh = bi->bi_bh; - int nr; - int i, j; - int last_loc, last_removed_loc; - struct block_head * blkh; - struct item_head * ih; - - RFALSE( bh == NULL, "10210: buffer is 0"); - RFALSE( del_num < 0, "10215: del_num less than 0 (%d)", del_num); - - if (del_num == 0) - return; - - blkh = B_BLK_HEAD(bh); - nr = blkh_nr_item(blkh); - - RFALSE( first < 0 || first + del_num > nr, - "10220: first=%d, number=%d, there is %d items", first, del_num, nr); - - if (first == 0 && del_num == nr) { - /* this does not work */ - make_empty_node (bi); - - do_balance_mark_leaf_dirty (bi->tb, bh, 0); - return; - } - - ih = B_N_PITEM_HEAD (bh, first); - - /* location of unmovable item */ - j = (first == 0) ? bh->b_size : ih_location(ih-1); - - /* delete items */ - last_loc = ih_location( &(ih[nr-1-first]) ); - last_removed_loc = ih_location( &(ih[del_num-1]) ); - - memmove (bh->b_data + last_loc + j - last_removed_loc, - bh->b_data + last_loc, last_removed_loc - last_loc); - - /* delete item headers */ - memmove (ih, ih + del_num, (nr - first - del_num) * IH_SIZE); - - /* change item location */ - for (i = first; i < nr - del_num; i ++) - put_ih_location( &(ih[i-first]), ih_location( &(ih[i-first]) ) + (j - last_removed_loc) ); - - /* sizes, item number */ - set_blkh_nr_item( blkh, blkh_nr_item(blkh) - del_num ); - set_blkh_free_space( blkh, blkh_free_space(blkh) + (j - last_removed_loc + IH_SIZE * del_num) ); - - do_balance_mark_leaf_dirty (bi->tb, bh, 0); - - if (bi->bi_parent) { - struct disk_child *t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position); - put_dc_size( t_dc, dc_size(t_dc) - - (j - last_removed_loc + IH_SIZE * del_num)); - do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0); - } -} + struct buffer_head *bh = bi->bi_bh; + int nr; + int i, j; + int last_loc, last_removed_loc; + struct block_head *blkh; + struct item_head *ih; + + RFALSE(bh == NULL, "10210: buffer is 0"); + RFALSE(del_num < 0, "10215: del_num less than 0 (%d)", del_num); + + if (del_num == 0) + return; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + RFALSE(first < 0 || first + del_num > nr, + "10220: first=%d, number=%d, there is %d items", first, del_num, + nr); + + if (first == 0 && del_num == nr) { + /* this does not work */ + make_empty_node(bi); + + do_balance_mark_leaf_dirty(bi->tb, bh, 0); + return; + } + ih = B_N_PITEM_HEAD(bh, first); + /* location of unmovable item */ + j = (first == 0) ? bh->b_size : ih_location(ih - 1); + /* delete items */ + last_loc = ih_location(&(ih[nr - 1 - first])); + last_removed_loc = ih_location(&(ih[del_num - 1])); + + memmove(bh->b_data + last_loc + j - last_removed_loc, + bh->b_data + last_loc, last_removed_loc - last_loc); + + /* delete item headers */ + memmove(ih, ih + del_num, (nr - first - del_num) * IH_SIZE); + + /* change item location */ + for (i = first; i < nr - del_num; i++) + put_ih_location(&(ih[i - first]), + ih_location(&(ih[i - first])) + (j - + last_removed_loc)); + + /* sizes, item number */ + set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num); + set_blkh_free_space(blkh, + blkh_free_space(blkh) + (j - last_removed_loc + + IH_SIZE * del_num)); + + do_balance_mark_leaf_dirty(bi->tb, bh, 0); + + if (bi->bi_parent) { + struct disk_child *t_dc = + B_N_CHILD(bi->bi_parent, bi->bi_position); + put_dc_size(t_dc, + dc_size(t_dc) - (j - last_removed_loc + + IH_SIZE * del_num)); + do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); + } +} /* paste new_entry_count entries (new_dehs, records) into position before to item_num-th item */ -void leaf_paste_entries ( - struct buffer_head * bh, +void leaf_paste_entries(struct buffer_head *bh, int item_num, int before, int new_entry_count, - struct reiserfs_de_head * new_dehs, - const char * records, - int paste_size - ) + struct reiserfs_de_head *new_dehs, + const char *records, int paste_size) { - struct item_head * ih; - char * item; - struct reiserfs_de_head * deh; - char * insert_point; - int i, old_entry_num; - - if (new_entry_count == 0) - return; - - ih = B_N_PITEM_HEAD(bh, item_num); - - /* make sure, that item is directory, and there are enough records in it */ - RFALSE( !is_direntry_le_ih (ih), "10225: item is not directory item"); - RFALSE( I_ENTRY_COUNT (ih) < before, - "10230: there are no entry we paste entries before. entry_count = %d, before = %d", - I_ENTRY_COUNT (ih), before); - - - /* first byte of dest item */ - item = bh->b_data + ih_location(ih); - - /* entry head array */ - deh = B_I_DEH (bh, ih); - - /* new records will be pasted at this point */ - insert_point = item + (before ? deh_location( &(deh[before - 1])) : (ih_item_len(ih) - paste_size)); - - /* adjust locations of records that will be AFTER new records */ - for (i = I_ENTRY_COUNT(ih) - 1; i >= before; i --) - put_deh_location( &(deh[i]), - deh_location(&(deh[i])) + (DEH_SIZE * new_entry_count )); - - /* adjust locations of records that will be BEFORE new records */ - for (i = 0; i < before; i ++) - put_deh_location( &(deh[i]), deh_location(&(deh[i])) + paste_size ); - - old_entry_num = I_ENTRY_COUNT(ih); - put_ih_entry_count( ih, ih_entry_count(ih) + new_entry_count ); - - /* prepare space for pasted records */ - memmove (insert_point + paste_size, insert_point, item + (ih_item_len(ih) - paste_size) - insert_point); - - /* copy new records */ - memcpy (insert_point + DEH_SIZE * new_entry_count, records, - paste_size - DEH_SIZE * new_entry_count); - - /* prepare space for new entry heads */ - deh += before; - memmove ((char *)(deh + new_entry_count), deh, insert_point - (char *)deh); - - /* copy new entry heads */ - deh = (struct reiserfs_de_head *)((char *)deh); - memcpy (deh, new_dehs, DEH_SIZE * new_entry_count); - - /* set locations of new records */ - for (i = 0; i < new_entry_count; i ++) - { - put_deh_location( &(deh[i]), - deh_location( &(deh[i] )) + - (- deh_location( &(new_dehs[new_entry_count - 1])) + - insert_point + DEH_SIZE * new_entry_count - item)); - } - - - /* change item key if necessary (when we paste before 0-th entry */ - if (!before) - { - set_le_ih_k_offset (ih, deh_offset(new_dehs)); + struct item_head *ih; + char *item; + struct reiserfs_de_head *deh; + char *insert_point; + int i, old_entry_num; + + if (new_entry_count == 0) + return; + + ih = B_N_PITEM_HEAD(bh, item_num); + + /* make sure, that item is directory, and there are enough records in it */ + RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item"); + RFALSE(I_ENTRY_COUNT(ih) < before, + "10230: there are no entry we paste entries before. entry_count = %d, before = %d", + I_ENTRY_COUNT(ih), before); + + /* first byte of dest item */ + item = bh->b_data + ih_location(ih); + + /* entry head array */ + deh = B_I_DEH(bh, ih); + + /* new records will be pasted at this point */ + insert_point = + item + + (before ? deh_location(&(deh[before - 1])) + : (ih_item_len(ih) - paste_size)); + + /* adjust locations of records that will be AFTER new records */ + for (i = I_ENTRY_COUNT(ih) - 1; i >= before; i--) + put_deh_location(&(deh[i]), + deh_location(&(deh[i])) + + (DEH_SIZE * new_entry_count)); + + /* adjust locations of records that will be BEFORE new records */ + for (i = 0; i < before; i++) + put_deh_location(&(deh[i]), + deh_location(&(deh[i])) + paste_size); + + old_entry_num = I_ENTRY_COUNT(ih); + put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count); + + /* prepare space for pasted records */ + memmove(insert_point + paste_size, insert_point, + item + (ih_item_len(ih) - paste_size) - insert_point); + + /* copy new records */ + memcpy(insert_point + DEH_SIZE * new_entry_count, records, + paste_size - DEH_SIZE * new_entry_count); + + /* prepare space for new entry heads */ + deh += before; + memmove((char *)(deh + new_entry_count), deh, + insert_point - (char *)deh); + + /* copy new entry heads */ + deh = (struct reiserfs_de_head *)((char *)deh); + memcpy(deh, new_dehs, DEH_SIZE * new_entry_count); + + /* set locations of new records */ + for (i = 0; i < new_entry_count; i++) { + put_deh_location(&(deh[i]), + deh_location(&(deh[i])) + + (-deh_location + (&(new_dehs[new_entry_count - 1])) + + insert_point + DEH_SIZE * new_entry_count - + item)); + } + + /* change item key if necessary (when we paste before 0-th entry */ + if (!before) { + set_le_ih_k_offset(ih, deh_offset(new_dehs)); /* memcpy (&ih->ih_key.k_offset, &new_dehs->deh_offset, SHORT_KEY_SIZE);*/ - } - + } #ifdef CONFIG_REISERFS_CHECK - { - int prev, next; - /* check record locations */ - deh = B_I_DEH (bh, ih); - for (i = 0; i < I_ENTRY_COUNT(ih); i ++) { - next = (i < I_ENTRY_COUNT(ih) - 1) ? deh_location( &(deh[i + 1])) : 0; - prev = (i != 0) ? deh_location( &(deh[i - 1]) ) : 0; - - if (prev && prev <= deh_location( &(deh[i]))) - reiserfs_warning (NULL, "vs-10240: leaf_paste_entries: directory item (%h) corrupted (prev %a, cur(%d) %a)", - ih, deh + i - 1, i, deh + i); - if (next && next >= deh_location( &(deh[i]))) - reiserfs_warning (NULL, "vs-10250: leaf_paste_entries: directory item (%h) corrupted (cur(%d) %a, next %a)", - ih, i, deh + i, deh + i + 1); - } - } + { + int prev, next; + /* check record locations */ + deh = B_I_DEH(bh, ih); + for (i = 0; i < I_ENTRY_COUNT(ih); i++) { + next = + (i < + I_ENTRY_COUNT(ih) - + 1) ? deh_location(&(deh[i + 1])) : 0; + prev = (i != 0) ? deh_location(&(deh[i - 1])) : 0; + + if (prev && prev <= deh_location(&(deh[i]))) + reiserfs_warning(NULL, + "vs-10240: leaf_paste_entries: directory item (%h) corrupted (prev %a, cur(%d) %a)", + ih, deh + i - 1, i, deh + i); + if (next && next >= deh_location(&(deh[i]))) + reiserfs_warning(NULL, + "vs-10250: leaf_paste_entries: directory item (%h) corrupted (cur(%d) %a, next %a)", + ih, i, deh + i, deh + i + 1); + } + } #endif } diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 7d4dc5f5aa8..3549067c42d 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -25,86 +25,85 @@ // directory item contains array of entry headers. This performs // binary search through that array -static int bin_search_in_dir_item (struct reiserfs_dir_entry * de, loff_t off) +static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off) { - struct item_head * ih = de->de_ih; - struct reiserfs_de_head * deh = de->de_deh; - int rbound, lbound, j; - - lbound = 0; - rbound = I_ENTRY_COUNT (ih) - 1; - - for (j = (rbound + lbound) / 2; lbound <= rbound; j = (rbound + lbound) / 2) { - if (off < deh_offset (deh + j)) { - rbound = j - 1; - continue; + struct item_head *ih = de->de_ih; + struct reiserfs_de_head *deh = de->de_deh; + int rbound, lbound, j; + + lbound = 0; + rbound = I_ENTRY_COUNT(ih) - 1; + + for (j = (rbound + lbound) / 2; lbound <= rbound; + j = (rbound + lbound) / 2) { + if (off < deh_offset(deh + j)) { + rbound = j - 1; + continue; + } + if (off > deh_offset(deh + j)) { + lbound = j + 1; + continue; + } + // this is not name found, but matched third key component + de->de_entry_num = j; + return NAME_FOUND; } - if (off > deh_offset (deh + j)) { - lbound = j + 1; - continue; - } - // this is not name found, but matched third key component - de->de_entry_num = j; - return NAME_FOUND; - } - de->de_entry_num = lbound; - return NAME_NOT_FOUND; + de->de_entry_num = lbound; + return NAME_NOT_FOUND; } - // comment? maybe something like set de to point to what the path points to? -static inline void set_de_item_location (struct reiserfs_dir_entry * de, struct path * path) +static inline void set_de_item_location(struct reiserfs_dir_entry *de, + struct path *path) { - de->de_bh = get_last_bh (path); - de->de_ih = get_ih (path); - de->de_deh = B_I_DEH (de->de_bh, de->de_ih); - de->de_item_num = PATH_LAST_POSITION (path); -} - + de->de_bh = get_last_bh(path); + de->de_ih = get_ih(path); + de->de_deh = B_I_DEH(de->de_bh, de->de_ih); + de->de_item_num = PATH_LAST_POSITION(path); +} // de_bh, de_ih, de_deh (points to first element of array), de_item_num is set -inline void set_de_name_and_namelen (struct reiserfs_dir_entry * de) +inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de) { - struct reiserfs_de_head * deh = de->de_deh + de->de_entry_num; + struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num; - if (de->de_entry_num >= ih_entry_count (de->de_ih)) - BUG (); + if (de->de_entry_num >= ih_entry_count(de->de_ih)) + BUG(); - de->de_entrylen = entry_length (de->de_bh, de->de_ih, de->de_entry_num); - de->de_namelen = de->de_entrylen - (de_with_sd (deh) ? SD_SIZE : 0); - de->de_name = B_I_PITEM (de->de_bh, de->de_ih) + deh_location(deh); - if (de->de_name[de->de_namelen - 1] == 0) - de->de_namelen = strlen (de->de_name); + de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num); + de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0); + de->de_name = B_I_PITEM(de->de_bh, de->de_ih) + deh_location(deh); + if (de->de_name[de->de_namelen - 1] == 0) + de->de_namelen = strlen(de->de_name); } - // what entry points to -static inline void set_de_object_key (struct reiserfs_dir_entry * de) +static inline void set_de_object_key(struct reiserfs_dir_entry *de) { - if (de->de_entry_num >= ih_entry_count (de->de_ih)) - BUG (); - de->de_dir_id = deh_dir_id( &(de->de_deh[de->de_entry_num])); - de->de_objectid = deh_objectid( &(de->de_deh[de->de_entry_num])); + if (de->de_entry_num >= ih_entry_count(de->de_ih)) + BUG(); + de->de_dir_id = deh_dir_id(&(de->de_deh[de->de_entry_num])); + de->de_objectid = deh_objectid(&(de->de_deh[de->de_entry_num])); } - -static inline void store_de_entry_key (struct reiserfs_dir_entry * de) +static inline void store_de_entry_key(struct reiserfs_dir_entry *de) { - struct reiserfs_de_head * deh = de->de_deh + de->de_entry_num; - - if (de->de_entry_num >= ih_entry_count (de->de_ih)) - BUG (); - - /* store key of the found entry */ - de->de_entry_key.version = KEY_FORMAT_3_5; - de->de_entry_key.on_disk_key.k_dir_id = le32_to_cpu (de->de_ih->ih_key.k_dir_id); - de->de_entry_key.on_disk_key.k_objectid = le32_to_cpu (de->de_ih->ih_key.k_objectid); - set_cpu_key_k_offset (&(de->de_entry_key), deh_offset (deh)); - set_cpu_key_k_type (&(de->de_entry_key), TYPE_DIRENTRY); + struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num; + + if (de->de_entry_num >= ih_entry_count(de->de_ih)) + BUG(); + + /* store key of the found entry */ + de->de_entry_key.version = KEY_FORMAT_3_5; + de->de_entry_key.on_disk_key.k_dir_id = + le32_to_cpu(de->de_ih->ih_key.k_dir_id); + de->de_entry_key.on_disk_key.k_objectid = + le32_to_cpu(de->de_ih->ih_key.k_objectid); + set_cpu_key_k_offset(&(de->de_entry_key), deh_offset(deh)); + set_cpu_key_k_type(&(de->de_entry_key), TYPE_DIRENTRY); } - /* We assign a key to each directory item, and place multiple entries in a single directory item. A directory item has a key equal to the key of the first directory entry in it. @@ -117,58 +116,60 @@ entry position in the item */ /* The function is NOT SCHEDULE-SAFE! */ -int search_by_entry_key (struct super_block * sb, const struct cpu_key * key, - struct path * path, struct reiserfs_dir_entry * de) +int search_by_entry_key(struct super_block *sb, const struct cpu_key *key, + struct path *path, struct reiserfs_dir_entry *de) { - int retval; - - retval = search_item (sb, key, path); - switch (retval) { - case ITEM_NOT_FOUND: - if (!PATH_LAST_POSITION (path)) { - reiserfs_warning (sb, "vs-7000: search_by_entry_key: search_by_key returned item position == 0"); - pathrelse(path) ; - return IO_ERROR ; + int retval; + + retval = search_item(sb, key, path); + switch (retval) { + case ITEM_NOT_FOUND: + if (!PATH_LAST_POSITION(path)) { + reiserfs_warning(sb, + "vs-7000: search_by_entry_key: search_by_key returned item position == 0"); + pathrelse(path); + return IO_ERROR; + } + PATH_LAST_POSITION(path)--; + + case ITEM_FOUND: + break; + + case IO_ERROR: + return retval; + + default: + pathrelse(path); + reiserfs_warning(sb, + "vs-7002: search_by_entry_key: no path to here"); + return IO_ERROR; } - PATH_LAST_POSITION (path) --; - - case ITEM_FOUND: - break; - - case IO_ERROR: - return retval; - - default: - pathrelse (path); - reiserfs_warning (sb, "vs-7002: search_by_entry_key: no path to here"); - return IO_ERROR; - } - set_de_item_location (de, path); + set_de_item_location(de, path); #ifdef CONFIG_REISERFS_CHECK - if (!is_direntry_le_ih (de->de_ih) || - COMP_SHORT_KEYS (&(de->de_ih->ih_key), key)) { - print_block (de->de_bh, 0, -1, -1); - reiserfs_panic (sb, "vs-7005: search_by_entry_key: found item %h is not directory item or " - "does not belong to the same directory as key %K", de->de_ih, key); - } -#endif /* CONFIG_REISERFS_CHECK */ - - /* binary search in directory item by third componen t of the - key. sets de->de_entry_num of de */ - retval = bin_search_in_dir_item (de, cpu_key_k_offset (key)); - path->pos_in_item = de->de_entry_num; - if (retval != NAME_NOT_FOUND) { - // ugly, but rename needs de_bh, de_deh, de_name, de_namelen, de_objectid set - set_de_name_and_namelen (de); - set_de_object_key (de); - } - return retval; + if (!is_direntry_le_ih(de->de_ih) || + COMP_SHORT_KEYS(&(de->de_ih->ih_key), key)) { + print_block(de->de_bh, 0, -1, -1); + reiserfs_panic(sb, + "vs-7005: search_by_entry_key: found item %h is not directory item or " + "does not belong to the same directory as key %K", + de->de_ih, key); + } +#endif /* CONFIG_REISERFS_CHECK */ + + /* binary search in directory item by third componen t of the + key. sets de->de_entry_num of de */ + retval = bin_search_in_dir_item(de, cpu_key_k_offset(key)); + path->pos_in_item = de->de_entry_num; + if (retval != NAME_NOT_FOUND) { + // ugly, but rename needs de_bh, de_deh, de_name, de_namelen, de_objectid set + set_de_name_and_namelen(de); + set_de_object_key(de); + } + return retval; } - - /* Keyed 32-bit hash function using TEA in a Davis-Meyer function */ /* The third component is hashed, and you can choose from more than @@ -176,197 +177,210 @@ int search_by_entry_key (struct super_block * sb, const struct cpu_key * key, but are thought about. This function should be moved to hashes.c Jedi, please do so. -Hans */ -static __u32 get_third_component (struct super_block * s, - const char * name, int len) +static __u32 get_third_component(struct super_block *s, + const char *name, int len) { - __u32 res; - - if (!len || (len == 1 && name[0] == '.')) - return DOT_OFFSET; - if (len == 2 && name[0] == '.' && name[1] == '.') - return DOT_DOT_OFFSET; - - res = REISERFS_SB(s)->s_hash_function (name, len); - - // take bits from 7-th to 30-th including both bounds - res = GET_HASH_VALUE(res); - if (res == 0) - // needed to have no names before "." and ".." those have hash - // value == 0 and generation conters 1 and 2 accordingly - res = 128; - return res + MAX_GENERATION_NUMBER; + __u32 res; + + if (!len || (len == 1 && name[0] == '.')) + return DOT_OFFSET; + if (len == 2 && name[0] == '.' && name[1] == '.') + return DOT_DOT_OFFSET; + + res = REISERFS_SB(s)->s_hash_function(name, len); + + // take bits from 7-th to 30-th including both bounds + res = GET_HASH_VALUE(res); + if (res == 0) + // needed to have no names before "." and ".." those have hash + // value == 0 and generation conters 1 and 2 accordingly + res = 128; + return res + MAX_GENERATION_NUMBER; } - -static int reiserfs_match (struct reiserfs_dir_entry * de, - const char * name, int namelen) +static int reiserfs_match(struct reiserfs_dir_entry *de, + const char *name, int namelen) { - int retval = NAME_NOT_FOUND; + int retval = NAME_NOT_FOUND; - if ((namelen == de->de_namelen) && - !memcmp(de->de_name, name, de->de_namelen)) - retval = (de_visible (de->de_deh + de->de_entry_num) ? NAME_FOUND : NAME_FOUND_INVISIBLE); + if ((namelen == de->de_namelen) && + !memcmp(de->de_name, name, de->de_namelen)) + retval = + (de_visible(de->de_deh + de->de_entry_num) ? NAME_FOUND : + NAME_FOUND_INVISIBLE); - return retval; + return retval; } - /* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */ /* used when hash collisions exist */ - -static int linear_search_in_dir_item (struct cpu_key * key, struct reiserfs_dir_entry * de, - const char * name, int namelen) +static int linear_search_in_dir_item(struct cpu_key *key, + struct reiserfs_dir_entry *de, + const char *name, int namelen) { - struct reiserfs_de_head * deh = de->de_deh; - int retval; - int i; + struct reiserfs_de_head *deh = de->de_deh; + int retval; + int i; - i = de->de_entry_num; + i = de->de_entry_num; - if (i == I_ENTRY_COUNT (de->de_ih) || - GET_HASH_VALUE (deh_offset (deh + i)) != GET_HASH_VALUE (cpu_key_k_offset (key))) { - i --; - } + if (i == I_ENTRY_COUNT(de->de_ih) || + GET_HASH_VALUE(deh_offset(deh + i)) != + GET_HASH_VALUE(cpu_key_k_offset(key))) { + i--; + } - RFALSE( de->de_deh != B_I_DEH (de->de_bh, de->de_ih), - "vs-7010: array of entry headers not found"); + RFALSE(de->de_deh != B_I_DEH(de->de_bh, de->de_ih), + "vs-7010: array of entry headers not found"); - deh += i; + deh += i; - for (; i >= 0; i --, deh --) { - if (GET_HASH_VALUE (deh_offset (deh)) != - GET_HASH_VALUE (cpu_key_k_offset (key))) { - // hash value does not match, no need to check whole name - return NAME_NOT_FOUND; - } - - /* mark, that this generation number is used */ - if (de->de_gen_number_bit_string) - set_bit (GET_GENERATION_NUMBER (deh_offset (deh)), (unsigned long *)de->de_gen_number_bit_string); + for (; i >= 0; i--, deh--) { + if (GET_HASH_VALUE(deh_offset(deh)) != + GET_HASH_VALUE(cpu_key_k_offset(key))) { + // hash value does not match, no need to check whole name + return NAME_NOT_FOUND; + } + + /* mark, that this generation number is used */ + if (de->de_gen_number_bit_string) + set_bit(GET_GENERATION_NUMBER(deh_offset(deh)), + (unsigned long *)de->de_gen_number_bit_string); - // calculate pointer to name and namelen - de->de_entry_num = i; - set_de_name_and_namelen (de); + // calculate pointer to name and namelen + de->de_entry_num = i; + set_de_name_and_namelen(de); - if ((retval = reiserfs_match (de, name, namelen)) != NAME_NOT_FOUND) { - // de's de_name, de_namelen, de_recordlen are set. Fill the rest: + if ((retval = + reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) { + // de's de_name, de_namelen, de_recordlen are set. Fill the rest: - // key of pointed object - set_de_object_key (de); + // key of pointed object + set_de_object_key(de); - store_de_entry_key (de); + store_de_entry_key(de); - // retval can be NAME_FOUND or NAME_FOUND_INVISIBLE - return retval; + // retval can be NAME_FOUND or NAME_FOUND_INVISIBLE + return retval; + } } - } - - if (GET_GENERATION_NUMBER (le_ih_k_offset (de->de_ih)) == 0) - /* we have reached left most entry in the node. In common we - have to go to the left neighbor, but if generation counter - is 0 already, we know for sure, that there is no name with - the same hash value */ - // FIXME: this work correctly only because hash value can not - // be 0. Btw, in case of Yura's hash it is probably possible, - // so, this is a bug - return NAME_NOT_FOUND; - RFALSE( de->de_item_num, - "vs-7015: two diritems of the same directory in one node?"); + if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0) + /* we have reached left most entry in the node. In common we + have to go to the left neighbor, but if generation counter + is 0 already, we know for sure, that there is no name with + the same hash value */ + // FIXME: this work correctly only because hash value can not + // be 0. Btw, in case of Yura's hash it is probably possible, + // so, this is a bug + return NAME_NOT_FOUND; - return GOTO_PREVIOUS_ITEM; -} + RFALSE(de->de_item_num, + "vs-7015: two diritems of the same directory in one node?"); + return GOTO_PREVIOUS_ITEM; +} // may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND // FIXME: should add something like IOERROR -static int reiserfs_find_entry (struct inode * dir, const char * name, int namelen, - struct path * path_to_entry, struct reiserfs_dir_entry * de) +static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen, + struct path *path_to_entry, + struct reiserfs_dir_entry *de) { - struct cpu_key key_to_search; - int retval; - - - if (namelen > REISERFS_MAX_NAME (dir->i_sb->s_blocksize)) - return NAME_NOT_FOUND; - - /* we will search for this key in the tree */ - make_cpu_key (&key_to_search, dir, - get_third_component (dir->i_sb, name, namelen), TYPE_DIRENTRY, 3); - - while (1) { - retval = search_by_entry_key (dir->i_sb, &key_to_search, path_to_entry, de); - if (retval == IO_ERROR) { - reiserfs_warning (dir->i_sb, "zam-7001: io error in %s", - __FUNCTION__); - return IO_ERROR; - } - - /* compare names for all entries having given hash value */ - retval = linear_search_in_dir_item (&key_to_search, de, name, namelen); - if (retval != GOTO_PREVIOUS_ITEM) { - /* there is no need to scan directory anymore. Given entry found or does not exist */ - path_to_entry->pos_in_item = de->de_entry_num; - return retval; - } - - /* there is left neighboring item of this directory and given entry can be there */ - set_cpu_key_k_offset (&key_to_search, le_ih_k_offset (de->de_ih) - 1); - pathrelse (path_to_entry); - - } /* while (1) */ + struct cpu_key key_to_search; + int retval; + + if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize)) + return NAME_NOT_FOUND; + + /* we will search for this key in the tree */ + make_cpu_key(&key_to_search, dir, + get_third_component(dir->i_sb, name, namelen), + TYPE_DIRENTRY, 3); + + while (1) { + retval = + search_by_entry_key(dir->i_sb, &key_to_search, + path_to_entry, de); + if (retval == IO_ERROR) { + reiserfs_warning(dir->i_sb, "zam-7001: io error in %s", + __FUNCTION__); + return IO_ERROR; + } + + /* compare names for all entries having given hash value */ + retval = + linear_search_in_dir_item(&key_to_search, de, name, + namelen); + if (retval != GOTO_PREVIOUS_ITEM) { + /* there is no need to scan directory anymore. Given entry found or does not exist */ + path_to_entry->pos_in_item = de->de_entry_num; + return retval; + } + + /* there is left neighboring item of this directory and given entry can be there */ + set_cpu_key_k_offset(&key_to_search, + le_ih_k_offset(de->de_ih) - 1); + pathrelse(path_to_entry); + + } /* while (1) */ } - -static struct dentry * reiserfs_lookup (struct inode * dir, struct dentry * dentry, struct nameidata *nd) +static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) { - int retval; - struct inode * inode = NULL; - struct reiserfs_dir_entry de; - INITIALIZE_PATH (path_to_entry); - - if (REISERFS_MAX_NAME (dir->i_sb->s_blocksize) < dentry->d_name.len) - return ERR_PTR(-ENAMETOOLONG); - - reiserfs_write_lock(dir->i_sb); - de.de_gen_number_bit_string = NULL; - retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path_to_entry, &de); - pathrelse (&path_to_entry); - if (retval == NAME_FOUND) { - /* Hide the .reiserfs_priv directory */ - if (reiserfs_xattrs (dir->i_sb) && - !old_format_only(dir->i_sb) && - REISERFS_SB(dir->i_sb)->priv_root && - REISERFS_SB(dir->i_sb)->priv_root->d_inode && - de.de_objectid == le32_to_cpu (INODE_PKEY(REISERFS_SB(dir->i_sb)->priv_root->d_inode)->k_objectid)) { - reiserfs_write_unlock (dir->i_sb); - return ERR_PTR (-EACCES); + int retval; + struct inode *inode = NULL; + struct reiserfs_dir_entry de; + INITIALIZE_PATH(path_to_entry); + + if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len) + return ERR_PTR(-ENAMETOOLONG); + + reiserfs_write_lock(dir->i_sb); + de.de_gen_number_bit_string = NULL; + retval = + reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, + &path_to_entry, &de); + pathrelse(&path_to_entry); + if (retval == NAME_FOUND) { + /* Hide the .reiserfs_priv directory */ + if (reiserfs_xattrs(dir->i_sb) && + !old_format_only(dir->i_sb) && + REISERFS_SB(dir->i_sb)->priv_root && + REISERFS_SB(dir->i_sb)->priv_root->d_inode && + de.de_objectid == + le32_to_cpu(INODE_PKEY + (REISERFS_SB(dir->i_sb)->priv_root->d_inode)-> + k_objectid)) { + reiserfs_write_unlock(dir->i_sb); + return ERR_PTR(-EACCES); + } + + inode = + reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); + if (!inode || IS_ERR(inode)) { + reiserfs_write_unlock(dir->i_sb); + return ERR_PTR(-EACCES); + } + + /* Propogate the priv_object flag so we know we're in the priv tree */ + if (is_reiserfs_priv_object(dir)) + reiserfs_mark_inode_private(inode); + } + reiserfs_write_unlock(dir->i_sb); + if (retval == IO_ERROR) { + return ERR_PTR(-EIO); } - inode = reiserfs_iget (dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); - if (!inode || IS_ERR(inode)) { - reiserfs_write_unlock(dir->i_sb); - return ERR_PTR(-EACCES); - } - - /* Propogate the priv_object flag so we know we're in the priv tree */ - if (is_reiserfs_priv_object (dir)) - reiserfs_mark_inode_private (inode); - } - reiserfs_write_unlock(dir->i_sb); - if ( retval == IO_ERROR ) { - return ERR_PTR(-EIO); - } - - if (inode) - return d_splice_alias(inode, dentry); - - d_add(dentry, inode); - return NULL; -} + if (inode) + return d_splice_alias(inode, dentry); + d_add(dentry, inode); + return NULL; +} /* ** looks up the dentry of the parent directory for child. @@ -374,40 +388,38 @@ static struct dentry * reiserfs_lookup (struct inode * dir, struct dentry * dent */ struct dentry *reiserfs_get_parent(struct dentry *child) { - int retval; - struct inode * inode = NULL; - struct reiserfs_dir_entry de; - INITIALIZE_PATH (path_to_entry); - struct dentry *parent; - struct inode *dir = child->d_inode ; - - - if (dir->i_nlink == 0) { - return ERR_PTR(-ENOENT); - } - de.de_gen_number_bit_string = NULL; - - reiserfs_write_lock(dir->i_sb); - retval = reiserfs_find_entry (dir, "..", 2, &path_to_entry, &de); - pathrelse (&path_to_entry); - if (retval != NAME_FOUND) { + int retval; + struct inode *inode = NULL; + struct reiserfs_dir_entry de; + INITIALIZE_PATH(path_to_entry); + struct dentry *parent; + struct inode *dir = child->d_inode; + + if (dir->i_nlink == 0) { + return ERR_PTR(-ENOENT); + } + de.de_gen_number_bit_string = NULL; + + reiserfs_write_lock(dir->i_sb); + retval = reiserfs_find_entry(dir, "..", 2, &path_to_entry, &de); + pathrelse(&path_to_entry); + if (retval != NAME_FOUND) { + reiserfs_write_unlock(dir->i_sb); + return ERR_PTR(-ENOENT); + } + inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); reiserfs_write_unlock(dir->i_sb); - return ERR_PTR(-ENOENT); - } - inode = reiserfs_iget (dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); - reiserfs_write_unlock(dir->i_sb); - - if (!inode || IS_ERR(inode)) { - return ERR_PTR(-EACCES); - } - parent = d_alloc_anon(inode); - if (!parent) { - iput(inode); - parent = ERR_PTR(-ENOMEM); - } - return parent; -} + if (!inode || IS_ERR(inode)) { + return ERR_PTR(-EACCES); + } + parent = d_alloc_anon(inode); + if (!parent) { + iput(inode); + parent = ERR_PTR(-ENOMEM); + } + return parent; +} /* add entry to the directory (entry can be hidden). @@ -415,132 +427,143 @@ insert definition of when hidden directories are used here -Hans Does not mark dir inode dirty, do it after successesfull call to it */ -static int reiserfs_add_entry (struct reiserfs_transaction_handle *th, struct inode * dir, - const char * name, int namelen, struct inode * inode, - int visible) +static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, + struct inode *dir, const char *name, int namelen, + struct inode *inode, int visible) { - struct cpu_key entry_key; - struct reiserfs_de_head * deh; - INITIALIZE_PATH (path); - struct reiserfs_dir_entry de; - int bit_string [MAX_GENERATION_NUMBER / (sizeof(int) * 8) + 1]; - int gen_number; - char small_buf[32+DEH_SIZE] ; /* 48 bytes now and we avoid kmalloc - if we create file with short name */ - char * buffer; - int buflen, paste_size; - int retval; - - BUG_ON (!th->t_trans_id); - - /* cannot allow items to be added into a busy deleted directory */ - if (!namelen) - return -EINVAL; - - if (namelen > REISERFS_MAX_NAME (dir->i_sb->s_blocksize)) - return -ENAMETOOLONG; - - /* each entry has unique key. compose it */ - make_cpu_key (&entry_key, dir, - get_third_component (dir->i_sb, name, namelen), TYPE_DIRENTRY, 3); - - /* get memory for composing the entry */ - buflen = DEH_SIZE + ROUND_UP (namelen); - if (buflen > sizeof (small_buf)) { - buffer = reiserfs_kmalloc (buflen, GFP_NOFS, dir->i_sb); - if (buffer == 0) - return -ENOMEM; - } else - buffer = small_buf; - - paste_size = (get_inode_sd_version (dir) == STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen; - - /* fill buffer : directory entry head, name[, dir objectid | , stat data | ,stat data, dir objectid ] */ - deh = (struct reiserfs_de_head *)buffer; - deh->deh_location = 0; /* JDM Endian safe if 0 */ - put_deh_offset( deh, cpu_key_k_offset( &entry_key ) ); - deh->deh_state = 0; /* JDM Endian safe if 0 */ - /* put key (ino analog) to de */ - deh->deh_dir_id = INODE_PKEY (inode)->k_dir_id; /* safe: k_dir_id is le */ - deh->deh_objectid = INODE_PKEY (inode)->k_objectid; /* safe: k_objectid is le */ - - /* copy name */ - memcpy ((char *)(deh + 1), name, namelen); - /* padd by 0s to the 4 byte boundary */ - padd_item ((char *)(deh + 1), ROUND_UP (namelen), namelen); - - /* entry is ready to be pasted into tree, set 'visibility' and 'stat data in entry' attributes */ - mark_de_without_sd (deh); - visible ? mark_de_visible (deh) : mark_de_hidden (deh); - - /* find the proper place for the new entry */ - memset (bit_string, 0, sizeof (bit_string)); - de.de_gen_number_bit_string = (char *)bit_string; - retval = reiserfs_find_entry (dir, name, namelen, &path, &de); - if( retval != NAME_NOT_FOUND ) { - if (buffer != small_buf) - reiserfs_kfree (buffer, buflen, dir->i_sb); - pathrelse (&path); + struct cpu_key entry_key; + struct reiserfs_de_head *deh; + INITIALIZE_PATH(path); + struct reiserfs_dir_entry de; + int bit_string[MAX_GENERATION_NUMBER / (sizeof(int) * 8) + 1]; + int gen_number; + char small_buf[32 + DEH_SIZE]; /* 48 bytes now and we avoid kmalloc + if we create file with short name */ + char *buffer; + int buflen, paste_size; + int retval; + + BUG_ON(!th->t_trans_id); + + /* cannot allow items to be added into a busy deleted directory */ + if (!namelen) + return -EINVAL; + + if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize)) + return -ENAMETOOLONG; + + /* each entry has unique key. compose it */ + make_cpu_key(&entry_key, dir, + get_third_component(dir->i_sb, name, namelen), + TYPE_DIRENTRY, 3); + + /* get memory for composing the entry */ + buflen = DEH_SIZE + ROUND_UP(namelen); + if (buflen > sizeof(small_buf)) { + buffer = reiserfs_kmalloc(buflen, GFP_NOFS, dir->i_sb); + if (buffer == 0) + return -ENOMEM; + } else + buffer = small_buf; + + paste_size = + (get_inode_sd_version(dir) == + STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen; + + /* fill buffer : directory entry head, name[, dir objectid | , stat data | ,stat data, dir objectid ] */ + deh = (struct reiserfs_de_head *)buffer; + deh->deh_location = 0; /* JDM Endian safe if 0 */ + put_deh_offset(deh, cpu_key_k_offset(&entry_key)); + deh->deh_state = 0; /* JDM Endian safe if 0 */ + /* put key (ino analog) to de */ + deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id; /* safe: k_dir_id is le */ + deh->deh_objectid = INODE_PKEY(inode)->k_objectid; /* safe: k_objectid is le */ + + /* copy name */ + memcpy((char *)(deh + 1), name, namelen); + /* padd by 0s to the 4 byte boundary */ + padd_item((char *)(deh + 1), ROUND_UP(namelen), namelen); + + /* entry is ready to be pasted into tree, set 'visibility' and 'stat data in entry' attributes */ + mark_de_without_sd(deh); + visible ? mark_de_visible(deh) : mark_de_hidden(deh); + + /* find the proper place for the new entry */ + memset(bit_string, 0, sizeof(bit_string)); + de.de_gen_number_bit_string = (char *)bit_string; + retval = reiserfs_find_entry(dir, name, namelen, &path, &de); + if (retval != NAME_NOT_FOUND) { + if (buffer != small_buf) + reiserfs_kfree(buffer, buflen, dir->i_sb); + pathrelse(&path); + + if (retval == IO_ERROR) { + return -EIO; + } + + if (retval != NAME_FOUND) { + reiserfs_warning(dir->i_sb, + "zam-7002:%s: \"reiserfs_find_entry\" " + "has returned unexpected value (%d)", + __FUNCTION__, retval); + } + + return -EEXIST; + } - if ( retval == IO_ERROR ) { - return -EIO; + gen_number = + find_first_zero_bit((unsigned long *)bit_string, + MAX_GENERATION_NUMBER + 1); + if (gen_number > MAX_GENERATION_NUMBER) { + /* there is no free generation number */ + reiserfs_warning(dir->i_sb, + "reiserfs_add_entry: Congratulations! we have got hash function screwed up"); + if (buffer != small_buf) + reiserfs_kfree(buffer, buflen, dir->i_sb); + pathrelse(&path); + return -EBUSY; + } + /* adjust offset of directory enrty */ + put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number)); + set_cpu_key_k_offset(&entry_key, deh_offset(deh)); + + /* update max-hash-collisions counter in reiserfs_sb_info */ + PROC_INFO_MAX(th->t_super, max_hash_collisions, gen_number); + + if (gen_number != 0) { /* we need to re-search for the insertion point */ + if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) != + NAME_NOT_FOUND) { + reiserfs_warning(dir->i_sb, + "vs-7032: reiserfs_add_entry: " + "entry with this key (%K) already exists", + &entry_key); + + if (buffer != small_buf) + reiserfs_kfree(buffer, buflen, dir->i_sb); + pathrelse(&path); + return -EBUSY; + } } - if (retval != NAME_FOUND) { - reiserfs_warning (dir->i_sb, "zam-7002:%s: \"reiserfs_find_entry\" " - "has returned unexpected value (%d)", - __FUNCTION__, retval); - } - - return -EEXIST; - } - - gen_number = find_first_zero_bit ((unsigned long *)bit_string, MAX_GENERATION_NUMBER + 1); - if (gen_number > MAX_GENERATION_NUMBER) { - /* there is no free generation number */ - reiserfs_warning (dir->i_sb, "reiserfs_add_entry: Congratulations! we have got hash function screwed up"); - if (buffer != small_buf) - reiserfs_kfree (buffer, buflen, dir->i_sb); - pathrelse (&path); - return -EBUSY; - } - /* adjust offset of directory enrty */ - put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number)); - set_cpu_key_k_offset (&entry_key, deh_offset(deh)); - - /* update max-hash-collisions counter in reiserfs_sb_info */ - PROC_INFO_MAX( th -> t_super, max_hash_collisions, gen_number ); - - if (gen_number != 0) { /* we need to re-search for the insertion point */ - if (search_by_entry_key (dir->i_sb, &entry_key, &path, &de) != NAME_NOT_FOUND) { - reiserfs_warning (dir->i_sb, "vs-7032: reiserfs_add_entry: " - "entry with this key (%K) already exists", - &entry_key); - - if (buffer != small_buf) - reiserfs_kfree (buffer, buflen, dir->i_sb); - pathrelse (&path); - return -EBUSY; + /* perform the insertion of the entry that we have prepared */ + retval = + reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer, + paste_size); + if (buffer != small_buf) + reiserfs_kfree(buffer, buflen, dir->i_sb); + if (retval) { + reiserfs_check_path(&path); + return retval; } - } - - /* perform the insertion of the entry that we have prepared */ - retval = reiserfs_paste_into_item (th, &path, &entry_key, dir, buffer, paste_size); - if (buffer != small_buf) - reiserfs_kfree (buffer, buflen, dir->i_sb); - if (retval) { - reiserfs_check_path(&path) ; - return retval; - } - dir->i_size += paste_size; - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; - if (!S_ISDIR (inode->i_mode) && visible) - // reiserfs_mkdir or reiserfs_rename will do that by itself - reiserfs_update_sd (th, dir); + dir->i_size += paste_size; + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + if (!S_ISDIR(inode->i_mode) && visible) + // reiserfs_mkdir or reiserfs_rename will do that by itself + reiserfs_update_sd(th, dir); - reiserfs_check_path(&path) ; - return 0; + reiserfs_check_path(&path); + return 0; } /* quota utility function, call if you've had to abort after calling @@ -548,12 +571,13 @@ static int reiserfs_add_entry (struct reiserfs_transaction_handle *th, struct in ** This should only be called on inodes that do not have stat data ** inserted into the tree yet. */ -static int drop_new_inode(struct inode *inode) { - DQUOT_DROP(inode); - make_bad_inode(inode) ; - inode->i_flags |= S_NOQUOTA; - iput(inode) ; - return 0 ; +static int drop_new_inode(struct inode *inode) +{ + DQUOT_DROP(inode); + make_bad_inode(inode); + inode->i_flags |= S_NOQUOTA; + iput(inode); + return 0; } /* utility function that does setup for reiserfs_new_inode. @@ -561,902 +585,971 @@ static int drop_new_inode(struct inode *inode) { ** outside of a transaction, so we had to pull some bits of ** reiserfs_new_inode out into this func. */ -static int new_inode_init(struct inode *inode, struct inode *dir, int mode) { - - /* the quota init calls have to know who to charge the quota to, so - ** we have to set uid and gid here - */ - inode->i_uid = current->fsuid; - inode->i_mode = mode; - - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } else { - inode->i_gid = current->fsgid; - } - DQUOT_INIT(inode); - return 0 ; +static int new_inode_init(struct inode *inode, struct inode *dir, int mode) +{ + + /* the quota init calls have to know who to charge the quota to, so + ** we have to set uid and gid here + */ + inode->i_uid = current->fsuid; + inode->i_mode = mode; + /* Make inode invalid - just in case we are going to drop it before + * the initialization happens */ + INODE_PKEY(inode)->k_objectid = 0; + + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + inode->i_mode |= S_ISGID; + } else { + inode->i_gid = current->fsgid; + } + DQUOT_INIT(inode); + return 0; } -static int reiserfs_create (struct inode * dir, struct dentry *dentry, int mode, - struct nameidata *nd) +static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) { - int retval; - struct inode * inode; - /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ - int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); - struct reiserfs_transaction_handle th ; - int locked; - - if (!(inode = new_inode(dir->i_sb))) { - return -ENOMEM ; - } - new_inode_init(inode, dir, mode); - - locked = reiserfs_cache_default_acl (dir); - - reiserfs_write_lock(dir->i_sb); - - if (locked) - reiserfs_write_lock_xattrs (dir->i_sb); - - retval = journal_begin(&th, dir->i_sb, jbegin_count); - if (retval) { - drop_new_inode (inode); - goto out_failed; - } - - retval = reiserfs_new_inode (&th, dir, mode, NULL, 0/*i_size*/, dentry, inode); - if (retval) - goto out_failed; - - if (locked) { - reiserfs_write_unlock_xattrs (dir->i_sb); - locked = 0; - } - - inode->i_op = &reiserfs_file_inode_operations; - inode->i_fop = &reiserfs_file_operations; - inode->i_mapping->a_ops = &reiserfs_address_space_operations ; - - retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, - inode, 1/*visible*/); - if (retval) { - int err; - inode->i_nlink--; - reiserfs_update_sd (&th, inode); - err = journal_end(&th, dir->i_sb, jbegin_count) ; - if (err) - retval = err; - iput (inode); - goto out_failed; - } - reiserfs_update_inode_transaction(inode) ; - reiserfs_update_inode_transaction(dir) ; - - d_instantiate(dentry, inode); - retval = journal_end(&th, dir->i_sb, jbegin_count) ; - -out_failed: - if (locked) - reiserfs_write_unlock_xattrs (dir->i_sb); - reiserfs_write_unlock(dir->i_sb); - return retval; -} + int retval; + struct inode *inode; + /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 2 + + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); + struct reiserfs_transaction_handle th; + int locked; + + if (!(inode = new_inode(dir->i_sb))) { + return -ENOMEM; + } + new_inode_init(inode, dir, mode); + locked = reiserfs_cache_default_acl(dir); -static int reiserfs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev) -{ - int retval; - struct inode * inode; - struct reiserfs_transaction_handle th ; - /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ - int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); - int locked; + reiserfs_write_lock(dir->i_sb); - if (!new_valid_dev(rdev)) - return -EINVAL; + if (locked) + reiserfs_write_lock_xattrs(dir->i_sb); + + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) { + drop_new_inode(inode); + goto out_failed; + } + + retval = + reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry, + inode); + if (retval) + goto out_failed; + + if (locked) { + reiserfs_write_unlock_xattrs(dir->i_sb); + locked = 0; + } + + inode->i_op = &reiserfs_file_inode_operations; + inode->i_fop = &reiserfs_file_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + + retval = + reiserfs_add_entry(&th, dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + if (retval) { + int err; + inode->i_nlink--; + reiserfs_update_sd(&th, inode); + err = journal_end(&th, dir->i_sb, jbegin_count); + if (err) + retval = err; + iput(inode); + goto out_failed; + } + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); - if (!(inode = new_inode(dir->i_sb))) { - return -ENOMEM ; - } - new_inode_init(inode, dir, mode); + d_instantiate(dentry, inode); + retval = journal_end(&th, dir->i_sb, jbegin_count); - locked = reiserfs_cache_default_acl (dir); + out_failed: + if (locked) + reiserfs_write_unlock_xattrs(dir->i_sb); + reiserfs_write_unlock(dir->i_sb); + return retval; +} - reiserfs_write_lock(dir->i_sb); +static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev) +{ + int retval; + struct inode *inode; + struct reiserfs_transaction_handle th; + /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); + int locked; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + if (!(inode = new_inode(dir->i_sb))) { + return -ENOMEM; + } + new_inode_init(inode, dir, mode); - if (locked) - reiserfs_write_lock_xattrs (dir->i_sb); + locked = reiserfs_cache_default_acl(dir); - retval = journal_begin(&th, dir->i_sb, jbegin_count) ; - if (retval) { - drop_new_inode (inode); - goto out_failed; - } + reiserfs_write_lock(dir->i_sb); - retval = reiserfs_new_inode (&th, dir, mode, NULL, 0/*i_size*/, dentry, inode); - if (retval) { - goto out_failed; - } + if (locked) + reiserfs_write_lock_xattrs(dir->i_sb); - if (locked) { - reiserfs_write_unlock_xattrs (dir->i_sb); - locked = 0; - } + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) { + drop_new_inode(inode); + goto out_failed; + } + retval = + reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry, + inode); + if (retval) { + goto out_failed; + } - inode->i_op = &reiserfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, rdev) ; + if (locked) { + reiserfs_write_unlock_xattrs(dir->i_sb); + locked = 0; + } - //FIXME: needed for block and char devices only - reiserfs_update_sd (&th, inode); + inode->i_op = &reiserfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + + //FIXME: needed for block and char devices only + reiserfs_update_sd(&th, inode); + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + retval = + reiserfs_add_entry(&th, dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + if (retval) { + int err; + inode->i_nlink--; + reiserfs_update_sd(&th, inode); + err = journal_end(&th, dir->i_sb, jbegin_count); + if (err) + retval = err; + iput(inode); + goto out_failed; + } - reiserfs_update_inode_transaction(inode) ; - reiserfs_update_inode_transaction(dir) ; + d_instantiate(dentry, inode); + retval = journal_end(&th, dir->i_sb, jbegin_count); - retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, - inode, 1/*visible*/); - if (retval) { - int err; - inode->i_nlink--; - reiserfs_update_sd (&th, inode); - err = journal_end(&th, dir->i_sb, jbegin_count) ; - if (err) - retval = err; - iput (inode); - goto out_failed; - } - - d_instantiate(dentry, inode); - retval = journal_end(&th, dir->i_sb, jbegin_count) ; - -out_failed: - if (locked) - reiserfs_write_unlock_xattrs (dir->i_sb); - reiserfs_write_unlock(dir->i_sb); - return retval; + out_failed: + if (locked) + reiserfs_write_unlock_xattrs(dir->i_sb); + reiserfs_write_unlock(dir->i_sb); + return retval; } - -static int reiserfs_mkdir (struct inode * dir, struct dentry *dentry, int mode) +static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { - int retval; - struct inode * inode; - struct reiserfs_transaction_handle th ; - /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ - int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); - int locked; + int retval; + struct inode *inode; + struct reiserfs_transaction_handle th; + /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) + + REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb)); + int locked; #ifdef DISPLACE_NEW_PACKING_LOCALITIES - /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */ - REISERFS_I(dir)->new_packing_locality = 1; + /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */ + REISERFS_I(dir)->new_packing_locality = 1; #endif - mode = S_IFDIR | mode; - if (!(inode = new_inode(dir->i_sb))) { - return -ENOMEM ; - } - new_inode_init(inode, dir, mode); - - locked = reiserfs_cache_default_acl (dir); - - reiserfs_write_lock(dir->i_sb); - if (locked) - reiserfs_write_lock_xattrs (dir->i_sb); - - retval = journal_begin(&th, dir->i_sb, jbegin_count) ; - if (retval) { - drop_new_inode (inode); - goto out_failed; - } - - - /* inc the link count now, so another writer doesn't overflow it while - ** we sleep later on. - */ - INC_DIR_INODE_NLINK(dir) - - retval = reiserfs_new_inode (&th, dir, mode, NULL/*symlink*/, - old_format_only (dir->i_sb) ? - EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, - dentry, inode); - if (retval) { - dir->i_nlink-- ; - goto out_failed; - } - - if (locked) { - reiserfs_write_unlock_xattrs (dir->i_sb); - locked = 0; - } - - reiserfs_update_inode_transaction(inode) ; - reiserfs_update_inode_transaction(dir) ; - - inode->i_op = &reiserfs_dir_inode_operations; - inode->i_fop = &reiserfs_dir_operations; - - // note, _this_ add_entry will not update dir's stat data - retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, - inode, 1/*visible*/); - if (retval) { - int err; - inode->i_nlink = 0; - DEC_DIR_INODE_NLINK(dir); - reiserfs_update_sd (&th, inode); - err = journal_end(&th, dir->i_sb, jbegin_count) ; - if (err) - retval = err; - iput (inode); - goto out_failed; - } - - // the above add_entry did not update dir's stat data - reiserfs_update_sd (&th, dir); - - d_instantiate(dentry, inode); - retval = journal_end(&th, dir->i_sb, jbegin_count) ; -out_failed: - if (locked) - reiserfs_write_unlock_xattrs (dir->i_sb); - reiserfs_write_unlock(dir->i_sb); - return retval; -} + mode = S_IFDIR | mode; + if (!(inode = new_inode(dir->i_sb))) { + return -ENOMEM; + } + new_inode_init(inode, dir, mode); + + locked = reiserfs_cache_default_acl(dir); + + reiserfs_write_lock(dir->i_sb); + if (locked) + reiserfs_write_lock_xattrs(dir->i_sb); + + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) { + drop_new_inode(inode); + goto out_failed; + } -static inline int reiserfs_empty_dir(struct inode *inode) { - /* we can cheat because an old format dir cannot have - ** EMPTY_DIR_SIZE, and a new format dir cannot have - ** EMPTY_DIR_SIZE_V1. So, if the inode is either size, - ** regardless of disk format version, the directory is empty. - */ - if (inode->i_size != EMPTY_DIR_SIZE && - inode->i_size != EMPTY_DIR_SIZE_V1) { - return 0 ; - } - return 1 ; + /* inc the link count now, so another writer doesn't overflow it while + ** we sleep later on. + */ + INC_DIR_INODE_NLINK(dir) + + retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */ , + old_format_only(dir->i_sb) ? + EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, + dentry, inode); + if (retval) { + dir->i_nlink--; + goto out_failed; + } + + if (locked) { + reiserfs_write_unlock_xattrs(dir->i_sb); + locked = 0; + } + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + inode->i_op = &reiserfs_dir_inode_operations; + inode->i_fop = &reiserfs_dir_operations; + + // note, _this_ add_entry will not update dir's stat data + retval = + reiserfs_add_entry(&th, dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + if (retval) { + int err; + inode->i_nlink = 0; + DEC_DIR_INODE_NLINK(dir); + reiserfs_update_sd(&th, inode); + err = journal_end(&th, dir->i_sb, jbegin_count); + if (err) + retval = err; + iput(inode); + goto out_failed; + } + // the above add_entry did not update dir's stat data + reiserfs_update_sd(&th, dir); + + d_instantiate(dentry, inode); + retval = journal_end(&th, dir->i_sb, jbegin_count); + out_failed: + if (locked) + reiserfs_write_unlock_xattrs(dir->i_sb); + reiserfs_write_unlock(dir->i_sb); + return retval; } -static int reiserfs_rmdir (struct inode * dir, struct dentry *dentry) +static inline int reiserfs_empty_dir(struct inode *inode) { - int retval, err; - struct inode * inode; - struct reiserfs_transaction_handle th ; - int jbegin_count; - INITIALIZE_PATH (path); - struct reiserfs_dir_entry de; - - - /* we will be doing 2 balancings and update 2 stat data, we change quotas - * of the owner of the directory and of the owner of the parent directory */ - jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); - - reiserfs_write_lock(dir->i_sb); - retval = journal_begin(&th, dir->i_sb, jbegin_count) ; - if (retval) - goto out_rmdir; - - de.de_gen_number_bit_string = NULL; - if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) { - retval = -ENOENT; - goto end_rmdir; - } else if ( retval == IO_ERROR) { - retval = -EIO; - goto end_rmdir; - } - - inode = dentry->d_inode; - - reiserfs_update_inode_transaction(inode) ; - reiserfs_update_inode_transaction(dir) ; - - if (de.de_objectid != inode->i_ino) { - // FIXME: compare key of an object and a key found in the - // entry - retval = -EIO; - goto end_rmdir; - } - if (!reiserfs_empty_dir(inode)) { - retval = -ENOTEMPTY; - goto end_rmdir; - } - - /* cut entry from dir directory */ - retval = reiserfs_cut_from_item (&th, &path, &(de.de_entry_key), dir, - NULL, /* page */ - 0/*new file size - not used here*/); - if (retval < 0) - goto end_rmdir; - - if ( inode->i_nlink != 2 && inode->i_nlink != 1 ) - reiserfs_warning (inode->i_sb, "%s: empty directory has nlink " - "!= 2 (%d)", __FUNCTION__, inode->i_nlink); - - inode->i_nlink = 0; - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; - reiserfs_update_sd (&th, inode); - - DEC_DIR_INODE_NLINK(dir) - dir->i_size -= (DEH_SIZE + de.de_entrylen); - reiserfs_update_sd (&th, dir); - - /* prevent empty directory from getting lost */ - add_save_link (&th, inode, 0/* not truncate */); - - retval = journal_end(&th, dir->i_sb, jbegin_count) ; - reiserfs_check_path(&path) ; -out_rmdir: - reiserfs_write_unlock(dir->i_sb); - return retval; - - end_rmdir: - /* we must release path, because we did not call - reiserfs_cut_from_item, or reiserfs_cut_from_item does not - release path if operation was not complete */ - pathrelse (&path); - err = journal_end(&th, dir->i_sb, jbegin_count) ; - reiserfs_write_unlock(dir->i_sb); - return err ? err : retval; + /* we can cheat because an old format dir cannot have + ** EMPTY_DIR_SIZE, and a new format dir cannot have + ** EMPTY_DIR_SIZE_V1. So, if the inode is either size, + ** regardless of disk format version, the directory is empty. + */ + if (inode->i_size != EMPTY_DIR_SIZE && + inode->i_size != EMPTY_DIR_SIZE_V1) { + return 0; + } + return 1; } -static int reiserfs_unlink (struct inode * dir, struct dentry *dentry) +static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) { - int retval, err; - struct inode * inode; - struct reiserfs_dir_entry de; - INITIALIZE_PATH (path); - struct reiserfs_transaction_handle th ; - int jbegin_count; - unsigned long savelink; - - inode = dentry->d_inode; - - /* in this transaction we can be doing at max two balancings and update - two stat datas, we change quotas of the owner of the directory and of - the owner of the parent directory */ - jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); - - reiserfs_write_lock(dir->i_sb); - retval = journal_begin(&th, dir->i_sb, jbegin_count) ; - if (retval) - goto out_unlink; - - de.de_gen_number_bit_string = NULL; - if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) { - retval = -ENOENT; - goto end_unlink; - } else if (retval == IO_ERROR) { - retval = -EIO; - goto end_unlink; - } - - reiserfs_update_inode_transaction(inode) ; - reiserfs_update_inode_transaction(dir) ; - - if (de.de_objectid != inode->i_ino) { - // FIXME: compare key of an object and a key found in the - // entry - retval = -EIO; - goto end_unlink; - } - - if (!inode->i_nlink) { - reiserfs_warning (inode->i_sb, "%s: deleting nonexistent file " - "(%s:%lu), %d", __FUNCTION__, - reiserfs_bdevname (inode->i_sb), inode->i_ino, - inode->i_nlink); - inode->i_nlink = 1; - } - - inode->i_nlink--; - - /* - * we schedule before doing the add_save_link call, save the link - * count so we don't race - */ - savelink = inode->i_nlink; - - - retval = reiserfs_cut_from_item (&th, &path, &(de.de_entry_key), dir, NULL, 0); - if (retval < 0) { - inode->i_nlink++; - goto end_unlink; - } - inode->i_ctime = CURRENT_TIME_SEC; - reiserfs_update_sd (&th, inode); - - dir->i_size -= (de.de_entrylen + DEH_SIZE); - dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; - reiserfs_update_sd (&th, dir); - - if (!savelink) - /* prevent file from getting lost */ - add_save_link (&th, inode, 0/* not truncate */); - - retval = journal_end(&th, dir->i_sb, jbegin_count) ; - reiserfs_check_path(&path) ; - reiserfs_write_unlock(dir->i_sb); - return retval; - - end_unlink: - pathrelse (&path); - err = journal_end(&th, dir->i_sb, jbegin_count) ; - reiserfs_check_path(&path) ; - if (err) - retval = err; -out_unlink: - reiserfs_write_unlock(dir->i_sb); - return retval; + int retval, err; + struct inode *inode; + struct reiserfs_transaction_handle th; + int jbegin_count; + INITIALIZE_PATH(path); + struct reiserfs_dir_entry de; + + /* we will be doing 2 balancings and update 2 stat data, we change quotas + * of the owner of the directory and of the owner of the parent directory. + * The quota structure is possibly deleted only on last iput => outside + * of this transaction */ + jbegin_count = + JOURNAL_PER_BALANCE_CNT * 2 + 2 + + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + + reiserfs_write_lock(dir->i_sb); + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) + goto out_rmdir; + + de.de_gen_number_bit_string = NULL; + if ((retval = + reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, + &path, &de)) == NAME_NOT_FOUND) { + retval = -ENOENT; + goto end_rmdir; + } else if (retval == IO_ERROR) { + retval = -EIO; + goto end_rmdir; + } + + inode = dentry->d_inode; + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + if (de.de_objectid != inode->i_ino) { + // FIXME: compare key of an object and a key found in the + // entry + retval = -EIO; + goto end_rmdir; + } + if (!reiserfs_empty_dir(inode)) { + retval = -ENOTEMPTY; + goto end_rmdir; + } + + /* cut entry from dir directory */ + retval = reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL, /* page */ + 0 /*new file size - not used here */ ); + if (retval < 0) + goto end_rmdir; + + if (inode->i_nlink != 2 && inode->i_nlink != 1) + reiserfs_warning(inode->i_sb, "%s: empty directory has nlink " + "!= 2 (%d)", __FUNCTION__, inode->i_nlink); + + inode->i_nlink = 0; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + reiserfs_update_sd(&th, inode); + + DEC_DIR_INODE_NLINK(dir) + dir->i_size -= (DEH_SIZE + de.de_entrylen); + reiserfs_update_sd(&th, dir); + + /* prevent empty directory from getting lost */ + add_save_link(&th, inode, 0 /* not truncate */ ); + + retval = journal_end(&th, dir->i_sb, jbegin_count); + reiserfs_check_path(&path); + out_rmdir: + reiserfs_write_unlock(dir->i_sb); + return retval; + + end_rmdir: + /* we must release path, because we did not call + reiserfs_cut_from_item, or reiserfs_cut_from_item does not + release path if operation was not complete */ + pathrelse(&path); + err = journal_end(&th, dir->i_sb, jbegin_count); + reiserfs_write_unlock(dir->i_sb); + return err ? err : retval; } -static int reiserfs_symlink (struct inode * parent_dir, - struct dentry * dentry, const char * symname) +static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) { - int retval; - struct inode * inode; - char * name; - int item_len; - struct reiserfs_transaction_handle th ; - int mode = S_IFLNK | S_IRWXUGO; - /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ - int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS); - - if (!(inode = new_inode(parent_dir->i_sb))) { - return -ENOMEM ; - } - new_inode_init(inode, parent_dir, mode); - - reiserfs_write_lock(parent_dir->i_sb); - item_len = ROUND_UP (strlen (symname)); - if (item_len > MAX_DIRECT_ITEM_LEN (parent_dir->i_sb->s_blocksize)) { - retval = -ENAMETOOLONG; - drop_new_inode(inode); - goto out_failed; - } - - name = reiserfs_kmalloc (item_len, GFP_NOFS, parent_dir->i_sb); - if (!name) { - drop_new_inode(inode); - retval = -ENOMEM; - goto out_failed; - } - memcpy (name, symname, strlen (symname)); - padd_item (name, item_len, strlen (symname)); - - /* We would inherit the default ACL here, but symlinks don't get ACLs */ - - retval = journal_begin(&th, parent_dir->i_sb, jbegin_count) ; - if (retval) { - drop_new_inode (inode); - reiserfs_kfree (name, item_len, parent_dir->i_sb); - goto out_failed; - } - - retval = reiserfs_new_inode (&th, parent_dir, mode, name, strlen (symname), - dentry, inode); - reiserfs_kfree (name, item_len, parent_dir->i_sb); - if (retval) { /* reiserfs_new_inode iputs for us */ - goto out_failed; - } - - reiserfs_update_inode_transaction(inode) ; - reiserfs_update_inode_transaction(parent_dir) ; - - inode->i_op = &reiserfs_symlink_inode_operations; - inode->i_mapping->a_ops = &reiserfs_address_space_operations; - - // must be sure this inode is written with this transaction - // - //reiserfs_update_sd (&th, inode, READ_BLOCKS); - - retval = reiserfs_add_entry (&th, parent_dir, dentry->d_name.name, - dentry->d_name.len, inode, 1/*visible*/); - if (retval) { - int err; + int retval, err; + struct inode *inode; + struct reiserfs_dir_entry de; + INITIALIZE_PATH(path); + struct reiserfs_transaction_handle th; + int jbegin_count; + unsigned long savelink; + + inode = dentry->d_inode; + + /* in this transaction we can be doing at max two balancings and update + * two stat datas, we change quotas of the owner of the directory and of + * the owner of the parent directory. The quota structure is possibly + * deleted only on iput => outside of this transaction */ + jbegin_count = + JOURNAL_PER_BALANCE_CNT * 2 + 2 + + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + + reiserfs_write_lock(dir->i_sb); + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) + goto out_unlink; + + de.de_gen_number_bit_string = NULL; + if ((retval = + reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, + &path, &de)) == NAME_NOT_FOUND) { + retval = -ENOENT; + goto end_unlink; + } else if (retval == IO_ERROR) { + retval = -EIO; + goto end_unlink; + } + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + if (de.de_objectid != inode->i_ino) { + // FIXME: compare key of an object and a key found in the + // entry + retval = -EIO; + goto end_unlink; + } + + if (!inode->i_nlink) { + reiserfs_warning(inode->i_sb, "%s: deleting nonexistent file " + "(%s:%lu), %d", __FUNCTION__, + reiserfs_bdevname(inode->i_sb), inode->i_ino, + inode->i_nlink); + inode->i_nlink = 1; + } + inode->i_nlink--; - reiserfs_update_sd (&th, inode); - err = journal_end(&th, parent_dir->i_sb, jbegin_count) ; + + /* + * we schedule before doing the add_save_link call, save the link + * count so we don't race + */ + savelink = inode->i_nlink; + + retval = + reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL, + 0); + if (retval < 0) { + inode->i_nlink++; + goto end_unlink; + } + inode->i_ctime = CURRENT_TIME_SEC; + reiserfs_update_sd(&th, inode); + + dir->i_size -= (de.de_entrylen + DEH_SIZE); + dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; + reiserfs_update_sd(&th, dir); + + if (!savelink) + /* prevent file from getting lost */ + add_save_link(&th, inode, 0 /* not truncate */ ); + + retval = journal_end(&th, dir->i_sb, jbegin_count); + reiserfs_check_path(&path); + reiserfs_write_unlock(dir->i_sb); + return retval; + + end_unlink: + pathrelse(&path); + err = journal_end(&th, dir->i_sb, jbegin_count); + reiserfs_check_path(&path); if (err) - retval = err; - iput (inode); - goto out_failed; - } - - d_instantiate(dentry, inode); - retval = journal_end(&th, parent_dir->i_sb, jbegin_count) ; -out_failed: - reiserfs_write_unlock(parent_dir->i_sb); - return retval; + retval = err; + out_unlink: + reiserfs_write_unlock(dir->i_sb); + return retval; } -static int reiserfs_link (struct dentry * old_dentry, struct inode * dir, struct dentry * dentry) +static int reiserfs_symlink(struct inode *parent_dir, + struct dentry *dentry, const char *symname) { - int retval; - struct inode *inode = old_dentry->d_inode; - struct reiserfs_transaction_handle th ; - /* We need blocks for transaction + update of quotas for the owners of the directory */ - int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * REISERFS_QUOTA_TRANS_BLOCKS; - - reiserfs_write_lock(dir->i_sb); - if (inode->i_nlink >= REISERFS_LINK_MAX) { - //FIXME: sd_nlink is 32 bit for new files - reiserfs_write_unlock(dir->i_sb); - return -EMLINK; - } - if (inode->i_nlink == 0) { - reiserfs_write_unlock(dir->i_sb); - return -ENOENT; - } - - /* inc before scheduling so reiserfs_unlink knows we are here */ - inode->i_nlink++; - - retval = journal_begin(&th, dir->i_sb, jbegin_count) ; - if (retval) { - inode->i_nlink--; - reiserfs_write_unlock (dir->i_sb); - return retval; - } - - /* create new entry */ - retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, - inode, 1/*visible*/); - - reiserfs_update_inode_transaction(inode) ; - reiserfs_update_inode_transaction(dir) ; - - if (retval) { - int err; - inode->i_nlink--; - err = journal_end(&th, dir->i_sb, jbegin_count) ; - reiserfs_write_unlock(dir->i_sb); - return err ? err : retval; - } + int retval; + struct inode *inode; + char *name; + int item_len; + struct reiserfs_transaction_handle th; + int mode = S_IFLNK | S_IRWXUGO; + /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + + 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) + + REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb)); + + if (!(inode = new_inode(parent_dir->i_sb))) { + return -ENOMEM; + } + new_inode_init(inode, parent_dir, mode); + + reiserfs_write_lock(parent_dir->i_sb); + item_len = ROUND_UP(strlen(symname)); + if (item_len > MAX_DIRECT_ITEM_LEN(parent_dir->i_sb->s_blocksize)) { + retval = -ENAMETOOLONG; + drop_new_inode(inode); + goto out_failed; + } + + name = reiserfs_kmalloc(item_len, GFP_NOFS, parent_dir->i_sb); + if (!name) { + drop_new_inode(inode); + retval = -ENOMEM; + goto out_failed; + } + memcpy(name, symname, strlen(symname)); + padd_item(name, item_len, strlen(symname)); + + /* We would inherit the default ACL here, but symlinks don't get ACLs */ + + retval = journal_begin(&th, parent_dir->i_sb, jbegin_count); + if (retval) { + drop_new_inode(inode); + reiserfs_kfree(name, item_len, parent_dir->i_sb); + goto out_failed; + } + + retval = + reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname), + dentry, inode); + reiserfs_kfree(name, item_len, parent_dir->i_sb); + if (retval) { /* reiserfs_new_inode iputs for us */ + goto out_failed; + } - inode->i_ctime = CURRENT_TIME_SEC; - reiserfs_update_sd (&th, inode); + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(parent_dir); + + inode->i_op = &reiserfs_symlink_inode_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + + // must be sure this inode is written with this transaction + // + //reiserfs_update_sd (&th, inode, READ_BLOCKS); + + retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + if (retval) { + int err; + inode->i_nlink--; + reiserfs_update_sd(&th, inode); + err = journal_end(&th, parent_dir->i_sb, jbegin_count); + if (err) + retval = err; + iput(inode); + goto out_failed; + } - atomic_inc(&inode->i_count) ; - d_instantiate(dentry, inode); - retval = journal_end(&th, dir->i_sb, jbegin_count) ; - reiserfs_write_unlock(dir->i_sb); - return retval; + d_instantiate(dentry, inode); + retval = journal_end(&th, parent_dir->i_sb, jbegin_count); + out_failed: + reiserfs_write_unlock(parent_dir->i_sb); + return retval; } +static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + int retval; + struct inode *inode = old_dentry->d_inode; + struct reiserfs_transaction_handle th; + /* We need blocks for transaction + update of quotas for the owners of the directory */ + int jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + + 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); + + reiserfs_write_lock(dir->i_sb); + if (inode->i_nlink >= REISERFS_LINK_MAX) { + //FIXME: sd_nlink is 32 bit for new files + reiserfs_write_unlock(dir->i_sb); + return -EMLINK; + } + if (inode->i_nlink == 0) { + reiserfs_write_unlock(dir->i_sb); + return -ENOENT; + } + + /* inc before scheduling so reiserfs_unlink knows we are here */ + inode->i_nlink++; + + retval = journal_begin(&th, dir->i_sb, jbegin_count); + if (retval) { + inode->i_nlink--; + reiserfs_write_unlock(dir->i_sb); + return retval; + } + + /* create new entry */ + retval = + reiserfs_add_entry(&th, dir, dentry->d_name.name, + dentry->d_name.len, inode, 1 /*visible */ ); + + reiserfs_update_inode_transaction(inode); + reiserfs_update_inode_transaction(dir); + + if (retval) { + int err; + inode->i_nlink--; + err = journal_end(&th, dir->i_sb, jbegin_count); + reiserfs_write_unlock(dir->i_sb); + return err ? err : retval; + } + + inode->i_ctime = CURRENT_TIME_SEC; + reiserfs_update_sd(&th, inode); + + atomic_inc(&inode->i_count); + d_instantiate(dentry, inode); + retval = journal_end(&th, dir->i_sb, jbegin_count); + reiserfs_write_unlock(dir->i_sb); + return retval; +} // de contains information pointing to an entry which -static int de_still_valid (const char * name, int len, struct reiserfs_dir_entry * de) +static int de_still_valid(const char *name, int len, + struct reiserfs_dir_entry *de) { - struct reiserfs_dir_entry tmp = *de; - - // recalculate pointer to name and name length - set_de_name_and_namelen (&tmp); - // FIXME: could check more - if (tmp.de_namelen != len || memcmp (name, de->de_name, len)) - return 0; - return 1; + struct reiserfs_dir_entry tmp = *de; + + // recalculate pointer to name and name length + set_de_name_and_namelen(&tmp); + // FIXME: could check more + if (tmp.de_namelen != len || memcmp(name, de->de_name, len)) + return 0; + return 1; } - -static int entry_points_to_object (const char * name, int len, struct reiserfs_dir_entry * de, struct inode * inode) +static int entry_points_to_object(const char *name, int len, + struct reiserfs_dir_entry *de, + struct inode *inode) { - if (!de_still_valid (name, len, de)) - return 0; - - if (inode) { - if (!de_visible (de->de_deh + de->de_entry_num)) - reiserfs_panic (NULL, "vs-7042: entry_points_to_object: entry must be visible"); - return (de->de_objectid == inode->i_ino) ? 1 : 0; - } + if (!de_still_valid(name, len, de)) + return 0; + + if (inode) { + if (!de_visible(de->de_deh + de->de_entry_num)) + reiserfs_panic(NULL, + "vs-7042: entry_points_to_object: entry must be visible"); + return (de->de_objectid == inode->i_ino) ? 1 : 0; + } - /* this must be added hidden entry */ - if (de_visible (de->de_deh + de->de_entry_num)) - reiserfs_panic (NULL, "vs-7043: entry_points_to_object: entry must be visible"); + /* this must be added hidden entry */ + if (de_visible(de->de_deh + de->de_entry_num)) + reiserfs_panic(NULL, + "vs-7043: entry_points_to_object: entry must be visible"); - return 1; + return 1; } - /* sets key of objectid the entry has to point to */ -static void set_ino_in_dir_entry (struct reiserfs_dir_entry * de, struct reiserfs_key * key) +static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de, + struct reiserfs_key *key) { - /* JDM These operations are endian safe - both are le */ - de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id; - de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid; + /* JDM These operations are endian safe - both are le */ + de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id; + de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid; } - /* * process, that is going to call fix_nodes/do_balance must hold only * one path. If it holds 2 or more, it can get into endless waiting in * get_empty_nodes or its clones */ -static int reiserfs_rename (struct inode * old_dir, struct dentry *old_dentry, - struct inode * new_dir, struct dentry *new_dentry) +static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) { - int retval; - INITIALIZE_PATH (old_entry_path); - INITIALIZE_PATH (new_entry_path); - INITIALIZE_PATH (dot_dot_entry_path); - struct item_head new_entry_ih, old_entry_ih, dot_dot_ih ; - struct reiserfs_dir_entry old_de, new_de, dot_dot_de; - struct inode * old_inode, * new_dentry_inode; - struct reiserfs_transaction_handle th ; - int jbegin_count ; - umode_t old_inode_mode; - unsigned long savelink = 1; - struct timespec ctime; - - /* three balancings: (1) old name removal, (2) new name insertion - and (3) maybe "save" link insertion - stat data updates: (1) old directory, - (2) new directory and (3) maybe old object stat data (when it is - directory) and (4) maybe stat data of object to which new entry - pointed initially and (5) maybe block containing ".." of - renamed directory - quota updates: two parent directories */ - jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 5 + 4 * REISERFS_QUOTA_TRANS_BLOCKS; - - old_inode = old_dentry->d_inode; - new_dentry_inode = new_dentry->d_inode; - - // make sure, that oldname still exists and points to an object we - // are going to rename - old_de.de_gen_number_bit_string = NULL; - reiserfs_write_lock(old_dir->i_sb); - retval = reiserfs_find_entry (old_dir, old_dentry->d_name.name, old_dentry->d_name.len, - &old_entry_path, &old_de); - pathrelse (&old_entry_path); - if (retval == IO_ERROR) { - reiserfs_write_unlock(old_dir->i_sb); - return -EIO; - } - - if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) { - reiserfs_write_unlock(old_dir->i_sb); - return -ENOENT; - } - - old_inode_mode = old_inode->i_mode; - if (S_ISDIR(old_inode_mode)) { - // make sure, that directory being renamed has correct ".." - // and that its new parent directory has not too many links - // already - - if (new_dentry_inode) { - if (!reiserfs_empty_dir(new_dentry_inode)) { + int retval; + INITIALIZE_PATH(old_entry_path); + INITIALIZE_PATH(new_entry_path); + INITIALIZE_PATH(dot_dot_entry_path); + struct item_head new_entry_ih, old_entry_ih, dot_dot_ih; + struct reiserfs_dir_entry old_de, new_de, dot_dot_de; + struct inode *old_inode, *new_dentry_inode; + struct reiserfs_transaction_handle th; + int jbegin_count; + umode_t old_inode_mode; + unsigned long savelink = 1; + struct timespec ctime; + + /* three balancings: (1) old name removal, (2) new name insertion + and (3) maybe "save" link insertion + stat data updates: (1) old directory, + (2) new directory and (3) maybe old object stat data (when it is + directory) and (4) maybe stat data of object to which new entry + pointed initially and (5) maybe block containing ".." of + renamed directory + quota updates: two parent directories */ + jbegin_count = + JOURNAL_PER_BALANCE_CNT * 3 + 5 + + 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb); + + old_inode = old_dentry->d_inode; + new_dentry_inode = new_dentry->d_inode; + + // make sure, that oldname still exists and points to an object we + // are going to rename + old_de.de_gen_number_bit_string = NULL; + reiserfs_write_lock(old_dir->i_sb); + retval = + reiserfs_find_entry(old_dir, old_dentry->d_name.name, + old_dentry->d_name.len, &old_entry_path, + &old_de); + pathrelse(&old_entry_path); + if (retval == IO_ERROR) { reiserfs_write_unlock(old_dir->i_sb); - return -ENOTEMPTY; - } + return -EIO; } - - /* directory is renamed, its parent directory will be changed, - ** so find ".." entry - */ - dot_dot_de.de_gen_number_bit_string = NULL; - retval = reiserfs_find_entry (old_inode, "..", 2, &dot_dot_entry_path, &dot_dot_de); - pathrelse (&dot_dot_entry_path); - if (retval != NAME_FOUND) { - reiserfs_write_unlock(old_dir->i_sb); - return -EIO; + + if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) { + reiserfs_write_unlock(old_dir->i_sb); + return -ENOENT; } - /* inode number of .. must equal old_dir->i_ino */ - if (dot_dot_de.de_objectid != old_dir->i_ino) { - reiserfs_write_unlock(old_dir->i_sb); - return -EIO; + old_inode_mode = old_inode->i_mode; + if (S_ISDIR(old_inode_mode)) { + // make sure, that directory being renamed has correct ".." + // and that its new parent directory has not too many links + // already + + if (new_dentry_inode) { + if (!reiserfs_empty_dir(new_dentry_inode)) { + reiserfs_write_unlock(old_dir->i_sb); + return -ENOTEMPTY; + } + } + + /* directory is renamed, its parent directory will be changed, + ** so find ".." entry + */ + dot_dot_de.de_gen_number_bit_string = NULL; + retval = + reiserfs_find_entry(old_inode, "..", 2, &dot_dot_entry_path, + &dot_dot_de); + pathrelse(&dot_dot_entry_path); + if (retval != NAME_FOUND) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + /* inode number of .. must equal old_dir->i_ino */ + if (dot_dot_de.de_objectid != old_dir->i_ino) { + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } } - } - - retval = journal_begin(&th, old_dir->i_sb, jbegin_count) ; - if (retval) { - reiserfs_write_unlock (old_dir->i_sb); - return retval; - } - - /* add new entry (or find the existing one) */ - retval = reiserfs_add_entry (&th, new_dir, new_dentry->d_name.name, new_dentry->d_name.len, - old_inode, 0); - if (retval == -EEXIST) { - if (!new_dentry_inode) { - reiserfs_panic (old_dir->i_sb, - "vs-7050: new entry is found, new inode == 0\n"); + + retval = journal_begin(&th, old_dir->i_sb, jbegin_count); + if (retval) { + reiserfs_write_unlock(old_dir->i_sb); + return retval; } - } else if (retval) { - int err = journal_end(&th, old_dir->i_sb, jbegin_count) ; - reiserfs_write_unlock(old_dir->i_sb); - return err ? err : retval; - } - - reiserfs_update_inode_transaction(old_dir) ; - reiserfs_update_inode_transaction(new_dir) ; - - /* this makes it so an fsync on an open fd for the old name will - ** commit the rename operation - */ - reiserfs_update_inode_transaction(old_inode) ; - - if (new_dentry_inode) - reiserfs_update_inode_transaction(new_dentry_inode) ; - - while (1) { - // look for old name using corresponding entry key (found by reiserfs_find_entry) - if ((retval = search_by_entry_key (new_dir->i_sb, &old_de.de_entry_key, - &old_entry_path, &old_de)) != NAME_FOUND) { - pathrelse(&old_entry_path); - journal_end(&th, old_dir->i_sb, jbegin_count); - reiserfs_write_unlock(old_dir->i_sb); - return -EIO; + + /* add new entry (or find the existing one) */ + retval = + reiserfs_add_entry(&th, new_dir, new_dentry->d_name.name, + new_dentry->d_name.len, old_inode, 0); + if (retval == -EEXIST) { + if (!new_dentry_inode) { + reiserfs_panic(old_dir->i_sb, + "vs-7050: new entry is found, new inode == 0\n"); + } + } else if (retval) { + int err = journal_end(&th, old_dir->i_sb, jbegin_count); + reiserfs_write_unlock(old_dir->i_sb); + return err ? err : retval; } - copy_item_head(&old_entry_ih, get_ih(&old_entry_path)) ; - - reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1) ; - - // look for new name by reiserfs_find_entry - new_de.de_gen_number_bit_string = NULL; - retval = reiserfs_find_entry (new_dir, new_dentry->d_name.name, new_dentry->d_name.len, - &new_entry_path, &new_de); - // reiserfs_add_entry should not return IO_ERROR, because it is called with essentially same parameters from - // reiserfs_add_entry above, and we'll catch any i/o errors before we get here. - if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) { - pathrelse(&new_entry_path); - pathrelse(&old_entry_path); - journal_end(&th, old_dir->i_sb, jbegin_count); - reiserfs_write_unlock(old_dir->i_sb); - return -EIO; + reiserfs_update_inode_transaction(old_dir); + reiserfs_update_inode_transaction(new_dir); + + /* this makes it so an fsync on an open fd for the old name will + ** commit the rename operation + */ + reiserfs_update_inode_transaction(old_inode); + + if (new_dentry_inode) + reiserfs_update_inode_transaction(new_dentry_inode); + + while (1) { + // look for old name using corresponding entry key (found by reiserfs_find_entry) + if ((retval = + search_by_entry_key(new_dir->i_sb, &old_de.de_entry_key, + &old_entry_path, + &old_de)) != NAME_FOUND) { + pathrelse(&old_entry_path); + journal_end(&th, old_dir->i_sb, jbegin_count); + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + copy_item_head(&old_entry_ih, get_ih(&old_entry_path)); + + reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1); + + // look for new name by reiserfs_find_entry + new_de.de_gen_number_bit_string = NULL; + retval = + reiserfs_find_entry(new_dir, new_dentry->d_name.name, + new_dentry->d_name.len, &new_entry_path, + &new_de); + // reiserfs_add_entry should not return IO_ERROR, because it is called with essentially same parameters from + // reiserfs_add_entry above, and we'll catch any i/o errors before we get here. + if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) { + pathrelse(&new_entry_path); + pathrelse(&old_entry_path); + journal_end(&th, old_dir->i_sb, jbegin_count); + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + + copy_item_head(&new_entry_ih, get_ih(&new_entry_path)); + + reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1); + + if (S_ISDIR(old_inode->i_mode)) { + if ((retval = + search_by_entry_key(new_dir->i_sb, + &dot_dot_de.de_entry_key, + &dot_dot_entry_path, + &dot_dot_de)) != NAME_FOUND) { + pathrelse(&dot_dot_entry_path); + pathrelse(&new_entry_path); + pathrelse(&old_entry_path); + journal_end(&th, old_dir->i_sb, jbegin_count); + reiserfs_write_unlock(old_dir->i_sb); + return -EIO; + } + copy_item_head(&dot_dot_ih, + get_ih(&dot_dot_entry_path)); + // node containing ".." gets into transaction + reiserfs_prepare_for_journal(old_inode->i_sb, + dot_dot_de.de_bh, 1); + } + /* we should check seals here, not do + this stuff, yes? Then, having + gathered everything into RAM we + should lock the buffers, yes? -Hans */ + /* probably. our rename needs to hold more + ** than one path at once. The seals would + ** have to be written to deal with multi-path + ** issues -chris + */ + /* sanity checking before doing the rename - avoid races many + ** of the above checks could have scheduled. We have to be + ** sure our items haven't been shifted by another process. + */ + if (item_moved(&new_entry_ih, &new_entry_path) || + !entry_points_to_object(new_dentry->d_name.name, + new_dentry->d_name.len, + &new_de, new_dentry_inode) || + item_moved(&old_entry_ih, &old_entry_path) || + !entry_points_to_object(old_dentry->d_name.name, + old_dentry->d_name.len, + &old_de, old_inode)) { + reiserfs_restore_prepared_buffer(old_inode->i_sb, + new_de.de_bh); + reiserfs_restore_prepared_buffer(old_inode->i_sb, + old_de.de_bh); + if (S_ISDIR(old_inode_mode)) + reiserfs_restore_prepared_buffer(old_inode-> + i_sb, + dot_dot_de. + de_bh); + continue; + } + if (S_ISDIR(old_inode_mode)) { + if (item_moved(&dot_dot_ih, &dot_dot_entry_path) || + !entry_points_to_object("..", 2, &dot_dot_de, + old_dir)) { + reiserfs_restore_prepared_buffer(old_inode-> + i_sb, + old_de.de_bh); + reiserfs_restore_prepared_buffer(old_inode-> + i_sb, + new_de.de_bh); + reiserfs_restore_prepared_buffer(old_inode-> + i_sb, + dot_dot_de. + de_bh); + continue; + } + } + + RFALSE(S_ISDIR(old_inode_mode) && + !buffer_journal_prepared(dot_dot_de.de_bh), ""); + + break; } - copy_item_head(&new_entry_ih, get_ih(&new_entry_path)) ; + /* ok, all the changes can be done in one fell swoop when we + have claimed all the buffers needed. */ - reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1) ; + mark_de_visible(new_de.de_deh + new_de.de_entry_num); + set_ino_in_dir_entry(&new_de, INODE_PKEY(old_inode)); + journal_mark_dirty(&th, old_dir->i_sb, new_de.de_bh); - if (S_ISDIR(old_inode->i_mode)) { - if ((retval = search_by_entry_key (new_dir->i_sb, &dot_dot_de.de_entry_key, - &dot_dot_entry_path, &dot_dot_de)) != NAME_FOUND) { - pathrelse(&dot_dot_entry_path); - pathrelse(&new_entry_path); - pathrelse(&old_entry_path); - journal_end(&th, old_dir->i_sb, jbegin_count); - reiserfs_write_unlock(old_dir->i_sb); - return -EIO; - } - copy_item_head(&dot_dot_ih, get_ih(&dot_dot_entry_path)) ; - // node containing ".." gets into transaction - reiserfs_prepare_for_journal(old_inode->i_sb, dot_dot_de.de_bh, 1) ; - } - /* we should check seals here, not do - this stuff, yes? Then, having - gathered everything into RAM we - should lock the buffers, yes? -Hans */ - /* probably. our rename needs to hold more - ** than one path at once. The seals would - ** have to be written to deal with multi-path - ** issues -chris - */ - /* sanity checking before doing the rename - avoid races many - ** of the above checks could have scheduled. We have to be - ** sure our items haven't been shifted by another process. - */ - if (item_moved(&new_entry_ih, &new_entry_path) || - !entry_points_to_object(new_dentry->d_name.name, - new_dentry->d_name.len, - &new_de, new_dentry_inode) || - item_moved(&old_entry_ih, &old_entry_path) || - !entry_points_to_object (old_dentry->d_name.name, - old_dentry->d_name.len, - &old_de, old_inode)) { - reiserfs_restore_prepared_buffer (old_inode->i_sb, new_de.de_bh); - reiserfs_restore_prepared_buffer (old_inode->i_sb, old_de.de_bh); - if (S_ISDIR(old_inode_mode)) - reiserfs_restore_prepared_buffer (old_inode->i_sb, dot_dot_de.de_bh); - continue; + mark_de_hidden(old_de.de_deh + old_de.de_entry_num); + journal_mark_dirty(&th, old_dir->i_sb, old_de.de_bh); + ctime = CURRENT_TIME_SEC; + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + /* thanks to Alex Adriaanse <alex_a@caltech.edu> for patch which adds ctime update of + renamed object */ + old_inode->i_ctime = ctime; + + if (new_dentry_inode) { + // adjust link number of the victim + if (S_ISDIR(new_dentry_inode->i_mode)) { + new_dentry_inode->i_nlink = 0; + } else { + new_dentry_inode->i_nlink--; + } + new_dentry_inode->i_ctime = ctime; + savelink = new_dentry_inode->i_nlink; } + if (S_ISDIR(old_inode_mode)) { - if ( item_moved(&dot_dot_ih, &dot_dot_entry_path) || - !entry_points_to_object ( "..", 2, &dot_dot_de, old_dir) ) { - reiserfs_restore_prepared_buffer (old_inode->i_sb, old_de.de_bh); - reiserfs_restore_prepared_buffer (old_inode->i_sb, new_de.de_bh); - reiserfs_restore_prepared_buffer (old_inode->i_sb, dot_dot_de.de_bh); - continue; - } + // adjust ".." of renamed directory + set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir)); + journal_mark_dirty(&th, new_dir->i_sb, dot_dot_de.de_bh); + + if (!new_dentry_inode) + /* there (in new_dir) was no directory, so it got new link + (".." of renamed directory) */ + INC_DIR_INODE_NLINK(new_dir); + + /* old directory lost one link - ".. " of renamed directory */ + DEC_DIR_INODE_NLINK(old_dir); } + // looks like in 2.3.99pre3 brelse is atomic. so we can use pathrelse + pathrelse(&new_entry_path); + pathrelse(&dot_dot_entry_path); - RFALSE( S_ISDIR(old_inode_mode) && - !buffer_journal_prepared(dot_dot_de.de_bh), "" ); - - break; - } - - /* ok, all the changes can be done in one fell swoop when we - have claimed all the buffers needed.*/ - - mark_de_visible (new_de.de_deh + new_de.de_entry_num); - set_ino_in_dir_entry (&new_de, INODE_PKEY (old_inode)); - journal_mark_dirty (&th, old_dir->i_sb, new_de.de_bh); - - mark_de_hidden (old_de.de_deh + old_de.de_entry_num); - journal_mark_dirty (&th, old_dir->i_sb, old_de.de_bh); - ctime = CURRENT_TIME_SEC; - old_dir->i_ctime = old_dir->i_mtime = ctime; - new_dir->i_ctime = new_dir->i_mtime = ctime; - /* thanks to Alex Adriaanse <alex_a@caltech.edu> for patch which adds ctime update of - renamed object */ - old_inode->i_ctime = ctime; - - if (new_dentry_inode) { - // adjust link number of the victim - if (S_ISDIR(new_dentry_inode->i_mode)) { - new_dentry_inode->i_nlink = 0; - } else { - new_dentry_inode->i_nlink--; + // FIXME: this reiserfs_cut_from_item's return value may screw up + // anybody, but it will panic if will not be able to find the + // entry. This needs one more clean up + if (reiserfs_cut_from_item + (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL, + 0) < 0) + reiserfs_warning(old_dir->i_sb, + "vs-7060: reiserfs_rename: couldn't not cut old name. Fsck later?"); + + old_dir->i_size -= DEH_SIZE + old_de.de_entrylen; + + reiserfs_update_sd(&th, old_dir); + reiserfs_update_sd(&th, new_dir); + reiserfs_update_sd(&th, old_inode); + + if (new_dentry_inode) { + if (savelink == 0) + add_save_link(&th, new_dentry_inode, + 0 /* not truncate */ ); + reiserfs_update_sd(&th, new_dentry_inode); } - new_dentry_inode->i_ctime = ctime; - savelink = new_dentry_inode->i_nlink; - } - - if (S_ISDIR(old_inode_mode)) { - // adjust ".." of renamed directory - set_ino_in_dir_entry (&dot_dot_de, INODE_PKEY (new_dir)); - journal_mark_dirty (&th, new_dir->i_sb, dot_dot_de.de_bh); - - if (!new_dentry_inode) - /* there (in new_dir) was no directory, so it got new link - (".." of renamed directory) */ - INC_DIR_INODE_NLINK(new_dir); - - /* old directory lost one link - ".. " of renamed directory */ - DEC_DIR_INODE_NLINK(old_dir); - } - - // looks like in 2.3.99pre3 brelse is atomic. so we can use pathrelse - pathrelse (&new_entry_path); - pathrelse (&dot_dot_entry_path); - - // FIXME: this reiserfs_cut_from_item's return value may screw up - // anybody, but it will panic if will not be able to find the - // entry. This needs one more clean up - if (reiserfs_cut_from_item (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL, 0) < 0) - reiserfs_warning (old_dir->i_sb, "vs-7060: reiserfs_rename: couldn't not cut old name. Fsck later?"); - - old_dir->i_size -= DEH_SIZE + old_de.de_entrylen; - - reiserfs_update_sd (&th, old_dir); - reiserfs_update_sd (&th, new_dir); - reiserfs_update_sd (&th, old_inode); - - if (new_dentry_inode) { - if (savelink == 0) - add_save_link (&th, new_dentry_inode, 0/* not truncate */); - reiserfs_update_sd (&th, new_dentry_inode); - } - - retval = journal_end(&th, old_dir->i_sb, jbegin_count) ; - reiserfs_write_unlock(old_dir->i_sb); - return retval; + + retval = journal_end(&th, old_dir->i_sb, jbegin_count); + reiserfs_write_unlock(old_dir->i_sb); + return retval; } /* * directories can handle most operations... */ struct inode_operations reiserfs_dir_inode_operations = { - //&reiserfs_dir_operations, /* default_file_ops */ - .create = reiserfs_create, - .lookup = reiserfs_lookup, - .link = reiserfs_link, - .unlink = reiserfs_unlink, - .symlink = reiserfs_symlink, - .mkdir = reiserfs_mkdir, - .rmdir = reiserfs_rmdir, - .mknod = reiserfs_mknod, - .rename = reiserfs_rename, - .setattr = reiserfs_setattr, - .setxattr = reiserfs_setxattr, - .getxattr = reiserfs_getxattr, - .listxattr = reiserfs_listxattr, - .removexattr = reiserfs_removexattr, - .permission = reiserfs_permission, + //&reiserfs_dir_operations, /* default_file_ops */ + .create = reiserfs_create, + .lookup = reiserfs_lookup, + .link = reiserfs_link, + .unlink = reiserfs_unlink, + .symlink = reiserfs_symlink, + .mkdir = reiserfs_mkdir, + .rmdir = reiserfs_rmdir, + .mknod = reiserfs_mknod, + .rename = reiserfs_rename, + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, }; /* @@ -1464,28 +1557,27 @@ struct inode_operations reiserfs_dir_inode_operations = { * stuff added */ struct inode_operations reiserfs_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, - .setattr = reiserfs_setattr, - .setxattr = reiserfs_setxattr, - .getxattr = reiserfs_getxattr, - .listxattr = reiserfs_listxattr, - .removexattr = reiserfs_removexattr, - .permission = reiserfs_permission, + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, }; - /* * special file operations.. just xattr/acl stuff */ struct inode_operations reiserfs_special_inode_operations = { - .setattr = reiserfs_setattr, - .setxattr = reiserfs_setxattr, - .getxattr = reiserfs_getxattr, - .listxattr = reiserfs_listxattr, - .removexattr = reiserfs_removexattr, - .permission = reiserfs_permission, + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, }; diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c index bfe8e25ef29..f62590aa9c9 100644 --- a/fs/reiserfs/objectid.c +++ b/fs/reiserfs/objectid.c @@ -14,24 +14,24 @@ (__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\ (__le32 *)((rs) + 1)) - #ifdef CONFIG_REISERFS_CHECK -static void check_objectid_map (struct super_block * s, __le32 * map) +static void check_objectid_map(struct super_block *s, __le32 * map) { - if (le32_to_cpu (map[0]) != 1) - reiserfs_panic (s, "vs-15010: check_objectid_map: map corrupted: %lx", - ( long unsigned int ) le32_to_cpu (map[0])); + if (le32_to_cpu(map[0]) != 1) + reiserfs_panic(s, + "vs-15010: check_objectid_map: map corrupted: %lx", + (long unsigned int)le32_to_cpu(map[0])); - // FIXME: add something else here + // FIXME: add something else here } #else -static void check_objectid_map (struct super_block * s, __le32 * map) -{;} +static void check_objectid_map(struct super_block *s, __le32 * map) +{; +} #endif - /* When we allocate objectids we allocate the first unused objectid. Each sequence of objectids in use (the odd sequences) is followed by a sequence of objectids not in use (the even sequences). We @@ -46,161 +46,162 @@ static void check_objectid_map (struct super_block * s, __le32 * map) interesting optimizations of layout could result from complicating objectid assignment, but we have deferred making them for now. */ - /* get unique object identifier */ -__u32 reiserfs_get_unused_objectid (struct reiserfs_transaction_handle *th) +__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th) { - struct super_block * s = th->t_super; - struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s); - __le32 * map = objectid_map (s, rs); - __u32 unused_objectid; - - BUG_ON (!th->t_trans_id); + struct super_block *s = th->t_super; + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); + __le32 *map = objectid_map(s, rs); + __u32 unused_objectid; + + BUG_ON(!th->t_trans_id); + + check_objectid_map(s, map); + + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + /* comment needed -Hans */ + unused_objectid = le32_to_cpu(map[1]); + if (unused_objectid == U32_MAX) { + reiserfs_warning(s, "%s: no more object ids", __FUNCTION__); + reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s)); + return 0; + } - check_objectid_map (s, map); + /* This incrementation allocates the first unused objectid. That + is to say, the first entry on the objectid map is the first + unused objectid, and by incrementing it we use it. See below + where we check to see if we eliminated a sequence of unused + objectids.... */ + map[1] = cpu_to_le32(unused_objectid + 1); + + /* Now we check to see if we eliminated the last remaining member of + the first even sequence (and can eliminate the sequence by + eliminating its last objectid from oids), and can collapse the + first two odd sequences into one sequence. If so, then the net + result is to eliminate a pair of objectids from oids. We do this + by shifting the entire map to the left. */ + if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) { + memmove(map + 1, map + 3, + (sb_oid_cursize(rs) - 3) * sizeof(__u32)); + set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2); + } - reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; - /* comment needed -Hans */ - unused_objectid = le32_to_cpu (map[1]); - if (unused_objectid == U32_MAX) { - reiserfs_warning (s, "%s: no more object ids", __FUNCTION__); - reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s)) ; - return 0; - } - - /* This incrementation allocates the first unused objectid. That - is to say, the first entry on the objectid map is the first - unused objectid, and by incrementing it we use it. See below - where we check to see if we eliminated a sequence of unused - objectids.... */ - map[1] = cpu_to_le32 (unused_objectid + 1); - - /* Now we check to see if we eliminated the last remaining member of - the first even sequence (and can eliminate the sequence by - eliminating its last objectid from oids), and can collapse the - first two odd sequences into one sequence. If so, then the net - result is to eliminate a pair of objectids from oids. We do this - by shifting the entire map to the left. */ - if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) { - memmove (map + 1, map + 3, (sb_oid_cursize(rs) - 3) * sizeof(__u32)); - set_sb_oid_cursize( rs, sb_oid_cursize(rs) - 2 ); - } - - journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); - return unused_objectid; + journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s)); + return unused_objectid; } - /* makes object identifier unused */ -void reiserfs_release_objectid (struct reiserfs_transaction_handle *th, - __u32 objectid_to_release) +void reiserfs_release_objectid(struct reiserfs_transaction_handle *th, + __u32 objectid_to_release) { - struct super_block * s = th->t_super; - struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s); - __le32 * map = objectid_map (s, rs); - int i = 0; - - BUG_ON (!th->t_trans_id); - //return; - check_objectid_map (s, map); - - reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; - journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); - - /* start at the beginning of the objectid map (i = 0) and go to - the end of it (i = disk_sb->s_oid_cursize). Linear search is - what we use, though it is possible that binary search would be - more efficient after performing lots of deletions (which is - when oids is large.) We only check even i's. */ - while (i < sb_oid_cursize(rs)) { - if (objectid_to_release == le32_to_cpu (map[i])) { - /* This incrementation unallocates the objectid. */ - //map[i]++; - map[i] = cpu_to_le32 (le32_to_cpu (map[i]) + 1); - - /* Did we unallocate the last member of an odd sequence, and can shrink oids? */ - if (map[i] == map[i+1]) { - /* shrink objectid map */ - memmove (map + i, map + i + 2, - (sb_oid_cursize(rs) - i - 2) * sizeof (__u32)); - //disk_sb->s_oid_cursize -= 2; - set_sb_oid_cursize( rs, sb_oid_cursize(rs) - 2 ); - - RFALSE( sb_oid_cursize(rs) < 2 || - sb_oid_cursize(rs) > sb_oid_maxsize(rs), - "vs-15005: objectid map corrupted cur_size == %d (max == %d)", - sb_oid_cursize(rs), sb_oid_maxsize(rs)); - } - return; + struct super_block *s = th->t_super; + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); + __le32 *map = objectid_map(s, rs); + int i = 0; + + BUG_ON(!th->t_trans_id); + //return; + check_objectid_map(s, map); + + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s)); + + /* start at the beginning of the objectid map (i = 0) and go to + the end of it (i = disk_sb->s_oid_cursize). Linear search is + what we use, though it is possible that binary search would be + more efficient after performing lots of deletions (which is + when oids is large.) We only check even i's. */ + while (i < sb_oid_cursize(rs)) { + if (objectid_to_release == le32_to_cpu(map[i])) { + /* This incrementation unallocates the objectid. */ + //map[i]++; + map[i] = cpu_to_le32(le32_to_cpu(map[i]) + 1); + + /* Did we unallocate the last member of an odd sequence, and can shrink oids? */ + if (map[i] == map[i + 1]) { + /* shrink objectid map */ + memmove(map + i, map + i + 2, + (sb_oid_cursize(rs) - i - + 2) * sizeof(__u32)); + //disk_sb->s_oid_cursize -= 2; + set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2); + + RFALSE(sb_oid_cursize(rs) < 2 || + sb_oid_cursize(rs) > sb_oid_maxsize(rs), + "vs-15005: objectid map corrupted cur_size == %d (max == %d)", + sb_oid_cursize(rs), sb_oid_maxsize(rs)); + } + return; + } + + if (objectid_to_release > le32_to_cpu(map[i]) && + objectid_to_release < le32_to_cpu(map[i + 1])) { + /* size of objectid map is not changed */ + if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) { + //objectid_map[i+1]--; + map[i + 1] = + cpu_to_le32(le32_to_cpu(map[i + 1]) - 1); + return; + } + + /* JDM comparing two little-endian values for equality -- safe */ + if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) { + /* objectid map must be expanded, but there is no space */ + PROC_INFO_INC(s, leaked_oid); + return; + } + + /* expand the objectid map */ + memmove(map + i + 3, map + i + 1, + (sb_oid_cursize(rs) - i - 1) * sizeof(__u32)); + map[i + 1] = cpu_to_le32(objectid_to_release); + map[i + 2] = cpu_to_le32(objectid_to_release + 1); + set_sb_oid_cursize(rs, sb_oid_cursize(rs) + 2); + return; + } + i += 2; } - if (objectid_to_release > le32_to_cpu (map[i]) && - objectid_to_release < le32_to_cpu (map[i + 1])) { - /* size of objectid map is not changed */ - if (objectid_to_release + 1 == le32_to_cpu (map[i + 1])) { - //objectid_map[i+1]--; - map[i + 1] = cpu_to_le32 (le32_to_cpu (map[i + 1]) - 1); - return; - } - - /* JDM comparing two little-endian values for equality -- safe */ - if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) { - /* objectid map must be expanded, but there is no space */ - PROC_INFO_INC( s, leaked_oid ); - return; - } + reiserfs_warning(s, + "vs-15011: reiserfs_release_objectid: tried to free free object id (%lu)", + (long unsigned)objectid_to_release); +} - /* expand the objectid map*/ - memmove (map + i + 3, map + i + 1, - (sb_oid_cursize(rs) - i - 1) * sizeof(__u32)); - map[i + 1] = cpu_to_le32 (objectid_to_release); - map[i + 2] = cpu_to_le32 (objectid_to_release + 1); - set_sb_oid_cursize( rs, sb_oid_cursize(rs) + 2 ); - return; +int reiserfs_convert_objectid_map_v1(struct super_block *s) +{ + struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK(s); + int cur_size = sb_oid_cursize(disk_sb); + int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2; + int old_max = sb_oid_maxsize(disk_sb); + struct reiserfs_super_block_v1 *disk_sb_v1; + __le32 *objectid_map, *new_objectid_map; + int i; + + disk_sb_v1 = + (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data); + objectid_map = (__le32 *) (disk_sb_v1 + 1); + new_objectid_map = (__le32 *) (disk_sb + 1); + + if (cur_size > new_size) { + /* mark everyone used that was listed as free at the end of the objectid + ** map + */ + objectid_map[new_size - 1] = objectid_map[cur_size - 1]; + set_sb_oid_cursize(disk_sb, new_size); + } + /* move the smaller objectid map past the end of the new super */ + for (i = new_size - 1; i >= 0; i--) { + objectid_map[i + (old_max - new_size)] = objectid_map[i]; } - i += 2; - } - reiserfs_warning (s, "vs-15011: reiserfs_release_objectid: tried to free free object id (%lu)", - ( long unsigned ) objectid_to_release); -} + /* set the max size so we don't overflow later */ + set_sb_oid_maxsize(disk_sb, new_size); + /* Zero out label and generate random UUID */ + memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label)); + generate_random_uuid(disk_sb->s_uuid); -int reiserfs_convert_objectid_map_v1(struct super_block *s) { - struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK (s); - int cur_size = sb_oid_cursize(disk_sb); - int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2 ; - int old_max = sb_oid_maxsize(disk_sb); - struct reiserfs_super_block_v1 *disk_sb_v1 ; - __le32 *objectid_map, *new_objectid_map ; - int i ; - - disk_sb_v1=(struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data); - objectid_map = (__le32 *)(disk_sb_v1 + 1) ; - new_objectid_map = (__le32 *)(disk_sb + 1) ; - - if (cur_size > new_size) { - /* mark everyone used that was listed as free at the end of the objectid - ** map - */ - objectid_map[new_size - 1] = objectid_map[cur_size - 1] ; - set_sb_oid_cursize(disk_sb,new_size) ; - } - /* move the smaller objectid map past the end of the new super */ - for (i = new_size - 1 ; i >= 0 ; i--) { - objectid_map[i + (old_max - new_size)] = objectid_map[i] ; - } - - - /* set the max size so we don't overflow later */ - set_sb_oid_maxsize(disk_sb,new_size) ; - - /* Zero out label and generate random UUID */ - memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label)) ; - generate_random_uuid(disk_sb->s_uuid); - - /* finally, zero out the unused chunk of the new super */ - memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused)) ; - return 0 ; + /* finally, zero out the unused chunk of the new super */ + memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused)); + return 0; } - diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 16fdca1d4bd..d55e164bd5c 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -15,168 +15,166 @@ static char error_buf[1024]; static char fmt_buf[1024]; static char off_buf[80]; - -static char * reiserfs_cpu_offset (struct cpu_key * key) +static char *reiserfs_cpu_offset(struct cpu_key *key) { - if (cpu_key_k_type(key) == TYPE_DIRENTRY) - sprintf (off_buf, "%Lu(%Lu)", - (unsigned long long)GET_HASH_VALUE (cpu_key_k_offset (key)), - (unsigned long long)GET_GENERATION_NUMBER (cpu_key_k_offset (key))); - else - sprintf (off_buf, "0x%Lx", (unsigned long long)cpu_key_k_offset (key)); - return off_buf; + if (cpu_key_k_type(key) == TYPE_DIRENTRY) + sprintf(off_buf, "%Lu(%Lu)", + (unsigned long long) + GET_HASH_VALUE(cpu_key_k_offset(key)), + (unsigned long long) + GET_GENERATION_NUMBER(cpu_key_k_offset(key))); + else + sprintf(off_buf, "0x%Lx", + (unsigned long long)cpu_key_k_offset(key)); + return off_buf; } - -static char * le_offset (struct reiserfs_key * key) +static char *le_offset(struct reiserfs_key *key) { - int version; + int version; - version = le_key_version (key); - if (le_key_k_type (version, key) == TYPE_DIRENTRY) - sprintf (off_buf, "%Lu(%Lu)", - (unsigned long long)GET_HASH_VALUE (le_key_k_offset (version, key)), - (unsigned long long)GET_GENERATION_NUMBER (le_key_k_offset (version, key))); - else - sprintf (off_buf, "0x%Lx", (unsigned long long)le_key_k_offset (version, key)); - return off_buf; + version = le_key_version(key); + if (le_key_k_type(version, key) == TYPE_DIRENTRY) + sprintf(off_buf, "%Lu(%Lu)", + (unsigned long long) + GET_HASH_VALUE(le_key_k_offset(version, key)), + (unsigned long long) + GET_GENERATION_NUMBER(le_key_k_offset(version, key))); + else + sprintf(off_buf, "0x%Lx", + (unsigned long long)le_key_k_offset(version, key)); + return off_buf; } - -static char * cpu_type (struct cpu_key * key) +static char *cpu_type(struct cpu_key *key) { - if (cpu_key_k_type (key) == TYPE_STAT_DATA) - return "SD"; - if (cpu_key_k_type (key) == TYPE_DIRENTRY) - return "DIR"; - if (cpu_key_k_type (key) == TYPE_DIRECT) - return "DIRECT"; - if (cpu_key_k_type (key) == TYPE_INDIRECT) - return "IND"; - return "UNKNOWN"; + if (cpu_key_k_type(key) == TYPE_STAT_DATA) + return "SD"; + if (cpu_key_k_type(key) == TYPE_DIRENTRY) + return "DIR"; + if (cpu_key_k_type(key) == TYPE_DIRECT) + return "DIRECT"; + if (cpu_key_k_type(key) == TYPE_INDIRECT) + return "IND"; + return "UNKNOWN"; } - -static char * le_type (struct reiserfs_key * key) +static char *le_type(struct reiserfs_key *key) { - int version; - - version = le_key_version (key); + int version; - if (le_key_k_type (version, key) == TYPE_STAT_DATA) - return "SD"; - if (le_key_k_type (version, key) == TYPE_DIRENTRY) - return "DIR"; - if (le_key_k_type (version, key) == TYPE_DIRECT) - return "DIRECT"; - if (le_key_k_type (version, key) == TYPE_INDIRECT) - return "IND"; - return "UNKNOWN"; -} + version = le_key_version(key); + if (le_key_k_type(version, key) == TYPE_STAT_DATA) + return "SD"; + if (le_key_k_type(version, key) == TYPE_DIRENTRY) + return "DIR"; + if (le_key_k_type(version, key) == TYPE_DIRECT) + return "DIRECT"; + if (le_key_k_type(version, key) == TYPE_INDIRECT) + return "IND"; + return "UNKNOWN"; +} /* %k */ -static void sprintf_le_key (char * buf, struct reiserfs_key * key) +static void sprintf_le_key(char *buf, struct reiserfs_key *key) { - if (key) - sprintf (buf, "[%d %d %s %s]", le32_to_cpu (key->k_dir_id), - le32_to_cpu (key->k_objectid), le_offset (key), le_type (key)); - else - sprintf (buf, "[NULL]"); + if (key) + sprintf(buf, "[%d %d %s %s]", le32_to_cpu(key->k_dir_id), + le32_to_cpu(key->k_objectid), le_offset(key), + le_type(key)); + else + sprintf(buf, "[NULL]"); } - /* %K */ -static void sprintf_cpu_key (char * buf, struct cpu_key * key) +static void sprintf_cpu_key(char *buf, struct cpu_key *key) { - if (key) - sprintf (buf, "[%d %d %s %s]", key->on_disk_key.k_dir_id, - key->on_disk_key.k_objectid, reiserfs_cpu_offset (key), - cpu_type (key)); - else - sprintf (buf, "[NULL]"); + if (key) + sprintf(buf, "[%d %d %s %s]", key->on_disk_key.k_dir_id, + key->on_disk_key.k_objectid, reiserfs_cpu_offset(key), + cpu_type(key)); + else + sprintf(buf, "[NULL]"); } -static void sprintf_de_head( char *buf, struct reiserfs_de_head *deh ) +static void sprintf_de_head(char *buf, struct reiserfs_de_head *deh) { - if( deh ) - sprintf( buf, "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]", deh_offset(deh), deh_dir_id(deh), - deh_objectid(deh), deh_location(deh), deh_state(deh) ); - else - sprintf( buf, "[NULL]" ); + if (deh) + sprintf(buf, + "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]", + deh_offset(deh), deh_dir_id(deh), deh_objectid(deh), + deh_location(deh), deh_state(deh)); + else + sprintf(buf, "[NULL]"); } -static void sprintf_item_head (char * buf, struct item_head * ih) +static void sprintf_item_head(char *buf, struct item_head *ih) { - if (ih) { - strcpy (buf, (ih_version (ih) == KEY_FORMAT_3_6) ? "*3.6* " : "*3.5*"); - sprintf_le_key (buf + strlen (buf), &(ih->ih_key)); - sprintf (buf + strlen (buf), ", item_len %d, item_location %d, " - "free_space(entry_count) %d", - ih_item_len(ih), ih_location(ih), ih_free_space (ih)); - } else - sprintf (buf, "[NULL]"); + if (ih) { + strcpy(buf, + (ih_version(ih) == KEY_FORMAT_3_6) ? "*3.6* " : "*3.5*"); + sprintf_le_key(buf + strlen(buf), &(ih->ih_key)); + sprintf(buf + strlen(buf), ", item_len %d, item_location %d, " + "free_space(entry_count) %d", + ih_item_len(ih), ih_location(ih), ih_free_space(ih)); + } else + sprintf(buf, "[NULL]"); } - -static void sprintf_direntry (char * buf, struct reiserfs_dir_entry * de) +static void sprintf_direntry(char *buf, struct reiserfs_dir_entry *de) { - char name[20]; + char name[20]; - memcpy (name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen); - name [de->de_namelen > 19 ? 19 : de->de_namelen] = 0; - sprintf (buf, "\"%s\"==>[%d %d]", name, de->de_dir_id, de->de_objectid); + memcpy(name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen); + name[de->de_namelen > 19 ? 19 : de->de_namelen] = 0; + sprintf(buf, "\"%s\"==>[%d %d]", name, de->de_dir_id, de->de_objectid); } - -static void sprintf_block_head (char * buf, struct buffer_head * bh) +static void sprintf_block_head(char *buf, struct buffer_head *bh) { - sprintf (buf, "level=%d, nr_items=%d, free_space=%d rdkey ", - B_LEVEL (bh), B_NR_ITEMS (bh), B_FREE_SPACE (bh)); + sprintf(buf, "level=%d, nr_items=%d, free_space=%d rdkey ", + B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh)); } - -static void sprintf_buffer_head (char * buf, struct buffer_head * bh) +static void sprintf_buffer_head(char *buf, struct buffer_head *bh) { - char b[BDEVNAME_SIZE]; + char b[BDEVNAME_SIZE]; - sprintf (buf, "dev %s, size %d, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)", - bdevname (bh->b_bdev, b), bh->b_size, - (unsigned long long)bh->b_blocknr, - atomic_read (&(bh->b_count)), - bh->b_state, bh->b_page, - buffer_uptodate (bh) ? "UPTODATE" : "!UPTODATE", - buffer_dirty (bh) ? "DIRTY" : "CLEAN", - buffer_locked (bh) ? "LOCKED" : "UNLOCKED"); + sprintf(buf, + "dev %s, size %d, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)", + bdevname(bh->b_bdev, b), bh->b_size, + (unsigned long long)bh->b_blocknr, atomic_read(&(bh->b_count)), + bh->b_state, bh->b_page, + buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE", + buffer_dirty(bh) ? "DIRTY" : "CLEAN", + buffer_locked(bh) ? "LOCKED" : "UNLOCKED"); } - -static void sprintf_disk_child (char * buf, struct disk_child * dc) +static void sprintf_disk_child(char *buf, struct disk_child *dc) { - sprintf (buf, "[dc_number=%d, dc_size=%u]", dc_block_number(dc), dc_size(dc)); + sprintf(buf, "[dc_number=%d, dc_size=%u]", dc_block_number(dc), + dc_size(dc)); } - -static char * is_there_reiserfs_struct (char * fmt, int * what, int * skip) +static char *is_there_reiserfs_struct(char *fmt, int *what, int *skip) { - char * k = fmt; + char *k = fmt; - *skip = 0; - - while ((k = strchr (k, '%')) != NULL) - { - if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' || - k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a' ) { - *what = k[1]; - break; - } - (*skip) ++; - k ++; - } - return k; -} + *skip = 0; + while ((k = strchr(k, '%')) != NULL) { + if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' || + k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a') { + *what = k[1]; + break; + } + (*skip)++; + k++; + } + return k; +} /* debugging reiserfs we used to print out a lot of different variables, like keys, item headers, buffer heads etc. Values of @@ -191,61 +189,64 @@ static char * is_there_reiserfs_struct (char * fmt, int * what, int * skip) key->k_offset, key->k_uniqueness); */ - -static void -prepare_error_buf( const char *fmt, va_list args ) -{ - char * fmt1 = fmt_buf; - char * k; - char * p = error_buf; - int i, j, what, skip; - - strcpy (fmt1, fmt); - - while( (k = is_there_reiserfs_struct( fmt1, &what, &skip )) != NULL ) - { - *k = 0; - - p += vsprintf (p, fmt1, args); - - for (i = 0; i < skip; i ++) - j = va_arg (args, int); - - switch (what) { - case 'k': - sprintf_le_key (p, va_arg(args, struct reiserfs_key *)); - break; - case 'K': - sprintf_cpu_key (p, va_arg(args, struct cpu_key *)); - break; - case 'h': - sprintf_item_head (p, va_arg(args, struct item_head *)); - break; - case 't': - sprintf_direntry (p, va_arg(args, struct reiserfs_dir_entry *)); - break; - case 'y': - sprintf_disk_child (p, va_arg(args, struct disk_child *)); - break; - case 'z': - sprintf_block_head (p, va_arg(args, struct buffer_head *)); - break; - case 'b': - sprintf_buffer_head (p, va_arg(args, struct buffer_head *)); - break; - case 'a': - sprintf_de_head (p, va_arg(args, struct reiserfs_de_head *)); - break; - } - - p += strlen (p); - fmt1 = k + 2; - } - vsprintf (p, fmt1, args); +static void prepare_error_buf(const char *fmt, va_list args) +{ + char *fmt1 = fmt_buf; + char *k; + char *p = error_buf; + int i, j, what, skip; + + strcpy(fmt1, fmt); + + while ((k = is_there_reiserfs_struct(fmt1, &what, &skip)) != NULL) { + *k = 0; + + p += vsprintf(p, fmt1, args); + + for (i = 0; i < skip; i++) + j = va_arg(args, int); + + switch (what) { + case 'k': + sprintf_le_key(p, va_arg(args, struct reiserfs_key *)); + break; + case 'K': + sprintf_cpu_key(p, va_arg(args, struct cpu_key *)); + break; + case 'h': + sprintf_item_head(p, va_arg(args, struct item_head *)); + break; + case 't': + sprintf_direntry(p, + va_arg(args, + struct reiserfs_dir_entry *)); + break; + case 'y': + sprintf_disk_child(p, + va_arg(args, struct disk_child *)); + break; + case 'z': + sprintf_block_head(p, + va_arg(args, struct buffer_head *)); + break; + case 'b': + sprintf_buffer_head(p, + va_arg(args, struct buffer_head *)); + break; + case 'a': + sprintf_de_head(p, + va_arg(args, + struct reiserfs_de_head *)); + break; + } + + p += strlen(p); + fmt1 = k + 2; + } + vsprintf(p, fmt1, args); } - /* in addition to usual conversion specifiers this accepts reiserfs specific conversion specifiers: %k to print little endian key, @@ -264,43 +265,43 @@ prepare_error_buf( const char *fmt, va_list args ) va_end( args );\ } -void reiserfs_warning (struct super_block *sb, const char * fmt, ...) +void reiserfs_warning(struct super_block *sb, const char *fmt, ...) { - do_reiserfs_warning(fmt); - if (sb) - printk (KERN_WARNING "ReiserFS: %s: warning: %s\n", - reiserfs_bdevname (sb), error_buf); - else - printk (KERN_WARNING "ReiserFS: warning: %s\n", error_buf); + do_reiserfs_warning(fmt); + if (sb) + printk(KERN_WARNING "ReiserFS: %s: warning: %s\n", + reiserfs_bdevname(sb), error_buf); + else + printk(KERN_WARNING "ReiserFS: warning: %s\n", error_buf); } /* No newline.. reiserfs_info calls can be followed by printk's */ -void reiserfs_info (struct super_block *sb, const char * fmt, ...) +void reiserfs_info(struct super_block *sb, const char *fmt, ...) { - do_reiserfs_warning(fmt); - if (sb) - printk (KERN_NOTICE "ReiserFS: %s: %s", - reiserfs_bdevname (sb), error_buf); - else - printk (KERN_NOTICE "ReiserFS: %s", error_buf); + do_reiserfs_warning(fmt); + if (sb) + printk(KERN_NOTICE "ReiserFS: %s: %s", + reiserfs_bdevname(sb), error_buf); + else + printk(KERN_NOTICE "ReiserFS: %s", error_buf); } /* No newline.. reiserfs_printk calls can be followed by printk's */ -static void reiserfs_printk (const char * fmt, ...) +static void reiserfs_printk(const char *fmt, ...) { - do_reiserfs_warning(fmt); - printk (error_buf); + do_reiserfs_warning(fmt); + printk(error_buf); } -void reiserfs_debug (struct super_block *s, int level, const char * fmt, ...) +void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...) { #ifdef CONFIG_REISERFS_CHECK - do_reiserfs_warning(fmt); - if (s) - printk (KERN_DEBUG "ReiserFS: %s: %s\n", - reiserfs_bdevname (s), error_buf); - else - printk (KERN_DEBUG "ReiserFS: %s\n", error_buf); + do_reiserfs_warning(fmt); + if (s) + printk(KERN_DEBUG "ReiserFS: %s: %s\n", + reiserfs_bdevname(s), error_buf); + else + printk(KERN_DEBUG "ReiserFS: %s\n", error_buf); #endif } @@ -349,379 +350,403 @@ void reiserfs_debug (struct super_block *s, int level, const char * fmt, ...) . */ - #ifdef CONFIG_REISERFS_CHECK -extern struct tree_balance * cur_tb; +extern struct tree_balance *cur_tb; #endif -void reiserfs_panic (struct super_block * sb, const char * fmt, ...) +void reiserfs_panic(struct super_block *sb, const char *fmt, ...) { - do_reiserfs_warning(fmt); - printk (KERN_EMERG "REISERFS: panic (device %s): %s\n", - reiserfs_bdevname (sb), error_buf); - BUG (); + do_reiserfs_warning(fmt); + printk(KERN_EMERG "REISERFS: panic (device %s): %s\n", + reiserfs_bdevname(sb), error_buf); + BUG(); - /* this is not actually called, but makes reiserfs_panic() "noreturn" */ - panic ("REISERFS: panic (device %s): %s\n", - reiserfs_bdevname (sb), error_buf); + /* this is not actually called, but makes reiserfs_panic() "noreturn" */ + panic("REISERFS: panic (device %s): %s\n", + reiserfs_bdevname(sb), error_buf); } -void -reiserfs_abort (struct super_block *sb, int errno, const char *fmt, ...) +void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...) { - do_reiserfs_warning (fmt); + do_reiserfs_warning(fmt); - if (reiserfs_error_panic (sb)) { - panic (KERN_CRIT "REISERFS: panic (device %s): %s\n", - reiserfs_bdevname (sb), error_buf); - } + if (reiserfs_error_panic(sb)) { + panic(KERN_CRIT "REISERFS: panic (device %s): %s\n", + reiserfs_bdevname(sb), error_buf); + } - if (sb->s_flags & MS_RDONLY) - return; + if (sb->s_flags & MS_RDONLY) + return; - printk (KERN_CRIT "REISERFS: abort (device %s): %s\n", - reiserfs_bdevname (sb), error_buf); + printk(KERN_CRIT "REISERFS: abort (device %s): %s\n", + reiserfs_bdevname(sb), error_buf); - sb->s_flags |= MS_RDONLY; - reiserfs_journal_abort (sb, errno); + sb->s_flags |= MS_RDONLY; + reiserfs_journal_abort(sb, errno); } /* this prints internal nodes (4 keys/items in line) (dc_number, dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number, dc_size)...*/ -static int print_internal (struct buffer_head * bh, int first, int last) +static int print_internal(struct buffer_head *bh, int first, int last) { - struct reiserfs_key * key; - struct disk_child * dc; - int i; - int from, to; - - if (!B_IS_KEYS_LEVEL (bh)) - return 1; - - check_internal (bh); - - if (first == -1) { - from = 0; - to = B_NR_ITEMS (bh); - } else { - from = first; - to = last < B_NR_ITEMS (bh) ? last : B_NR_ITEMS (bh); - } - - reiserfs_printk ("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh); - - dc = B_N_CHILD (bh, from); - reiserfs_printk ("PTR %d: %y ", from, dc); - - for (i = from, key = B_N_PDELIM_KEY (bh, from), dc ++; i < to; i ++, key ++, dc ++) { - reiserfs_printk ("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc); - if (i && i % 4 == 0) - printk ("\n"); - } - printk ("\n"); - return 0; -} + struct reiserfs_key *key; + struct disk_child *dc; + int i; + int from, to; + if (!B_IS_KEYS_LEVEL(bh)) + return 1; + check_internal(bh); + if (first == -1) { + from = 0; + to = B_NR_ITEMS(bh); + } else { + from = first; + to = last < B_NR_ITEMS(bh) ? last : B_NR_ITEMS(bh); + } + reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh); -static int print_leaf (struct buffer_head * bh, int print_mode, int first, int last) -{ - struct block_head * blkh; - struct item_head * ih; - int i, nr; - int from, to; + dc = B_N_CHILD(bh, from); + reiserfs_printk("PTR %d: %y ", from, dc); - if (!B_IS_ITEMS_LEVEL (bh)) - return 1; + for (i = from, key = B_N_PDELIM_KEY(bh, from), dc++; i < to; + i++, key++, dc++) { + reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc); + if (i && i % 4 == 0) + printk("\n"); + } + printk("\n"); + return 0; +} - check_leaf (bh); +static int print_leaf(struct buffer_head *bh, int print_mode, int first, + int last) +{ + struct block_head *blkh; + struct item_head *ih; + int i, nr; + int from, to; - blkh = B_BLK_HEAD (bh); - ih = B_N_PITEM_HEAD (bh,0); - nr = blkh_nr_item(blkh); + if (!B_IS_ITEMS_LEVEL(bh)) + return 1; - printk ("\n===================================================================\n"); - reiserfs_printk ("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh); + check_leaf(bh); - if (!(print_mode & PRINT_LEAF_ITEMS)) { - reiserfs_printk ("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n", - &(ih->ih_key), &((ih + nr - 1)->ih_key)); - return 0; - } + blkh = B_BLK_HEAD(bh); + ih = B_N_PITEM_HEAD(bh, 0); + nr = blkh_nr_item(blkh); - if (first < 0 || first > nr - 1) - from = 0; - else - from = first; + printk + ("\n===================================================================\n"); + reiserfs_printk("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh); - if (last < 0 || last > nr ) - to = nr; - else - to = last; + if (!(print_mode & PRINT_LEAF_ITEMS)) { + reiserfs_printk("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n", + &(ih->ih_key), &((ih + nr - 1)->ih_key)); + return 0; + } - ih += from; - printk ("-------------------------------------------------------------------------------\n"); - printk ("|##| type | key | ilen | free_space | version | loc |\n"); - for (i = from; i < to; i++, ih ++) { - printk ("-------------------------------------------------------------------------------\n"); - reiserfs_printk ("|%2d| %h |\n", i, ih); - if (print_mode & PRINT_LEAF_ITEMS) - op_print_item (ih, B_I_PITEM (bh, ih)); - } + if (first < 0 || first > nr - 1) + from = 0; + else + from = first; + + if (last < 0 || last > nr) + to = nr; + else + to = last; + + ih += from; + printk + ("-------------------------------------------------------------------------------\n"); + printk + ("|##| type | key | ilen | free_space | version | loc |\n"); + for (i = from; i < to; i++, ih++) { + printk + ("-------------------------------------------------------------------------------\n"); + reiserfs_printk("|%2d| %h |\n", i, ih); + if (print_mode & PRINT_LEAF_ITEMS) + op_print_item(ih, B_I_PITEM(bh, ih)); + } - printk ("===================================================================\n"); + printk + ("===================================================================\n"); - return 0; + return 0; } -char * reiserfs_hashname(int code) +char *reiserfs_hashname(int code) { - if ( code == YURA_HASH) - return "rupasov"; - if ( code == TEA_HASH) - return "tea"; - if ( code == R5_HASH) - return "r5"; + if (code == YURA_HASH) + return "rupasov"; + if (code == TEA_HASH) + return "tea"; + if (code == R5_HASH) + return "r5"; - return "unknown"; + return "unknown"; } /* return 1 if this is not super block */ -static int print_super_block (struct buffer_head * bh) -{ - struct reiserfs_super_block * rs = (struct reiserfs_super_block *)(bh->b_data); - int skipped, data_blocks; - char *version; - char b[BDEVNAME_SIZE]; - - if (is_reiserfs_3_5(rs)) { - version = "3.5"; - } else if (is_reiserfs_3_6(rs)) { - version = "3.6"; - } else if (is_reiserfs_jr(rs)) { - version = ((sb_version(rs) == REISERFS_VERSION_2) ? - "3.6" : "3.5"); - } else { - return 1; - } - - printk ("%s\'s super block is in block %llu\n", bdevname (bh->b_bdev, b), - (unsigned long long)bh->b_blocknr); - printk ("Reiserfs version %s\n", version ); - printk ("Block count %u\n", sb_block_count(rs)); - printk ("Blocksize %d\n", sb_blocksize(rs)); - printk ("Free blocks %u\n", sb_free_blocks(rs)); - // FIXME: this would be confusing if - // someone stores reiserfs super block in some data block ;) +static int print_super_block(struct buffer_head *bh) +{ + struct reiserfs_super_block *rs = + (struct reiserfs_super_block *)(bh->b_data); + int skipped, data_blocks; + char *version; + char b[BDEVNAME_SIZE]; + + if (is_reiserfs_3_5(rs)) { + version = "3.5"; + } else if (is_reiserfs_3_6(rs)) { + version = "3.6"; + } else if (is_reiserfs_jr(rs)) { + version = ((sb_version(rs) == REISERFS_VERSION_2) ? + "3.6" : "3.5"); + } else { + return 1; + } + + printk("%s\'s super block is in block %llu\n", bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); + printk("Reiserfs version %s\n", version); + printk("Block count %u\n", sb_block_count(rs)); + printk("Blocksize %d\n", sb_blocksize(rs)); + printk("Free blocks %u\n", sb_free_blocks(rs)); + // FIXME: this would be confusing if + // someone stores reiserfs super block in some data block ;) // skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs); - skipped = bh->b_blocknr; - data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) - - (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) + 1 : sb_reserved_for_journal(rs)) - - sb_free_blocks(rs); - printk ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n" - "1 super block, %d data blocks\n", - skipped, sb_bmap_nr(rs), (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) : - sb_reserved_for_journal(rs)) , data_blocks); - printk ("Root block %u\n", sb_root_block(rs)); - printk ("Journal block (first) %d\n", sb_jp_journal_1st_block(rs)); - printk ("Journal dev %d\n", sb_jp_journal_dev(rs)); - printk ("Journal orig size %d\n", sb_jp_journal_size(rs)); - printk ("FS state %d\n", sb_fs_state(rs)); - printk ("Hash function \"%s\"\n", - reiserfs_hashname(sb_hash_function_code(rs))); - - printk ("Tree height %d\n", sb_tree_height(rs)); - return 0; + skipped = bh->b_blocknr; + data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) - + (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) + + 1 : sb_reserved_for_journal(rs)) - sb_free_blocks(rs); + printk + ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n" + "1 super block, %d data blocks\n", skipped, sb_bmap_nr(rs), + (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) : + sb_reserved_for_journal(rs)), data_blocks); + printk("Root block %u\n", sb_root_block(rs)); + printk("Journal block (first) %d\n", sb_jp_journal_1st_block(rs)); + printk("Journal dev %d\n", sb_jp_journal_dev(rs)); + printk("Journal orig size %d\n", sb_jp_journal_size(rs)); + printk("FS state %d\n", sb_fs_state(rs)); + printk("Hash function \"%s\"\n", + reiserfs_hashname(sb_hash_function_code(rs))); + + printk("Tree height %d\n", sb_tree_height(rs)); + return 0; } -static int print_desc_block (struct buffer_head * bh) +static int print_desc_block(struct buffer_head *bh) { - struct reiserfs_journal_desc * desc; + struct reiserfs_journal_desc *desc; - if (memcmp(get_journal_desc_magic (bh), JOURNAL_DESC_MAGIC, 8)) - return 1; + if (memcmp(get_journal_desc_magic(bh), JOURNAL_DESC_MAGIC, 8)) + return 1; - desc = (struct reiserfs_journal_desc *)(bh->b_data); - printk ("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)", - (unsigned long long)bh->b_blocknr, get_desc_trans_id (desc), get_desc_mount_id (desc), - get_desc_trans_len (desc)); + desc = (struct reiserfs_journal_desc *)(bh->b_data); + printk("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)", + (unsigned long long)bh->b_blocknr, get_desc_trans_id(desc), + get_desc_mount_id(desc), get_desc_trans_len(desc)); - return 0; + return 0; } - -void print_block (struct buffer_head * bh, ...)//int print_mode, int first, int last) +void print_block(struct buffer_head *bh, ...) //int print_mode, int first, int last) { - va_list args; - int mode, first, last; + va_list args; + int mode, first, last; - va_start (args, bh); + va_start(args, bh); - if ( ! bh ) { - printk("print_block: buffer is NULL\n"); - return; - } + if (!bh) { + printk("print_block: buffer is NULL\n"); + return; + } - mode = va_arg (args, int); - first = va_arg (args, int); - last = va_arg (args, int); - if (print_leaf (bh, mode, first, last)) - if (print_internal (bh, first, last)) - if (print_super_block (bh)) - if (print_desc_block (bh)) - printk ("Block %llu contains unformatted data\n", (unsigned long long)bh->b_blocknr); + mode = va_arg(args, int); + first = va_arg(args, int); + last = va_arg(args, int); + if (print_leaf(bh, mode, first, last)) + if (print_internal(bh, first, last)) + if (print_super_block(bh)) + if (print_desc_block(bh)) + printk + ("Block %llu contains unformatted data\n", + (unsigned long long)bh->b_blocknr); } - - static char print_tb_buf[2048]; /* this stores initial state of tree balance in the print_tb_buf */ -void store_print_tb (struct tree_balance * tb) -{ - int h = 0; - int i; - struct buffer_head * tbSh, * tbFh; - - if (!tb) - return; - - sprintf (print_tb_buf, "\n" - "BALANCING %d\n" - "MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n" - "=====================================================================\n" - "* h * S * L * R * F * FL * FR * CFL * CFR *\n", - REISERFS_SB(tb->tb_sb)->s_do_balance, - tb->tb_mode, PATH_LAST_POSITION (tb->tb_path), tb->tb_path->pos_in_item); - - for (h = 0; h < sizeof(tb->insert_size) / sizeof (tb->insert_size[0]); h ++) { - if (PATH_H_PATH_OFFSET (tb->tb_path, h) <= tb->tb_path->path_length && - PATH_H_PATH_OFFSET (tb->tb_path, h) > ILLEGAL_PATH_ELEMENT_OFFSET) { - tbSh = PATH_H_PBUFFER (tb->tb_path, h); - tbFh = PATH_H_PPARENT (tb->tb_path, h); - } else { - tbSh = NULL; - tbFh = NULL; +void store_print_tb(struct tree_balance *tb) +{ + int h = 0; + int i; + struct buffer_head *tbSh, *tbFh; + + if (!tb) + return; + + sprintf(print_tb_buf, "\n" + "BALANCING %d\n" + "MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n" + "=====================================================================\n" + "* h * S * L * R * F * FL * FR * CFL * CFR *\n", + REISERFS_SB(tb->tb_sb)->s_do_balance, + tb->tb_mode, PATH_LAST_POSITION(tb->tb_path), + tb->tb_path->pos_in_item); + + for (h = 0; h < sizeof(tb->insert_size) / sizeof(tb->insert_size[0]); + h++) { + if (PATH_H_PATH_OFFSET(tb->tb_path, h) <= + tb->tb_path->path_length + && PATH_H_PATH_OFFSET(tb->tb_path, + h) > ILLEGAL_PATH_ELEMENT_OFFSET) { + tbSh = PATH_H_PBUFFER(tb->tb_path, h); + tbFh = PATH_H_PPARENT(tb->tb_path, h); + } else { + tbSh = NULL; + tbFh = NULL; + } + sprintf(print_tb_buf + strlen(print_tb_buf), + "* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n", + h, + (tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL), + (tbSh) ? atomic_read(&(tbSh->b_count)) : -1, + (tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL), + (tb->L[h]) ? atomic_read(&(tb->L[h]->b_count)) : -1, + (tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL), + (tb->R[h]) ? atomic_read(&(tb->R[h]->b_count)) : -1, + (tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL), + (tb->FL[h]) ? (long long)(tb->FL[h]-> + b_blocknr) : (-1LL), + (tb->FR[h]) ? (long long)(tb->FR[h]-> + b_blocknr) : (-1LL), + (tb->CFL[h]) ? (long long)(tb->CFL[h]-> + b_blocknr) : (-1LL), + (tb->CFR[h]) ? (long long)(tb->CFR[h]-> + b_blocknr) : (-1LL)); } - sprintf (print_tb_buf + strlen (print_tb_buf), - "* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n", - h, - (tbSh) ? (long long)(tbSh->b_blocknr):(-1LL), - (tbSh) ? atomic_read (&(tbSh->b_count)) : -1, - (tb->L[h]) ? (long long)(tb->L[h]->b_blocknr):(-1LL), - (tb->L[h]) ? atomic_read (&(tb->L[h]->b_count)) : -1, - (tb->R[h]) ? (long long)(tb->R[h]->b_blocknr):(-1LL), - (tb->R[h]) ? atomic_read (&(tb->R[h]->b_count)) : -1, - (tbFh) ? (long long)(tbFh->b_blocknr):(-1LL), - (tb->FL[h]) ? (long long)(tb->FL[h]->b_blocknr):(-1LL), - (tb->FR[h]) ? (long long)(tb->FR[h]->b_blocknr):(-1LL), - (tb->CFL[h]) ? (long long)(tb->CFL[h]->b_blocknr):(-1LL), - (tb->CFR[h]) ? (long long)(tb->CFR[h]->b_blocknr):(-1LL)); - } - - sprintf (print_tb_buf + strlen (print_tb_buf), - "=====================================================================\n" - "* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n" - "* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n", - tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0],tb->rbytes, tb->blknum[0], - tb->s0num, tb->s1num,tb->s1bytes, tb->s2num, tb->s2bytes, tb->cur_blknum, tb->lkey[0], tb->rkey[0]); - - /* this prints balance parameters for non-leaf levels */ - h = 0; - do { - h++; - sprintf (print_tb_buf + strlen (print_tb_buf), - "* %d * %4d * %2d * * %2d * * %2d *\n", - h, tb->insert_size[h], tb->lnum[h], tb->rnum[h], tb->blknum[h]); - } while (tb->insert_size[h]); - - sprintf (print_tb_buf + strlen (print_tb_buf), - "=====================================================================\n" - "FEB list: "); - - /* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */ - h = 0; - for (i = 0; i < sizeof (tb->FEB) / sizeof (tb->FEB[0]); i ++) - sprintf (print_tb_buf + strlen (print_tb_buf), - "%p (%llu %d)%s", tb->FEB[i], tb->FEB[i] ? (unsigned long long)tb->FEB[i]->b_blocknr : 0ULL, - tb->FEB[i] ? atomic_read (&(tb->FEB[i]->b_count)) : 0, - (i == sizeof (tb->FEB) / sizeof (tb->FEB[0]) - 1) ? "\n" : ", "); - - sprintf (print_tb_buf + strlen (print_tb_buf), - "======================== the end ====================================\n"); -} - -void print_cur_tb (char * mes) -{ - printk ("%s\n%s", mes, print_tb_buf); -} - -static void check_leaf_block_head (struct buffer_head * bh) -{ - struct block_head * blkh; - int nr; - - blkh = B_BLK_HEAD (bh); - nr = blkh_nr_item(blkh); - if ( nr > (bh->b_size - BLKH_SIZE) / IH_SIZE) - reiserfs_panic (NULL, "vs-6010: check_leaf_block_head: invalid item number %z", bh); - if ( blkh_free_space(blkh) > - bh->b_size - BLKH_SIZE - IH_SIZE * nr ) - reiserfs_panic (NULL, "vs-6020: check_leaf_block_head: invalid free space %z", bh); - -} -static void check_internal_block_head (struct buffer_head * bh) -{ - struct block_head * blkh; - - blkh = B_BLK_HEAD (bh); - if (!(B_LEVEL (bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL (bh) <= MAX_HEIGHT)) - reiserfs_panic (NULL, "vs-6025: check_internal_block_head: invalid level %z", bh); + sprintf(print_tb_buf + strlen(print_tb_buf), + "=====================================================================\n" + "* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n" + "* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n", + tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0], + tb->rbytes, tb->blknum[0], tb->s0num, tb->s1num, tb->s1bytes, + tb->s2num, tb->s2bytes, tb->cur_blknum, tb->lkey[0], + tb->rkey[0]); + + /* this prints balance parameters for non-leaf levels */ + h = 0; + do { + h++; + sprintf(print_tb_buf + strlen(print_tb_buf), + "* %d * %4d * %2d * * %2d * * %2d *\n", + h, tb->insert_size[h], tb->lnum[h], tb->rnum[h], + tb->blknum[h]); + } while (tb->insert_size[h]); - if (B_NR_ITEMS (bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE) - reiserfs_panic (NULL, "vs-6030: check_internal_block_head: invalid item number %z", bh); + sprintf(print_tb_buf + strlen(print_tb_buf), + "=====================================================================\n" + "FEB list: "); - if (B_FREE_SPACE (bh) != - bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS (bh) - DC_SIZE * (B_NR_ITEMS (bh) + 1)) - reiserfs_panic (NULL, "vs-6040: check_internal_block_head: invalid free space %z", bh); + /* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */ + h = 0; + for (i = 0; i < sizeof(tb->FEB) / sizeof(tb->FEB[0]); i++) + sprintf(print_tb_buf + strlen(print_tb_buf), + "%p (%llu %d)%s", tb->FEB[i], + tb->FEB[i] ? (unsigned long long)tb->FEB[i]-> + b_blocknr : 0ULL, + tb->FEB[i] ? atomic_read(&(tb->FEB[i]->b_count)) : 0, + (i == + sizeof(tb->FEB) / sizeof(tb->FEB[0]) - + 1) ? "\n" : ", "); + sprintf(print_tb_buf + strlen(print_tb_buf), + "======================== the end ====================================\n"); } +void print_cur_tb(char *mes) +{ + printk("%s\n%s", mes, print_tb_buf); +} -void check_leaf (struct buffer_head * bh) +static void check_leaf_block_head(struct buffer_head *bh) { - int i; - struct item_head * ih; + struct block_head *blkh; + int nr; + + blkh = B_BLK_HEAD(bh); + nr = blkh_nr_item(blkh); + if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE) + reiserfs_panic(NULL, + "vs-6010: check_leaf_block_head: invalid item number %z", + bh); + if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr) + reiserfs_panic(NULL, + "vs-6020: check_leaf_block_head: invalid free space %z", + bh); - if (!bh) - return; - check_leaf_block_head (bh); - for (i = 0, ih = B_N_PITEM_HEAD (bh, 0); i < B_NR_ITEMS (bh); i ++, ih ++) - op_check_item (ih, B_I_PITEM (bh, ih)); } +static void check_internal_block_head(struct buffer_head *bh) +{ + struct block_head *blkh; + + blkh = B_BLK_HEAD(bh); + if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT)) + reiserfs_panic(NULL, + "vs-6025: check_internal_block_head: invalid level %z", + bh); + + if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE) + reiserfs_panic(NULL, + "vs-6030: check_internal_block_head: invalid item number %z", + bh); + + if (B_FREE_SPACE(bh) != + bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) - + DC_SIZE * (B_NR_ITEMS(bh) + 1)) + reiserfs_panic(NULL, + "vs-6040: check_internal_block_head: invalid free space %z", + bh); + +} -void check_internal (struct buffer_head * bh) +void check_leaf(struct buffer_head *bh) { - if (!bh) - return; - check_internal_block_head (bh); + int i; + struct item_head *ih; + + if (!bh) + return; + check_leaf_block_head(bh); + for (i = 0, ih = B_N_PITEM_HEAD(bh, 0); i < B_NR_ITEMS(bh); i++, ih++) + op_check_item(ih, B_I_PITEM(bh, ih)); } +void check_internal(struct buffer_head *bh) +{ + if (!bh) + return; + check_internal_block_head(bh); +} -void print_statistics (struct super_block * s) +void print_statistics(struct super_block *s) { - /* - printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \ -bmap with search %d, without %d, dir2ind %d, ind2dir %d\n", - REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes, - REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search, - REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct); - */ + /* + printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \ + bmap with search %d, without %d, dir2ind %d, ind2dir %d\n", + REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes, + REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search, + REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct); + */ } diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index e242ebc7f6f..fc2f43c75df 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -33,28 +33,27 @@ static int show_version(struct seq_file *m, struct super_block *sb) { char *format; - - if ( REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6) ) { + + if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) { format = "3.6"; - } else if ( REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5) ) { + } else if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5)) { format = "3.5"; } else { format = "unknown"; } - seq_printf(m, "%s format\twith checks %s\n", - format, + seq_printf(m, "%s format\twith checks %s\n", format, #if defined( CONFIG_REISERFS_CHECK ) - "on" + "on" #else - "off" + "off" #endif - ); + ); return 0; } -int reiserfs_global_version_in_proc( char *buffer, char **start, off_t offset, - int count, int *eof, void *data ) +int reiserfs_global_version_in_proc(char *buffer, char **start, off_t offset, + int count, int *eof, void *data) { *start = buffer; *eof = 1; @@ -79,87 +78,68 @@ int reiserfs_global_version_in_proc( char *buffer, char **start, off_t offset, #define DJF( x ) le32_to_cpu( rs -> x ) #define DJV( x ) le32_to_cpu( s_v1 -> x ) -#define DJP( x ) le32_to_cpu( jp -> x ) +#define DJP( x ) le32_to_cpu( jp -> x ) #define JF( x ) ( r -> s_journal -> x ) static int show_super(struct seq_file *m, struct super_block *sb) { struct reiserfs_sb_info *r = REISERFS_SB(sb); - - seq_printf(m, "state: \t%s\n" - "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n" - "gen. counter: \t%i\n" - "s_kmallocs: \t%i\n" - "s_disk_reads: \t%i\n" - "s_disk_writes: \t%i\n" - "s_fix_nodes: \t%i\n" - "s_do_balance: \t%i\n" - "s_unneeded_left_neighbor: \t%i\n" - "s_good_search_by_key_reada: \t%i\n" - "s_bmaps: \t%i\n" - "s_bmaps_without_search: \t%i\n" - "s_direct2indirect: \t%i\n" - "s_indirect2direct: \t%i\n" - "\n" - "max_hash_collisions: \t%i\n" - - "breads: \t%lu\n" - "bread_misses: \t%lu\n" - - "search_by_key: \t%lu\n" - "search_by_key_fs_changed: \t%lu\n" - "search_by_key_restarted: \t%lu\n" - - "insert_item_restarted: \t%lu\n" - "paste_into_item_restarted: \t%lu\n" - "cut_from_item_restarted: \t%lu\n" - "delete_solid_item_restarted: \t%lu\n" - "delete_item_restarted: \t%lu\n" - - "leaked_oid: \t%lu\n" - "leaves_removable: \t%lu\n", - - SF( s_mount_state ) == REISERFS_VALID_FS ? - "REISERFS_VALID_FS" : "REISERFS_ERROR_FS", - reiserfs_r5_hash( sb ) ? "FORCE_R5 " : "", - reiserfs_rupasov_hash( sb ) ? "FORCE_RUPASOV " : "", - reiserfs_tea_hash( sb ) ? "FORCE_TEA " : "", - reiserfs_hash_detect( sb ) ? "DETECT_HASH " : "", - reiserfs_no_border( sb ) ? "NO_BORDER " : "BORDER ", - reiserfs_no_unhashed_relocation( sb ) ? "NO_UNHASHED_RELOCATION " : "", - reiserfs_hashed_relocation( sb ) ? "UNHASHED_RELOCATION " : "", - reiserfs_test4( sb ) ? "TEST4 " : "", - have_large_tails( sb ) ? "TAILS " : have_small_tails(sb)?"SMALL_TAILS ":"NO_TAILS ", - replay_only( sb ) ? "REPLAY_ONLY " : "", - convert_reiserfs( sb ) ? "CONV " : "", - - atomic_read( &r -> s_generation_counter ), - SF( s_kmallocs ), - SF( s_disk_reads ), - SF( s_disk_writes ), - SF( s_fix_nodes ), - SF( s_do_balance ), - SF( s_unneeded_left_neighbor ), - SF( s_good_search_by_key_reada ), - SF( s_bmaps ), - SF( s_bmaps_without_search ), - SF( s_direct2indirect ), - SF( s_indirect2direct ), - SFP( max_hash_collisions ), - SFP( breads ), - SFP( bread_miss ), - SFP( search_by_key ), - SFP( search_by_key_fs_changed ), - SFP( search_by_key_restarted ), - - SFP( insert_item_restarted ), - SFP( paste_into_item_restarted ), - SFP( cut_from_item_restarted ), - SFP( delete_solid_item_restarted ), - SFP( delete_item_restarted ), - - SFP( leaked_oid ), - SFP( leaves_removable ) ); + + seq_printf(m, "state: \t%s\n" + "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n" + "gen. counter: \t%i\n" + "s_kmallocs: \t%i\n" + "s_disk_reads: \t%i\n" + "s_disk_writes: \t%i\n" + "s_fix_nodes: \t%i\n" + "s_do_balance: \t%i\n" + "s_unneeded_left_neighbor: \t%i\n" + "s_good_search_by_key_reada: \t%i\n" + "s_bmaps: \t%i\n" + "s_bmaps_without_search: \t%i\n" + "s_direct2indirect: \t%i\n" + "s_indirect2direct: \t%i\n" + "\n" + "max_hash_collisions: \t%i\n" + "breads: \t%lu\n" + "bread_misses: \t%lu\n" + "search_by_key: \t%lu\n" + "search_by_key_fs_changed: \t%lu\n" + "search_by_key_restarted: \t%lu\n" + "insert_item_restarted: \t%lu\n" + "paste_into_item_restarted: \t%lu\n" + "cut_from_item_restarted: \t%lu\n" + "delete_solid_item_restarted: \t%lu\n" + "delete_item_restarted: \t%lu\n" + "leaked_oid: \t%lu\n" + "leaves_removable: \t%lu\n", + SF(s_mount_state) == REISERFS_VALID_FS ? + "REISERFS_VALID_FS" : "REISERFS_ERROR_FS", + reiserfs_r5_hash(sb) ? "FORCE_R5 " : "", + reiserfs_rupasov_hash(sb) ? "FORCE_RUPASOV " : "", + reiserfs_tea_hash(sb) ? "FORCE_TEA " : "", + reiserfs_hash_detect(sb) ? "DETECT_HASH " : "", + reiserfs_no_border(sb) ? "NO_BORDER " : "BORDER ", + reiserfs_no_unhashed_relocation(sb) ? + "NO_UNHASHED_RELOCATION " : "", + reiserfs_hashed_relocation(sb) ? "UNHASHED_RELOCATION " : "", + reiserfs_test4(sb) ? "TEST4 " : "", + have_large_tails(sb) ? "TAILS " : have_small_tails(sb) ? + "SMALL_TAILS " : "NO_TAILS ", + replay_only(sb) ? "REPLAY_ONLY " : "", + convert_reiserfs(sb) ? "CONV " : "", + atomic_read(&r->s_generation_counter), SF(s_kmallocs), + SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes), + SF(s_do_balance), SF(s_unneeded_left_neighbor), + SF(s_good_search_by_key_reada), SF(s_bmaps), + SF(s_bmaps_without_search), SF(s_direct2indirect), + SF(s_indirect2direct), SFP(max_hash_collisions), SFP(breads), + SFP(bread_miss), SFP(search_by_key), + SFP(search_by_key_fs_changed), SFP(search_by_key_restarted), + SFP(insert_item_restarted), SFP(paste_into_item_restarted), + SFP(cut_from_item_restarted), + SFP(delete_solid_item_restarted), SFP(delete_item_restarted), + SFP(leaked_oid), SFP(leaves_removable)); return 0; } @@ -169,61 +149,55 @@ static int show_per_level(struct seq_file *m, struct super_block *sb) struct reiserfs_sb_info *r = REISERFS_SB(sb); int level; - seq_printf(m, "level\t" - " balances" - " [sbk: reads" - " fs_changed" - " restarted]" - " free space" - " items" - " can_remove" - " lnum" - " rnum" - " lbytes" - " rbytes" - " get_neig" - " get_neig_res" - " need_l_neig" - " need_r_neig" - "\n" - - ); - - for( level = 0 ; level < MAX_HEIGHT ; ++ level ) { - seq_printf(m, "%i\t" - " %12lu" - " %12lu" - " %12lu" - " %12lu" - " %12lu" - " %12lu" - " %12lu" - " %12li" - " %12li" - " %12li" - " %12li" - " %12lu" - " %12lu" - " %12lu" - " %12lu" - "\n", - level, - SFPL( balance_at ), - SFPL( sbk_read_at ), - SFPL( sbk_fs_changed ), - SFPL( sbk_restarted ), - SFPL( free_at ), - SFPL( items_at ), - SFPL( can_node_be_removed ), - SFPL( lnum ), - SFPL( rnum ), - SFPL( lbytes ), - SFPL( rbytes ), - SFPL( get_neighbors ), - SFPL( get_neighbors_restart ), - SFPL( need_l_neighbor ), - SFPL( need_r_neighbor ) - ); + seq_printf(m, "level\t" + " balances" + " [sbk: reads" + " fs_changed" + " restarted]" + " free space" + " items" + " can_remove" + " lnum" + " rnum" + " lbytes" + " rbytes" + " get_neig" + " get_neig_res" " need_l_neig" " need_r_neig" "\n"); + + for (level = 0; level < MAX_HEIGHT; ++level) { + seq_printf(m, "%i\t" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + " %12li" + " %12li" + " %12li" + " %12li" + " %12lu" + " %12lu" + " %12lu" + " %12lu" + "\n", + level, + SFPL(balance_at), + SFPL(sbk_read_at), + SFPL(sbk_fs_changed), + SFPL(sbk_restarted), + SFPL(free_at), + SFPL(items_at), + SFPL(can_node_be_removed), + SFPL(lnum), + SFPL(rnum), + SFPL(lbytes), + SFPL(rbytes), + SFPL(get_neighbors), + SFPL(get_neighbors_restart), + SFPL(need_l_neighbor), SFPL(need_r_neighbor) + ); } return 0; } @@ -232,31 +206,30 @@ static int show_bitmap(struct seq_file *m, struct super_block *sb) { struct reiserfs_sb_info *r = REISERFS_SB(sb); - seq_printf(m, "free_block: %lu\n" - " scan_bitmap:" - " wait" - " bmap" - " retry" - " stolen" - " journal_hint" - "journal_nohint" - "\n" - " %14lu" - " %14lu" - " %14lu" - " %14lu" - " %14lu" - " %14lu" - " %14lu" - "\n", - SFP( free_block ), - SFPF( call ), - SFPF( wait ), - SFPF( bmap ), - SFPF( retry ), - SFPF( stolen ), - SFPF( in_journal_hint ), - SFPF( in_journal_nohint ) ); + seq_printf(m, "free_block: %lu\n" + " scan_bitmap:" + " wait" + " bmap" + " retry" + " stolen" + " journal_hint" + "journal_nohint" + "\n" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + " %14lu" + "\n", + SFP(free_block), + SFPF(call), + SFPF(wait), + SFPF(bmap), + SFPF(retry), + SFPF(stolen), + SFPF(in_journal_hint), SFPF(in_journal_nohint)); return 0; } @@ -264,46 +237,42 @@ static int show_bitmap(struct seq_file *m, struct super_block *sb) static int show_on_disk_super(struct seq_file *m, struct super_block *sb) { struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); - struct reiserfs_super_block *rs = sb_info -> s_rs; - int hash_code = DFL( s_hash_function_code ); - __u32 flags = DJF( s_flags ); - - seq_printf(m, "block_count: \t%i\n" - "free_blocks: \t%i\n" - "root_block: \t%i\n" - "blocksize: \t%i\n" - "oid_maxsize: \t%i\n" - "oid_cursize: \t%i\n" - "umount_state: \t%i\n" - "magic: \t%10.10s\n" - "fs_state: \t%i\n" - "hash: \t%s\n" - "tree_height: \t%i\n" - "bmap_nr: \t%i\n" - "version: \t%i\n" - "flags: \t%x[%s]\n" - "reserved_for_journal: \t%i\n", - - DFL( s_block_count ), - DFL( s_free_blocks ), - DFL( s_root_block ), - DF( s_blocksize ), - DF( s_oid_maxsize ), - DF( s_oid_cursize ), - DF( s_umount_state ), - rs -> s_v1.s_magic, - DF( s_fs_state ), - hash_code == TEA_HASH ? "tea" : - ( hash_code == YURA_HASH ) ? "rupasov" : - ( hash_code == R5_HASH ) ? "r5" : - ( hash_code == UNSET_HASH ) ? "unset" : "unknown", - DF( s_tree_height ), - DF( s_bmap_nr ), - DF( s_version ), - flags, - ( flags & reiserfs_attrs_cleared ) - ? "attrs_cleared" : "", - DF (s_reserved_for_journal)); + struct reiserfs_super_block *rs = sb_info->s_rs; + int hash_code = DFL(s_hash_function_code); + __u32 flags = DJF(s_flags); + + seq_printf(m, "block_count: \t%i\n" + "free_blocks: \t%i\n" + "root_block: \t%i\n" + "blocksize: \t%i\n" + "oid_maxsize: \t%i\n" + "oid_cursize: \t%i\n" + "umount_state: \t%i\n" + "magic: \t%10.10s\n" + "fs_state: \t%i\n" + "hash: \t%s\n" + "tree_height: \t%i\n" + "bmap_nr: \t%i\n" + "version: \t%i\n" + "flags: \t%x[%s]\n" + "reserved_for_journal: \t%i\n", + DFL(s_block_count), + DFL(s_free_blocks), + DFL(s_root_block), + DF(s_blocksize), + DF(s_oid_maxsize), + DF(s_oid_cursize), + DF(s_umount_state), + rs->s_v1.s_magic, + DF(s_fs_state), + hash_code == TEA_HASH ? "tea" : + (hash_code == YURA_HASH) ? "rupasov" : + (hash_code == R5_HASH) ? "r5" : + (hash_code == UNSET_HASH) ? "unset" : "unknown", + DF(s_tree_height), + DF(s_bmap_nr), + DF(s_version), flags, (flags & reiserfs_attrs_cleared) + ? "attrs_cleared" : "", DF(s_reserved_for_journal)); return 0; } @@ -311,131 +280,122 @@ static int show_on_disk_super(struct seq_file *m, struct super_block *sb) static int show_oidmap(struct seq_file *m, struct super_block *sb) { struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); - struct reiserfs_super_block *rs = sb_info -> s_rs; - unsigned int mapsize = le16_to_cpu( rs -> s_v1.s_oid_cursize ); + struct reiserfs_super_block *rs = sb_info->s_rs; + unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize); unsigned long total_used = 0; int i; - for( i = 0 ; i < mapsize ; ++i ) { + for (i = 0; i < mapsize; ++i) { __u32 right; - right = ( i == mapsize - 1 ) ? MAX_KEY_OBJECTID : MAP( i + 1 ); + right = (i == mapsize - 1) ? MAX_KEY_OBJECTID : MAP(i + 1); seq_printf(m, "%s: [ %x .. %x )\n", - ( i & 1 ) ? "free" : "used", MAP( i ), right ); - if( ! ( i & 1 ) ) { - total_used += right - MAP( i ); + (i & 1) ? "free" : "used", MAP(i), right); + if (!(i & 1)) { + total_used += right - MAP(i); } } #if defined( REISERFS_USE_OIDMAPF ) - if( sb_info -> oidmap.use_file && ( sb_info -> oidmap.mapf != NULL ) ) { + if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) { loff_t size = sb_info->oidmap.mapf->f_dentry->d_inode->i_size; - total_used += size / sizeof( reiserfs_oidinterval_d_t ); + total_used += size / sizeof(reiserfs_oidinterval_d_t); } #endif - seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n", - mapsize, - mapsize, le16_to_cpu( rs -> s_v1.s_oid_maxsize ), - total_used); + seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n", + mapsize, + mapsize, le16_to_cpu(rs->s_v1.s_oid_maxsize), total_used); return 0; } static int show_journal(struct seq_file *m, struct super_block *sb) { struct reiserfs_sb_info *r = REISERFS_SB(sb); - struct reiserfs_super_block *rs = r -> s_rs; + struct reiserfs_super_block *rs = r->s_rs; struct journal_params *jp = &rs->s_v1.s_journal; char b[BDEVNAME_SIZE]; - - - seq_printf(m, /* on-disk fields */ - "jp_journal_1st_block: \t%i\n" - "jp_journal_dev: \t%s[%x]\n" - "jp_journal_size: \t%i\n" - "jp_journal_trans_max: \t%i\n" - "jp_journal_magic: \t%i\n" - "jp_journal_max_batch: \t%i\n" - "jp_journal_max_commit_age: \t%i\n" - "jp_journal_max_trans_age: \t%i\n" - /* incore fields */ - "j_1st_reserved_block: \t%i\n" - "j_state: \t%li\n" - "j_trans_id: \t%lu\n" - "j_mount_id: \t%lu\n" - "j_start: \t%lu\n" - "j_len: \t%lu\n" - "j_len_alloc: \t%lu\n" - "j_wcount: \t%i\n" - "j_bcount: \t%lu\n" - "j_first_unflushed_offset: \t%lu\n" - "j_last_flush_trans_id: \t%lu\n" - "j_trans_start_time: \t%li\n" - "j_list_bitmap_index: \t%i\n" - "j_must_wait: \t%i\n" - "j_next_full_flush: \t%i\n" - "j_next_async_flush: \t%i\n" - "j_cnode_used: \t%i\n" - "j_cnode_free: \t%i\n" - "\n" - /* reiserfs_proc_info_data_t.journal fields */ - "in_journal: \t%12lu\n" - "in_journal_bitmap: \t%12lu\n" - "in_journal_reusable: \t%12lu\n" - "lock_journal: \t%12lu\n" - "lock_journal_wait: \t%12lu\n" - "journal_begin: \t%12lu\n" - "journal_relock_writers: \t%12lu\n" - "journal_relock_wcount: \t%12lu\n" - "mark_dirty: \t%12lu\n" - "mark_dirty_already: \t%12lu\n" - "mark_dirty_notjournal: \t%12lu\n" - "restore_prepared: \t%12lu\n" - "prepare: \t%12lu\n" - "prepare_retry: \t%12lu\n", - - DJP( jp_journal_1st_block ), - bdevname(SB_JOURNAL(sb)->j_dev_bd, b), - DJP( jp_journal_dev ), - DJP( jp_journal_size ), - DJP( jp_journal_trans_max ), - DJP( jp_journal_magic ), - DJP( jp_journal_max_batch ), - SB_JOURNAL(sb)->j_max_commit_age, - DJP( jp_journal_max_trans_age ), - - JF( j_1st_reserved_block ), - JF( j_state ), - JF( j_trans_id ), - JF( j_mount_id ), - JF( j_start ), - JF( j_len ), - JF( j_len_alloc ), - atomic_read( & r -> s_journal -> j_wcount ), - JF( j_bcount ), - JF( j_first_unflushed_offset ), - JF( j_last_flush_trans_id ), - JF( j_trans_start_time ), - JF( j_list_bitmap_index ), - JF( j_must_wait ), - JF( j_next_full_flush ), - JF( j_next_async_flush ), - JF( j_cnode_used ), - JF( j_cnode_free ), - - SFPJ( in_journal ), - SFPJ( in_journal_bitmap ), - SFPJ( in_journal_reusable ), - SFPJ( lock_journal ), - SFPJ( lock_journal_wait ), - SFPJ( journal_being ), - SFPJ( journal_relock_writers ), - SFPJ( journal_relock_wcount ), - SFPJ( mark_dirty ), - SFPJ( mark_dirty_already ), - SFPJ( mark_dirty_notjournal ), - SFPJ( restore_prepared ), - SFPJ( prepare ), - SFPJ( prepare_retry ) - ); + + seq_printf(m, /* on-disk fields */ + "jp_journal_1st_block: \t%i\n" + "jp_journal_dev: \t%s[%x]\n" + "jp_journal_size: \t%i\n" + "jp_journal_trans_max: \t%i\n" + "jp_journal_magic: \t%i\n" + "jp_journal_max_batch: \t%i\n" + "jp_journal_max_commit_age: \t%i\n" + "jp_journal_max_trans_age: \t%i\n" + /* incore fields */ + "j_1st_reserved_block: \t%i\n" + "j_state: \t%li\n" + "j_trans_id: \t%lu\n" + "j_mount_id: \t%lu\n" + "j_start: \t%lu\n" + "j_len: \t%lu\n" + "j_len_alloc: \t%lu\n" + "j_wcount: \t%i\n" + "j_bcount: \t%lu\n" + "j_first_unflushed_offset: \t%lu\n" + "j_last_flush_trans_id: \t%lu\n" + "j_trans_start_time: \t%li\n" + "j_list_bitmap_index: \t%i\n" + "j_must_wait: \t%i\n" + "j_next_full_flush: \t%i\n" + "j_next_async_flush: \t%i\n" + "j_cnode_used: \t%i\n" "j_cnode_free: \t%i\n" "\n" + /* reiserfs_proc_info_data_t.journal fields */ + "in_journal: \t%12lu\n" + "in_journal_bitmap: \t%12lu\n" + "in_journal_reusable: \t%12lu\n" + "lock_journal: \t%12lu\n" + "lock_journal_wait: \t%12lu\n" + "journal_begin: \t%12lu\n" + "journal_relock_writers: \t%12lu\n" + "journal_relock_wcount: \t%12lu\n" + "mark_dirty: \t%12lu\n" + "mark_dirty_already: \t%12lu\n" + "mark_dirty_notjournal: \t%12lu\n" + "restore_prepared: \t%12lu\n" + "prepare: \t%12lu\n" + "prepare_retry: \t%12lu\n", + DJP(jp_journal_1st_block), + bdevname(SB_JOURNAL(sb)->j_dev_bd, b), + DJP(jp_journal_dev), + DJP(jp_journal_size), + DJP(jp_journal_trans_max), + DJP(jp_journal_magic), + DJP(jp_journal_max_batch), + SB_JOURNAL(sb)->j_max_commit_age, + DJP(jp_journal_max_trans_age), + JF(j_1st_reserved_block), + JF(j_state), + JF(j_trans_id), + JF(j_mount_id), + JF(j_start), + JF(j_len), + JF(j_len_alloc), + atomic_read(&r->s_journal->j_wcount), + JF(j_bcount), + JF(j_first_unflushed_offset), + JF(j_last_flush_trans_id), + JF(j_trans_start_time), + JF(j_list_bitmap_index), + JF(j_must_wait), + JF(j_next_full_flush), + JF(j_next_async_flush), + JF(j_cnode_used), + JF(j_cnode_free), + SFPJ(in_journal), + SFPJ(in_journal_bitmap), + SFPJ(in_journal_reusable), + SFPJ(lock_journal), + SFPJ(lock_journal_wait), + SFPJ(journal_being), + SFPJ(journal_relock_writers), + SFPJ(journal_relock_wcount), + SFPJ(mark_dirty), + SFPJ(mark_dirty_already), + SFPJ(mark_dirty_notjournal), + SFPJ(restore_prepared), SFPJ(prepare), SFPJ(prepare_retry) + ); return 0; } @@ -450,7 +410,7 @@ static int set_sb(struct super_block *sb, void *data) return -ENOENT; } -static void *r_start(struct seq_file *m, loff_t *pos) +static void *r_start(struct seq_file *m, loff_t * pos) { struct proc_dir_entry *de = m->private; struct super_block *s = de->parent->data; @@ -472,7 +432,7 @@ static void *r_start(struct seq_file *m, loff_t *pos) return s; } -static void *r_next(struct seq_file *m, void *v, loff_t *pos) +static void *r_next(struct seq_file *m, void *v, loff_t * pos) { ++*pos; if (v) @@ -489,7 +449,7 @@ static void r_stop(struct seq_file *m, void *v) static int r_show(struct seq_file *m, void *v) { struct proc_dir_entry *de = m->private; - int (*show)(struct seq_file *, struct super_block *) = de->data; + int (*show) (struct seq_file *, struct super_block *) = de->data; return show(m, v); } @@ -512,17 +472,17 @@ static int r_open(struct inode *inode, struct file *file) } static struct file_operations r_file_operations = { - .open = r_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, + .open = r_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, }; static struct proc_dir_entry *proc_info_root = NULL; static const char proc_info_root_name[] = "fs/reiserfs"; static void add_file(struct super_block *sb, char *name, - int (*func)(struct seq_file *, struct super_block *)) + int (*func) (struct seq_file *, struct super_block *)) { struct proc_dir_entry *de; de = create_proc_entry(name, 0, REISERFS_SB(sb)->procdir); @@ -532,11 +492,12 @@ static void add_file(struct super_block *sb, char *name, } } -int reiserfs_proc_info_init( struct super_block *sb ) +int reiserfs_proc_info_init(struct super_block *sb) { - spin_lock_init( & __PINFO( sb ).lock ); - REISERFS_SB(sb)->procdir = proc_mkdir(reiserfs_bdevname (sb), proc_info_root); - if( REISERFS_SB(sb)->procdir ) { + spin_lock_init(&__PINFO(sb).lock); + REISERFS_SB(sb)->procdir = + proc_mkdir(reiserfs_bdevname(sb), proc_info_root); + if (REISERFS_SB(sb)->procdir) { REISERFS_SB(sb)->procdir->owner = THIS_MODULE; REISERFS_SB(sb)->procdir->data = sb; add_file(sb, "version", show_version); @@ -549,11 +510,11 @@ int reiserfs_proc_info_init( struct super_block *sb ) return 0; } reiserfs_warning(sb, "reiserfs: cannot create /proc/%s/%s", - proc_info_root_name, reiserfs_bdevname (sb) ); + proc_info_root_name, reiserfs_bdevname(sb)); return 1; } -int reiserfs_proc_info_done( struct super_block *sb ) +int reiserfs_proc_info_done(struct super_block *sb) { struct proc_dir_entry *de = REISERFS_SB(sb)->procdir; if (de) { @@ -565,48 +526,48 @@ int reiserfs_proc_info_done( struct super_block *sb ) remove_proc_entry("super", de); remove_proc_entry("version", de); } - spin_lock( & __PINFO( sb ).lock ); - __PINFO( sb ).exiting = 1; - spin_unlock( & __PINFO( sb ).lock ); - if ( proc_info_root ) { - remove_proc_entry( reiserfs_bdevname (sb), proc_info_root ); + spin_lock(&__PINFO(sb).lock); + __PINFO(sb).exiting = 1; + spin_unlock(&__PINFO(sb).lock); + if (proc_info_root) { + remove_proc_entry(reiserfs_bdevname(sb), proc_info_root); REISERFS_SB(sb)->procdir = NULL; } return 0; } -struct proc_dir_entry *reiserfs_proc_register_global( char *name, - read_proc_t *func ) +struct proc_dir_entry *reiserfs_proc_register_global(char *name, + read_proc_t * func) { - return ( proc_info_root ) ? create_proc_read_entry( name, 0, - proc_info_root, - func, NULL ) : NULL; + return (proc_info_root) ? create_proc_read_entry(name, 0, + proc_info_root, + func, NULL) : NULL; } -void reiserfs_proc_unregister_global( const char *name ) +void reiserfs_proc_unregister_global(const char *name) { - remove_proc_entry( name, proc_info_root ); + remove_proc_entry(name, proc_info_root); } -int reiserfs_proc_info_global_init( void ) +int reiserfs_proc_info_global_init(void) { - if( proc_info_root == NULL ) { + if (proc_info_root == NULL) { proc_info_root = proc_mkdir(proc_info_root_name, NULL); - if( proc_info_root ) { - proc_info_root -> owner = THIS_MODULE; + if (proc_info_root) { + proc_info_root->owner = THIS_MODULE; } else { - reiserfs_warning (NULL, - "reiserfs: cannot create /proc/%s", - proc_info_root_name ); + reiserfs_warning(NULL, + "reiserfs: cannot create /proc/%s", + proc_info_root_name); return 1; } } return 0; } -int reiserfs_proc_info_global_done( void ) +int reiserfs_proc_info_global_done(void) { - if ( proc_info_root != NULL ) { + if (proc_info_root != NULL) { proc_info_root = NULL; remove_proc_entry(proc_info_root_name, NULL); } @@ -616,22 +577,40 @@ int reiserfs_proc_info_global_done( void ) /* REISERFS_PROC_INFO */ #else -int reiserfs_proc_info_init( struct super_block *sb ) { return 0; } -int reiserfs_proc_info_done( struct super_block *sb ) { return 0; } +int reiserfs_proc_info_init(struct super_block *sb) +{ + return 0; +} +int reiserfs_proc_info_done(struct super_block *sb) +{ + return 0; +} -struct proc_dir_entry *reiserfs_proc_register_global( char *name, - read_proc_t *func ) -{ return NULL; } +struct proc_dir_entry *reiserfs_proc_register_global(char *name, + read_proc_t * func) +{ + return NULL; +} -void reiserfs_proc_unregister_global( const char *name ) {;} +void reiserfs_proc_unregister_global(const char *name) +{; +} -int reiserfs_proc_info_global_init( void ) { return 0; } -int reiserfs_proc_info_global_done( void ) { return 0; } +int reiserfs_proc_info_global_init(void) +{ + return 0; +} +int reiserfs_proc_info_global_done(void) +{ + return 0; +} -int reiserfs_global_version_in_proc( char *buffer, char **start, - off_t offset, - int count, int *eof, void *data ) -{ return 0; } +int reiserfs_global_version_in_proc(char *buffer, char **start, + off_t offset, + int count, int *eof, void *data) +{ + return 0; +} /* REISERFS_PROC_INFO */ #endif diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index 170012078b7..39cc7f47f5d 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -1,7 +1,7 @@ /* * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ - + /* * Written by Alexander Zarochentcev. * @@ -17,23 +17,23 @@ #include <linux/reiserfs_fs_sb.h> #include <linux/buffer_head.h> -int reiserfs_resize (struct super_block * s, unsigned long block_count_new) +int reiserfs_resize(struct super_block *s, unsigned long block_count_new) { - int err = 0; - struct reiserfs_super_block * sb; - struct reiserfs_bitmap_info *bitmap; + int err = 0; + struct reiserfs_super_block *sb; + struct reiserfs_bitmap_info *bitmap; struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s); - struct buffer_head * bh; + struct buffer_head *bh; struct reiserfs_transaction_handle th; unsigned int bmap_nr_new, bmap_nr; unsigned int block_r_new, block_r; - - struct reiserfs_list_bitmap * jb; + + struct reiserfs_list_bitmap *jb; struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS]; - + unsigned long int block_count, free_blocks; int i; - int copy_size ; + int copy_size; sb = SB_DISK_SUPER_BLOCK(s); @@ -47,136 +47,145 @@ int reiserfs_resize (struct super_block * s, unsigned long block_count_new) if (!bh) { printk("reiserfs_resize: can\'t read last block\n"); return -EINVAL; - } + } bforget(bh); /* old disk layout detection; those partitions can be mounted, but * cannot be resized */ - if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size - != REISERFS_DISK_OFFSET_IN_BYTES ) { - printk("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n"); + if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size + != REISERFS_DISK_OFFSET_IN_BYTES) { + printk + ("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n"); return -ENOTSUPP; } - + /* count used bits in last bitmap block */ - block_r = SB_BLOCK_COUNT(s) - - (SB_BMAP_NR(s) - 1) * s->s_blocksize * 8; - + block_r = SB_BLOCK_COUNT(s) - (SB_BMAP_NR(s) - 1) * s->s_blocksize * 8; + /* count bitmap blocks in new fs */ - bmap_nr_new = block_count_new / ( s->s_blocksize * 8 ); + bmap_nr_new = block_count_new / (s->s_blocksize * 8); block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8; - if (block_r_new) + if (block_r_new) bmap_nr_new++; else block_r_new = s->s_blocksize * 8; /* save old values */ block_count = SB_BLOCK_COUNT(s); - bmap_nr = SB_BMAP_NR(s); + bmap_nr = SB_BMAP_NR(s); /* resizing of reiserfs bitmaps (journal and real), if needed */ - if (bmap_nr_new > bmap_nr) { - /* reallocate journal bitmaps */ - if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) { - printk("reiserfs_resize: unable to allocate memory for journal bitmaps\n"); - unlock_super(s) ; - return -ENOMEM ; - } - /* the new journal bitmaps are zero filled, now we copy in the bitmap - ** node pointers from the old journal bitmap structs, and then - ** transfer the new data structures into the journal struct. - ** - ** using the copy_size var below allows this code to work for - ** both shrinking and expanding the FS. - */ - copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr ; - copy_size = copy_size * sizeof(struct reiserfs_list_bitmap_node *) ; - for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) { - struct reiserfs_bitmap_node **node_tmp ; - jb = SB_JOURNAL(s)->j_list_bitmap + i ; - memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size) ; - - /* just in case vfree schedules on us, copy the new - ** pointer into the journal struct before freeing the - ** old one - */ - node_tmp = jb->bitmaps ; - jb->bitmaps = jbitmap[i].bitmaps ; - vfree(node_tmp) ; - } - - /* allocate additional bitmap blocks, reallocate array of bitmap - * block pointers */ - bitmap = vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); - if (!bitmap) { - /* Journal bitmaps are still supersized, but the memory isn't - * leaked, so I guess it's ok */ - printk("reiserfs_resize: unable to allocate memory.\n"); - return -ENOMEM; - } - memset (bitmap, 0, sizeof (struct reiserfs_bitmap_info) * SB_BMAP_NR(s)); - for (i = 0; i < bmap_nr; i++) - bitmap[i] = old_bitmap[i]; - - /* This doesn't go through the journal, but it doesn't have to. - * The changes are still atomic: We're synced up when the journal - * transaction begins, and the new bitmaps don't matter if the - * transaction fails. */ - for (i = bmap_nr; i < bmap_nr_new; i++) { - bitmap[i].bh = sb_getblk(s, i * s->s_blocksize * 8); - memset(bitmap[i].bh->b_data, 0, sb_blocksize(sb)); - reiserfs_test_and_set_le_bit(0, bitmap[i].bh->b_data); - - set_buffer_uptodate(bitmap[i].bh); - mark_buffer_dirty(bitmap[i].bh) ; - sync_dirty_buffer(bitmap[i].bh); - // update bitmap_info stuff - bitmap[i].first_zero_hint=1; - bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; - } - /* free old bitmap blocks array */ - SB_AP_BITMAP(s) = bitmap; - vfree (old_bitmap); + if (bmap_nr_new > bmap_nr) { + /* reallocate journal bitmaps */ + if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) { + printk + ("reiserfs_resize: unable to allocate memory for journal bitmaps\n"); + unlock_super(s); + return -ENOMEM; + } + /* the new journal bitmaps are zero filled, now we copy in the bitmap + ** node pointers from the old journal bitmap structs, and then + ** transfer the new data structures into the journal struct. + ** + ** using the copy_size var below allows this code to work for + ** both shrinking and expanding the FS. + */ + copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr; + copy_size = + copy_size * sizeof(struct reiserfs_list_bitmap_node *); + for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) { + struct reiserfs_bitmap_node **node_tmp; + jb = SB_JOURNAL(s)->j_list_bitmap + i; + memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size); + + /* just in case vfree schedules on us, copy the new + ** pointer into the journal struct before freeing the + ** old one + */ + node_tmp = jb->bitmaps; + jb->bitmaps = jbitmap[i].bitmaps; + vfree(node_tmp); + } + + /* allocate additional bitmap blocks, reallocate array of bitmap + * block pointers */ + bitmap = + vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new); + if (!bitmap) { + /* Journal bitmaps are still supersized, but the memory isn't + * leaked, so I guess it's ok */ + printk("reiserfs_resize: unable to allocate memory.\n"); + return -ENOMEM; + } + memset(bitmap, 0, + sizeof(struct reiserfs_bitmap_info) * SB_BMAP_NR(s)); + for (i = 0; i < bmap_nr; i++) + bitmap[i] = old_bitmap[i]; + + /* This doesn't go through the journal, but it doesn't have to. + * The changes are still atomic: We're synced up when the journal + * transaction begins, and the new bitmaps don't matter if the + * transaction fails. */ + for (i = bmap_nr; i < bmap_nr_new; i++) { + bitmap[i].bh = sb_getblk(s, i * s->s_blocksize * 8); + memset(bitmap[i].bh->b_data, 0, sb_blocksize(sb)); + reiserfs_test_and_set_le_bit(0, bitmap[i].bh->b_data); + + set_buffer_uptodate(bitmap[i].bh); + mark_buffer_dirty(bitmap[i].bh); + sync_dirty_buffer(bitmap[i].bh); + // update bitmap_info stuff + bitmap[i].first_zero_hint = 1; + bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; + } + /* free old bitmap blocks array */ + SB_AP_BITMAP(s) = bitmap; + vfree(old_bitmap); } - + /* begin transaction, if there was an error, it's fine. Yes, we have * incorrect bitmaps now, but none of it is ever going to touch the * disk anyway. */ err = journal_begin(&th, s, 10); if (err) - return err; + return err; /* correct last bitmap blocks in old and new disk layout */ reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[bmap_nr - 1].bh, 1); for (i = block_r; i < s->s_blocksize * 8; i++) - reiserfs_test_and_clear_le_bit(i, - SB_AP_BITMAP(s)[bmap_nr - 1].bh->b_data); + reiserfs_test_and_clear_le_bit(i, + SB_AP_BITMAP(s)[bmap_nr - + 1].bh->b_data); SB_AP_BITMAP(s)[bmap_nr - 1].free_count += s->s_blocksize * 8 - block_r; - if ( !SB_AP_BITMAP(s)[bmap_nr - 1].first_zero_hint) - SB_AP_BITMAP(s)[bmap_nr - 1].first_zero_hint = block_r; + if (!SB_AP_BITMAP(s)[bmap_nr - 1].first_zero_hint) + SB_AP_BITMAP(s)[bmap_nr - 1].first_zero_hint = block_r; journal_mark_dirty(&th, s, SB_AP_BITMAP(s)[bmap_nr - 1].bh); reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[bmap_nr_new - 1].bh, 1); for (i = block_r_new; i < s->s_blocksize * 8; i++) - reiserfs_test_and_set_le_bit(i, - SB_AP_BITMAP(s)[bmap_nr_new - 1].bh->b_data); + reiserfs_test_and_set_le_bit(i, + SB_AP_BITMAP(s)[bmap_nr_new - + 1].bh->b_data); journal_mark_dirty(&th, s, SB_AP_BITMAP(s)[bmap_nr_new - 1].bh); - - SB_AP_BITMAP(s)[bmap_nr_new - 1].free_count -= s->s_blocksize * 8 - block_r_new; + + SB_AP_BITMAP(s)[bmap_nr_new - 1].free_count -= + s->s_blocksize * 8 - block_r_new; /* Extreme case where last bitmap is the only valid block in itself. */ - if ( !SB_AP_BITMAP(s)[bmap_nr_new - 1].free_count ) - SB_AP_BITMAP(s)[bmap_nr_new - 1].first_zero_hint = 0; - /* update super */ - reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; + if (!SB_AP_BITMAP(s)[bmap_nr_new - 1].free_count) + SB_AP_BITMAP(s)[bmap_nr_new - 1].first_zero_hint = 0; + /* update super */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); free_blocks = SB_FREE_BLOCKS(s); - PUT_SB_FREE_BLOCKS(s, free_blocks + (block_count_new - block_count - (bmap_nr_new - bmap_nr))); + PUT_SB_FREE_BLOCKS(s, + free_blocks + (block_count_new - block_count - + (bmap_nr_new - bmap_nr))); PUT_SB_BLOCK_COUNT(s, block_count_new); PUT_SB_BMAP_NR(s, bmap_nr_new); s->s_dirt = 1; journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); - + SB_JOURNAL(s)->j_must_wait = 1; return journal_end(&th, s, 10); } diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index c47f8fd31a2..e2d08d7bcff 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -59,46 +59,45 @@ #include <linux/quotaops.h> /* Does the buffer contain a disk block which is in the tree. */ -inline int B_IS_IN_TREE (const struct buffer_head * p_s_bh) +inline int B_IS_IN_TREE(const struct buffer_head *p_s_bh) { - RFALSE( B_LEVEL (p_s_bh) > MAX_HEIGHT, - "PAP-1010: block (%b) has too big level (%z)", p_s_bh, p_s_bh); + RFALSE(B_LEVEL(p_s_bh) > MAX_HEIGHT, + "PAP-1010: block (%b) has too big level (%z)", p_s_bh, p_s_bh); - return ( B_LEVEL (p_s_bh) != FREE_LEVEL ); + return (B_LEVEL(p_s_bh) != FREE_LEVEL); } // // to gets item head in le form // -inline void copy_item_head(struct item_head * p_v_to, - const struct item_head * p_v_from) +inline void copy_item_head(struct item_head *p_v_to, + const struct item_head *p_v_from) { - memcpy (p_v_to, p_v_from, IH_SIZE); + memcpy(p_v_to, p_v_from, IH_SIZE); } - /* k1 is pointer to on-disk structure which is stored in little-endian form. k2 is pointer to cpu variable. For key of items of the same object this returns 0. Returns: -1 if key1 < key2 0 if key1 == key2 1 if key1 > key2 */ -inline int comp_short_keys (const struct reiserfs_key * le_key, - const struct cpu_key * cpu_key) +inline int comp_short_keys(const struct reiserfs_key *le_key, + const struct cpu_key *cpu_key) { - __u32 n; - n = le32_to_cpu(le_key->k_dir_id); - if (n < cpu_key->on_disk_key.k_dir_id) - return -1; - if (n > cpu_key->on_disk_key.k_dir_id) - return 1; - n = le32_to_cpu(le_key->k_objectid); - if (n < cpu_key->on_disk_key.k_objectid) - return -1; - if (n > cpu_key->on_disk_key.k_objectid) - return 1; - return 0; + __u32 n; + n = le32_to_cpu(le_key->k_dir_id); + if (n < cpu_key->on_disk_key.k_dir_id) + return -1; + if (n > cpu_key->on_disk_key.k_dir_id) + return 1; + n = le32_to_cpu(le_key->k_objectid); + if (n < cpu_key->on_disk_key.k_objectid) + return -1; + if (n > cpu_key->on_disk_key.k_objectid) + return 1; + return 0; } /* k1 is pointer to on-disk structure which is stored in little-endian @@ -106,68 +105,72 @@ inline int comp_short_keys (const struct reiserfs_key * le_key, Compare keys using all 4 key fields. Returns: -1 if key1 < key2 0 if key1 = key2 1 if key1 > key2 */ -static inline int comp_keys (const struct reiserfs_key * le_key, const struct cpu_key * cpu_key) +static inline int comp_keys(const struct reiserfs_key *le_key, + const struct cpu_key *cpu_key) { - int retval; - - retval = comp_short_keys (le_key, cpu_key); - if (retval) - return retval; - if (le_key_k_offset (le_key_version(le_key), le_key) < cpu_key_k_offset (cpu_key)) - return -1; - if (le_key_k_offset (le_key_version(le_key), le_key) > cpu_key_k_offset (cpu_key)) - return 1; - - if (cpu_key->key_length == 3) - return 0; - - /* this part is needed only when tail conversion is in progress */ - if (le_key_k_type (le_key_version(le_key), le_key) < cpu_key_k_type (cpu_key)) - return -1; + int retval; + + retval = comp_short_keys(le_key, cpu_key); + if (retval) + return retval; + if (le_key_k_offset(le_key_version(le_key), le_key) < + cpu_key_k_offset(cpu_key)) + return -1; + if (le_key_k_offset(le_key_version(le_key), le_key) > + cpu_key_k_offset(cpu_key)) + return 1; + + if (cpu_key->key_length == 3) + return 0; + + /* this part is needed only when tail conversion is in progress */ + if (le_key_k_type(le_key_version(le_key), le_key) < + cpu_key_k_type(cpu_key)) + return -1; + + if (le_key_k_type(le_key_version(le_key), le_key) > + cpu_key_k_type(cpu_key)) + return 1; - if (le_key_k_type (le_key_version(le_key), le_key) > cpu_key_k_type (cpu_key)) - return 1; - - return 0; + return 0; } - -inline int comp_short_le_keys (const struct reiserfs_key * key1, const struct reiserfs_key * key2) +inline int comp_short_le_keys(const struct reiserfs_key *key1, + const struct reiserfs_key *key2) { - __u32 * p_s_1_u32, * p_s_2_u32; - int n_key_length = REISERFS_SHORT_KEY_LEN; - - p_s_1_u32 = (__u32 *)key1; - p_s_2_u32 = (__u32 *)key2; - for( ; n_key_length--; ++p_s_1_u32, ++p_s_2_u32 ) { - if ( le32_to_cpu (*p_s_1_u32) < le32_to_cpu (*p_s_2_u32) ) - return -1; - if ( le32_to_cpu (*p_s_1_u32) > le32_to_cpu (*p_s_2_u32) ) - return 1; - } - return 0; + __u32 *p_s_1_u32, *p_s_2_u32; + int n_key_length = REISERFS_SHORT_KEY_LEN; + + p_s_1_u32 = (__u32 *) key1; + p_s_2_u32 = (__u32 *) key2; + for (; n_key_length--; ++p_s_1_u32, ++p_s_2_u32) { + if (le32_to_cpu(*p_s_1_u32) < le32_to_cpu(*p_s_2_u32)) + return -1; + if (le32_to_cpu(*p_s_1_u32) > le32_to_cpu(*p_s_2_u32)) + return 1; + } + return 0; } -inline void le_key2cpu_key (struct cpu_key * to, const struct reiserfs_key * from) +inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from) { - int version; - to->on_disk_key.k_dir_id = le32_to_cpu (from->k_dir_id); - to->on_disk_key.k_objectid = le32_to_cpu (from->k_objectid); - - // find out version of the key - version = le_key_version (from); - to->version = version; - to->on_disk_key.k_offset = le_key_k_offset(version, from); - to->on_disk_key.k_type = le_key_k_type(version, from); + int version; + to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id); + to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid); + + // find out version of the key + version = le_key_version(from); + to->version = version; + to->on_disk_key.k_offset = le_key_k_offset(version, from); + to->on_disk_key.k_type = le_key_k_type(version, from); } - - // this does not say which one is bigger, it only returns 1 if keys // are not equal, 0 otherwise -inline int comp_le_keys (const struct reiserfs_key * k1, const struct reiserfs_key * k2) +inline int comp_le_keys(const struct reiserfs_key *k1, + const struct reiserfs_key *k2) { - return memcmp (k1, k2, sizeof (struct reiserfs_key)); + return memcmp(k1, k2, sizeof(struct reiserfs_key)); } /************************************************************************** @@ -184,373 +187,396 @@ inline int comp_le_keys (const struct reiserfs_key * k1, const struct reiserfs_k there are no possible items, and we have not found it. With each examination we cut the number of possible items it could be by one more than half rounded down, or we find it. */ -static inline int bin_search ( - const void * p_v_key, /* Key to search for. */ - const void * p_v_base,/* First item in the array. */ - int p_n_num, /* Number of items in the array. */ - int p_n_width, /* Item size in the array. - searched. Lest the reader be - confused, note that this is crafted - as a general function, and when it - is applied specifically to the array - of item headers in a node, p_n_width - is actually the item header size not - the item size. */ - int * p_n_pos /* Number of the searched for element. */ - ) { - int n_rbound, n_lbound, n_j; - - for ( n_j = ((n_rbound = p_n_num - 1) + (n_lbound = 0))/2; n_lbound <= n_rbound; n_j = (n_rbound + n_lbound)/2 ) - switch( comp_keys((struct reiserfs_key *)((char * )p_v_base + n_j * p_n_width), (struct cpu_key *)p_v_key) ) { - case -1: n_lbound = n_j + 1; continue; - case 1: n_rbound = n_j - 1; continue; - case 0: *p_n_pos = n_j; return ITEM_FOUND; /* Key found in the array. */ - } - - /* bin_search did not find given key, it returns position of key, - that is minimal and greater than the given one. */ - *p_n_pos = n_lbound; - return ITEM_NOT_FOUND; +static inline int bin_search(const void *p_v_key, /* Key to search for. */ + const void *p_v_base, /* First item in the array. */ + int p_n_num, /* Number of items in the array. */ + int p_n_width, /* Item size in the array. + searched. Lest the reader be + confused, note that this is crafted + as a general function, and when it + is applied specifically to the array + of item headers in a node, p_n_width + is actually the item header size not + the item size. */ + int *p_n_pos /* Number of the searched for element. */ + ) +{ + int n_rbound, n_lbound, n_j; + + for (n_j = ((n_rbound = p_n_num - 1) + (n_lbound = 0)) / 2; + n_lbound <= n_rbound; n_j = (n_rbound + n_lbound) / 2) + switch (comp_keys + ((struct reiserfs_key *)((char *)p_v_base + + n_j * p_n_width), + (struct cpu_key *)p_v_key)) { + case -1: + n_lbound = n_j + 1; + continue; + case 1: + n_rbound = n_j - 1; + continue; + case 0: + *p_n_pos = n_j; + return ITEM_FOUND; /* Key found in the array. */ + } + + /* bin_search did not find given key, it returns position of key, + that is minimal and greater than the given one. */ + *p_n_pos = n_lbound; + return ITEM_NOT_FOUND; } #ifdef CONFIG_REISERFS_CHECK -extern struct tree_balance * cur_tb; +extern struct tree_balance *cur_tb; #endif - - /* Minimal possible key. It is never in the tree. */ -const struct reiserfs_key MIN_KEY = {0, 0, {{0, 0},}}; +const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} }; /* Maximal possible key. It is never in the tree. */ -const struct reiserfs_key MAX_KEY = { +static const struct reiserfs_key MAX_KEY = { __constant_cpu_to_le32(0xffffffff), __constant_cpu_to_le32(0xffffffff), {{__constant_cpu_to_le32(0xffffffff), - __constant_cpu_to_le32(0xffffffff)},} + __constant_cpu_to_le32(0xffffffff)},} }; - /* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom of the path, and going upwards. We must check the path's validity at each step. If the key is not in the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this case we return a special key, either MIN_KEY or MAX_KEY. */ -static inline const struct reiserfs_key * get_lkey ( - const struct path * p_s_chk_path, - const struct super_block * p_s_sb - ) { - int n_position, n_path_offset = p_s_chk_path->path_length; - struct buffer_head * p_s_parent; - - RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET, - "PAP-5010: invalid offset in the path"); - - /* While not higher in path than first element. */ - while ( n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET ) { - - RFALSE( ! buffer_uptodate(PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)), - "PAP-5020: parent is not uptodate"); - - /* Parent at the path is not in the tree now. */ - if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)) ) - return &MAX_KEY; - /* Check whether position in the parent is correct. */ - if ( (n_position = PATH_OFFSET_POSITION(p_s_chk_path, n_path_offset)) > B_NR_ITEMS(p_s_parent) ) - return &MAX_KEY; - /* Check whether parent at the path really points to the child. */ - if ( B_N_CHILD_NUM(p_s_parent, n_position) != - PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset + 1)->b_blocknr ) - return &MAX_KEY; - /* Return delimiting key if position in the parent is not equal to zero. */ - if ( n_position ) - return B_N_PDELIM_KEY(p_s_parent, n_position - 1); - } - /* Return MIN_KEY if we are in the root of the buffer tree. */ - if ( PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == - SB_ROOT_BLOCK (p_s_sb) ) - return &MIN_KEY; - return &MAX_KEY; +static inline const struct reiserfs_key *get_lkey(const struct path + *p_s_chk_path, + const struct super_block + *p_s_sb) +{ + int n_position, n_path_offset = p_s_chk_path->path_length; + struct buffer_head *p_s_parent; + + RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET, + "PAP-5010: invalid offset in the path"); + + /* While not higher in path than first element. */ + while (n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET) { + + RFALSE(!buffer_uptodate + (PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)), + "PAP-5020: parent is not uptodate"); + + /* Parent at the path is not in the tree now. */ + if (!B_IS_IN_TREE + (p_s_parent = + PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset))) + return &MAX_KEY; + /* Check whether position in the parent is correct. */ + if ((n_position = + PATH_OFFSET_POSITION(p_s_chk_path, + n_path_offset)) > + B_NR_ITEMS(p_s_parent)) + return &MAX_KEY; + /* Check whether parent at the path really points to the child. */ + if (B_N_CHILD_NUM(p_s_parent, n_position) != + PATH_OFFSET_PBUFFER(p_s_chk_path, + n_path_offset + 1)->b_blocknr) + return &MAX_KEY; + /* Return delimiting key if position in the parent is not equal to zero. */ + if (n_position) + return B_N_PDELIM_KEY(p_s_parent, n_position - 1); + } + /* Return MIN_KEY if we are in the root of the buffer tree. */ + if (PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)-> + b_blocknr == SB_ROOT_BLOCK(p_s_sb)) + return &MIN_KEY; + return &MAX_KEY; } - /* Get delimiting key of the buffer at the path and its right neighbor. */ -inline const struct reiserfs_key * get_rkey ( - const struct path * p_s_chk_path, - const struct super_block * p_s_sb - ) { - int n_position, - n_path_offset = p_s_chk_path->path_length; - struct buffer_head * p_s_parent; - - RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET, - "PAP-5030: invalid offset in the path"); - - while ( n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET ) { - - RFALSE( ! buffer_uptodate(PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)), - "PAP-5040: parent is not uptodate"); - - /* Parent at the path is not in the tree now. */ - if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)) ) - return &MIN_KEY; - /* Check whether position in the parent is correct. */ - if ( (n_position = PATH_OFFSET_POSITION(p_s_chk_path, n_path_offset)) > B_NR_ITEMS(p_s_parent) ) - return &MIN_KEY; - /* Check whether parent at the path really points to the child. */ - if ( B_N_CHILD_NUM(p_s_parent, n_position) != - PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset + 1)->b_blocknr ) - return &MIN_KEY; - /* Return delimiting key if position in the parent is not the last one. */ - if ( n_position != B_NR_ITEMS(p_s_parent) ) - return B_N_PDELIM_KEY(p_s_parent, n_position); - } - /* Return MAX_KEY if we are in the root of the buffer tree. */ - if ( PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == - SB_ROOT_BLOCK (p_s_sb) ) - return &MAX_KEY; - return &MIN_KEY; +inline const struct reiserfs_key *get_rkey(const struct path *p_s_chk_path, + const struct super_block *p_s_sb) +{ + int n_position, n_path_offset = p_s_chk_path->path_length; + struct buffer_head *p_s_parent; + + RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET, + "PAP-5030: invalid offset in the path"); + + while (n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET) { + + RFALSE(!buffer_uptodate + (PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)), + "PAP-5040: parent is not uptodate"); + + /* Parent at the path is not in the tree now. */ + if (!B_IS_IN_TREE + (p_s_parent = + PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset))) + return &MIN_KEY; + /* Check whether position in the parent is correct. */ + if ((n_position = + PATH_OFFSET_POSITION(p_s_chk_path, + n_path_offset)) > + B_NR_ITEMS(p_s_parent)) + return &MIN_KEY; + /* Check whether parent at the path really points to the child. */ + if (B_N_CHILD_NUM(p_s_parent, n_position) != + PATH_OFFSET_PBUFFER(p_s_chk_path, + n_path_offset + 1)->b_blocknr) + return &MIN_KEY; + /* Return delimiting key if position in the parent is not the last one. */ + if (n_position != B_NR_ITEMS(p_s_parent)) + return B_N_PDELIM_KEY(p_s_parent, n_position); + } + /* Return MAX_KEY if we are in the root of the buffer tree. */ + if (PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)-> + b_blocknr == SB_ROOT_BLOCK(p_s_sb)) + return &MAX_KEY; + return &MIN_KEY; } - /* Check whether a key is contained in the tree rooted from a buffer at a path. */ /* This works by looking at the left and right delimiting keys for the buffer in the last path_element in the path. These delimiting keys are stored at least one level above that buffer in the tree. If the buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */ -static inline int key_in_buffer ( - struct path * p_s_chk_path, /* Path which should be checked. */ - const struct cpu_key * p_s_key, /* Key which should be checked. */ - struct super_block * p_s_sb /* Super block pointer. */ - ) { - - RFALSE( ! p_s_key || p_s_chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET || - p_s_chk_path->path_length > MAX_HEIGHT, - "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)", - p_s_key, p_s_chk_path->path_length); - RFALSE( !PATH_PLAST_BUFFER(p_s_chk_path)->b_bdev, - "PAP-5060: device must not be NODEV"); - - if ( comp_keys(get_lkey(p_s_chk_path, p_s_sb), p_s_key) == 1 ) - /* left delimiting key is bigger, that the key we look for */ - return 0; - // if ( comp_keys(p_s_key, get_rkey(p_s_chk_path, p_s_sb)) != -1 ) - if ( comp_keys(get_rkey(p_s_chk_path, p_s_sb), p_s_key) != 1 ) - /* p_s_key must be less than right delimitiing key */ - return 0; - return 1; -} - +static inline int key_in_buffer(struct path *p_s_chk_path, /* Path which should be checked. */ + const struct cpu_key *p_s_key, /* Key which should be checked. */ + struct super_block *p_s_sb /* Super block pointer. */ + ) +{ -inline void decrement_bcount( - struct buffer_head * p_s_bh - ) { - if ( p_s_bh ) { - if ( atomic_read (&(p_s_bh->b_count)) ) { - put_bh(p_s_bh) ; - return; - } - reiserfs_panic(NULL, "PAP-5070: decrement_bcount: trying to free free buffer %b", p_s_bh); - } + RFALSE(!p_s_key || p_s_chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET + || p_s_chk_path->path_length > MAX_HEIGHT, + "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)", + p_s_key, p_s_chk_path->path_length); + RFALSE(!PATH_PLAST_BUFFER(p_s_chk_path)->b_bdev, + "PAP-5060: device must not be NODEV"); + + if (comp_keys(get_lkey(p_s_chk_path, p_s_sb), p_s_key) == 1) + /* left delimiting key is bigger, that the key we look for */ + return 0; + // if ( comp_keys(p_s_key, get_rkey(p_s_chk_path, p_s_sb)) != -1 ) + if (comp_keys(get_rkey(p_s_chk_path, p_s_sb), p_s_key) != 1) + /* p_s_key must be less than right delimitiing key */ + return 0; + return 1; } +inline void decrement_bcount(struct buffer_head *p_s_bh) +{ + if (p_s_bh) { + if (atomic_read(&(p_s_bh->b_count))) { + put_bh(p_s_bh); + return; + } + reiserfs_panic(NULL, + "PAP-5070: decrement_bcount: trying to free free buffer %b", + p_s_bh); + } +} /* Decrement b_count field of the all buffers in the path. */ -void decrement_counters_in_path ( - struct path * p_s_search_path - ) { - int n_path_offset = p_s_search_path->path_length; - - RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET || - n_path_offset > EXTENDED_MAX_HEIGHT - 1, - "PAP-5080: invalid path offset of %d", n_path_offset); - - while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET ) { - struct buffer_head * bh; - - bh = PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--); - decrement_bcount (bh); - } - p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; -} +void decrement_counters_in_path(struct path *p_s_search_path) +{ + int n_path_offset = p_s_search_path->path_length; + + RFALSE(n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET || + n_path_offset > EXTENDED_MAX_HEIGHT - 1, + "PAP-5080: invalid path offset of %d", n_path_offset); + while (n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) { + struct buffer_head *bh; -int reiserfs_check_path(struct path *p) { - RFALSE( p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET, - "path not properly relsed") ; - return 0 ; + bh = PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--); + decrement_bcount(bh); + } + p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; } +int reiserfs_check_path(struct path *p) +{ + RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET, + "path not properly relsed"); + return 0; +} /* Release all buffers in the path. Restore dirty bits clean ** when preparing the buffer for the log ** ** only called from fix_nodes() */ -void pathrelse_and_restore ( - struct super_block *s, - struct path * p_s_search_path - ) { - int n_path_offset = p_s_search_path->path_length; - - RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, - "clm-4000: invalid path offset"); - - while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET ) { - reiserfs_restore_prepared_buffer(s, PATH_OFFSET_PBUFFER(p_s_search_path, - n_path_offset)); - brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--)); - } - p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; +void pathrelse_and_restore(struct super_block *s, struct path *p_s_search_path) +{ + int n_path_offset = p_s_search_path->path_length; + + RFALSE(n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, + "clm-4000: invalid path offset"); + + while (n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) { + reiserfs_restore_prepared_buffer(s, + PATH_OFFSET_PBUFFER + (p_s_search_path, + n_path_offset)); + brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--)); + } + p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; } /* Release all buffers in the path. */ -void pathrelse ( - struct path * p_s_search_path - ) { - int n_path_offset = p_s_search_path->path_length; - - RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, - "PAP-5090: invalid path offset"); - - while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET ) - brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--)); - - p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; -} +void pathrelse(struct path *p_s_search_path) +{ + int n_path_offset = p_s_search_path->path_length; + RFALSE(n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, + "PAP-5090: invalid path offset"); + while (n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) + brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--)); -static int is_leaf (char * buf, int blocksize, struct buffer_head * bh) -{ - struct block_head * blkh; - struct item_head * ih; - int used_space; - int prev_location; - int i; - int nr; - - blkh = (struct block_head *)buf; - if ( blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) { - reiserfs_warning (NULL, "is_leaf: this should be caught earlier"); - return 0; - } + p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; +} - nr = blkh_nr_item(blkh); - if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) { - /* item number is too big or too small */ - reiserfs_warning (NULL, "is_leaf: nr_item seems wrong: %z", bh); - return 0; - } - ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1; - used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location (ih)); - if (used_space != blocksize - blkh_free_space(blkh)) { - /* free space does not match to calculated amount of use space */ - reiserfs_warning (NULL, "is_leaf: free space seems wrong: %z", bh); - return 0; - } - - // FIXME: it is_leaf will hit performance too much - we may have - // return 1 here - - /* check tables of item heads */ - ih = (struct item_head *)(buf + BLKH_SIZE); - prev_location = blocksize; - for (i = 0; i < nr; i ++, ih ++) { - if ( le_ih_k_type(ih) == TYPE_ANY) { - reiserfs_warning (NULL, "is_leaf: wrong item type for item %h",ih); - return 0; +static int is_leaf(char *buf, int blocksize, struct buffer_head *bh) +{ + struct block_head *blkh; + struct item_head *ih; + int used_space; + int prev_location; + int i; + int nr; + + blkh = (struct block_head *)buf; + if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) { + reiserfs_warning(NULL, + "is_leaf: this should be caught earlier"); + return 0; } - if (ih_location (ih) >= blocksize || ih_location (ih) < IH_SIZE * nr) { - reiserfs_warning (NULL, "is_leaf: item location seems wrong: %h", ih); - return 0; + + nr = blkh_nr_item(blkh); + if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) { + /* item number is too big or too small */ + reiserfs_warning(NULL, "is_leaf: nr_item seems wrong: %z", bh); + return 0; } - if (ih_item_len (ih) < 1 || ih_item_len (ih) > MAX_ITEM_LEN (blocksize)) { - reiserfs_warning (NULL, "is_leaf: item length seems wrong: %h", ih); - return 0; + ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1; + used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih)); + if (used_space != blocksize - blkh_free_space(blkh)) { + /* free space does not match to calculated amount of use space */ + reiserfs_warning(NULL, "is_leaf: free space seems wrong: %z", + bh); + return 0; } - if (prev_location - ih_location (ih) != ih_item_len (ih)) { - reiserfs_warning (NULL, "is_leaf: item location seems wrong (second one): %h", ih); - return 0; + // FIXME: it is_leaf will hit performance too much - we may have + // return 1 here + + /* check tables of item heads */ + ih = (struct item_head *)(buf + BLKH_SIZE); + prev_location = blocksize; + for (i = 0; i < nr; i++, ih++) { + if (le_ih_k_type(ih) == TYPE_ANY) { + reiserfs_warning(NULL, + "is_leaf: wrong item type for item %h", + ih); + return 0; + } + if (ih_location(ih) >= blocksize + || ih_location(ih) < IH_SIZE * nr) { + reiserfs_warning(NULL, + "is_leaf: item location seems wrong: %h", + ih); + return 0; + } + if (ih_item_len(ih) < 1 + || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) { + reiserfs_warning(NULL, + "is_leaf: item length seems wrong: %h", + ih); + return 0; + } + if (prev_location - ih_location(ih) != ih_item_len(ih)) { + reiserfs_warning(NULL, + "is_leaf: item location seems wrong (second one): %h", + ih); + return 0; + } + prev_location = ih_location(ih); } - prev_location = ih_location (ih); - } - // one may imagine much more checks - return 1; + // one may imagine much more checks + return 1; } - /* returns 1 if buf looks like an internal node, 0 otherwise */ -static int is_internal (char * buf, int blocksize, struct buffer_head * bh) +static int is_internal(char *buf, int blocksize, struct buffer_head *bh) { - struct block_head * blkh; - int nr; - int used_space; - - blkh = (struct block_head *)buf; - nr = blkh_level(blkh); - if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) { - /* this level is not possible for internal nodes */ - reiserfs_warning (NULL, "is_internal: this should be caught earlier"); - return 0; - } - - nr = blkh_nr_item(blkh); - if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) { - /* for internal which is not root we might check min number of keys */ - reiserfs_warning (NULL, "is_internal: number of key seems wrong: %z", bh); - return 0; - } + struct block_head *blkh; + int nr; + int used_space; + + blkh = (struct block_head *)buf; + nr = blkh_level(blkh); + if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) { + /* this level is not possible for internal nodes */ + reiserfs_warning(NULL, + "is_internal: this should be caught earlier"); + return 0; + } - used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1); - if (used_space != blocksize - blkh_free_space(blkh)) { - reiserfs_warning (NULL, "is_internal: free space seems wrong: %z", bh); - return 0; - } + nr = blkh_nr_item(blkh); + if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) { + /* for internal which is not root we might check min number of keys */ + reiserfs_warning(NULL, + "is_internal: number of key seems wrong: %z", + bh); + return 0; + } - // one may imagine much more checks - return 1; + used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1); + if (used_space != blocksize - blkh_free_space(blkh)) { + reiserfs_warning(NULL, + "is_internal: free space seems wrong: %z", bh); + return 0; + } + // one may imagine much more checks + return 1; } - // make sure that bh contains formatted node of reiserfs tree of // 'level'-th level -static int is_tree_node (struct buffer_head * bh, int level) +static int is_tree_node(struct buffer_head *bh, int level) { - if (B_LEVEL (bh) != level) { - reiserfs_warning (NULL, "is_tree_node: node level %d does not match to the expected one %d", - B_LEVEL (bh), level); - return 0; - } - if (level == DISK_LEAF_NODE_LEVEL) - return is_leaf (bh->b_data, bh->b_size, bh); + if (B_LEVEL(bh) != level) { + reiserfs_warning(NULL, + "is_tree_node: node level %d does not match to the expected one %d", + B_LEVEL(bh), level); + return 0; + } + if (level == DISK_LEAF_NODE_LEVEL) + return is_leaf(bh->b_data, bh->b_size, bh); - return is_internal (bh->b_data, bh->b_size, bh); + return is_internal(bh->b_data, bh->b_size, bh); } - - #define SEARCH_BY_KEY_READA 16 /* The function is NOT SCHEDULE-SAFE! */ -static void search_by_key_reada (struct super_block * s, - struct buffer_head **bh, - unsigned long *b, int num) +static void search_by_key_reada(struct super_block *s, + struct buffer_head **bh, + unsigned long *b, int num) { - int i,j; - - for (i = 0 ; i < num ; i++) { - bh[i] = sb_getblk (s, b[i]); - } - for (j = 0 ; j < i ; j++) { - /* - * note, this needs attention if we are getting rid of the BKL - * you have to make sure the prepared bit isn't set on this buffer - */ - if (!buffer_uptodate(bh[j])) - ll_rw_block(READA, 1, bh + j); - brelse(bh[j]); - } + int i, j; + + for (i = 0; i < num; i++) { + bh[i] = sb_getblk(s, b[i]); + } + for (j = 0; j < i; j++) { + /* + * note, this needs attention if we are getting rid of the BKL + * you have to make sure the prepared bit isn't set on this buffer + */ + if (!buffer_uptodate(bh[j])) + ll_rw_block(READA, 1, bh + j); + brelse(bh[j]); + } } /************************************************************************** @@ -576,194 +602,200 @@ static void search_by_key_reada (struct super_block * s, correctness of the top of the path but need not be checked for the correctness of the bottom of the path */ /* The function is NOT SCHEDULE-SAFE! */ -int search_by_key (struct super_block * p_s_sb, - const struct cpu_key * p_s_key, /* Key to search. */ - struct path * p_s_search_path, /* This structure was - allocated and initialized - by the calling - function. It is filled up - by this function. */ - int n_stop_level /* How far down the tree to search. To - stop at leaf level - set to - DISK_LEAF_NODE_LEVEL */ - ) { - int n_block_number; - int expected_level; - struct buffer_head * p_s_bh; - struct path_element * p_s_last_element; - int n_node_level, n_retval; - int right_neighbor_of_leaf_node; - int fs_gen; - struct buffer_head *reada_bh[SEARCH_BY_KEY_READA]; - unsigned long reada_blocks[SEARCH_BY_KEY_READA]; - int reada_count = 0; +int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key, /* Key to search. */ + struct path *p_s_search_path, /* This structure was + allocated and initialized + by the calling + function. It is filled up + by this function. */ + int n_stop_level /* How far down the tree to search. To + stop at leaf level - set to + DISK_LEAF_NODE_LEVEL */ + ) +{ + int n_block_number; + int expected_level; + struct buffer_head *p_s_bh; + struct path_element *p_s_last_element; + int n_node_level, n_retval; + int right_neighbor_of_leaf_node; + int fs_gen; + struct buffer_head *reada_bh[SEARCH_BY_KEY_READA]; + unsigned long reada_blocks[SEARCH_BY_KEY_READA]; + int reada_count = 0; #ifdef CONFIG_REISERFS_CHECK - int n_repeat_counter = 0; + int n_repeat_counter = 0; #endif - - PROC_INFO_INC( p_s_sb, search_by_key ); - - /* As we add each node to a path we increase its count. This means that - we must be careful to release all nodes in a path before we either - discard the path struct or re-use the path struct, as we do here. */ - decrement_counters_in_path(p_s_search_path); + PROC_INFO_INC(p_s_sb, search_by_key); + + /* As we add each node to a path we increase its count. This means that + we must be careful to release all nodes in a path before we either + discard the path struct or re-use the path struct, as we do here. */ - right_neighbor_of_leaf_node = 0; + decrement_counters_in_path(p_s_search_path); - /* With each iteration of this loop we search through the items in the - current node, and calculate the next current node(next path element) - for the next iteration of this loop.. */ - n_block_number = SB_ROOT_BLOCK (p_s_sb); - expected_level = -1; - while ( 1 ) { + right_neighbor_of_leaf_node = 0; + + /* With each iteration of this loop we search through the items in the + current node, and calculate the next current node(next path element) + for the next iteration of this loop.. */ + n_block_number = SB_ROOT_BLOCK(p_s_sb); + expected_level = -1; + while (1) { #ifdef CONFIG_REISERFS_CHECK - if ( !(++n_repeat_counter % 50000) ) - reiserfs_warning (p_s_sb, "PAP-5100: search_by_key: %s:" - "there were %d iterations of while loop " - "looking for key %K", - current->comm, n_repeat_counter, p_s_key); + if (!(++n_repeat_counter % 50000)) + reiserfs_warning(p_s_sb, "PAP-5100: search_by_key: %s:" + "there were %d iterations of while loop " + "looking for key %K", + current->comm, n_repeat_counter, + p_s_key); #endif - /* prep path to have another element added to it. */ - p_s_last_element = PATH_OFFSET_PELEMENT(p_s_search_path, ++p_s_search_path->path_length); - fs_gen = get_generation (p_s_sb); - - /* Read the next tree node, and set the last element in the path to - have a pointer to it. */ - if ((p_s_bh = p_s_last_element->pe_buffer = - sb_getblk(p_s_sb, n_block_number)) ) { - if (!buffer_uptodate(p_s_bh) && reada_count > 1) { - search_by_key_reada (p_s_sb, reada_bh, - reada_blocks, reada_count); - } - ll_rw_block(READ, 1, &p_s_bh); - wait_on_buffer(p_s_bh); - if (!buffer_uptodate(p_s_bh)) - goto io_error; - } else { -io_error: - p_s_search_path->path_length --; - pathrelse(p_s_search_path); - return IO_ERROR; - } - reada_count = 0; - if (expected_level == -1) - expected_level = SB_TREE_HEIGHT (p_s_sb); - expected_level --; - - /* It is possible that schedule occurred. We must check whether the key - to search is still in the tree rooted from the current buffer. If - not then repeat search from the root. */ - if ( fs_changed (fs_gen, p_s_sb) && - (!B_IS_IN_TREE (p_s_bh) || - B_LEVEL(p_s_bh) != expected_level || - !key_in_buffer(p_s_search_path, p_s_key, p_s_sb))) { - PROC_INFO_INC( p_s_sb, search_by_key_fs_changed ); - PROC_INFO_INC( p_s_sb, search_by_key_restarted ); - PROC_INFO_INC( p_s_sb, sbk_restarted[ expected_level - 1 ] ); - decrement_counters_in_path(p_s_search_path); - - /* Get the root block number so that we can repeat the search - starting from the root. */ - n_block_number = SB_ROOT_BLOCK (p_s_sb); - expected_level = -1; - right_neighbor_of_leaf_node = 0; - - /* repeat search from the root */ - continue; - } + /* prep path to have another element added to it. */ + p_s_last_element = + PATH_OFFSET_PELEMENT(p_s_search_path, + ++p_s_search_path->path_length); + fs_gen = get_generation(p_s_sb); + + /* Read the next tree node, and set the last element in the path to + have a pointer to it. */ + if ((p_s_bh = p_s_last_element->pe_buffer = + sb_getblk(p_s_sb, n_block_number))) { + if (!buffer_uptodate(p_s_bh) && reada_count > 1) { + search_by_key_reada(p_s_sb, reada_bh, + reada_blocks, reada_count); + } + ll_rw_block(READ, 1, &p_s_bh); + wait_on_buffer(p_s_bh); + if (!buffer_uptodate(p_s_bh)) + goto io_error; + } else { + io_error: + p_s_search_path->path_length--; + pathrelse(p_s_search_path); + return IO_ERROR; + } + reada_count = 0; + if (expected_level == -1) + expected_level = SB_TREE_HEIGHT(p_s_sb); + expected_level--; + + /* It is possible that schedule occurred. We must check whether the key + to search is still in the tree rooted from the current buffer. If + not then repeat search from the root. */ + if (fs_changed(fs_gen, p_s_sb) && + (!B_IS_IN_TREE(p_s_bh) || + B_LEVEL(p_s_bh) != expected_level || + !key_in_buffer(p_s_search_path, p_s_key, p_s_sb))) { + PROC_INFO_INC(p_s_sb, search_by_key_fs_changed); + PROC_INFO_INC(p_s_sb, search_by_key_restarted); + PROC_INFO_INC(p_s_sb, + sbk_restarted[expected_level - 1]); + decrement_counters_in_path(p_s_search_path); + + /* Get the root block number so that we can repeat the search + starting from the root. */ + n_block_number = SB_ROOT_BLOCK(p_s_sb); + expected_level = -1; + right_neighbor_of_leaf_node = 0; + + /* repeat search from the root */ + continue; + } - /* only check that the key is in the buffer if p_s_key is not - equal to the MAX_KEY. Latter case is only possible in - "finish_unfinished()" processing during mount. */ - RFALSE( comp_keys( &MAX_KEY, p_s_key ) && - ! key_in_buffer(p_s_search_path, p_s_key, p_s_sb), - "PAP-5130: key is not in the buffer"); + /* only check that the key is in the buffer if p_s_key is not + equal to the MAX_KEY. Latter case is only possible in + "finish_unfinished()" processing during mount. */ + RFALSE(comp_keys(&MAX_KEY, p_s_key) && + !key_in_buffer(p_s_search_path, p_s_key, p_s_sb), + "PAP-5130: key is not in the buffer"); #ifdef CONFIG_REISERFS_CHECK - if ( cur_tb ) { - print_cur_tb ("5140"); - reiserfs_panic(p_s_sb, "PAP-5140: search_by_key: schedule occurred in do_balance!"); - } + if (cur_tb) { + print_cur_tb("5140"); + reiserfs_panic(p_s_sb, + "PAP-5140: search_by_key: schedule occurred in do_balance!"); + } #endif - // make sure, that the node contents look like a node of - // certain level - if (!is_tree_node (p_s_bh, expected_level)) { - reiserfs_warning (p_s_sb, "vs-5150: search_by_key: " - "invalid format found in block %ld. Fsck?", - p_s_bh->b_blocknr); - pathrelse (p_s_search_path); - return IO_ERROR; - } - - /* ok, we have acquired next formatted node in the tree */ - n_node_level = B_LEVEL (p_s_bh); - - PROC_INFO_BH_STAT( p_s_sb, p_s_bh, n_node_level - 1 ); - - RFALSE( n_node_level < n_stop_level, - "vs-5152: tree level (%d) is less than stop level (%d)", - n_node_level, n_stop_level); - - n_retval = bin_search( p_s_key, B_N_PITEM_HEAD(p_s_bh, 0), - B_NR_ITEMS(p_s_bh), - ( n_node_level == DISK_LEAF_NODE_LEVEL ) ? IH_SIZE : KEY_SIZE, - &(p_s_last_element->pe_position)); - if (n_node_level == n_stop_level) { - return n_retval; - } + // make sure, that the node contents look like a node of + // certain level + if (!is_tree_node(p_s_bh, expected_level)) { + reiserfs_warning(p_s_sb, "vs-5150: search_by_key: " + "invalid format found in block %ld. Fsck?", + p_s_bh->b_blocknr); + pathrelse(p_s_search_path); + return IO_ERROR; + } - /* we are not in the stop level */ - if (n_retval == ITEM_FOUND) - /* item has been found, so we choose the pointer which is to the right of the found one */ - p_s_last_element->pe_position++; + /* ok, we have acquired next formatted node in the tree */ + n_node_level = B_LEVEL(p_s_bh); - /* if item was not found we choose the position which is to - the left of the found item. This requires no code, - bin_search did it already.*/ + PROC_INFO_BH_STAT(p_s_sb, p_s_bh, n_node_level - 1); - /* So we have chosen a position in the current node which is - an internal node. Now we calculate child block number by - position in the node. */ - n_block_number = B_N_CHILD_NUM(p_s_bh, p_s_last_element->pe_position); + RFALSE(n_node_level < n_stop_level, + "vs-5152: tree level (%d) is less than stop level (%d)", + n_node_level, n_stop_level); - /* if we are going to read leaf nodes, try for read ahead as well */ - if ((p_s_search_path->reada & PATH_READA) && - n_node_level == DISK_LEAF_NODE_LEVEL + 1) - { - int pos = p_s_last_element->pe_position; - int limit = B_NR_ITEMS(p_s_bh); - struct reiserfs_key *le_key; - - if (p_s_search_path->reada & PATH_READA_BACK) - limit = 0; - while(reada_count < SEARCH_BY_KEY_READA) { - if (pos == limit) - break; - reada_blocks[reada_count++] = B_N_CHILD_NUM(p_s_bh, pos); - if (p_s_search_path->reada & PATH_READA_BACK) - pos--; - else - pos++; + n_retval = bin_search(p_s_key, B_N_PITEM_HEAD(p_s_bh, 0), + B_NR_ITEMS(p_s_bh), + (n_node_level == + DISK_LEAF_NODE_LEVEL) ? IH_SIZE : + KEY_SIZE, + &(p_s_last_element->pe_position)); + if (n_node_level == n_stop_level) { + return n_retval; + } - /* - * check to make sure we're in the same object - */ - le_key = B_N_PDELIM_KEY(p_s_bh, pos); - if (le32_to_cpu(le_key->k_objectid) != - p_s_key->on_disk_key.k_objectid) - { - break; + /* we are not in the stop level */ + if (n_retval == ITEM_FOUND) + /* item has been found, so we choose the pointer which is to the right of the found one */ + p_s_last_element->pe_position++; + + /* if item was not found we choose the position which is to + the left of the found item. This requires no code, + bin_search did it already. */ + + /* So we have chosen a position in the current node which is + an internal node. Now we calculate child block number by + position in the node. */ + n_block_number = + B_N_CHILD_NUM(p_s_bh, p_s_last_element->pe_position); + + /* if we are going to read leaf nodes, try for read ahead as well */ + if ((p_s_search_path->reada & PATH_READA) && + n_node_level == DISK_LEAF_NODE_LEVEL + 1) { + int pos = p_s_last_element->pe_position; + int limit = B_NR_ITEMS(p_s_bh); + struct reiserfs_key *le_key; + + if (p_s_search_path->reada & PATH_READA_BACK) + limit = 0; + while (reada_count < SEARCH_BY_KEY_READA) { + if (pos == limit) + break; + reada_blocks[reada_count++] = + B_N_CHILD_NUM(p_s_bh, pos); + if (p_s_search_path->reada & PATH_READA_BACK) + pos--; + else + pos++; + + /* + * check to make sure we're in the same object + */ + le_key = B_N_PDELIM_KEY(p_s_bh, pos); + if (le32_to_cpu(le_key->k_objectid) != + p_s_key->on_disk_key.k_objectid) { + break; + } + } } - } - } - } + } } - /* Form the path to an item and position in this item which contains file byte defined by p_s_key. If there is no such item corresponding to the key, we point the path to the item with @@ -780,94 +812,97 @@ io_error: units of directory entries. */ /* The function is NOT SCHEDULE-SAFE! */ -int search_for_position_by_key (struct super_block * p_s_sb, /* Pointer to the super block. */ - const struct cpu_key * p_cpu_key, /* Key to search (cpu variable) */ - struct path * p_s_search_path /* Filled up by this function. */ - ) { - struct item_head * p_le_ih; /* pointer to on-disk structure */ - int n_blk_size; - loff_t item_offset, offset; - struct reiserfs_dir_entry de; - int retval; - - /* If searching for directory entry. */ - if ( is_direntry_cpu_key (p_cpu_key) ) - return search_by_entry_key (p_s_sb, p_cpu_key, p_s_search_path, &de); - - /* If not searching for directory entry. */ - - /* If item is found. */ - retval = search_item (p_s_sb, p_cpu_key, p_s_search_path); - if (retval == IO_ERROR) - return retval; - if ( retval == ITEM_FOUND ) { - - RFALSE( ! ih_item_len( - B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path), - PATH_LAST_POSITION(p_s_search_path))), - "PAP-5165: item length equals zero"); +int search_for_position_by_key(struct super_block *p_s_sb, /* Pointer to the super block. */ + const struct cpu_key *p_cpu_key, /* Key to search (cpu variable) */ + struct path *p_s_search_path /* Filled up by this function. */ + ) +{ + struct item_head *p_le_ih; /* pointer to on-disk structure */ + int n_blk_size; + loff_t item_offset, offset; + struct reiserfs_dir_entry de; + int retval; + + /* If searching for directory entry. */ + if (is_direntry_cpu_key(p_cpu_key)) + return search_by_entry_key(p_s_sb, p_cpu_key, p_s_search_path, + &de); + + /* If not searching for directory entry. */ + + /* If item is found. */ + retval = search_item(p_s_sb, p_cpu_key, p_s_search_path); + if (retval == IO_ERROR) + return retval; + if (retval == ITEM_FOUND) { - pos_in_item(p_s_search_path) = 0; - return POSITION_FOUND; - } + RFALSE(!ih_item_len + (B_N_PITEM_HEAD + (PATH_PLAST_BUFFER(p_s_search_path), + PATH_LAST_POSITION(p_s_search_path))), + "PAP-5165: item length equals zero"); - RFALSE( ! PATH_LAST_POSITION(p_s_search_path), - "PAP-5170: position equals zero"); + pos_in_item(p_s_search_path) = 0; + return POSITION_FOUND; + } - /* Item is not found. Set path to the previous item. */ - p_le_ih = B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path), --PATH_LAST_POSITION(p_s_search_path)); - n_blk_size = p_s_sb->s_blocksize; + RFALSE(!PATH_LAST_POSITION(p_s_search_path), + "PAP-5170: position equals zero"); - if (comp_short_keys (&(p_le_ih->ih_key), p_cpu_key)) { - return FILE_NOT_FOUND; - } + /* Item is not found. Set path to the previous item. */ + p_le_ih = + B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path), + --PATH_LAST_POSITION(p_s_search_path)); + n_blk_size = p_s_sb->s_blocksize; - // FIXME: quite ugly this far + if (comp_short_keys(&(p_le_ih->ih_key), p_cpu_key)) { + return FILE_NOT_FOUND; + } + // FIXME: quite ugly this far - item_offset = le_ih_k_offset (p_le_ih); - offset = cpu_key_k_offset (p_cpu_key); + item_offset = le_ih_k_offset(p_le_ih); + offset = cpu_key_k_offset(p_cpu_key); - /* Needed byte is contained in the item pointed to by the path.*/ - if (item_offset <= offset && - item_offset + op_bytes_number (p_le_ih, n_blk_size) > offset) { - pos_in_item (p_s_search_path) = offset - item_offset; - if ( is_indirect_le_ih(p_le_ih) ) { - pos_in_item (p_s_search_path) /= n_blk_size; + /* Needed byte is contained in the item pointed to by the path. */ + if (item_offset <= offset && + item_offset + op_bytes_number(p_le_ih, n_blk_size) > offset) { + pos_in_item(p_s_search_path) = offset - item_offset; + if (is_indirect_le_ih(p_le_ih)) { + pos_in_item(p_s_search_path) /= n_blk_size; + } + return POSITION_FOUND; } - return POSITION_FOUND; - } - - /* Needed byte is not contained in the item pointed to by the - path. Set pos_in_item out of the item. */ - if ( is_indirect_le_ih (p_le_ih) ) - pos_in_item (p_s_search_path) = ih_item_len(p_le_ih) / UNFM_P_SIZE; - else - pos_in_item (p_s_search_path) = ih_item_len( p_le_ih ); - - return POSITION_NOT_FOUND; -} + /* Needed byte is not contained in the item pointed to by the + path. Set pos_in_item out of the item. */ + if (is_indirect_le_ih(p_le_ih)) + pos_in_item(p_s_search_path) = + ih_item_len(p_le_ih) / UNFM_P_SIZE; + else + pos_in_item(p_s_search_path) = ih_item_len(p_le_ih); + + return POSITION_NOT_FOUND; +} /* Compare given item and item pointed to by the path. */ -int comp_items (const struct item_head * stored_ih, const struct path * p_s_path) +int comp_items(const struct item_head *stored_ih, const struct path *p_s_path) { - struct buffer_head * p_s_bh; - struct item_head * ih; + struct buffer_head *p_s_bh; + struct item_head *ih; - /* Last buffer at the path is not in the tree. */ - if ( ! B_IS_IN_TREE(p_s_bh = PATH_PLAST_BUFFER(p_s_path)) ) - return 1; + /* Last buffer at the path is not in the tree. */ + if (!B_IS_IN_TREE(p_s_bh = PATH_PLAST_BUFFER(p_s_path))) + return 1; - /* Last path position is invalid. */ - if ( PATH_LAST_POSITION(p_s_path) >= B_NR_ITEMS(p_s_bh) ) - return 1; + /* Last path position is invalid. */ + if (PATH_LAST_POSITION(p_s_path) >= B_NR_ITEMS(p_s_bh)) + return 1; - /* we need only to know, whether it is the same item */ - ih = get_ih (p_s_path); - return memcmp (stored_ih, ih, IH_SIZE); + /* we need only to know, whether it is the same item */ + ih = get_ih(p_s_path); + return memcmp(stored_ih, ih, IH_SIZE); } - /* unformatted nodes are not logged anymore, ever. This is safe ** now */ @@ -876,461 +911,466 @@ int comp_items (const struct item_head * stored_ih, const struct path * p_s_path // block can not be forgotten as it is in I/O or held by someone #define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh))) - - // prepare for delete or cut of direct item -static inline int prepare_for_direct_item (struct path * path, - struct item_head * le_ih, - struct inode * inode, - loff_t new_file_length, - int * cut_size) +static inline int prepare_for_direct_item(struct path *path, + struct item_head *le_ih, + struct inode *inode, + loff_t new_file_length, int *cut_size) { - loff_t round_len; - - - if ( new_file_length == max_reiserfs_offset (inode) ) { - /* item has to be deleted */ - *cut_size = -(IH_SIZE + ih_item_len(le_ih)); - return M_DELETE; - } - - // new file gets truncated - if (get_inode_item_key_version (inode) == KEY_FORMAT_3_6) { - // - round_len = ROUND_UP (new_file_length); - /* this was n_new_file_length < le_ih ... */ - if ( round_len < le_ih_k_offset (le_ih) ) { - *cut_size = -(IH_SIZE + ih_item_len(le_ih)); - return M_DELETE; /* Delete this item. */ + loff_t round_len; + + if (new_file_length == max_reiserfs_offset(inode)) { + /* item has to be deleted */ + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; + } + // new file gets truncated + if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) { + // + round_len = ROUND_UP(new_file_length); + /* this was n_new_file_length < le_ih ... */ + if (round_len < le_ih_k_offset(le_ih)) { + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; /* Delete this item. */ + } + /* Calculate first position and size for cutting from item. */ + pos_in_item(path) = round_len - (le_ih_k_offset(le_ih) - 1); + *cut_size = -(ih_item_len(le_ih) - pos_in_item(path)); + + return M_CUT; /* Cut from this item. */ + } + + // old file: items may have any length + + if (new_file_length < le_ih_k_offset(le_ih)) { + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; /* Delete this item. */ } /* Calculate first position and size for cutting from item. */ - pos_in_item (path) = round_len - (le_ih_k_offset (le_ih) - 1); - *cut_size = -(ih_item_len(le_ih) - pos_in_item(path)); - - return M_CUT; /* Cut from this item. */ - } - - - // old file: items may have any length - - if ( new_file_length < le_ih_k_offset (le_ih) ) { - *cut_size = -(IH_SIZE + ih_item_len(le_ih)); - return M_DELETE; /* Delete this item. */ - } - /* Calculate first position and size for cutting from item. */ - *cut_size = -(ih_item_len(le_ih) - - (pos_in_item (path) = new_file_length + 1 - le_ih_k_offset (le_ih))); - return M_CUT; /* Cut from this item. */ + *cut_size = -(ih_item_len(le_ih) - + (pos_in_item(path) = + new_file_length + 1 - le_ih_k_offset(le_ih))); + return M_CUT; /* Cut from this item. */ } - -static inline int prepare_for_direntry_item (struct path * path, - struct item_head * le_ih, - struct inode * inode, - loff_t new_file_length, - int * cut_size) +static inline int prepare_for_direntry_item(struct path *path, + struct item_head *le_ih, + struct inode *inode, + loff_t new_file_length, + int *cut_size) { - if (le_ih_k_offset (le_ih) == DOT_OFFSET && - new_file_length == max_reiserfs_offset (inode)) { - RFALSE( ih_entry_count (le_ih) != 2, - "PAP-5220: incorrect empty directory item (%h)", le_ih); - *cut_size = -(IH_SIZE + ih_item_len(le_ih)); - return M_DELETE; /* Delete the directory item containing "." and ".." entry. */ - } - - if ( ih_entry_count (le_ih) == 1 ) { - /* Delete the directory item such as there is one record only - in this item*/ - *cut_size = -(IH_SIZE + ih_item_len(le_ih)); - return M_DELETE; - } - - /* Cut one record from the directory item. */ - *cut_size = -(DEH_SIZE + entry_length (get_last_bh (path), le_ih, pos_in_item (path))); - return M_CUT; -} + if (le_ih_k_offset(le_ih) == DOT_OFFSET && + new_file_length == max_reiserfs_offset(inode)) { + RFALSE(ih_entry_count(le_ih) != 2, + "PAP-5220: incorrect empty directory item (%h)", le_ih); + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; /* Delete the directory item containing "." and ".." entry. */ + } + if (ih_entry_count(le_ih) == 1) { + /* Delete the directory item such as there is one record only + in this item */ + *cut_size = -(IH_SIZE + ih_item_len(le_ih)); + return M_DELETE; + } + + /* Cut one record from the directory item. */ + *cut_size = + -(DEH_SIZE + + entry_length(get_last_bh(path), le_ih, pos_in_item(path))); + return M_CUT; +} /* If the path points to a directory or direct item, calculate mode and the size cut, for balance. If the path points to an indirect item, remove some number of its unformatted nodes. In case of file truncate calculate whether this item must be deleted/truncated or last unformatted node of this item will be converted to a direct item. This function returns a determination of what balance mode the calling function should employ. */ -static char prepare_for_delete_or_cut( - struct reiserfs_transaction_handle *th, - struct inode * inode, - struct path * p_s_path, - const struct cpu_key * p_s_item_key, - int * p_n_removed, /* Number of unformatted nodes which were removed - from end of the file. */ - int * p_n_cut_size, - unsigned long long n_new_file_length /* MAX_KEY_OFFSET in case of delete. */ - ) { - struct super_block * p_s_sb = inode->i_sb; - struct item_head * p_le_ih = PATH_PITEM_HEAD(p_s_path); - struct buffer_head * p_s_bh = PATH_PLAST_BUFFER(p_s_path); - - BUG_ON (!th->t_trans_id); - - /* Stat_data item. */ - if ( is_statdata_le_ih (p_le_ih) ) { - - RFALSE( n_new_file_length != max_reiserfs_offset (inode), - "PAP-5210: mode must be M_DELETE"); - - *p_n_cut_size = -(IH_SIZE + ih_item_len(p_le_ih)); - return M_DELETE; - } - - - /* Directory item. */ - if ( is_direntry_le_ih (p_le_ih) ) - return prepare_for_direntry_item (p_s_path, p_le_ih, inode, n_new_file_length, p_n_cut_size); - - /* Direct item. */ - if ( is_direct_le_ih (p_le_ih) ) - return prepare_for_direct_item (p_s_path, p_le_ih, inode, n_new_file_length, p_n_cut_size); - - - /* Case of an indirect item. */ - { - int n_unfm_number, /* Number of the item unformatted nodes. */ - n_counter, - n_blk_size; - __le32 * p_n_unfm_pointer; /* Pointer to the unformatted node number. */ - __u32 tmp; - struct item_head s_ih; /* Item header. */ - char c_mode; /* Returned mode of the balance. */ - int need_research; +static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, struct inode *inode, struct path *p_s_path, const struct cpu_key *p_s_item_key, int *p_n_removed, /* Number of unformatted nodes which were removed + from end of the file. */ + int *p_n_cut_size, unsigned long long n_new_file_length /* MAX_KEY_OFFSET in case of delete. */ + ) +{ + struct super_block *p_s_sb = inode->i_sb; + struct item_head *p_le_ih = PATH_PITEM_HEAD(p_s_path); + struct buffer_head *p_s_bh = PATH_PLAST_BUFFER(p_s_path); + BUG_ON(!th->t_trans_id); - n_blk_size = p_s_sb->s_blocksize; + /* Stat_data item. */ + if (is_statdata_le_ih(p_le_ih)) { - /* Search for the needed object indirect item until there are no unformatted nodes to be removed. */ - do { - need_research = 0; - p_s_bh = PATH_PLAST_BUFFER(p_s_path); - /* Copy indirect item header to a temp variable. */ - copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); - /* Calculate number of unformatted nodes in this item. */ - n_unfm_number = I_UNFM_NUM(&s_ih); - - RFALSE( ! is_indirect_le_ih(&s_ih) || ! n_unfm_number || - pos_in_item (p_s_path) + 1 != n_unfm_number, - "PAP-5240: invalid item %h " - "n_unfm_number = %d *p_n_pos_in_item = %d", - &s_ih, n_unfm_number, pos_in_item (p_s_path)); - - /* Calculate balance mode and position in the item to remove unformatted nodes. */ - if ( n_new_file_length == max_reiserfs_offset (inode) ) {/* Case of delete. */ - pos_in_item (p_s_path) = 0; - *p_n_cut_size = -(IH_SIZE + ih_item_len(&s_ih)); - c_mode = M_DELETE; - } - else { /* Case of truncate. */ - if ( n_new_file_length < le_ih_k_offset (&s_ih) ) { - pos_in_item (p_s_path) = 0; - *p_n_cut_size = -(IH_SIZE + ih_item_len(&s_ih)); - c_mode = M_DELETE; /* Delete this item. */ - } - else { - /* indirect item must be truncated starting from *p_n_pos_in_item-th position */ - pos_in_item (p_s_path) = (n_new_file_length + n_blk_size - le_ih_k_offset (&s_ih) ) >> p_s_sb->s_blocksize_bits; - - RFALSE( pos_in_item (p_s_path) > n_unfm_number, - "PAP-5250: invalid position in the item"); - - /* Either convert last unformatted node of indirect item to direct item or increase - its free space. */ - if ( pos_in_item (p_s_path) == n_unfm_number ) { - *p_n_cut_size = 0; /* Nothing to cut. */ - return M_CONVERT; /* Maybe convert last unformatted node to the direct item. */ - } - /* Calculate size to cut. */ - *p_n_cut_size = -(ih_item_len(&s_ih) - pos_in_item(p_s_path) * UNFM_P_SIZE); - - c_mode = M_CUT; /* Cut from this indirect item. */ - } - } - - RFALSE( n_unfm_number <= pos_in_item (p_s_path), - "PAP-5260: invalid position in the indirect item"); - - /* pointers to be cut */ - n_unfm_number -= pos_in_item (p_s_path); - /* Set pointer to the last unformatted node pointer that is to be cut. */ - p_n_unfm_pointer = (__le32 *)B_I_PITEM(p_s_bh, &s_ih) + I_UNFM_NUM(&s_ih) - 1 - *p_n_removed; - - - /* We go through the unformatted nodes pointers of the indirect - item and look for the unformatted nodes in the cache. If we - found some of them we free it, zero corresponding indirect item - entry and log buffer containing that indirect item. For this we - need to prepare last path element for logging. If some - unformatted node has b_count > 1 we must not free this - unformatted node since it is in use. */ - reiserfs_prepare_for_journal(p_s_sb, p_s_bh, 1); - // note: path could be changed, first line in for loop takes care - // of it - - for (n_counter = *p_n_removed; - n_counter < n_unfm_number; n_counter++, p_n_unfm_pointer-- ) { - - cond_resched(); - if (item_moved (&s_ih, p_s_path)) { - need_research = 1 ; - break; - } - RFALSE( p_n_unfm_pointer < (__le32 *)B_I_PITEM(p_s_bh, &s_ih) || - p_n_unfm_pointer > (__le32 *)B_I_PITEM(p_s_bh, &s_ih) + I_UNFM_NUM(&s_ih) - 1, - "vs-5265: pointer out of range"); + RFALSE(n_new_file_length != max_reiserfs_offset(inode), + "PAP-5210: mode must be M_DELETE"); - /* Hole, nothing to remove. */ - if ( ! get_block_num(p_n_unfm_pointer,0) ) { - (*p_n_removed)++; - continue; - } + *p_n_cut_size = -(IH_SIZE + ih_item_len(p_le_ih)); + return M_DELETE; + } - (*p_n_removed)++; + /* Directory item. */ + if (is_direntry_le_ih(p_le_ih)) + return prepare_for_direntry_item(p_s_path, p_le_ih, inode, + n_new_file_length, + p_n_cut_size); - tmp = get_block_num(p_n_unfm_pointer,0); - put_block_num(p_n_unfm_pointer, 0, 0); - journal_mark_dirty (th, p_s_sb, p_s_bh); - reiserfs_free_block(th, inode, tmp, 1); - if ( item_moved (&s_ih, p_s_path) ) { - need_research = 1; - break ; - } - } - - /* a trick. If the buffer has been logged, this - ** will do nothing. If we've broken the loop without - ** logging it, it will restore the buffer - ** - */ - reiserfs_restore_prepared_buffer(p_s_sb, p_s_bh); - - /* This loop can be optimized. */ - } while ( (*p_n_removed < n_unfm_number || need_research) && - search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_FOUND ); - - RFALSE( *p_n_removed < n_unfm_number, - "PAP-5310: indirect item is not found"); - RFALSE( item_moved (&s_ih, p_s_path), - "after while, comp failed, retry") ; - - if (c_mode == M_CUT) - pos_in_item (p_s_path) *= UNFM_P_SIZE; - return c_mode; - } + /* Direct item. */ + if (is_direct_le_ih(p_le_ih)) + return prepare_for_direct_item(p_s_path, p_le_ih, inode, + n_new_file_length, p_n_cut_size); + + /* Case of an indirect item. */ + { + int n_unfm_number, /* Number of the item unformatted nodes. */ + n_counter, n_blk_size; + __le32 *p_n_unfm_pointer; /* Pointer to the unformatted node number. */ + __u32 tmp; + struct item_head s_ih; /* Item header. */ + char c_mode; /* Returned mode of the balance. */ + int need_research; + + n_blk_size = p_s_sb->s_blocksize; + + /* Search for the needed object indirect item until there are no unformatted nodes to be removed. */ + do { + need_research = 0; + p_s_bh = PATH_PLAST_BUFFER(p_s_path); + /* Copy indirect item header to a temp variable. */ + copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); + /* Calculate number of unformatted nodes in this item. */ + n_unfm_number = I_UNFM_NUM(&s_ih); + + RFALSE(!is_indirect_le_ih(&s_ih) || !n_unfm_number || + pos_in_item(p_s_path) + 1 != n_unfm_number, + "PAP-5240: invalid item %h " + "n_unfm_number = %d *p_n_pos_in_item = %d", + &s_ih, n_unfm_number, pos_in_item(p_s_path)); + + /* Calculate balance mode and position in the item to remove unformatted nodes. */ + if (n_new_file_length == max_reiserfs_offset(inode)) { /* Case of delete. */ + pos_in_item(p_s_path) = 0; + *p_n_cut_size = -(IH_SIZE + ih_item_len(&s_ih)); + c_mode = M_DELETE; + } else { /* Case of truncate. */ + if (n_new_file_length < le_ih_k_offset(&s_ih)) { + pos_in_item(p_s_path) = 0; + *p_n_cut_size = + -(IH_SIZE + ih_item_len(&s_ih)); + c_mode = M_DELETE; /* Delete this item. */ + } else { + /* indirect item must be truncated starting from *p_n_pos_in_item-th position */ + pos_in_item(p_s_path) = + (n_new_file_length + n_blk_size - + le_ih_k_offset(&s_ih)) >> p_s_sb-> + s_blocksize_bits; + + RFALSE(pos_in_item(p_s_path) > + n_unfm_number, + "PAP-5250: invalid position in the item"); + + /* Either convert last unformatted node of indirect item to direct item or increase + its free space. */ + if (pos_in_item(p_s_path) == + n_unfm_number) { + *p_n_cut_size = 0; /* Nothing to cut. */ + return M_CONVERT; /* Maybe convert last unformatted node to the direct item. */ + } + /* Calculate size to cut. */ + *p_n_cut_size = + -(ih_item_len(&s_ih) - + pos_in_item(p_s_path) * + UNFM_P_SIZE); + + c_mode = M_CUT; /* Cut from this indirect item. */ + } + } + + RFALSE(n_unfm_number <= pos_in_item(p_s_path), + "PAP-5260: invalid position in the indirect item"); + + /* pointers to be cut */ + n_unfm_number -= pos_in_item(p_s_path); + /* Set pointer to the last unformatted node pointer that is to be cut. */ + p_n_unfm_pointer = + (__le32 *) B_I_PITEM(p_s_bh, + &s_ih) + I_UNFM_NUM(&s_ih) - + 1 - *p_n_removed; + + /* We go through the unformatted nodes pointers of the indirect + item and look for the unformatted nodes in the cache. If we + found some of them we free it, zero corresponding indirect item + entry and log buffer containing that indirect item. For this we + need to prepare last path element for logging. If some + unformatted node has b_count > 1 we must not free this + unformatted node since it is in use. */ + reiserfs_prepare_for_journal(p_s_sb, p_s_bh, 1); + // note: path could be changed, first line in for loop takes care + // of it + + for (n_counter = *p_n_removed; + n_counter < n_unfm_number; + n_counter++, p_n_unfm_pointer--) { + + cond_resched(); + if (item_moved(&s_ih, p_s_path)) { + need_research = 1; + break; + } + RFALSE(p_n_unfm_pointer < + (__le32 *) B_I_PITEM(p_s_bh, &s_ih) + || p_n_unfm_pointer > + (__le32 *) B_I_PITEM(p_s_bh, + &s_ih) + + I_UNFM_NUM(&s_ih) - 1, + "vs-5265: pointer out of range"); + + /* Hole, nothing to remove. */ + if (!get_block_num(p_n_unfm_pointer, 0)) { + (*p_n_removed)++; + continue; + } + + (*p_n_removed)++; + + tmp = get_block_num(p_n_unfm_pointer, 0); + put_block_num(p_n_unfm_pointer, 0, 0); + journal_mark_dirty(th, p_s_sb, p_s_bh); + reiserfs_free_block(th, inode, tmp, 1); + if (item_moved(&s_ih, p_s_path)) { + need_research = 1; + break; + } + } + + /* a trick. If the buffer has been logged, this + ** will do nothing. If we've broken the loop without + ** logging it, it will restore the buffer + ** + */ + reiserfs_restore_prepared_buffer(p_s_sb, p_s_bh); + + /* This loop can be optimized. */ + } while ((*p_n_removed < n_unfm_number || need_research) && + search_for_position_by_key(p_s_sb, p_s_item_key, + p_s_path) == + POSITION_FOUND); + + RFALSE(*p_n_removed < n_unfm_number, + "PAP-5310: indirect item is not found"); + RFALSE(item_moved(&s_ih, p_s_path), + "after while, comp failed, retry"); + + if (c_mode == M_CUT) + pos_in_item(p_s_path) *= UNFM_P_SIZE; + return c_mode; + } } /* Calculate number of bytes which will be deleted or cut during balance */ -static int calc_deleted_bytes_number( - struct tree_balance * p_s_tb, - char c_mode - ) { - int n_del_size; - struct item_head * p_le_ih = PATH_PITEM_HEAD(p_s_tb->tb_path); - - if ( is_statdata_le_ih (p_le_ih) ) - return 0; +static int calc_deleted_bytes_number(struct tree_balance *p_s_tb, char c_mode) +{ + int n_del_size; + struct item_head *p_le_ih = PATH_PITEM_HEAD(p_s_tb->tb_path); + + if (is_statdata_le_ih(p_le_ih)) + return 0; + + n_del_size = + (c_mode == + M_DELETE) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0]; + if (is_direntry_le_ih(p_le_ih)) { + // return EMPTY_DIR_SIZE; /* We delete emty directoris only. */ + // we can't use EMPTY_DIR_SIZE, as old format dirs have a different + // empty size. ick. FIXME, is this right? + // + return n_del_size; + } - n_del_size = ( c_mode == M_DELETE ) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0]; - if ( is_direntry_le_ih (p_le_ih) ) { - // return EMPTY_DIR_SIZE; /* We delete emty directoris only. */ - // we can't use EMPTY_DIR_SIZE, as old format dirs have a different - // empty size. ick. FIXME, is this right? - // - return n_del_size ; - } - - if ( is_indirect_le_ih (p_le_ih) ) - n_del_size = (n_del_size/UNFM_P_SIZE)* - (PATH_PLAST_BUFFER(p_s_tb->tb_path)->b_size);// - get_ih_free_space (p_le_ih); - return n_del_size; + if (is_indirect_le_ih(p_le_ih)) + n_del_size = (n_del_size / UNFM_P_SIZE) * (PATH_PLAST_BUFFER(p_s_tb->tb_path)->b_size); // - get_ih_free_space (p_le_ih); + return n_del_size; } -static void init_tb_struct( - struct reiserfs_transaction_handle *th, - struct tree_balance * p_s_tb, - struct super_block * p_s_sb, - struct path * p_s_path, - int n_size - ) { - - BUG_ON (!th->t_trans_id); - - memset (p_s_tb,'\0',sizeof(struct tree_balance)); - p_s_tb->transaction_handle = th ; - p_s_tb->tb_sb = p_s_sb; - p_s_tb->tb_path = p_s_path; - PATH_OFFSET_PBUFFER(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL; - PATH_OFFSET_POSITION(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0; - p_s_tb->insert_size[0] = n_size; -} +static void init_tb_struct(struct reiserfs_transaction_handle *th, + struct tree_balance *p_s_tb, + struct super_block *p_s_sb, + struct path *p_s_path, int n_size) +{ + BUG_ON(!th->t_trans_id); + memset(p_s_tb, '\0', sizeof(struct tree_balance)); + p_s_tb->transaction_handle = th; + p_s_tb->tb_sb = p_s_sb; + p_s_tb->tb_path = p_s_path; + PATH_OFFSET_PBUFFER(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL; + PATH_OFFSET_POSITION(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0; + p_s_tb->insert_size[0] = n_size; +} -void padd_item (char * item, int total_length, int length) +void padd_item(char *item, int total_length, int length) { - int i; + int i; - for (i = total_length; i > length; ) - item [--i] = 0; + for (i = total_length; i > length;) + item[--i] = 0; } #ifdef REISERQUOTA_DEBUG char key2type(struct reiserfs_key *ih) { - if (is_direntry_le_key(2, ih)) - return 'd'; - if (is_direct_le_key(2, ih)) - return 'D'; - if (is_indirect_le_key(2, ih)) - return 'i'; - if (is_statdata_le_key(2, ih)) - return 's'; - return 'u'; + if (is_direntry_le_key(2, ih)) + return 'd'; + if (is_direct_le_key(2, ih)) + return 'D'; + if (is_indirect_le_key(2, ih)) + return 'i'; + if (is_statdata_le_key(2, ih)) + return 's'; + return 'u'; } char head2type(struct item_head *ih) { - if (is_direntry_le_ih(ih)) - return 'd'; - if (is_direct_le_ih(ih)) - return 'D'; - if (is_indirect_le_ih(ih)) - return 'i'; - if (is_statdata_le_ih(ih)) - return 's'; - return 'u'; + if (is_direntry_le_ih(ih)) + return 'd'; + if (is_direct_le_ih(ih)) + return 'D'; + if (is_indirect_le_ih(ih)) + return 'i'; + if (is_statdata_le_ih(ih)) + return 's'; + return 'u'; } #endif /* Delete object item. */ -int reiserfs_delete_item (struct reiserfs_transaction_handle *th, - struct path * p_s_path, /* Path to the deleted item. */ - const struct cpu_key * p_s_item_key, /* Key to search for the deleted item. */ - struct inode * p_s_inode,/* inode is here just to update i_blocks and quotas */ - struct buffer_head * p_s_un_bh) /* NULL or unformatted node pointer. */ -{ - struct super_block * p_s_sb = p_s_inode->i_sb; - struct tree_balance s_del_balance; - struct item_head s_ih; - struct item_head *q_ih; - int quota_cut_bytes; - int n_ret_value, - n_del_size, - n_removed; +int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct path *p_s_path, /* Path to the deleted item. */ + const struct cpu_key *p_s_item_key, /* Key to search for the deleted item. */ + struct inode *p_s_inode, /* inode is here just to update i_blocks and quotas */ + struct buffer_head *p_s_un_bh) +{ /* NULL or unformatted node pointer. */ + struct super_block *p_s_sb = p_s_inode->i_sb; + struct tree_balance s_del_balance; + struct item_head s_ih; + struct item_head *q_ih; + int quota_cut_bytes; + int n_ret_value, n_del_size, n_removed; #ifdef CONFIG_REISERFS_CHECK - char c_mode; - int n_iter = 0; + char c_mode; + int n_iter = 0; #endif - BUG_ON (!th->t_trans_id); + BUG_ON(!th->t_trans_id); - init_tb_struct(th, &s_del_balance, p_s_sb, p_s_path, 0/*size is unknown*/); + init_tb_struct(th, &s_del_balance, p_s_sb, p_s_path, + 0 /*size is unknown */ ); - while ( 1 ) { - n_removed = 0; + while (1) { + n_removed = 0; #ifdef CONFIG_REISERFS_CHECK - n_iter++; - c_mode = + n_iter++; + c_mode = #endif - prepare_for_delete_or_cut(th, p_s_inode, p_s_path, p_s_item_key, &n_removed, &n_del_size, max_reiserfs_offset (p_s_inode)); - - RFALSE( c_mode != M_DELETE, "PAP-5320: mode must be M_DELETE"); - - copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); - s_del_balance.insert_size[0] = n_del_size; - - n_ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL); - if ( n_ret_value != REPEAT_SEARCH ) - break; - - PROC_INFO_INC( p_s_sb, delete_item_restarted ); + prepare_for_delete_or_cut(th, p_s_inode, p_s_path, + p_s_item_key, &n_removed, + &n_del_size, + max_reiserfs_offset(p_s_inode)); + + RFALSE(c_mode != M_DELETE, "PAP-5320: mode must be M_DELETE"); + + copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); + s_del_balance.insert_size[0] = n_del_size; + + n_ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL); + if (n_ret_value != REPEAT_SEARCH) + break; + + PROC_INFO_INC(p_s_sb, delete_item_restarted); + + // file system changed, repeat search + n_ret_value = + search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path); + if (n_ret_value == IO_ERROR) + break; + if (n_ret_value == FILE_NOT_FOUND) { + reiserfs_warning(p_s_sb, + "vs-5340: reiserfs_delete_item: " + "no items of the file %K found", + p_s_item_key); + break; + } + } /* while (1) */ - // file system changed, repeat search - n_ret_value = search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path); - if (n_ret_value == IO_ERROR) - break; - if (n_ret_value == FILE_NOT_FOUND) { - reiserfs_warning (p_s_sb, "vs-5340: reiserfs_delete_item: " - "no items of the file %K found", p_s_item_key); - break; + if (n_ret_value != CARRY_ON) { + unfix_nodes(&s_del_balance); + return 0; + } + // reiserfs_delete_item returns item length when success + n_ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE); + q_ih = get_ih(p_s_path); + quota_cut_bytes = ih_item_len(q_ih); + + /* hack so the quota code doesn't have to guess if the file + ** has a tail. On tail insert, we allocate quota for 1 unformatted node. + ** We test the offset because the tail might have been + ** split into multiple items, and we only want to decrement for + ** the unfm node once + */ + if (!S_ISLNK(p_s_inode->i_mode) && is_direct_le_ih(q_ih)) { + if ((le_ih_k_offset(q_ih) & (p_s_sb->s_blocksize - 1)) == 1) { + quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE; + } else { + quota_cut_bytes = 0; + } } - } /* while (1) */ - if ( n_ret_value != CARRY_ON ) { - unfix_nodes(&s_del_balance); - return 0; - } - - // reiserfs_delete_item returns item length when success - n_ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE); - q_ih = get_ih(p_s_path) ; - quota_cut_bytes = ih_item_len(q_ih) ; - - /* hack so the quota code doesn't have to guess if the file - ** has a tail. On tail insert, we allocate quota for 1 unformatted node. - ** We test the offset because the tail might have been - ** split into multiple items, and we only want to decrement for - ** the unfm node once - */ - if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(q_ih)) { - if ((le_ih_k_offset(q_ih) & (p_s_sb->s_blocksize - 1)) == 1) { - quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE; - } else { - quota_cut_bytes = 0 ; + if (p_s_un_bh) { + int off; + char *data; + + /* We are in direct2indirect conversion, so move tail contents + to the unformatted node */ + /* note, we do the copy before preparing the buffer because we + ** don't care about the contents of the unformatted node yet. + ** the only thing we really care about is the direct item's data + ** is in the unformatted node. + ** + ** Otherwise, we would have to call reiserfs_prepare_for_journal on + ** the unformatted node, which might schedule, meaning we'd have to + ** loop all the way back up to the start of the while loop. + ** + ** The unformatted node must be dirtied later on. We can't be + ** sure here if the entire tail has been deleted yet. + ** + ** p_s_un_bh is from the page cache (all unformatted nodes are + ** from the page cache) and might be a highmem page. So, we + ** can't use p_s_un_bh->b_data. + ** -clm + */ + + data = kmap_atomic(p_s_un_bh->b_page, KM_USER0); + off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1)); + memcpy(data + off, + B_I_PITEM(PATH_PLAST_BUFFER(p_s_path), &s_ih), + n_ret_value); + kunmap_atomic(data, KM_USER0); } - } - - if ( p_s_un_bh ) { - int off; - char *data ; - - /* We are in direct2indirect conversion, so move tail contents - to the unformatted node */ - /* note, we do the copy before preparing the buffer because we - ** don't care about the contents of the unformatted node yet. - ** the only thing we really care about is the direct item's data - ** is in the unformatted node. - ** - ** Otherwise, we would have to call reiserfs_prepare_for_journal on - ** the unformatted node, which might schedule, meaning we'd have to - ** loop all the way back up to the start of the while loop. - ** - ** The unformatted node must be dirtied later on. We can't be - ** sure here if the entire tail has been deleted yet. - ** - ** p_s_un_bh is from the page cache (all unformatted nodes are - ** from the page cache) and might be a highmem page. So, we - ** can't use p_s_un_bh->b_data. - ** -clm - */ - - data = kmap_atomic(p_s_un_bh->b_page, KM_USER0); - off = ((le_ih_k_offset (&s_ih) - 1) & (PAGE_CACHE_SIZE - 1)); - memcpy(data + off, - B_I_PITEM(PATH_PLAST_BUFFER(p_s_path), &s_ih), n_ret_value); - kunmap_atomic(data, KM_USER0); - } - /* Perform balancing after all resources have been collected at once. */ - do_balance(&s_del_balance, NULL, NULL, M_DELETE); + /* Perform balancing after all resources have been collected at once. */ + do_balance(&s_del_balance, NULL, NULL, M_DELETE); #ifdef REISERQUOTA_DEBUG - reiserfs_debug (p_s_sb, REISERFS_DEBUG_CODE, "reiserquota delete_item(): freeing %u, id=%u type=%c", quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih)); + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, + "reiserquota delete_item(): freeing %u, id=%u type=%c", + quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih)); #endif - DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes); + DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes); - /* Return deleted body length */ - return n_ret_value; + /* Return deleted body length */ + return n_ret_value; } - /* Summary Of Mechanisms For Handling Collisions Between Processes: deletion of the body of the object is performed by iput(), with the @@ -1347,727 +1387,804 @@ int reiserfs_delete_item (struct reiserfs_transaction_handle *th, - Hans */ - /* this deletes item which never gets split */ -void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th, - struct inode *inode, - struct reiserfs_key * key) +void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th, + struct inode *inode, struct reiserfs_key *key) { - struct tree_balance tb; - INITIALIZE_PATH (path); - int item_len = 0; - int tb_init = 0 ; - struct cpu_key cpu_key; - int retval; - int quota_cut_bytes = 0; - - BUG_ON (!th->t_trans_id); - - le_key2cpu_key (&cpu_key, key); - - while (1) { - retval = search_item (th->t_super, &cpu_key, &path); - if (retval == IO_ERROR) { - reiserfs_warning (th->t_super, - "vs-5350: reiserfs_delete_solid_item: " - "i/o failure occurred trying to delete %K", - &cpu_key); - break; - } - if (retval != ITEM_FOUND) { - pathrelse (&path); - // No need for a warning, if there is just no free space to insert '..' item into the newly-created subdir - if ( !( (unsigned long long) GET_HASH_VALUE (le_key_k_offset (le_key_version (key), key)) == 0 && \ - (unsigned long long) GET_GENERATION_NUMBER (le_key_k_offset (le_key_version (key), key)) == 1 ) ) - reiserfs_warning (th->t_super, "vs-5355: reiserfs_delete_solid_item: %k not found", key); - break; - } - if (!tb_init) { - tb_init = 1 ; - item_len = ih_item_len( PATH_PITEM_HEAD(&path) ); - init_tb_struct (th, &tb, th->t_super, &path, - (IH_SIZE + item_len)); - } - quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path)) ; + struct tree_balance tb; + INITIALIZE_PATH(path); + int item_len = 0; + int tb_init = 0; + struct cpu_key cpu_key; + int retval; + int quota_cut_bytes = 0; + + BUG_ON(!th->t_trans_id); + + le_key2cpu_key(&cpu_key, key); + + while (1) { + retval = search_item(th->t_super, &cpu_key, &path); + if (retval == IO_ERROR) { + reiserfs_warning(th->t_super, + "vs-5350: reiserfs_delete_solid_item: " + "i/o failure occurred trying to delete %K", + &cpu_key); + break; + } + if (retval != ITEM_FOUND) { + pathrelse(&path); + // No need for a warning, if there is just no free space to insert '..' item into the newly-created subdir + if (! + ((unsigned long long) + GET_HASH_VALUE(le_key_k_offset + (le_key_version(key), key)) == 0 + && (unsigned long long) + GET_GENERATION_NUMBER(le_key_k_offset + (le_key_version(key), + key)) == 1)) + reiserfs_warning(th->t_super, + "vs-5355: reiserfs_delete_solid_item: %k not found", + key); + break; + } + if (!tb_init) { + tb_init = 1; + item_len = ih_item_len(PATH_PITEM_HEAD(&path)); + init_tb_struct(th, &tb, th->t_super, &path, + -(IH_SIZE + item_len)); + } + quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path)); - retval = fix_nodes (M_DELETE, &tb, NULL, NULL); - if (retval == REPEAT_SEARCH) { - PROC_INFO_INC( th -> t_super, delete_solid_item_restarted ); - continue; - } + retval = fix_nodes(M_DELETE, &tb, NULL, NULL); + if (retval == REPEAT_SEARCH) { + PROC_INFO_INC(th->t_super, delete_solid_item_restarted); + continue; + } - if (retval == CARRY_ON) { - do_balance (&tb, NULL, NULL, M_DELETE); - if (inode) { /* Should we count quota for item? (we don't count quotas for save-links) */ + if (retval == CARRY_ON) { + do_balance(&tb, NULL, NULL, M_DELETE); + if (inode) { /* Should we count quota for item? (we don't count quotas for save-links) */ #ifdef REISERQUOTA_DEBUG - reiserfs_debug (th->t_super, REISERFS_DEBUG_CODE, "reiserquota delete_solid_item(): freeing %u id=%u type=%c", quota_cut_bytes, inode->i_uid, key2type(key)); + reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, + "reiserquota delete_solid_item(): freeing %u id=%u type=%c", + quota_cut_bytes, inode->i_uid, + key2type(key)); #endif - DQUOT_FREE_SPACE_NODIRTY(inode, quota_cut_bytes); - } - break; + DQUOT_FREE_SPACE_NODIRTY(inode, + quota_cut_bytes); + } + break; + } + // IO_ERROR, NO_DISK_SPACE, etc + reiserfs_warning(th->t_super, + "vs-5360: reiserfs_delete_solid_item: " + "could not delete %K due to fix_nodes failure", + &cpu_key); + unfix_nodes(&tb); + break; } - // IO_ERROR, NO_DISK_SPACE, etc - reiserfs_warning (th->t_super, "vs-5360: reiserfs_delete_solid_item: " - "could not delete %K due to fix_nodes failure", &cpu_key); - unfix_nodes (&tb); - break; - } - - reiserfs_check_path(&path) ; + reiserfs_check_path(&path); } - -int reiserfs_delete_object (struct reiserfs_transaction_handle *th, struct inode * inode) +int reiserfs_delete_object(struct reiserfs_transaction_handle *th, + struct inode *inode) { - int err; - inode->i_size = 0; - BUG_ON (!th->t_trans_id); - - /* for directory this deletes item containing "." and ".." */ - err = reiserfs_do_truncate (th, inode, NULL, 0/*no timestamp updates*/); - if (err) - return err; - + int err; + inode->i_size = 0; + BUG_ON(!th->t_trans_id); + + /* for directory this deletes item containing "." and ".." */ + err = + reiserfs_do_truncate(th, inode, NULL, 0 /*no timestamp updates */ ); + if (err) + return err; + #if defined( USE_INODE_GENERATION_COUNTER ) - if( !old_format_only ( th -> t_super ) ) - { - __le32 *inode_generation; - - inode_generation = - &REISERFS_SB(th -> t_super) -> s_rs -> s_inode_generation; - *inode_generation = cpu_to_le32( le32_to_cpu( *inode_generation ) + 1 ); - } + if (!old_format_only(th->t_super)) { + __le32 *inode_generation; + + inode_generation = + &REISERFS_SB(th->t_super)->s_rs->s_inode_generation; + *inode_generation = + cpu_to_le32(le32_to_cpu(*inode_generation) + 1); + } /* USE_INODE_GENERATION_COUNTER */ #endif - reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode)); + reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode)); - return err; + return err; } -static void -unmap_buffers(struct page *page, loff_t pos) { - struct buffer_head *bh ; - struct buffer_head *head ; - struct buffer_head *next ; - unsigned long tail_index ; - unsigned long cur_index ; - - if (page) { - if (page_has_buffers(page)) { - tail_index = pos & (PAGE_CACHE_SIZE - 1) ; - cur_index = 0 ; - head = page_buffers(page) ; - bh = head ; - do { - next = bh->b_this_page ; - - /* we want to unmap the buffers that contain the tail, and - ** all the buffers after it (since the tail must be at the - ** end of the file). We don't want to unmap file data - ** before the tail, since it might be dirty and waiting to - ** reach disk - */ - cur_index += bh->b_size ; - if (cur_index > tail_index) { - reiserfs_unmap_buffer(bh) ; +static void unmap_buffers(struct page *page, loff_t pos) +{ + struct buffer_head *bh; + struct buffer_head *head; + struct buffer_head *next; + unsigned long tail_index; + unsigned long cur_index; + + if (page) { + if (page_has_buffers(page)) { + tail_index = pos & (PAGE_CACHE_SIZE - 1); + cur_index = 0; + head = page_buffers(page); + bh = head; + do { + next = bh->b_this_page; + + /* we want to unmap the buffers that contain the tail, and + ** all the buffers after it (since the tail must be at the + ** end of the file). We don't want to unmap file data + ** before the tail, since it might be dirty and waiting to + ** reach disk + */ + cur_index += bh->b_size; + if (cur_index > tail_index) { + reiserfs_unmap_buffer(bh); + } + bh = next; + } while (bh != head); + if (PAGE_SIZE == bh->b_size) { + clear_page_dirty(page); + } } - bh = next ; - } while (bh != head) ; - if ( PAGE_SIZE == bh->b_size ) { - clear_page_dirty(page); - } } - } } -static int maybe_indirect_to_direct (struct reiserfs_transaction_handle *th, - struct inode * p_s_inode, - struct page *page, - struct path * p_s_path, - const struct cpu_key * p_s_item_key, - loff_t n_new_file_size, - char * p_c_mode - ) { - struct super_block * p_s_sb = p_s_inode->i_sb; - int n_block_size = p_s_sb->s_blocksize; - int cut_bytes; - BUG_ON (!th->t_trans_id); - - if (n_new_file_size != p_s_inode->i_size) - BUG (); - - /* the page being sent in could be NULL if there was an i/o error - ** reading in the last block. The user will hit problems trying to - ** read the file, but for now we just skip the indirect2direct - */ - if (atomic_read(&p_s_inode->i_count) > 1 || - !tail_has_to_be_packed (p_s_inode) || - !page || (REISERFS_I(p_s_inode)->i_flags & i_nopack_mask)) { - // leave tail in an unformatted node - *p_c_mode = M_SKIP_BALANCING; - cut_bytes = n_block_size - (n_new_file_size & (n_block_size - 1)); - pathrelse(p_s_path); - return cut_bytes; - } - /* Permorm the conversion to a direct_item. */ - /*return indirect_to_direct (p_s_inode, p_s_path, p_s_item_key, n_new_file_size, p_c_mode);*/ - return indirect2direct (th, p_s_inode, page, p_s_path, p_s_item_key, n_new_file_size, p_c_mode); -} +static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th, + struct inode *p_s_inode, + struct page *page, + struct path *p_s_path, + const struct cpu_key *p_s_item_key, + loff_t n_new_file_size, char *p_c_mode) +{ + struct super_block *p_s_sb = p_s_inode->i_sb; + int n_block_size = p_s_sb->s_blocksize; + int cut_bytes; + BUG_ON(!th->t_trans_id); + + if (n_new_file_size != p_s_inode->i_size) + BUG(); + /* the page being sent in could be NULL if there was an i/o error + ** reading in the last block. The user will hit problems trying to + ** read the file, but for now we just skip the indirect2direct + */ + if (atomic_read(&p_s_inode->i_count) > 1 || + !tail_has_to_be_packed(p_s_inode) || + !page || (REISERFS_I(p_s_inode)->i_flags & i_nopack_mask)) { + // leave tail in an unformatted node + *p_c_mode = M_SKIP_BALANCING; + cut_bytes = + n_block_size - (n_new_file_size & (n_block_size - 1)); + pathrelse(p_s_path); + return cut_bytes; + } + /* Permorm the conversion to a direct_item. */ + /*return indirect_to_direct (p_s_inode, p_s_path, p_s_item_key, n_new_file_size, p_c_mode); */ + return indirect2direct(th, p_s_inode, page, p_s_path, p_s_item_key, + n_new_file_size, p_c_mode); +} /* we did indirect_to_direct conversion. And we have inserted direct item successesfully, but there were no disk space to cut unfm pointer being converted. Therefore we have to delete inserted direct item(s) */ -static void indirect_to_direct_roll_back (struct reiserfs_transaction_handle *th, struct inode * inode, struct path * path) +static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th, + struct inode *inode, struct path *path) { - struct cpu_key tail_key; - int tail_len; - int removed; - BUG_ON (!th->t_trans_id); - - make_cpu_key (&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);// !!!! - tail_key.key_length = 4; - - tail_len = (cpu_key_k_offset (&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1; - while (tail_len) { - /* look for the last byte of the tail */ - if (search_for_position_by_key (inode->i_sb, &tail_key, path) == POSITION_NOT_FOUND) - reiserfs_panic (inode->i_sb, "vs-5615: indirect_to_direct_roll_back: found invalid item"); - RFALSE( path->pos_in_item != ih_item_len(PATH_PITEM_HEAD (path)) - 1, - "vs-5616: appended bytes found"); - PATH_LAST_POSITION (path) --; - - removed = reiserfs_delete_item (th, path, &tail_key, inode, NULL/*unbh not needed*/); - RFALSE( removed <= 0 || removed > tail_len, - "vs-5617: there was tail %d bytes, removed item length %d bytes", - tail_len, removed); - tail_len -= removed; - set_cpu_key_k_offset (&tail_key, cpu_key_k_offset (&tail_key) - removed); - } - reiserfs_warning (inode->i_sb, "indirect_to_direct_roll_back: indirect_to_direct conversion has been rolled back due to lack of disk space"); - //mark_file_without_tail (inode); - mark_inode_dirty (inode); + struct cpu_key tail_key; + int tail_len; + int removed; + BUG_ON(!th->t_trans_id); + + make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4); // !!!! + tail_key.key_length = 4; + + tail_len = + (cpu_key_k_offset(&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1; + while (tail_len) { + /* look for the last byte of the tail */ + if (search_for_position_by_key(inode->i_sb, &tail_key, path) == + POSITION_NOT_FOUND) + reiserfs_panic(inode->i_sb, + "vs-5615: indirect_to_direct_roll_back: found invalid item"); + RFALSE(path->pos_in_item != + ih_item_len(PATH_PITEM_HEAD(path)) - 1, + "vs-5616: appended bytes found"); + PATH_LAST_POSITION(path)--; + + removed = + reiserfs_delete_item(th, path, &tail_key, inode, + NULL /*unbh not needed */ ); + RFALSE(removed <= 0 + || removed > tail_len, + "vs-5617: there was tail %d bytes, removed item length %d bytes", + tail_len, removed); + tail_len -= removed; + set_cpu_key_k_offset(&tail_key, + cpu_key_k_offset(&tail_key) - removed); + } + reiserfs_warning(inode->i_sb, + "indirect_to_direct_roll_back: indirect_to_direct conversion has been rolled back due to lack of disk space"); + //mark_file_without_tail (inode); + mark_inode_dirty(inode); } - /* (Truncate or cut entry) or delete object item. Returns < 0 on failure */ -int reiserfs_cut_from_item (struct reiserfs_transaction_handle *th, - struct path * p_s_path, - struct cpu_key * p_s_item_key, - struct inode * p_s_inode, - struct page *page, - loff_t n_new_file_size) +int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th, + struct path *p_s_path, + struct cpu_key *p_s_item_key, + struct inode *p_s_inode, + struct page *page, loff_t n_new_file_size) { - struct super_block * p_s_sb = p_s_inode->i_sb; - /* Every function which is going to call do_balance must first - create a tree_balance structure. Then it must fill up this - structure by using the init_tb_struct and fix_nodes functions. - After that we can make tree balancing. */ - struct tree_balance s_cut_balance; - struct item_head *p_le_ih; - int n_cut_size = 0, /* Amount to be cut. */ - n_ret_value = CARRY_ON, - n_removed = 0, /* Number of the removed unformatted nodes. */ - n_is_inode_locked = 0; - char c_mode; /* Mode of the balance. */ - int retval2 = -1; - int quota_cut_bytes; - loff_t tail_pos = 0; - - BUG_ON (!th->t_trans_id); - - init_tb_struct(th, &s_cut_balance, p_s_inode->i_sb, p_s_path, n_cut_size); - - - /* Repeat this loop until we either cut the item without needing - to balance, or we fix_nodes without schedule occurring */ - while ( 1 ) { - /* Determine the balance mode, position of the first byte to - be cut, and size to be cut. In case of the indirect item - free unformatted nodes which are pointed to by the cut - pointers. */ - - c_mode = prepare_for_delete_or_cut(th, p_s_inode, p_s_path, p_s_item_key, &n_removed, - &n_cut_size, n_new_file_size); - if ( c_mode == M_CONVERT ) { - /* convert last unformatted node to direct item or leave - tail in the unformatted node */ - RFALSE( n_ret_value != CARRY_ON, "PAP-5570: can not convert twice"); - - n_ret_value = maybe_indirect_to_direct (th, p_s_inode, page, p_s_path, p_s_item_key, - n_new_file_size, &c_mode); - if ( c_mode == M_SKIP_BALANCING ) - /* tail has been left in the unformatted node */ - return n_ret_value; - - n_is_inode_locked = 1; - - /* removing of last unformatted node will change value we - have to return to truncate. Save it */ - retval2 = n_ret_value; - /*retval2 = p_s_sb->s_blocksize - (n_new_file_size & (p_s_sb->s_blocksize - 1));*/ - - /* So, we have performed the first part of the conversion: - inserting the new direct item. Now we are removing the - last unformatted node pointer. Set key to search for - it. */ - set_cpu_key_k_type (p_s_item_key, TYPE_INDIRECT); - p_s_item_key->key_length = 4; - n_new_file_size -= (n_new_file_size & (p_s_sb->s_blocksize - 1)); - tail_pos = n_new_file_size; - set_cpu_key_k_offset (p_s_item_key, n_new_file_size + 1); - if ( search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_NOT_FOUND ){ - print_block (PATH_PLAST_BUFFER (p_s_path), 3, PATH_LAST_POSITION (p_s_path) - 1, PATH_LAST_POSITION (p_s_path) + 1); - reiserfs_panic(p_s_sb, "PAP-5580: reiserfs_cut_from_item: item to convert does not exist (%K)", p_s_item_key); - } - continue; - } - if (n_cut_size == 0) { - pathrelse (p_s_path); - return 0; - } + struct super_block *p_s_sb = p_s_inode->i_sb; + /* Every function which is going to call do_balance must first + create a tree_balance structure. Then it must fill up this + structure by using the init_tb_struct and fix_nodes functions. + After that we can make tree balancing. */ + struct tree_balance s_cut_balance; + struct item_head *p_le_ih; + int n_cut_size = 0, /* Amount to be cut. */ + n_ret_value = CARRY_ON, n_removed = 0, /* Number of the removed unformatted nodes. */ + n_is_inode_locked = 0; + char c_mode; /* Mode of the balance. */ + int retval2 = -1; + int quota_cut_bytes; + loff_t tail_pos = 0; + + BUG_ON(!th->t_trans_id); + + init_tb_struct(th, &s_cut_balance, p_s_inode->i_sb, p_s_path, + n_cut_size); + + /* Repeat this loop until we either cut the item without needing + to balance, or we fix_nodes without schedule occurring */ + while (1) { + /* Determine the balance mode, position of the first byte to + be cut, and size to be cut. In case of the indirect item + free unformatted nodes which are pointed to by the cut + pointers. */ + + c_mode = + prepare_for_delete_or_cut(th, p_s_inode, p_s_path, + p_s_item_key, &n_removed, + &n_cut_size, n_new_file_size); + if (c_mode == M_CONVERT) { + /* convert last unformatted node to direct item or leave + tail in the unformatted node */ + RFALSE(n_ret_value != CARRY_ON, + "PAP-5570: can not convert twice"); + + n_ret_value = + maybe_indirect_to_direct(th, p_s_inode, page, + p_s_path, p_s_item_key, + n_new_file_size, &c_mode); + if (c_mode == M_SKIP_BALANCING) + /* tail has been left in the unformatted node */ + return n_ret_value; + + n_is_inode_locked = 1; + + /* removing of last unformatted node will change value we + have to return to truncate. Save it */ + retval2 = n_ret_value; + /*retval2 = p_s_sb->s_blocksize - (n_new_file_size & (p_s_sb->s_blocksize - 1)); */ + + /* So, we have performed the first part of the conversion: + inserting the new direct item. Now we are removing the + last unformatted node pointer. Set key to search for + it. */ + set_cpu_key_k_type(p_s_item_key, TYPE_INDIRECT); + p_s_item_key->key_length = 4; + n_new_file_size -= + (n_new_file_size & (p_s_sb->s_blocksize - 1)); + tail_pos = n_new_file_size; + set_cpu_key_k_offset(p_s_item_key, n_new_file_size + 1); + if (search_for_position_by_key + (p_s_sb, p_s_item_key, + p_s_path) == POSITION_NOT_FOUND) { + print_block(PATH_PLAST_BUFFER(p_s_path), 3, + PATH_LAST_POSITION(p_s_path) - 1, + PATH_LAST_POSITION(p_s_path) + 1); + reiserfs_panic(p_s_sb, + "PAP-5580: reiserfs_cut_from_item: item to convert does not exist (%K)", + p_s_item_key); + } + continue; + } + if (n_cut_size == 0) { + pathrelse(p_s_path); + return 0; + } + + s_cut_balance.insert_size[0] = n_cut_size; + + n_ret_value = fix_nodes(c_mode, &s_cut_balance, NULL, NULL); + if (n_ret_value != REPEAT_SEARCH) + break; + + PROC_INFO_INC(p_s_sb, cut_from_item_restarted); + + n_ret_value = + search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path); + if (n_ret_value == POSITION_FOUND) + continue; - s_cut_balance.insert_size[0] = n_cut_size; - - n_ret_value = fix_nodes(c_mode, &s_cut_balance, NULL, NULL); - if ( n_ret_value != REPEAT_SEARCH ) - break; - - PROC_INFO_INC( p_s_sb, cut_from_item_restarted ); - - n_ret_value = search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path); - if (n_ret_value == POSITION_FOUND) - continue; - - reiserfs_warning (p_s_sb, "PAP-5610: reiserfs_cut_from_item: item %K not found", p_s_item_key); - unfix_nodes (&s_cut_balance); - return (n_ret_value == IO_ERROR) ? -EIO : -ENOENT; - } /* while */ - - // check fix_nodes results (IO_ERROR or NO_DISK_SPACE) - if ( n_ret_value != CARRY_ON ) { - if ( n_is_inode_locked ) { - // FIXME: this seems to be not needed: we are always able - // to cut item - indirect_to_direct_roll_back (th, p_s_inode, p_s_path); + reiserfs_warning(p_s_sb, + "PAP-5610: reiserfs_cut_from_item: item %K not found", + p_s_item_key); + unfix_nodes(&s_cut_balance); + return (n_ret_value == IO_ERROR) ? -EIO : -ENOENT; + } /* while */ + + // check fix_nodes results (IO_ERROR or NO_DISK_SPACE) + if (n_ret_value != CARRY_ON) { + if (n_is_inode_locked) { + // FIXME: this seems to be not needed: we are always able + // to cut item + indirect_to_direct_roll_back(th, p_s_inode, p_s_path); + } + if (n_ret_value == NO_DISK_SPACE) + reiserfs_warning(p_s_sb, "NO_DISK_SPACE"); + unfix_nodes(&s_cut_balance); + return -EIO; } - if (n_ret_value == NO_DISK_SPACE) - reiserfs_warning (p_s_sb, "NO_DISK_SPACE"); - unfix_nodes (&s_cut_balance); - return -EIO; - } - - /* go ahead and perform balancing */ - - RFALSE( c_mode == M_PASTE || c_mode == M_INSERT, "invalid mode"); - - /* Calculate number of bytes that need to be cut from the item. */ - quota_cut_bytes = ( c_mode == M_DELETE ) ? ih_item_len(get_ih(p_s_path)) : -s_cut_balance.insert_size[0]; - if (retval2 == -1) - n_ret_value = calc_deleted_bytes_number(&s_cut_balance, c_mode); - else - n_ret_value = retval2; - - - /* For direct items, we only change the quota when deleting the last - ** item. - */ - p_le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path); - if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(p_le_ih)) { - if (c_mode == M_DELETE && - (le_ih_k_offset (p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1 ) { - // FIXME: this is to keep 3.5 happy - REISERFS_I(p_s_inode)->i_first_direct_byte = U32_MAX; - quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE ; - } else { - quota_cut_bytes = 0 ; + + /* go ahead and perform balancing */ + + RFALSE(c_mode == M_PASTE || c_mode == M_INSERT, "invalid mode"); + + /* Calculate number of bytes that need to be cut from the item. */ + quota_cut_bytes = + (c_mode == + M_DELETE) ? ih_item_len(get_ih(p_s_path)) : -s_cut_balance. + insert_size[0]; + if (retval2 == -1) + n_ret_value = calc_deleted_bytes_number(&s_cut_balance, c_mode); + else + n_ret_value = retval2; + + /* For direct items, we only change the quota when deleting the last + ** item. + */ + p_le_ih = PATH_PITEM_HEAD(s_cut_balance.tb_path); + if (!S_ISLNK(p_s_inode->i_mode) && is_direct_le_ih(p_le_ih)) { + if (c_mode == M_DELETE && + (le_ih_k_offset(p_le_ih) & (p_s_sb->s_blocksize - 1)) == + 1) { + // FIXME: this is to keep 3.5 happy + REISERFS_I(p_s_inode)->i_first_direct_byte = U32_MAX; + quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE; + } else { + quota_cut_bytes = 0; + } } - } #ifdef CONFIG_REISERFS_CHECK - if (n_is_inode_locked) { - struct item_head * le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path); - /* we are going to complete indirect2direct conversion. Make - sure, that we exactly remove last unformatted node pointer - of the item */ - if (!is_indirect_le_ih (le_ih)) - reiserfs_panic (p_s_sb, "vs-5652: reiserfs_cut_from_item: " - "item must be indirect %h", le_ih); - - if (c_mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE) - reiserfs_panic (p_s_sb, "vs-5653: reiserfs_cut_from_item: " - "completing indirect2direct conversion indirect item %h " - "being deleted must be of 4 byte long", le_ih); - - if (c_mode == M_CUT && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) { - reiserfs_panic (p_s_sb, "vs-5654: reiserfs_cut_from_item: " - "can not complete indirect2direct conversion of %h (CUT, insert_size==%d)", - le_ih, s_cut_balance.insert_size[0]); + if (n_is_inode_locked) { + struct item_head *le_ih = + PATH_PITEM_HEAD(s_cut_balance.tb_path); + /* we are going to complete indirect2direct conversion. Make + sure, that we exactly remove last unformatted node pointer + of the item */ + if (!is_indirect_le_ih(le_ih)) + reiserfs_panic(p_s_sb, + "vs-5652: reiserfs_cut_from_item: " + "item must be indirect %h", le_ih); + + if (c_mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE) + reiserfs_panic(p_s_sb, + "vs-5653: reiserfs_cut_from_item: " + "completing indirect2direct conversion indirect item %h " + "being deleted must be of 4 byte long", + le_ih); + + if (c_mode == M_CUT + && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) { + reiserfs_panic(p_s_sb, + "vs-5654: reiserfs_cut_from_item: " + "can not complete indirect2direct conversion of %h (CUT, insert_size==%d)", + le_ih, s_cut_balance.insert_size[0]); + } + /* it would be useful to make sure, that right neighboring + item is direct item of this file */ } - /* it would be useful to make sure, that right neighboring - item is direct item of this file */ - } #endif - - do_balance(&s_cut_balance, NULL, NULL, c_mode); - if ( n_is_inode_locked ) { - /* we've done an indirect->direct conversion. when the data block - ** was freed, it was removed from the list of blocks that must - ** be flushed before the transaction commits, make sure to - ** unmap and invalidate it - */ - unmap_buffers(page, tail_pos); - REISERFS_I(p_s_inode)->i_flags &= ~i_pack_on_close_mask ; - } + + do_balance(&s_cut_balance, NULL, NULL, c_mode); + if (n_is_inode_locked) { + /* we've done an indirect->direct conversion. when the data block + ** was freed, it was removed from the list of blocks that must + ** be flushed before the transaction commits, make sure to + ** unmap and invalidate it + */ + unmap_buffers(page, tail_pos); + REISERFS_I(p_s_inode)->i_flags &= ~i_pack_on_close_mask; + } #ifdef REISERQUOTA_DEBUG - reiserfs_debug (p_s_inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota cut_from_item(): freeing %u id=%u type=%c", quota_cut_bytes, p_s_inode->i_uid, '?'); + reiserfs_debug(p_s_inode->i_sb, REISERFS_DEBUG_CODE, + "reiserquota cut_from_item(): freeing %u id=%u type=%c", + quota_cut_bytes, p_s_inode->i_uid, '?'); #endif - DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes); - return n_ret_value; + DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes); + return n_ret_value; } -static void truncate_directory (struct reiserfs_transaction_handle *th, struct inode * inode) +static void truncate_directory(struct reiserfs_transaction_handle *th, + struct inode *inode) { - BUG_ON (!th->t_trans_id); - if (inode->i_nlink) - reiserfs_warning (inode->i_sb, - "vs-5655: truncate_directory: link count != 0"); - - set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), DOT_OFFSET); - set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_DIRENTRY); - reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode)); - reiserfs_update_sd(th, inode) ; - set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), SD_OFFSET); - set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_STAT_DATA); + BUG_ON(!th->t_trans_id); + if (inode->i_nlink) + reiserfs_warning(inode->i_sb, + "vs-5655: truncate_directory: link count != 0"); + + set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET); + set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY); + reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode)); + reiserfs_update_sd(th, inode); + set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), SD_OFFSET); + set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA); } +/* Truncate file to the new size. Note, this must be called with a transaction + already started */ +int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p_s_inode, /* ->i_size contains new + size */ + struct page *page, /* up to date for last block */ + int update_timestamps /* when it is called by + file_release to convert + the tail - no timestamps + should be updated */ + ) +{ + INITIALIZE_PATH(s_search_path); /* Path to the current object item. */ + struct item_head *p_le_ih; /* Pointer to an item header. */ + struct cpu_key s_item_key; /* Key to search for a previous file item. */ + loff_t n_file_size, /* Old file size. */ + n_new_file_size; /* New file size. */ + int n_deleted; /* Number of deleted or truncated bytes. */ + int retval; + int err = 0; + + BUG_ON(!th->t_trans_id); + if (! + (S_ISREG(p_s_inode->i_mode) || S_ISDIR(p_s_inode->i_mode) + || S_ISLNK(p_s_inode->i_mode))) + return 0; + + if (S_ISDIR(p_s_inode->i_mode)) { + // deletion of directory - no need to update timestamps + truncate_directory(th, p_s_inode); + return 0; + } + /* Get new file size. */ + n_new_file_size = p_s_inode->i_size; + // FIXME: note, that key type is unimportant here + make_cpu_key(&s_item_key, p_s_inode, max_reiserfs_offset(p_s_inode), + TYPE_DIRECT, 3); -/* Truncate file to the new size. Note, this must be called with a transaction - already started */ -int reiserfs_do_truncate (struct reiserfs_transaction_handle *th, - struct inode * p_s_inode, /* ->i_size contains new - size */ - struct page *page, /* up to date for last block */ - int update_timestamps /* when it is called by - file_release to convert - the tail - no timestamps - should be updated */ - ) { - INITIALIZE_PATH (s_search_path); /* Path to the current object item. */ - struct item_head * p_le_ih; /* Pointer to an item header. */ - struct cpu_key s_item_key; /* Key to search for a previous file item. */ - loff_t n_file_size, /* Old file size. */ - n_new_file_size;/* New file size. */ - int n_deleted; /* Number of deleted or truncated bytes. */ - int retval; - int err = 0; - - BUG_ON (!th->t_trans_id); - if ( ! (S_ISREG(p_s_inode->i_mode) || S_ISDIR(p_s_inode->i_mode) || S_ISLNK(p_s_inode->i_mode)) ) - return 0; + retval = + search_for_position_by_key(p_s_inode->i_sb, &s_item_key, + &s_search_path); + if (retval == IO_ERROR) { + reiserfs_warning(p_s_inode->i_sb, + "vs-5657: reiserfs_do_truncate: " + "i/o failure occurred trying to truncate %K", + &s_item_key); + err = -EIO; + goto out; + } + if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) { + reiserfs_warning(p_s_inode->i_sb, + "PAP-5660: reiserfs_do_truncate: " + "wrong result %d of search for %K", retval, + &s_item_key); + + err = -EIO; + goto out; + } - if (S_ISDIR(p_s_inode->i_mode)) { - // deletion of directory - no need to update timestamps - truncate_directory (th, p_s_inode); - return 0; - } - - /* Get new file size. */ - n_new_file_size = p_s_inode->i_size; - - // FIXME: note, that key type is unimportant here - make_cpu_key (&s_item_key, p_s_inode, max_reiserfs_offset (p_s_inode), TYPE_DIRECT, 3); - - retval = search_for_position_by_key(p_s_inode->i_sb, &s_item_key, &s_search_path); - if (retval == IO_ERROR) { - reiserfs_warning (p_s_inode->i_sb, "vs-5657: reiserfs_do_truncate: " - "i/o failure occurred trying to truncate %K", &s_item_key); - err = -EIO; - goto out; - } - if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) { - reiserfs_warning (p_s_inode->i_sb, "PAP-5660: reiserfs_do_truncate: " - "wrong result %d of search for %K", retval, &s_item_key); - - err = -EIO; - goto out; - } - - s_search_path.pos_in_item --; - - /* Get real file size (total length of all file items) */ - p_le_ih = PATH_PITEM_HEAD(&s_search_path); - if ( is_statdata_le_ih (p_le_ih) ) - n_file_size = 0; - else { - loff_t offset = le_ih_k_offset (p_le_ih); - int bytes = op_bytes_number (p_le_ih,p_s_inode->i_sb->s_blocksize); - - /* this may mismatch with real file size: if last direct item - had no padding zeros and last unformatted node had no free - space, this file would have this file size */ - n_file_size = offset + bytes - 1; - } - /* - * are we doing a full truncate or delete, if so - * kick in the reada code - */ - if (n_new_file_size == 0) - s_search_path.reada = PATH_READA | PATH_READA_BACK; - - if ( n_file_size == 0 || n_file_size < n_new_file_size ) { - goto update_and_out ; - } - - /* Update key to search for the last file item. */ - set_cpu_key_k_offset (&s_item_key, n_file_size); - - do { - /* Cut or delete file item. */ - n_deleted = reiserfs_cut_from_item(th, &s_search_path, &s_item_key, p_s_inode, page, n_new_file_size); - if (n_deleted < 0) { - reiserfs_warning (p_s_inode->i_sb, "vs-5665: reiserfs_do_truncate: reiserfs_cut_from_item failed"); - reiserfs_check_path(&s_search_path) ; - return 0; + s_search_path.pos_in_item--; + + /* Get real file size (total length of all file items) */ + p_le_ih = PATH_PITEM_HEAD(&s_search_path); + if (is_statdata_le_ih(p_le_ih)) + n_file_size = 0; + else { + loff_t offset = le_ih_k_offset(p_le_ih); + int bytes = + op_bytes_number(p_le_ih, p_s_inode->i_sb->s_blocksize); + + /* this may mismatch with real file size: if last direct item + had no padding zeros and last unformatted node had no free + space, this file would have this file size */ + n_file_size = offset + bytes - 1; + } + /* + * are we doing a full truncate or delete, if so + * kick in the reada code + */ + if (n_new_file_size == 0) + s_search_path.reada = PATH_READA | PATH_READA_BACK; + + if (n_file_size == 0 || n_file_size < n_new_file_size) { + goto update_and_out; } - RFALSE( n_deleted > n_file_size, - "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K", - n_deleted, n_file_size, &s_item_key); + /* Update key to search for the last file item. */ + set_cpu_key_k_offset(&s_item_key, n_file_size); + + do { + /* Cut or delete file item. */ + n_deleted = + reiserfs_cut_from_item(th, &s_search_path, &s_item_key, + p_s_inode, page, n_new_file_size); + if (n_deleted < 0) { + reiserfs_warning(p_s_inode->i_sb, + "vs-5665: reiserfs_do_truncate: reiserfs_cut_from_item failed"); + reiserfs_check_path(&s_search_path); + return 0; + } - /* Change key to search the last file item. */ - n_file_size -= n_deleted; + RFALSE(n_deleted > n_file_size, + "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K", + n_deleted, n_file_size, &s_item_key); - set_cpu_key_k_offset (&s_item_key, n_file_size); + /* Change key to search the last file item. */ + n_file_size -= n_deleted; - /* While there are bytes to truncate and previous file item is presented in the tree. */ + set_cpu_key_k_offset(&s_item_key, n_file_size); - /* - ** This loop could take a really long time, and could log - ** many more blocks than a transaction can hold. So, we do a polite - ** journal end here, and if the transaction needs ending, we make - ** sure the file is consistent before ending the current trans - ** and starting a new one - */ - if (journal_transaction_should_end(th, th->t_blocks_allocated)) { - int orig_len_alloc = th->t_blocks_allocated ; - decrement_counters_in_path(&s_search_path) ; - - if (update_timestamps) { - p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME_SEC; - } - reiserfs_update_sd(th, p_s_inode) ; - - err = journal_end(th, p_s_inode->i_sb, orig_len_alloc) ; - if (err) - goto out; - err = journal_begin (th, p_s_inode->i_sb, - JOURNAL_PER_BALANCE_CNT * 6); - if (err) - goto out; - reiserfs_update_inode_transaction(p_s_inode) ; + /* While there are bytes to truncate and previous file item is presented in the tree. */ + + /* + ** This loop could take a really long time, and could log + ** many more blocks than a transaction can hold. So, we do a polite + ** journal end here, and if the transaction needs ending, we make + ** sure the file is consistent before ending the current trans + ** and starting a new one + */ + if (journal_transaction_should_end(th, th->t_blocks_allocated)) { + int orig_len_alloc = th->t_blocks_allocated; + decrement_counters_in_path(&s_search_path); + + if (update_timestamps) { + p_s_inode->i_mtime = p_s_inode->i_ctime = + CURRENT_TIME_SEC; + } + reiserfs_update_sd(th, p_s_inode); + + err = journal_end(th, p_s_inode->i_sb, orig_len_alloc); + if (err) + goto out; + err = journal_begin(th, p_s_inode->i_sb, + JOURNAL_PER_BALANCE_CNT * 6); + if (err) + goto out; + reiserfs_update_inode_transaction(p_s_inode); + } + } while (n_file_size > ROUND_UP(n_new_file_size) && + search_for_position_by_key(p_s_inode->i_sb, &s_item_key, + &s_search_path) == POSITION_FOUND); + + RFALSE(n_file_size > ROUND_UP(n_new_file_size), + "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d", + n_new_file_size, n_file_size, s_item_key.on_disk_key.k_objectid); + + update_and_out: + if (update_timestamps) { + // this is truncate, not file closing + p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME_SEC; } - } while ( n_file_size > ROUND_UP (n_new_file_size) && - search_for_position_by_key(p_s_inode->i_sb, &s_item_key, &s_search_path) == POSITION_FOUND ) ; - - RFALSE( n_file_size > ROUND_UP (n_new_file_size), - "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d", - n_new_file_size, n_file_size, s_item_key.on_disk_key.k_objectid); - -update_and_out: - if (update_timestamps) { - // this is truncate, not file closing - p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME_SEC; - } - reiserfs_update_sd (th, p_s_inode); - -out: - pathrelse(&s_search_path) ; - return err; -} + reiserfs_update_sd(th, p_s_inode); + out: + pathrelse(&s_search_path); + return err; +} #ifdef CONFIG_REISERFS_CHECK // this makes sure, that we __append__, not overwrite or add holes -static void check_research_for_paste (struct path * path, - const struct cpu_key * p_s_key) +static void check_research_for_paste(struct path *path, + const struct cpu_key *p_s_key) { - struct item_head * found_ih = get_ih (path); - - if (is_direct_le_ih (found_ih)) { - if (le_ih_k_offset (found_ih) + op_bytes_number (found_ih, get_last_bh (path)->b_size) != - cpu_key_k_offset (p_s_key) || - op_bytes_number (found_ih, get_last_bh (path)->b_size) != pos_in_item (path)) - reiserfs_panic (NULL, "PAP-5720: check_research_for_paste: " - "found direct item %h or position (%d) does not match to key %K", - found_ih, pos_in_item (path), p_s_key); - } - if (is_indirect_le_ih (found_ih)) { - if (le_ih_k_offset (found_ih) + op_bytes_number (found_ih, get_last_bh (path)->b_size) != cpu_key_k_offset (p_s_key) || - I_UNFM_NUM (found_ih) != pos_in_item (path) || - get_ih_free_space (found_ih) != 0) - reiserfs_panic (NULL, "PAP-5730: check_research_for_paste: " - "found indirect item (%h) or position (%d) does not match to key (%K)", - found_ih, pos_in_item (path), p_s_key); - } + struct item_head *found_ih = get_ih(path); + + if (is_direct_le_ih(found_ih)) { + if (le_ih_k_offset(found_ih) + + op_bytes_number(found_ih, + get_last_bh(path)->b_size) != + cpu_key_k_offset(p_s_key) + || op_bytes_number(found_ih, + get_last_bh(path)->b_size) != + pos_in_item(path)) + reiserfs_panic(NULL, + "PAP-5720: check_research_for_paste: " + "found direct item %h or position (%d) does not match to key %K", + found_ih, pos_in_item(path), p_s_key); + } + if (is_indirect_le_ih(found_ih)) { + if (le_ih_k_offset(found_ih) + + op_bytes_number(found_ih, + get_last_bh(path)->b_size) != + cpu_key_k_offset(p_s_key) + || I_UNFM_NUM(found_ih) != pos_in_item(path) + || get_ih_free_space(found_ih) != 0) + reiserfs_panic(NULL, + "PAP-5730: check_research_for_paste: " + "found indirect item (%h) or position (%d) does not match to key (%K)", + found_ih, pos_in_item(path), p_s_key); + } } -#endif /* config reiserfs check */ - +#endif /* config reiserfs check */ /* Paste bytes to the existing item. Returns bytes number pasted into the item. */ -int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th, - struct path * p_s_search_path, /* Path to the pasted item. */ - const struct cpu_key * p_s_key, /* Key to search for the needed item.*/ - struct inode * inode, /* Inode item belongs to */ - const char * p_c_body, /* Pointer to the bytes to paste. */ - int n_pasted_size) /* Size of pasted bytes. */ -{ - struct tree_balance s_paste_balance; - int retval; - int fs_gen; +int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct path *p_s_search_path, /* Path to the pasted item. */ + const struct cpu_key *p_s_key, /* Key to search for the needed item. */ + struct inode *inode, /* Inode item belongs to */ + const char *p_c_body, /* Pointer to the bytes to paste. */ + int n_pasted_size) +{ /* Size of pasted bytes. */ + struct tree_balance s_paste_balance; + int retval; + int fs_gen; + + BUG_ON(!th->t_trans_id); - BUG_ON (!th->t_trans_id); - - fs_gen = get_generation(inode->i_sb) ; + fs_gen = get_generation(inode->i_sb); #ifdef REISERQUOTA_DEBUG - reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota paste_into_item(): allocating %u id=%u type=%c", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key))); + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, + "reiserquota paste_into_item(): allocating %u id=%u type=%c", + n_pasted_size, inode->i_uid, + key2type(&(p_s_key->on_disk_key))); #endif - if (DQUOT_ALLOC_SPACE_NODIRTY(inode, n_pasted_size)) { - pathrelse(p_s_search_path); - return -EDQUOT; - } - init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path, n_pasted_size); + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, n_pasted_size)) { + pathrelse(p_s_search_path); + return -EDQUOT; + } + init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path, + n_pasted_size); #ifdef DISPLACE_NEW_PACKING_LOCALITIES - s_paste_balance.key = p_s_key->on_disk_key; + s_paste_balance.key = p_s_key->on_disk_key; #endif - /* DQUOT_* can schedule, must check before the fix_nodes */ - if (fs_changed(fs_gen, inode->i_sb)) { - goto search_again; - } - - while ((retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, p_c_body)) == -REPEAT_SEARCH ) { -search_again: - /* file system changed while we were in the fix_nodes */ - PROC_INFO_INC( th -> t_super, paste_into_item_restarted ); - retval = search_for_position_by_key (th->t_super, p_s_key, p_s_search_path); - if (retval == IO_ERROR) { - retval = -EIO ; - goto error_out ; + /* DQUOT_* can schedule, must check before the fix_nodes */ + if (fs_changed(fs_gen, inode->i_sb)) { + goto search_again; } - if (retval == POSITION_FOUND) { - reiserfs_warning (inode->i_sb, "PAP-5710: reiserfs_paste_into_item: entry or pasted byte (%K) exists", p_s_key); - retval = -EEXIST ; - goto error_out ; - } - + + while ((retval = + fix_nodes(M_PASTE, &s_paste_balance, NULL, + p_c_body)) == REPEAT_SEARCH) { + search_again: + /* file system changed while we were in the fix_nodes */ + PROC_INFO_INC(th->t_super, paste_into_item_restarted); + retval = + search_for_position_by_key(th->t_super, p_s_key, + p_s_search_path); + if (retval == IO_ERROR) { + retval = -EIO; + goto error_out; + } + if (retval == POSITION_FOUND) { + reiserfs_warning(inode->i_sb, + "PAP-5710: reiserfs_paste_into_item: entry or pasted byte (%K) exists", + p_s_key); + retval = -EEXIST; + goto error_out; + } #ifdef CONFIG_REISERFS_CHECK - check_research_for_paste (p_s_search_path, p_s_key); + check_research_for_paste(p_s_search_path, p_s_key); #endif - } + } - /* Perform balancing after all resources are collected by fix_nodes, and - accessing them will not risk triggering schedule. */ - if ( retval == CARRY_ON ) { - do_balance(&s_paste_balance, NULL/*ih*/, p_c_body, M_PASTE); - return 0; - } - retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; -error_out: - /* this also releases the path */ - unfix_nodes(&s_paste_balance); + /* Perform balancing after all resources are collected by fix_nodes, and + accessing them will not risk triggering schedule. */ + if (retval == CARRY_ON) { + do_balance(&s_paste_balance, NULL /*ih */ , p_c_body, M_PASTE); + return 0; + } + retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; + error_out: + /* this also releases the path */ + unfix_nodes(&s_paste_balance); #ifdef REISERQUOTA_DEBUG - reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota paste_into_item(): freeing %u id=%u type=%c", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key))); + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, + "reiserquota paste_into_item(): freeing %u id=%u type=%c", + n_pasted_size, inode->i_uid, + key2type(&(p_s_key->on_disk_key))); #endif - DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size); - return retval ; + DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size); + return retval; } - /* Insert new item into the buffer at the path. */ -int reiserfs_insert_item(struct reiserfs_transaction_handle *th, - struct path * p_s_path, /* Path to the inserteded item. */ - const struct cpu_key * key, - struct item_head * p_s_ih, /* Pointer to the item header to insert.*/ - struct inode * inode, - const char * p_c_body) /* Pointer to the bytes to insert. */ -{ - struct tree_balance s_ins_balance; - int retval; - int fs_gen = 0 ; - int quota_bytes = 0 ; - - BUG_ON (!th->t_trans_id); - - if (inode) { /* Do we count quotas for item? */ - fs_gen = get_generation(inode->i_sb); - quota_bytes = ih_item_len(p_s_ih); - - /* hack so the quota code doesn't have to guess if the file has - ** a tail, links are always tails, so there's no guessing needed - */ - if (!S_ISLNK (inode->i_mode) && is_direct_le_ih(p_s_ih)) { - quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE ; - } +int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct path *p_s_path, /* Path to the inserteded item. */ + const struct cpu_key *key, struct item_head *p_s_ih, /* Pointer to the item header to insert. */ + struct inode *inode, const char *p_c_body) +{ /* Pointer to the bytes to insert. */ + struct tree_balance s_ins_balance; + int retval; + int fs_gen = 0; + int quota_bytes = 0; + + BUG_ON(!th->t_trans_id); + + if (inode) { /* Do we count quotas for item? */ + fs_gen = get_generation(inode->i_sb); + quota_bytes = ih_item_len(p_s_ih); + + /* hack so the quota code doesn't have to guess if the file has + ** a tail, links are always tails, so there's no guessing needed + */ + if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_s_ih)) { + quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE; + } #ifdef REISERQUOTA_DEBUG - reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota insert_item(): allocating %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(p_s_ih)); + reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE, + "reiserquota insert_item(): allocating %u id=%u type=%c", + quota_bytes, inode->i_uid, head2type(p_s_ih)); #endif - /* We can't dirty inode here. It would be immediately written but - * appropriate stat item isn't inserted yet... */ - if (DQUOT_ALLOC_SPACE_NODIRTY(inode, quota_bytes)) { - pathrelse(p_s_path); - return -EDQUOT; + /* We can't dirty inode here. It would be immediately written but + * appropriate stat item isn't inserted yet... */ + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, quota_bytes)) { + pathrelse(p_s_path); + return -EDQUOT; + } } - } - init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path, IH_SIZE + ih_item_len(p_s_ih)); + init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path, + IH_SIZE + ih_item_len(p_s_ih)); #ifdef DISPLACE_NEW_PACKING_LOCALITIES - s_ins_balance.key = key->on_disk_key; + s_ins_balance.key = key->on_disk_key; #endif - /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */ - if (inode && fs_changed(fs_gen, inode->i_sb)) { - goto search_again; - } - - while ( (retval = fix_nodes(M_INSERT, &s_ins_balance, p_s_ih, p_c_body)) == REPEAT_SEARCH) { -search_again: - /* file system changed while we were in the fix_nodes */ - PROC_INFO_INC( th -> t_super, insert_item_restarted ); - retval = search_item (th->t_super, key, p_s_path); - if (retval == IO_ERROR) { - retval = -EIO; - goto error_out ; + /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */ + if (inode && fs_changed(fs_gen, inode->i_sb)) { + goto search_again; } - if (retval == ITEM_FOUND) { - reiserfs_warning (th->t_super, "PAP-5760: reiserfs_insert_item: " - "key %K already exists in the tree", key); - retval = -EEXIST ; - goto error_out; + + while ((retval = + fix_nodes(M_INSERT, &s_ins_balance, p_s_ih, + p_c_body)) == REPEAT_SEARCH) { + search_again: + /* file system changed while we were in the fix_nodes */ + PROC_INFO_INC(th->t_super, insert_item_restarted); + retval = search_item(th->t_super, key, p_s_path); + if (retval == IO_ERROR) { + retval = -EIO; + goto error_out; + } + if (retval == ITEM_FOUND) { + reiserfs_warning(th->t_super, + "PAP-5760: reiserfs_insert_item: " + "key %K already exists in the tree", + key); + retval = -EEXIST; + goto error_out; + } } - } - /* make balancing after all resources will be collected at a time */ - if ( retval == CARRY_ON ) { - do_balance (&s_ins_balance, p_s_ih, p_c_body, M_INSERT); - return 0; - } + /* make balancing after all resources will be collected at a time */ + if (retval == CARRY_ON) { + do_balance(&s_ins_balance, p_s_ih, p_c_body, M_INSERT); + return 0; + } - retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; -error_out: - /* also releases the path */ - unfix_nodes(&s_ins_balance); + retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; + error_out: + /* also releases the path */ + unfix_nodes(&s_ins_balance); #ifdef REISERQUOTA_DEBUG - reiserfs_debug (th->t_super, REISERFS_DEBUG_CODE, "reiserquota insert_item(): freeing %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(p_s_ih)); + reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE, + "reiserquota insert_item(): freeing %u id=%u type=%c", + quota_bytes, inode->i_uid, head2type(p_s_ih)); #endif - if (inode) - DQUOT_FREE_SPACE_NODIRTY(inode, quota_bytes) ; - return retval; + if (inode) + DQUOT_FREE_SPACE_NODIRTY(inode, quota_bytes); + return retval; } - - - - diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index b35b8774498..6951c35755b 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -35,83 +35,81 @@ static const char reiserfs_3_5_magic_string[] = REISERFS_SUPER_MAGIC_STRING; static const char reiserfs_3_6_magic_string[] = REISER2FS_SUPER_MAGIC_STRING; static const char reiserfs_jr_magic_string[] = REISER2FS_JR_SUPER_MAGIC_STRING; -int is_reiserfs_3_5 (struct reiserfs_super_block * rs) +int is_reiserfs_3_5(struct reiserfs_super_block *rs) { - return !strncmp (rs->s_v1.s_magic, reiserfs_3_5_magic_string, - strlen (reiserfs_3_5_magic_string)); + return !strncmp(rs->s_v1.s_magic, reiserfs_3_5_magic_string, + strlen(reiserfs_3_5_magic_string)); } - -int is_reiserfs_3_6 (struct reiserfs_super_block * rs) +int is_reiserfs_3_6(struct reiserfs_super_block *rs) { - return !strncmp (rs->s_v1.s_magic, reiserfs_3_6_magic_string, - strlen (reiserfs_3_6_magic_string)); + return !strncmp(rs->s_v1.s_magic, reiserfs_3_6_magic_string, + strlen(reiserfs_3_6_magic_string)); } - -int is_reiserfs_jr (struct reiserfs_super_block * rs) +int is_reiserfs_jr(struct reiserfs_super_block *rs) { - return !strncmp (rs->s_v1.s_magic, reiserfs_jr_magic_string, - strlen (reiserfs_jr_magic_string)); + return !strncmp(rs->s_v1.s_magic, reiserfs_jr_magic_string, + strlen(reiserfs_jr_magic_string)); } - -static int is_any_reiserfs_magic_string (struct reiserfs_super_block * rs) +static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs) { - return (is_reiserfs_3_5 (rs) || is_reiserfs_3_6 (rs) || - is_reiserfs_jr (rs)); + return (is_reiserfs_3_5(rs) || is_reiserfs_3_6(rs) || + is_reiserfs_jr(rs)); } -static int reiserfs_remount (struct super_block * s, int * flags, char * data); -static int reiserfs_statfs (struct super_block * s, struct kstatfs * buf); +static int reiserfs_remount(struct super_block *s, int *flags, char *data); +static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf); -static int reiserfs_sync_fs (struct super_block * s, int wait) +static int reiserfs_sync_fs(struct super_block *s, int wait) { - if (!(s->s_flags & MS_RDONLY)) { - struct reiserfs_transaction_handle th; - reiserfs_write_lock(s); - if (!journal_begin(&th, s, 1)) - if (!journal_end_sync(&th, s, 1)) - reiserfs_flush_old_commits(s); - s->s_dirt = 0; /* Even if it's not true. - * We'll loop forever in sync_supers otherwise */ - reiserfs_write_unlock(s); - } else { - s->s_dirt = 0; - } - return 0; + if (!(s->s_flags & MS_RDONLY)) { + struct reiserfs_transaction_handle th; + reiserfs_write_lock(s); + if (!journal_begin(&th, s, 1)) + if (!journal_end_sync(&th, s, 1)) + reiserfs_flush_old_commits(s); + s->s_dirt = 0; /* Even if it's not true. + * We'll loop forever in sync_supers otherwise */ + reiserfs_write_unlock(s); + } else { + s->s_dirt = 0; + } + return 0; } static void reiserfs_write_super(struct super_block *s) { - reiserfs_sync_fs(s, 1); + reiserfs_sync_fs(s, 1); } -static void reiserfs_write_super_lockfs (struct super_block * s) +static void reiserfs_write_super_lockfs(struct super_block *s) { - struct reiserfs_transaction_handle th ; - reiserfs_write_lock(s); - if (!(s->s_flags & MS_RDONLY)) { - int err = journal_begin(&th, s, 1) ; - if (err) { - reiserfs_block_writes(&th) ; - } else { - reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); - journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); - reiserfs_block_writes(&th) ; - journal_end_sync(&th, s, 1) ; - } - } - s->s_dirt = 0; - reiserfs_write_unlock(s); + struct reiserfs_transaction_handle th; + reiserfs_write_lock(s); + if (!(s->s_flags & MS_RDONLY)) { + int err = journal_begin(&th, s, 1); + if (err) { + reiserfs_block_writes(&th); + } else { + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), + 1); + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + reiserfs_block_writes(&th); + journal_end_sync(&th, s, 1); + } + } + s->s_dirt = 0; + reiserfs_write_unlock(s); } -static void reiserfs_unlockfs(struct super_block *s) { - reiserfs_allow_writes(s) ; +static void reiserfs_unlockfs(struct super_block *s) +{ + reiserfs_allow_writes(s); } -extern const struct in_core_key MAX_IN_CORE_KEY; - +extern const struct in_core_key MAX_IN_CORE_KEY; /* this is used to delete "save link" when there are no items of a file it points to. It can either happen if unlink is completed but @@ -120,364 +118,387 @@ extern const struct in_core_key MAX_IN_CORE_KEY; protecting unlink is bigger that a key lf "save link" which protects truncate), so there left no items to make truncate completion on */ -static int remove_save_link_only (struct super_block * s, struct reiserfs_key * key, int oid_free) +static int remove_save_link_only(struct super_block *s, + struct reiserfs_key *key, int oid_free) { - struct reiserfs_transaction_handle th; - int err; - - /* we are going to do one balancing */ - err = journal_begin (&th, s, JOURNAL_PER_BALANCE_CNT); - if (err) - return err; - - reiserfs_delete_solid_item (&th, NULL, key); - if (oid_free) - /* removals are protected by direct items */ - reiserfs_release_objectid (&th, le32_to_cpu (key->k_objectid)); - - return journal_end (&th, s, JOURNAL_PER_BALANCE_CNT); + struct reiserfs_transaction_handle th; + int err; + + /* we are going to do one balancing */ + err = journal_begin(&th, s, JOURNAL_PER_BALANCE_CNT); + if (err) + return err; + + reiserfs_delete_solid_item(&th, NULL, key); + if (oid_free) + /* removals are protected by direct items */ + reiserfs_release_objectid(&th, le32_to_cpu(key->k_objectid)); + + return journal_end(&th, s, JOURNAL_PER_BALANCE_CNT); } - + #ifdef CONFIG_QUOTA static int reiserfs_quota_on_mount(struct super_block *, int); #endif - + /* look for uncompleted unlinks and truncates and complete them */ -static int finish_unfinished (struct super_block * s) +static int finish_unfinished(struct super_block *s) { - INITIALIZE_PATH (path); - struct cpu_key max_cpu_key, obj_key; - struct reiserfs_key save_link_key; - int retval = 0; - struct item_head * ih; - struct buffer_head * bh; - int item_pos; - char * item; - int done; - struct inode * inode; - int truncate; + INITIALIZE_PATH(path); + struct cpu_key max_cpu_key, obj_key; + struct reiserfs_key save_link_key; + int retval = 0; + struct item_head *ih; + struct buffer_head *bh; + int item_pos; + char *item; + int done; + struct inode *inode; + int truncate; #ifdef CONFIG_QUOTA - int i; - int ms_active_set; + int i; + int ms_active_set; #endif - - - /* compose key to look for "save" links */ - max_cpu_key.version = KEY_FORMAT_3_5; - max_cpu_key.on_disk_key.k_dir_id = ~0U; - max_cpu_key.on_disk_key.k_objectid = ~0U; - set_cpu_key_k_offset (&max_cpu_key, ~0U); - max_cpu_key.key_length = 3; + + /* compose key to look for "save" links */ + max_cpu_key.version = KEY_FORMAT_3_5; + max_cpu_key.on_disk_key.k_dir_id = ~0U; + max_cpu_key.on_disk_key.k_objectid = ~0U; + set_cpu_key_k_offset(&max_cpu_key, ~0U); + max_cpu_key.key_length = 3; #ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - if (s->s_flags & MS_ACTIVE) { - ms_active_set = 0; - } else { - ms_active_set = 1; - s->s_flags |= MS_ACTIVE; - } - /* Turn on quotas so that they are updated correctly */ - for (i = 0; i < MAXQUOTAS; i++) { - if (REISERFS_SB(s)->s_qf_names[i]) { - int ret = reiserfs_quota_on_mount(s, i); - if (ret < 0) - reiserfs_warning(s, "reiserfs: cannot turn on journalled quota: error %d", ret); - } - } + /* Needed for iput() to work correctly and not trash data */ + if (s->s_flags & MS_ACTIVE) { + ms_active_set = 0; + } else { + ms_active_set = 1; + s->s_flags |= MS_ACTIVE; + } + /* Turn on quotas so that they are updated correctly */ + for (i = 0; i < MAXQUOTAS; i++) { + if (REISERFS_SB(s)->s_qf_names[i]) { + int ret = reiserfs_quota_on_mount(s, i); + if (ret < 0) + reiserfs_warning(s, + "reiserfs: cannot turn on journalled quota: error %d", + ret); + } + } #endif - - done = 0; - REISERFS_SB(s)->s_is_unlinked_ok = 1; - while (!retval) { - retval = search_item (s, &max_cpu_key, &path); - if (retval != ITEM_NOT_FOUND) { - reiserfs_warning (s, "vs-2140: finish_unfinished: search_by_key returned %d", - retval); - break; - } - - bh = get_last_bh (&path); - item_pos = get_item_pos (&path); - if (item_pos != B_NR_ITEMS (bh)) { - reiserfs_warning (s, "vs-2060: finish_unfinished: wrong position found"); - break; - } - item_pos --; - ih = B_N_PITEM_HEAD (bh, item_pos); - - if (le32_to_cpu (ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID) - /* there are no "save" links anymore */ - break; - - save_link_key = ih->ih_key; - if (is_indirect_le_ih (ih)) - truncate = 1; - else - truncate = 0; - - /* reiserfs_iget needs k_dirid and k_objectid only */ - item = B_I_PITEM (bh, ih); - obj_key.on_disk_key.k_dir_id = le32_to_cpu (*(__le32 *)item); - obj_key.on_disk_key.k_objectid = le32_to_cpu (ih->ih_key.k_objectid); - obj_key.on_disk_key.k_offset = 0; - obj_key.on_disk_key.k_type = 0; - - pathrelse (&path); - - inode = reiserfs_iget (s, &obj_key); - if (!inode) { - /* the unlink almost completed, it just did not manage to remove - "save" link and release objectid */ - reiserfs_warning (s, "vs-2180: finish_unfinished: iget failed for %K", - &obj_key); - retval = remove_save_link_only (s, &save_link_key, 1); - continue; - } - - if (!truncate && inode->i_nlink) { - /* file is not unlinked */ - reiserfs_warning (s, "vs-2185: finish_unfinished: file %K is not unlinked", - &obj_key); - retval = remove_save_link_only (s, &save_link_key, 0); - continue; - } - DQUOT_INIT(inode); - - if (truncate && S_ISDIR (inode->i_mode) ) { - /* We got a truncate request for a dir which is impossible. - The only imaginable way is to execute unfinished truncate request - then boot into old kernel, remove the file and create dir with - the same key. */ - reiserfs_warning(s, "green-2101: impossible truncate on a directory %k. Please report", INODE_PKEY (inode)); - retval = remove_save_link_only (s, &save_link_key, 0); - truncate = 0; - iput (inode); - continue; - } - - if (truncate) { - REISERFS_I(inode) -> i_flags |= i_link_saved_truncate_mask; - /* not completed truncate found. New size was committed together - with "save" link */ - reiserfs_info (s, "Truncating %k to %Ld ..", - INODE_PKEY (inode), inode->i_size); - reiserfs_truncate_file (inode, 0/*don't update modification time*/); - retval = remove_save_link (inode, truncate); - } else { - REISERFS_I(inode) -> i_flags |= i_link_saved_unlink_mask; - /* not completed unlink (rmdir) found */ - reiserfs_info (s, "Removing %k..", INODE_PKEY (inode)); - /* removal gets completed in iput */ - retval = 0; - } - - iput (inode); - printk ("done\n"); - done ++; - } - REISERFS_SB(s)->s_is_unlinked_ok = 0; - + + done = 0; + REISERFS_SB(s)->s_is_unlinked_ok = 1; + while (!retval) { + retval = search_item(s, &max_cpu_key, &path); + if (retval != ITEM_NOT_FOUND) { + reiserfs_warning(s, + "vs-2140: finish_unfinished: search_by_key returned %d", + retval); + break; + } + + bh = get_last_bh(&path); + item_pos = get_item_pos(&path); + if (item_pos != B_NR_ITEMS(bh)) { + reiserfs_warning(s, + "vs-2060: finish_unfinished: wrong position found"); + break; + } + item_pos--; + ih = B_N_PITEM_HEAD(bh, item_pos); + + if (le32_to_cpu(ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID) + /* there are no "save" links anymore */ + break; + + save_link_key = ih->ih_key; + if (is_indirect_le_ih(ih)) + truncate = 1; + else + truncate = 0; + + /* reiserfs_iget needs k_dirid and k_objectid only */ + item = B_I_PITEM(bh, ih); + obj_key.on_disk_key.k_dir_id = le32_to_cpu(*(__le32 *) item); + obj_key.on_disk_key.k_objectid = + le32_to_cpu(ih->ih_key.k_objectid); + obj_key.on_disk_key.k_offset = 0; + obj_key.on_disk_key.k_type = 0; + + pathrelse(&path); + + inode = reiserfs_iget(s, &obj_key); + if (!inode) { + /* the unlink almost completed, it just did not manage to remove + "save" link and release objectid */ + reiserfs_warning(s, + "vs-2180: finish_unfinished: iget failed for %K", + &obj_key); + retval = remove_save_link_only(s, &save_link_key, 1); + continue; + } + + if (!truncate && inode->i_nlink) { + /* file is not unlinked */ + reiserfs_warning(s, + "vs-2185: finish_unfinished: file %K is not unlinked", + &obj_key); + retval = remove_save_link_only(s, &save_link_key, 0); + continue; + } + DQUOT_INIT(inode); + + if (truncate && S_ISDIR(inode->i_mode)) { + /* We got a truncate request for a dir which is impossible. + The only imaginable way is to execute unfinished truncate request + then boot into old kernel, remove the file and create dir with + the same key. */ + reiserfs_warning(s, + "green-2101: impossible truncate on a directory %k. Please report", + INODE_PKEY(inode)); + retval = remove_save_link_only(s, &save_link_key, 0); + truncate = 0; + iput(inode); + continue; + } + + if (truncate) { + REISERFS_I(inode)->i_flags |= + i_link_saved_truncate_mask; + /* not completed truncate found. New size was committed together + with "save" link */ + reiserfs_info(s, "Truncating %k to %Ld ..", + INODE_PKEY(inode), inode->i_size); + reiserfs_truncate_file(inode, + 0 + /*don't update modification time */ + ); + retval = remove_save_link(inode, truncate); + } else { + REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask; + /* not completed unlink (rmdir) found */ + reiserfs_info(s, "Removing %k..", INODE_PKEY(inode)); + /* removal gets completed in iput */ + retval = 0; + } + + iput(inode); + printk("done\n"); + done++; + } + REISERFS_SB(s)->s_is_unlinked_ok = 0; + #ifdef CONFIG_QUOTA - /* Turn quotas off */ - for (i = 0; i < MAXQUOTAS; i++) { - if (sb_dqopt(s)->files[i]) - vfs_quota_off_mount(s, i); - } - if (ms_active_set) - /* Restore the flag back */ - s->s_flags &= ~MS_ACTIVE; + /* Turn quotas off */ + for (i = 0; i < MAXQUOTAS; i++) { + if (sb_dqopt(s)->files[i]) + vfs_quota_off_mount(s, i); + } + if (ms_active_set) + /* Restore the flag back */ + s->s_flags &= ~MS_ACTIVE; #endif - pathrelse (&path); - if (done) - reiserfs_info (s, "There were %d uncompleted unlinks/truncates. " - "Completed\n", done); - return retval; + pathrelse(&path); + if (done) + reiserfs_info(s, "There were %d uncompleted unlinks/truncates. " + "Completed\n", done); + return retval; } - + /* to protect file being unlinked from getting lost we "safe" link files being unlinked. This link will be deleted in the same transaction with last item of file. mounting the filesytem we scan all these links and remove files which almost got lost */ -void add_save_link (struct reiserfs_transaction_handle * th, - struct inode * inode, int truncate) +void add_save_link(struct reiserfs_transaction_handle *th, + struct inode *inode, int truncate) { - INITIALIZE_PATH (path); - int retval; - struct cpu_key key; - struct item_head ih; - __le32 link; - - BUG_ON (!th->t_trans_id); - - /* file can only get one "save link" of each kind */ - RFALSE( truncate && - ( REISERFS_I(inode) -> i_flags & i_link_saved_truncate_mask ), - "saved link already exists for truncated inode %lx", - ( long ) inode -> i_ino ); - RFALSE( !truncate && - ( REISERFS_I(inode) -> i_flags & i_link_saved_unlink_mask ), - "saved link already exists for unlinked inode %lx", - ( long ) inode -> i_ino ); - - /* setup key of "save" link */ - key.version = KEY_FORMAT_3_5; - key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID; - key.on_disk_key.k_objectid = inode->i_ino; - if (!truncate) { - /* unlink, rmdir, rename */ - set_cpu_key_k_offset (&key, 1 + inode->i_sb->s_blocksize); - set_cpu_key_k_type (&key, TYPE_DIRECT); - - /* item head of "safe" link */ - make_le_item_head (&ih, &key, key.version, 1 + inode->i_sb->s_blocksize, TYPE_DIRECT, - 4/*length*/, 0xffff/*free space*/); - } else { - /* truncate */ - if (S_ISDIR (inode->i_mode)) - reiserfs_warning(inode->i_sb, "green-2102: Adding a truncate savelink for a directory %k! Please report", INODE_PKEY(inode)); - set_cpu_key_k_offset (&key, 1); - set_cpu_key_k_type (&key, TYPE_INDIRECT); - - /* item head of "safe" link */ - make_le_item_head (&ih, &key, key.version, 1, TYPE_INDIRECT, - 4/*length*/, 0/*free space*/); - } - key.key_length = 3; - - /* look for its place in the tree */ - retval = search_item (inode->i_sb, &key, &path); - if (retval != ITEM_NOT_FOUND) { - if ( retval != -ENOSPC ) - reiserfs_warning (inode->i_sb, "vs-2100: add_save_link:" - "search_by_key (%K) returned %d", &key, retval); - pathrelse (&path); - return; - } - - /* body of "save" link */ - link = INODE_PKEY (inode)->k_dir_id; - - /* put "save" link inot tree, don't charge quota to anyone */ - retval = reiserfs_insert_item (th, &path, &key, &ih, NULL, (char *)&link); - if (retval) { - if (retval != -ENOSPC) - reiserfs_warning (inode->i_sb, "vs-2120: add_save_link: insert_item returned %d", - retval); - } else { - if( truncate ) - REISERFS_I(inode) -> i_flags |= i_link_saved_truncate_mask; - else - REISERFS_I(inode) -> i_flags |= i_link_saved_unlink_mask; - } -} + INITIALIZE_PATH(path); + int retval; + struct cpu_key key; + struct item_head ih; + __le32 link; + + BUG_ON(!th->t_trans_id); + + /* file can only get one "save link" of each kind */ + RFALSE(truncate && + (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask), + "saved link already exists for truncated inode %lx", + (long)inode->i_ino); + RFALSE(!truncate && + (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask), + "saved link already exists for unlinked inode %lx", + (long)inode->i_ino); + + /* setup key of "save" link */ + key.version = KEY_FORMAT_3_5; + key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID; + key.on_disk_key.k_objectid = inode->i_ino; + if (!truncate) { + /* unlink, rmdir, rename */ + set_cpu_key_k_offset(&key, 1 + inode->i_sb->s_blocksize); + set_cpu_key_k_type(&key, TYPE_DIRECT); + + /* item head of "safe" link */ + make_le_item_head(&ih, &key, key.version, + 1 + inode->i_sb->s_blocksize, TYPE_DIRECT, + 4 /*length */ , 0xffff /*free space */ ); + } else { + /* truncate */ + if (S_ISDIR(inode->i_mode)) + reiserfs_warning(inode->i_sb, + "green-2102: Adding a truncate savelink for a directory %k! Please report", + INODE_PKEY(inode)); + set_cpu_key_k_offset(&key, 1); + set_cpu_key_k_type(&key, TYPE_INDIRECT); + + /* item head of "safe" link */ + make_le_item_head(&ih, &key, key.version, 1, TYPE_INDIRECT, + 4 /*length */ , 0 /*free space */ ); + } + key.key_length = 3; + + /* look for its place in the tree */ + retval = search_item(inode->i_sb, &key, &path); + if (retval != ITEM_NOT_FOUND) { + if (retval != -ENOSPC) + reiserfs_warning(inode->i_sb, "vs-2100: add_save_link:" + "search_by_key (%K) returned %d", &key, + retval); + pathrelse(&path); + return; + } + /* body of "save" link */ + link = INODE_PKEY(inode)->k_dir_id; + + /* put "save" link inot tree, don't charge quota to anyone */ + retval = + reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link); + if (retval) { + if (retval != -ENOSPC) + reiserfs_warning(inode->i_sb, + "vs-2120: add_save_link: insert_item returned %d", + retval); + } else { + if (truncate) + REISERFS_I(inode)->i_flags |= + i_link_saved_truncate_mask; + else + REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask; + } +} /* this opens transaction unlike add_save_link */ -int remove_save_link (struct inode * inode, int truncate) +int remove_save_link(struct inode *inode, int truncate) { - struct reiserfs_transaction_handle th; - struct reiserfs_key key; - int err; - - /* we are going to do one balancing only */ - err = journal_begin (&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT); - if (err) - return err; - - /* setup key of "save" link */ - key.k_dir_id = cpu_to_le32 (MAX_KEY_OBJECTID); - key.k_objectid = INODE_PKEY (inode)->k_objectid; - if (!truncate) { - /* unlink, rmdir, rename */ - set_le_key_k_offset (KEY_FORMAT_3_5, &key, - 1 + inode->i_sb->s_blocksize); - set_le_key_k_type (KEY_FORMAT_3_5, &key, TYPE_DIRECT); - } else { - /* truncate */ - set_le_key_k_offset (KEY_FORMAT_3_5, &key, 1); - set_le_key_k_type (KEY_FORMAT_3_5, &key, TYPE_INDIRECT); - } - - if( ( truncate && - ( REISERFS_I(inode) -> i_flags & i_link_saved_truncate_mask ) ) || - ( !truncate && - ( REISERFS_I(inode) -> i_flags & i_link_saved_unlink_mask ) ) ) - /* don't take quota bytes from anywhere */ - reiserfs_delete_solid_item (&th, NULL, &key); - if (!truncate) { - reiserfs_release_objectid (&th, inode->i_ino); - REISERFS_I(inode) -> i_flags &= ~i_link_saved_unlink_mask; - } else - REISERFS_I(inode) -> i_flags &= ~i_link_saved_truncate_mask; - - return journal_end (&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT); -} + struct reiserfs_transaction_handle th; + struct reiserfs_key key; + int err; + + /* we are going to do one balancing only */ + err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT); + if (err) + return err; + + /* setup key of "save" link */ + key.k_dir_id = cpu_to_le32(MAX_KEY_OBJECTID); + key.k_objectid = INODE_PKEY(inode)->k_objectid; + if (!truncate) { + /* unlink, rmdir, rename */ + set_le_key_k_offset(KEY_FORMAT_3_5, &key, + 1 + inode->i_sb->s_blocksize); + set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_DIRECT); + } else { + /* truncate */ + set_le_key_k_offset(KEY_FORMAT_3_5, &key, 1); + set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_INDIRECT); + } + if ((truncate && + (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask)) || + (!truncate && + (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask))) + /* don't take quota bytes from anywhere */ + reiserfs_delete_solid_item(&th, NULL, &key); + if (!truncate) { + reiserfs_release_objectid(&th, inode->i_ino); + REISERFS_I(inode)->i_flags &= ~i_link_saved_unlink_mask; + } else + REISERFS_I(inode)->i_flags &= ~i_link_saved_truncate_mask; + + return journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT); +} -static void reiserfs_put_super (struct super_block * s) +static void reiserfs_put_super(struct super_block *s) { - int i; - struct reiserfs_transaction_handle th ; - th.t_trans_id = 0; - - if (REISERFS_SB(s)->xattr_root) { - d_invalidate (REISERFS_SB(s)->xattr_root); - dput (REISERFS_SB(s)->xattr_root); - } - - if (REISERFS_SB(s)->priv_root) { - d_invalidate (REISERFS_SB(s)->priv_root); - dput (REISERFS_SB(s)->priv_root); - } - - /* change file system state to current state if it was mounted with read-write permissions */ - if (!(s->s_flags & MS_RDONLY)) { - if (!journal_begin(&th, s, 10)) { - reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; - set_sb_umount_state( SB_DISK_SUPER_BLOCK(s), REISERFS_SB(s)->s_mount_state ); - journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); - } - } - - /* note, journal_release checks for readonly mount, and can decide not - ** to do a journal_end - */ - journal_release(&th, s) ; - - for (i = 0; i < SB_BMAP_NR (s); i ++) - brelse (SB_AP_BITMAP (s)[i].bh); - - vfree (SB_AP_BITMAP (s)); - - brelse (SB_BUFFER_WITH_SB (s)); - - print_statistics (s); - - if (REISERFS_SB(s)->s_kmallocs != 0) { - reiserfs_warning (s, "vs-2004: reiserfs_put_super: allocated memory left %d", - REISERFS_SB(s)->s_kmallocs); - } - - if (REISERFS_SB(s)->reserved_blocks != 0) { - reiserfs_warning (s, "green-2005: reiserfs_put_super: reserved blocks left %d", - REISERFS_SB(s)->reserved_blocks); - } - - reiserfs_proc_info_done( s ); - - kfree(s->s_fs_info); - s->s_fs_info = NULL; - - return; + int i; + struct reiserfs_transaction_handle th; + th.t_trans_id = 0; + + if (REISERFS_SB(s)->xattr_root) { + d_invalidate(REISERFS_SB(s)->xattr_root); + dput(REISERFS_SB(s)->xattr_root); + } + + if (REISERFS_SB(s)->priv_root) { + d_invalidate(REISERFS_SB(s)->priv_root); + dput(REISERFS_SB(s)->priv_root); + } + + /* change file system state to current state if it was mounted with read-write permissions */ + if (!(s->s_flags & MS_RDONLY)) { + if (!journal_begin(&th, s, 10)) { + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), + 1); + set_sb_umount_state(SB_DISK_SUPER_BLOCK(s), + REISERFS_SB(s)->s_mount_state); + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + } + } + + /* note, journal_release checks for readonly mount, and can decide not + ** to do a journal_end + */ + journal_release(&th, s); + + for (i = 0; i < SB_BMAP_NR(s); i++) + brelse(SB_AP_BITMAP(s)[i].bh); + + vfree(SB_AP_BITMAP(s)); + + brelse(SB_BUFFER_WITH_SB(s)); + + print_statistics(s); + + if (REISERFS_SB(s)->s_kmallocs != 0) { + reiserfs_warning(s, + "vs-2004: reiserfs_put_super: allocated memory left %d", + REISERFS_SB(s)->s_kmallocs); + } + + if (REISERFS_SB(s)->reserved_blocks != 0) { + reiserfs_warning(s, + "green-2005: reiserfs_put_super: reserved blocks left %d", + REISERFS_SB(s)->reserved_blocks); + } + + reiserfs_proc_info_done(s); + + kfree(s->s_fs_info); + s->s_fs_info = NULL; + + return; } -static kmem_cache_t * reiserfs_inode_cachep; +static kmem_cache_t *reiserfs_inode_cachep; static struct inode *reiserfs_alloc_inode(struct super_block *sb) { struct reiserfs_inode_info *ei; - ei = (struct reiserfs_inode_info *)kmem_cache_alloc(reiserfs_inode_cachep, SLAB_KERNEL); + ei = (struct reiserfs_inode_info *) + kmem_cache_alloc(reiserfs_inode_cachep, SLAB_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; @@ -488,25 +509,26 @@ static void reiserfs_destroy_inode(struct inode *inode) kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode)); } -static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags) { - struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *) foo; + struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo; - if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR) { - INIT_LIST_HEAD(&ei->i_prealloc_list) ; + INIT_LIST_HEAD(&ei->i_prealloc_list); inode_init_once(&ei->vfs_inode); ei->i_acl_access = NULL; ei->i_acl_default = NULL; } } - + static int init_inodecache(void) { reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache", - sizeof(struct reiserfs_inode_info), - 0, SLAB_RECLAIM_ACCOUNT, - init_once, NULL); + sizeof(struct + reiserfs_inode_info), + 0, SLAB_RECLAIM_ACCOUNT, + init_once, NULL); if (reiserfs_inode_cachep == NULL) return -ENOMEM; return 0; @@ -515,72 +537,76 @@ static int init_inodecache(void) static void destroy_inodecache(void) { if (kmem_cache_destroy(reiserfs_inode_cachep)) - reiserfs_warning (NULL, "reiserfs_inode_cache: not all structures were freed"); + reiserfs_warning(NULL, + "reiserfs_inode_cache: not all structures were freed"); } /* we don't mark inodes dirty, we just log them */ -static void reiserfs_dirty_inode (struct inode * inode) { - struct reiserfs_transaction_handle th ; - - int err = 0; - if (inode->i_sb->s_flags & MS_RDONLY) { - reiserfs_warning(inode->i_sb, "clm-6006: writing inode %lu on readonly FS", - inode->i_ino) ; - return ; - } - reiserfs_write_lock(inode->i_sb); - - /* this is really only used for atime updates, so they don't have - ** to be included in O_SYNC or fsync - */ - err = journal_begin(&th, inode->i_sb, 1) ; - if (err) { - reiserfs_write_unlock (inode->i_sb); - return; - } - reiserfs_update_sd (&th, inode); - journal_end(&th, inode->i_sb, 1) ; - reiserfs_write_unlock(inode->i_sb); +static void reiserfs_dirty_inode(struct inode *inode) +{ + struct reiserfs_transaction_handle th; + + int err = 0; + if (inode->i_sb->s_flags & MS_RDONLY) { + reiserfs_warning(inode->i_sb, + "clm-6006: writing inode %lu on readonly FS", + inode->i_ino); + return; + } + reiserfs_write_lock(inode->i_sb); + + /* this is really only used for atime updates, so they don't have + ** to be included in O_SYNC or fsync + */ + err = journal_begin(&th, inode->i_sb, 1); + if (err) { + reiserfs_write_unlock(inode->i_sb); + return; + } + reiserfs_update_sd(&th, inode); + journal_end(&th, inode->i_sb, 1); + reiserfs_write_unlock(inode->i_sb); } -static void reiserfs_clear_inode (struct inode *inode) +static void reiserfs_clear_inode(struct inode *inode) { - struct posix_acl *acl; + struct posix_acl *acl; - acl = REISERFS_I(inode)->i_acl_access; - if (acl && !IS_ERR (acl)) - posix_acl_release (acl); - REISERFS_I(inode)->i_acl_access = NULL; + acl = REISERFS_I(inode)->i_acl_access; + if (acl && !IS_ERR(acl)) + posix_acl_release(acl); + REISERFS_I(inode)->i_acl_access = NULL; - acl = REISERFS_I(inode)->i_acl_default; - if (acl && !IS_ERR (acl)) - posix_acl_release (acl); - REISERFS_I(inode)->i_acl_default = NULL; + acl = REISERFS_I(inode)->i_acl_default; + if (acl && !IS_ERR(acl)) + posix_acl_release(acl); + REISERFS_I(inode)->i_acl_default = NULL; } #ifdef CONFIG_QUOTA -static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, size_t, loff_t); -static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t, loff_t); +static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, + size_t, loff_t); +static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t, + loff_t); #endif -static struct super_operations reiserfs_sops = -{ - .alloc_inode = reiserfs_alloc_inode, - .destroy_inode = reiserfs_destroy_inode, - .write_inode = reiserfs_write_inode, - .dirty_inode = reiserfs_dirty_inode, - .delete_inode = reiserfs_delete_inode, - .clear_inode = reiserfs_clear_inode, - .put_super = reiserfs_put_super, - .write_super = reiserfs_write_super, - .sync_fs = reiserfs_sync_fs, - .write_super_lockfs = reiserfs_write_super_lockfs, - .unlockfs = reiserfs_unlockfs, - .statfs = reiserfs_statfs, - .remount_fs = reiserfs_remount, +static struct super_operations reiserfs_sops = { + .alloc_inode = reiserfs_alloc_inode, + .destroy_inode = reiserfs_destroy_inode, + .write_inode = reiserfs_write_inode, + .dirty_inode = reiserfs_dirty_inode, + .delete_inode = reiserfs_delete_inode, + .clear_inode = reiserfs_clear_inode, + .put_super = reiserfs_put_super, + .write_super = reiserfs_write_super, + .sync_fs = reiserfs_sync_fs, + .write_super_lockfs = reiserfs_write_super_lockfs, + .unlockfs = reiserfs_unlockfs, + .statfs = reiserfs_statfs, + .remount_fs = reiserfs_remount, #ifdef CONFIG_QUOTA - .quota_read = reiserfs_quota_read, - .quota_write = reiserfs_quota_write, + .quota_read = reiserfs_quota_read, + .quota_write = reiserfs_quota_write, #endif }; @@ -596,50 +622,48 @@ static int reiserfs_mark_dquot_dirty(struct dquot *); static int reiserfs_write_info(struct super_block *, int); static int reiserfs_quota_on(struct super_block *, int, int, char *); -static struct dquot_operations reiserfs_quota_operations = -{ - .initialize = reiserfs_dquot_initialize, - .drop = reiserfs_dquot_drop, - .alloc_space = dquot_alloc_space, - .alloc_inode = dquot_alloc_inode, - .free_space = dquot_free_space, - .free_inode = dquot_free_inode, - .transfer = dquot_transfer, - .write_dquot = reiserfs_write_dquot, - .acquire_dquot = reiserfs_acquire_dquot, - .release_dquot = reiserfs_release_dquot, - .mark_dirty = reiserfs_mark_dquot_dirty, - .write_info = reiserfs_write_info, +static struct dquot_operations reiserfs_quota_operations = { + .initialize = reiserfs_dquot_initialize, + .drop = reiserfs_dquot_drop, + .alloc_space = dquot_alloc_space, + .alloc_inode = dquot_alloc_inode, + .free_space = dquot_free_space, + .free_inode = dquot_free_inode, + .transfer = dquot_transfer, + .write_dquot = reiserfs_write_dquot, + .acquire_dquot = reiserfs_acquire_dquot, + .release_dquot = reiserfs_release_dquot, + .mark_dirty = reiserfs_mark_dquot_dirty, + .write_info = reiserfs_write_info, }; -static struct quotactl_ops reiserfs_qctl_operations = -{ - .quota_on = reiserfs_quota_on, - .quota_off = vfs_quota_off, - .quota_sync = vfs_quota_sync, - .get_info = vfs_get_dqinfo, - .set_info = vfs_set_dqinfo, - .get_dqblk = vfs_get_dqblk, - .set_dqblk = vfs_set_dqblk, +static struct quotactl_ops reiserfs_qctl_operations = { + .quota_on = reiserfs_quota_on, + .quota_off = vfs_quota_off, + .quota_sync = vfs_quota_sync, + .get_info = vfs_get_dqinfo, + .set_info = vfs_set_dqinfo, + .get_dqblk = vfs_get_dqblk, + .set_dqblk = vfs_set_dqblk, }; #endif static struct export_operations reiserfs_export_ops = { - .encode_fh = reiserfs_encode_fh, - .decode_fh = reiserfs_decode_fh, - .get_parent = reiserfs_get_parent, - .get_dentry = reiserfs_get_dentry, -} ; + .encode_fh = reiserfs_encode_fh, + .decode_fh = reiserfs_decode_fh, + .get_parent = reiserfs_get_parent, + .get_dentry = reiserfs_get_dentry, +}; /* this struct is used in reiserfs_getopt () for containing the value for those mount options that have values rather than being toggles. */ typedef struct { - char * value; - int setmask; /* bitmask which is to set on mount_options bitmask when this - value is found, 0 is no bits are to be changed. */ - int clrmask; /* bitmask which is to clear on mount_options bitmask when this - value is found, 0 is no bits are to be changed. This is - applied BEFORE setmask */ + char *value; + int setmask; /* bitmask which is to set on mount_options bitmask when this + value is found, 0 is no bits are to be changed. */ + int clrmask; /* bitmask which is to clear on mount_options bitmask when this + value is found, 0 is no bits are to be changed. This is + applied BEFORE setmask */ } arg_desc_t; /* Set this bit in arg_required to allow empty arguments */ @@ -648,67 +672,70 @@ typedef struct { /* this struct is used in reiserfs_getopt() for describing the set of reiserfs mount options */ typedef struct { - char * option_name; - int arg_required; /* 0 if argument is not required, not 0 otherwise */ - const arg_desc_t * values; /* list of values accepted by an option */ - int setmask; /* bitmask which is to set on mount_options bitmask when this - value is found, 0 is no bits are to be changed. */ - int clrmask; /* bitmask which is to clear on mount_options bitmask when this - value is found, 0 is no bits are to be changed. This is - applied BEFORE setmask */ + char *option_name; + int arg_required; /* 0 if argument is not required, not 0 otherwise */ + const arg_desc_t *values; /* list of values accepted by an option */ + int setmask; /* bitmask which is to set on mount_options bitmask when this + value is found, 0 is no bits are to be changed. */ + int clrmask; /* bitmask which is to clear on mount_options bitmask when this + value is found, 0 is no bits are to be changed. This is + applied BEFORE setmask */ } opt_desc_t; /* possible values for -o data= */ static const arg_desc_t logging_mode[] = { - {"ordered", 1<<REISERFS_DATA_ORDERED, (1<<REISERFS_DATA_LOG|1<<REISERFS_DATA_WRITEBACK)}, - {"journal", 1<<REISERFS_DATA_LOG, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_WRITEBACK)}, - {"writeback", 1<<REISERFS_DATA_WRITEBACK, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_LOG)}, - {NULL, 0} + {"ordered", 1 << REISERFS_DATA_ORDERED, + (1 << REISERFS_DATA_LOG | 1 << REISERFS_DATA_WRITEBACK)}, + {"journal", 1 << REISERFS_DATA_LOG, + (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_WRITEBACK)}, + {"writeback", 1 << REISERFS_DATA_WRITEBACK, + (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_LOG)}, + {NULL, 0} }; /* possible values for -o barrier= */ static const arg_desc_t barrier_mode[] = { - {"none", 1<<REISERFS_BARRIER_NONE, 1<<REISERFS_BARRIER_FLUSH}, - {"flush", 1<<REISERFS_BARRIER_FLUSH, 1<<REISERFS_BARRIER_NONE}, - {NULL, 0} + {"none", 1 << REISERFS_BARRIER_NONE, 1 << REISERFS_BARRIER_FLUSH}, + {"flush", 1 << REISERFS_BARRIER_FLUSH, 1 << REISERFS_BARRIER_NONE}, + {NULL, 0} }; /* possible values for "-o block-allocator=" and bits which are to be set in s_mount_opt of reiserfs specific part of in-core super block */ static const arg_desc_t balloc[] = { - {"noborder", 1<<REISERFS_NO_BORDER, 0}, - {"border", 0, 1<<REISERFS_NO_BORDER}, - {"no_unhashed_relocation", 1<<REISERFS_NO_UNHASHED_RELOCATION, 0}, - {"hashed_relocation", 1<<REISERFS_HASHED_RELOCATION, 0}, - {"test4", 1<<REISERFS_TEST4, 0}, - {"notest4", 0, 1<<REISERFS_TEST4}, - {NULL, 0, 0} + {"noborder", 1 << REISERFS_NO_BORDER, 0}, + {"border", 0, 1 << REISERFS_NO_BORDER}, + {"no_unhashed_relocation", 1 << REISERFS_NO_UNHASHED_RELOCATION, 0}, + {"hashed_relocation", 1 << REISERFS_HASHED_RELOCATION, 0}, + {"test4", 1 << REISERFS_TEST4, 0}, + {"notest4", 0, 1 << REISERFS_TEST4}, + {NULL, 0, 0} }; static const arg_desc_t tails[] = { - {"on", 1<<REISERFS_LARGETAIL, 1<<REISERFS_SMALLTAIL}, - {"off", 0, (1<<REISERFS_LARGETAIL)|(1<<REISERFS_SMALLTAIL)}, - {"small", 1<<REISERFS_SMALLTAIL, 1<<REISERFS_LARGETAIL}, - {NULL, 0, 0} + {"on", 1 << REISERFS_LARGETAIL, 1 << REISERFS_SMALLTAIL}, + {"off", 0, (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)}, + {"small", 1 << REISERFS_SMALLTAIL, 1 << REISERFS_LARGETAIL}, + {NULL, 0, 0} }; static const arg_desc_t error_actions[] = { - {"panic", 1 << REISERFS_ERROR_PANIC, - (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)}, - {"ro-remount", 1 << REISERFS_ERROR_RO, - (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)}, + {"panic", 1 << REISERFS_ERROR_PANIC, + (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)}, + {"ro-remount", 1 << REISERFS_ERROR_RO, + (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)}, #ifdef REISERFS_JOURNAL_ERROR_ALLOWS_NO_LOG - {"continue", 1 << REISERFS_ERROR_CONTINUE, - (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)}, + {"continue", 1 << REISERFS_ERROR_CONTINUE, + (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)}, #endif - {NULL, 0, 0}, + {NULL, 0, 0}, }; -int reiserfs_default_io_size = 128 * 1024; /* Default recommended I/O size is 128k. - There might be broken applications that are - confused by this. Use nolargeio mount option - to get usual i/o size = PAGE_SIZE. - */ +int reiserfs_default_io_size = 128 * 1024; /* Default recommended I/O size is 128k. + There might be broken applications that are + confused by this. Use nolargeio mount option + to get usual i/o size = PAGE_SIZE. + */ /* proceed only one option from a list *cur - string containing of mount options opts - array of options which are accepted @@ -716,476 +743,530 @@ int reiserfs_default_io_size = 128 * 1024; /* Default recommended I/O size is 12 in the input - pointer to the argument is stored here bit_flags - if option requires to set a certain bit - it is set here return -1 if unknown option is found, opt->arg_required otherwise */ -static int reiserfs_getopt ( struct super_block * s, char ** cur, opt_desc_t * opts, char ** opt_arg, - unsigned long * bit_flags) +static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts, + char **opt_arg, unsigned long *bit_flags) { - char * p; - /* foo=bar, - ^ ^ ^ - | | +-- option_end - | +-- arg_start - +-- option_start - */ - const opt_desc_t * opt; - const arg_desc_t * arg; - - - p = *cur; - - /* assume argument cannot contain commas */ - *cur = strchr (p, ','); - if (*cur) { - *(*cur) = '\0'; - (*cur) ++; - } - - if ( !strncmp (p, "alloc=", 6) ) { - /* Ugly special case, probably we should redo options parser so that - it can understand several arguments for some options, also so that - it can fill several bitfields with option values. */ - if ( reiserfs_parse_alloc_options( s, p + 6) ) { - return -1; - } else { - return 0; - } - } - - - /* for every option in the list */ - for (opt = opts; opt->option_name; opt ++) { - if (!strncmp (p, opt->option_name, strlen (opt->option_name))) { - if (bit_flags) { - if (opt->clrmask == (1 << REISERFS_UNSUPPORTED_OPT)) - reiserfs_warning (s, "%s not supported.", p); - else - *bit_flags &= ~opt->clrmask; - if (opt->setmask == (1 << REISERFS_UNSUPPORTED_OPT)) - reiserfs_warning (s, "%s not supported.", p); - else - *bit_flags |= opt->setmask; - } - break; - } - } - if (!opt->option_name) { - reiserfs_warning (s, "unknown mount option \"%s\"", p); - return -1; - } - - p += strlen (opt->option_name); - switch (*p) { - case '=': - if (!opt->arg_required) { - reiserfs_warning (s, "the option \"%s\" does not require an argument", - opt->option_name); - return -1; - } - break; - - case 0: - if (opt->arg_required) { - reiserfs_warning (s, "the option \"%s\" requires an argument", opt->option_name); - return -1; - } - break; - default: - reiserfs_warning (s, "head of option \"%s\" is only correct", opt->option_name); - return -1; - } - - /* move to the argument, or to next option if argument is not required */ - p ++; - - if ( opt->arg_required && !(opt->arg_required & (1<<REISERFS_OPT_ALLOWEMPTY)) && !strlen (p) ) { - /* this catches "option=," if not allowed */ - reiserfs_warning (s, "empty argument for \"%s\"", opt->option_name); + char *p; + /* foo=bar, + ^ ^ ^ + | | +-- option_end + | +-- arg_start + +-- option_start + */ + const opt_desc_t *opt; + const arg_desc_t *arg; + + p = *cur; + + /* assume argument cannot contain commas */ + *cur = strchr(p, ','); + if (*cur) { + *(*cur) = '\0'; + (*cur)++; + } + + if (!strncmp(p, "alloc=", 6)) { + /* Ugly special case, probably we should redo options parser so that + it can understand several arguments for some options, also so that + it can fill several bitfields with option values. */ + if (reiserfs_parse_alloc_options(s, p + 6)) { + return -1; + } else { + return 0; + } + } + + /* for every option in the list */ + for (opt = opts; opt->option_name; opt++) { + if (!strncmp(p, opt->option_name, strlen(opt->option_name))) { + if (bit_flags) { + if (opt->clrmask == + (1 << REISERFS_UNSUPPORTED_OPT)) + reiserfs_warning(s, "%s not supported.", + p); + else + *bit_flags &= ~opt->clrmask; + if (opt->setmask == + (1 << REISERFS_UNSUPPORTED_OPT)) + reiserfs_warning(s, "%s not supported.", + p); + else + *bit_flags |= opt->setmask; + } + break; + } + } + if (!opt->option_name) { + reiserfs_warning(s, "unknown mount option \"%s\"", p); + return -1; + } + + p += strlen(opt->option_name); + switch (*p) { + case '=': + if (!opt->arg_required) { + reiserfs_warning(s, + "the option \"%s\" does not require an argument", + opt->option_name); + return -1; + } + break; + + case 0: + if (opt->arg_required) { + reiserfs_warning(s, + "the option \"%s\" requires an argument", + opt->option_name); + return -1; + } + break; + default: + reiserfs_warning(s, "head of option \"%s\" is only correct", + opt->option_name); + return -1; + } + + /* move to the argument, or to next option if argument is not required */ + p++; + + if (opt->arg_required + && !(opt->arg_required & (1 << REISERFS_OPT_ALLOWEMPTY)) + && !strlen(p)) { + /* this catches "option=," if not allowed */ + reiserfs_warning(s, "empty argument for \"%s\"", + opt->option_name); + return -1; + } + + if (!opt->values) { + /* *=NULLopt_arg contains pointer to argument */ + *opt_arg = p; + return opt->arg_required & ~(1 << REISERFS_OPT_ALLOWEMPTY); + } + + /* values possible for this option are listed in opt->values */ + for (arg = opt->values; arg->value; arg++) { + if (!strcmp(p, arg->value)) { + if (bit_flags) { + *bit_flags &= ~arg->clrmask; + *bit_flags |= arg->setmask; + } + return opt->arg_required; + } + } + + reiserfs_warning(s, "bad value \"%s\" for option \"%s\"", p, + opt->option_name); return -1; - } - - if (!opt->values) { - /* *=NULLopt_arg contains pointer to argument */ - *opt_arg = p; - return opt->arg_required & ~(1<<REISERFS_OPT_ALLOWEMPTY); - } - - /* values possible for this option are listed in opt->values */ - for (arg = opt->values; arg->value; arg ++) { - if (!strcmp (p, arg->value)) { - if (bit_flags) { - *bit_flags &= ~arg->clrmask; - *bit_flags |= arg->setmask; - } - return opt->arg_required; - } - } - - reiserfs_warning (s, "bad value \"%s\" for option \"%s\"", p, opt->option_name); - return -1; } /* returns 0 if something is wrong in option string, 1 - otherwise */ -static int reiserfs_parse_options (struct super_block * s, char * options, /* string given via mount's -o */ - unsigned long * mount_options, - /* after the parsing phase, contains the - collection of bitflags defining what - mount options were selected. */ - unsigned long * blocks, /* strtol-ed from NNN of resize=NNN */ - char ** jdev_name, - unsigned int * commit_max_age) +static int reiserfs_parse_options(struct super_block *s, char *options, /* string given via mount's -o */ + unsigned long *mount_options, + /* after the parsing phase, contains the + collection of bitflags defining what + mount options were selected. */ + unsigned long *blocks, /* strtol-ed from NNN of resize=NNN */ + char **jdev_name, + unsigned int *commit_max_age) { - int c; - char * arg = NULL; - char * pos; - opt_desc_t opts[] = { - /* Compatibility stuff, so that -o notail for old setups still work */ - {"tails", .arg_required = 't', .values = tails}, - {"notail", .clrmask = (1<<REISERFS_LARGETAIL)|(1<<REISERFS_SMALLTAIL)}, - {"conv", .setmask = 1<<REISERFS_CONVERT}, - {"attrs", .setmask = 1<<REISERFS_ATTRS}, - {"noattrs", .clrmask = 1<<REISERFS_ATTRS}, + int c; + char *arg = NULL; + char *pos; + opt_desc_t opts[] = { + /* Compatibility stuff, so that -o notail for old setups still work */ + {"tails",.arg_required = 't',.values = tails}, + {"notail",.clrmask = + (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)}, + {"conv",.setmask = 1 << REISERFS_CONVERT}, + {"attrs",.setmask = 1 << REISERFS_ATTRS}, + {"noattrs",.clrmask = 1 << REISERFS_ATTRS}, #ifdef CONFIG_REISERFS_FS_XATTR - {"user_xattr", .setmask = 1<<REISERFS_XATTRS_USER}, - {"nouser_xattr",.clrmask = 1<<REISERFS_XATTRS_USER}, + {"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER}, + {"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER}, #else - {"user_xattr", .setmask = 1<<REISERFS_UNSUPPORTED_OPT}, - {"nouser_xattr",.clrmask = 1<<REISERFS_UNSUPPORTED_OPT}, + {"user_xattr",.setmask = 1 << REISERFS_UNSUPPORTED_OPT}, + {"nouser_xattr",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT}, #endif #ifdef CONFIG_REISERFS_FS_POSIX_ACL - {"acl", .setmask = 1<<REISERFS_POSIXACL}, - {"noacl", .clrmask = 1<<REISERFS_POSIXACL}, + {"acl",.setmask = 1 << REISERFS_POSIXACL}, + {"noacl",.clrmask = 1 << REISERFS_POSIXACL}, #else - {"acl", .setmask = 1<<REISERFS_UNSUPPORTED_OPT}, - {"noacl", .clrmask = 1<<REISERFS_UNSUPPORTED_OPT}, + {"acl",.setmask = 1 << REISERFS_UNSUPPORTED_OPT}, + {"noacl",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT}, #endif - {"nolog",}, /* This is unsupported */ - {"replayonly", .setmask = 1<<REPLAYONLY}, - {"block-allocator", .arg_required = 'a', .values = balloc}, - {"data", .arg_required = 'd', .values = logging_mode}, - {"barrier", .arg_required = 'b', .values = barrier_mode}, - {"resize", .arg_required = 'r', .values = NULL}, - {"jdev", .arg_required = 'j', .values = NULL}, - {"nolargeio", .arg_required = 'w', .values = NULL}, - {"commit", .arg_required = 'c', .values = NULL}, - {"usrquota",}, - {"grpquota",}, - {"errors", .arg_required = 'e', .values = error_actions}, - {"usrjquota", .arg_required = 'u'|(1<<REISERFS_OPT_ALLOWEMPTY), .values = NULL}, - {"grpjquota", .arg_required = 'g'|(1<<REISERFS_OPT_ALLOWEMPTY), .values = NULL}, - {"jqfmt", .arg_required = 'f', .values = NULL}, - {NULL,} - }; - - *blocks = 0; - if (!options || !*options) - /* use default configuration: create tails, journaling on, no - conversion to newest format */ - return 1; - - for (pos = options; pos; ) { - c = reiserfs_getopt (s, &pos, opts, &arg, mount_options); - if (c == -1) - /* wrong option is given */ - return 0; - - if (c == 'r') { - char * p; - - p = NULL; - /* "resize=NNN" or "resize=auto" */ - - if (!strcmp(arg, "auto")) { - /* From JFS code, to auto-get the size.*/ - *blocks = s->s_bdev->bd_inode->i_size >> s->s_blocksize_bits; - } else { - *blocks = simple_strtoul (arg, &p, 0); - if (*p != '\0') { - /* NNN does not look like a number */ - reiserfs_warning (s, "reiserfs_parse_options: bad value %s", arg); + {"nolog",}, /* This is unsupported */ + {"replayonly",.setmask = 1 << REPLAYONLY}, + {"block-allocator",.arg_required = 'a',.values = balloc}, + {"data",.arg_required = 'd',.values = logging_mode}, + {"barrier",.arg_required = 'b',.values = barrier_mode}, + {"resize",.arg_required = 'r',.values = NULL}, + {"jdev",.arg_required = 'j',.values = NULL}, + {"nolargeio",.arg_required = 'w',.values = NULL}, + {"commit",.arg_required = 'c',.values = NULL}, + {"usrquota",.setmask = 1 << REISERFS_QUOTA}, + {"grpquota",.setmask = 1 << REISERFS_QUOTA}, + {"noquota",.clrmask = 1 << REISERFS_QUOTA}, + {"errors",.arg_required = 'e',.values = error_actions}, + {"usrjquota",.arg_required = + 'u' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL}, + {"grpjquota",.arg_required = + 'g' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL}, + {"jqfmt",.arg_required = 'f',.values = NULL}, + {NULL,} + }; + + *blocks = 0; + if (!options || !*options) + /* use default configuration: create tails, journaling on, no + conversion to newest format */ + return 1; + + for (pos = options; pos;) { + c = reiserfs_getopt(s, &pos, opts, &arg, mount_options); + if (c == -1) + /* wrong option is given */ return 0; - } - } - } - if ( c == 'c' ) { - char *p = NULL; - unsigned long val = simple_strtoul (arg, &p, 0); - /* commit=NNN (time in seconds) */ - if ( *p != '\0' || val >= (unsigned int)-1) { - reiserfs_warning (s, "reiserfs_parse_options: bad value %s", arg); - return 0; + if (c == 'r') { + char *p; + + p = NULL; + /* "resize=NNN" or "resize=auto" */ + + if (!strcmp(arg, "auto")) { + /* From JFS code, to auto-get the size. */ + *blocks = + s->s_bdev->bd_inode->i_size >> s-> + s_blocksize_bits; + } else { + *blocks = simple_strtoul(arg, &p, 0); + if (*p != '\0') { + /* NNN does not look like a number */ + reiserfs_warning(s, + "reiserfs_parse_options: bad value %s", + arg); + return 0; + } + } } - *commit_max_age = (unsigned int)val; - } - if ( c == 'w' ) { - char *p=NULL; - int val = simple_strtoul (arg, &p, 0); - - if ( *p != '\0') { - reiserfs_warning (s, "reiserfs_parse_options: non-numeric value %s for nolargeio option", arg); - return 0; + if (c == 'c') { + char *p = NULL; + unsigned long val = simple_strtoul(arg, &p, 0); + /* commit=NNN (time in seconds) */ + if (*p != '\0' || val >= (unsigned int)-1) { + reiserfs_warning(s, + "reiserfs_parse_options: bad value %s", + arg); + return 0; + } + *commit_max_age = (unsigned int)val; } - if ( val ) - reiserfs_default_io_size = PAGE_SIZE; - else - reiserfs_default_io_size = 128 * 1024; - } - if (c == 'j') { - if (arg && *arg && jdev_name) { - if ( *jdev_name ) { //Hm, already assigned? - reiserfs_warning (s, "reiserfs_parse_options: journal device was already specified to be %s", *jdev_name); - return 0; + if (c == 'w') { + char *p = NULL; + int val = simple_strtoul(arg, &p, 0); + + if (*p != '\0') { + reiserfs_warning(s, + "reiserfs_parse_options: non-numeric value %s for nolargeio option", + arg); + return 0; + } + if (val) + reiserfs_default_io_size = PAGE_SIZE; + else + reiserfs_default_io_size = 128 * 1024; } - *jdev_name = arg; - } - } -#ifdef CONFIG_QUOTA - if (c == 'u' || c == 'g') { - int qtype = c == 'u' ? USRQUOTA : GRPQUOTA; - - if (sb_any_quota_enabled(s)) { - reiserfs_warning(s, "reiserfs_parse_options: cannot change journalled quota options when quota turned on."); - return 0; - } - if (*arg) { /* Some filename specified? */ - if (REISERFS_SB(s)->s_qf_names[qtype] && strcmp(REISERFS_SB(s)->s_qf_names[qtype], arg)) { - reiserfs_warning(s, "reiserfs_parse_options: %s quota file already specified.", QTYPE2NAME(qtype)); - return 0; + if (c == 'j') { + if (arg && *arg && jdev_name) { + if (*jdev_name) { //Hm, already assigned? + reiserfs_warning(s, + "reiserfs_parse_options: journal device was already specified to be %s", + *jdev_name); + return 0; + } + *jdev_name = arg; + } } - if (strchr(arg, '/')) { - reiserfs_warning(s, "reiserfs_parse_options: quotafile must be on filesystem root."); - return 0; +#ifdef CONFIG_QUOTA + if (c == 'u' || c == 'g') { + int qtype = c == 'u' ? USRQUOTA : GRPQUOTA; + + if (sb_any_quota_enabled(s)) { + reiserfs_warning(s, + "reiserfs_parse_options: cannot change journalled quota options when quota turned on."); + return 0; + } + if (*arg) { /* Some filename specified? */ + if (REISERFS_SB(s)->s_qf_names[qtype] + && strcmp(REISERFS_SB(s)->s_qf_names[qtype], + arg)) { + reiserfs_warning(s, + "reiserfs_parse_options: %s quota file already specified.", + QTYPE2NAME(qtype)); + return 0; + } + if (strchr(arg, '/')) { + reiserfs_warning(s, + "reiserfs_parse_options: quotafile must be on filesystem root."); + return 0; + } + REISERFS_SB(s)->s_qf_names[qtype] = + kmalloc(strlen(arg) + 1, GFP_KERNEL); + if (!REISERFS_SB(s)->s_qf_names[qtype]) { + reiserfs_warning(s, + "reiserfs_parse_options: not enough memory for storing quotafile name."); + return 0; + } + strcpy(REISERFS_SB(s)->s_qf_names[qtype], arg); + *mount_options |= 1 << REISERFS_QUOTA; + } else { + if (REISERFS_SB(s)->s_qf_names[qtype]) { + kfree(REISERFS_SB(s)-> + s_qf_names[qtype]); + REISERFS_SB(s)->s_qf_names[qtype] = + NULL; + } + } } - REISERFS_SB(s)->s_qf_names[qtype] = kmalloc(strlen(arg)+1, GFP_KERNEL); - if (!REISERFS_SB(s)->s_qf_names[qtype]) { - reiserfs_warning(s, "reiserfs_parse_options: not enough memory for storing quotafile name."); - return 0; + if (c == 'f') { + if (!strcmp(arg, "vfsold")) + REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_OLD; + else if (!strcmp(arg, "vfsv0")) + REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_V0; + else { + reiserfs_warning(s, + "reiserfs_parse_options: unknown quota format specified."); + return 0; + } } - strcpy(REISERFS_SB(s)->s_qf_names[qtype], arg); - } - else { - if (REISERFS_SB(s)->s_qf_names[qtype]) { - kfree(REISERFS_SB(s)->s_qf_names[qtype]); - REISERFS_SB(s)->s_qf_names[qtype] = NULL; +#else + if (c == 'u' || c == 'g' || c == 'f') { + reiserfs_warning(s, + "reiserfs_parse_options: journalled quota options not supported."); + return 0; } - } - } - if (c == 'f') { - if (!strcmp(arg, "vfsold")) - REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_OLD; - else if (!strcmp(arg, "vfsv0")) - REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_V0; - else { - reiserfs_warning(s, "reiserfs_parse_options: unknown quota format specified."); +#endif + } + +#ifdef CONFIG_QUOTA + if (!REISERFS_SB(s)->s_jquota_fmt + && (REISERFS_SB(s)->s_qf_names[USRQUOTA] + || REISERFS_SB(s)->s_qf_names[GRPQUOTA])) { + reiserfs_warning(s, + "reiserfs_parse_options: journalled quota format not specified."); return 0; - } } -#else - if (c == 'u' || c == 'g' || c == 'f') { - reiserfs_warning(s, "reiserfs_parse_options: journalled quota options not supported."); - return 0; + /* This checking is not precise wrt the quota type but for our purposes it is sufficient */ + if (!(*mount_options & (1 << REISERFS_QUOTA)) + && sb_any_quota_enabled(s)) { + reiserfs_warning(s, + "reiserfs_parse_options: quota options must be present when quota is turned on."); + return 0; } #endif - } - -#ifdef CONFIG_QUOTA - if (!REISERFS_SB(s)->s_jquota_fmt && (REISERFS_SB(s)->s_qf_names[USRQUOTA] || REISERFS_SB(s)->s_qf_names[GRPQUOTA])) { - reiserfs_warning(s, "reiserfs_parse_options: journalled quota format not specified."); - return 0; - } -#endif - return 1; + + return 1; } -static void switch_data_mode(struct super_block *s, unsigned long mode) { - REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) | - (1 << REISERFS_DATA_ORDERED) | - (1 << REISERFS_DATA_WRITEBACK)); - REISERFS_SB(s)->s_mount_opt |= (1 << mode); +static void switch_data_mode(struct super_block *s, unsigned long mode) +{ + REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) | + (1 << REISERFS_DATA_ORDERED) | + (1 << REISERFS_DATA_WRITEBACK)); + REISERFS_SB(s)->s_mount_opt |= (1 << mode); } static void handle_data_mode(struct super_block *s, unsigned long mount_options) { - if (mount_options & (1 << REISERFS_DATA_LOG)) { - if (!reiserfs_data_log(s)) { - switch_data_mode(s, REISERFS_DATA_LOG); - reiserfs_info (s, "switching to journaled data mode\n"); - } - } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) { - if (!reiserfs_data_ordered(s)) { - switch_data_mode(s, REISERFS_DATA_ORDERED); - reiserfs_info (s, "switching to ordered data mode\n"); - } - } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) { - if (!reiserfs_data_writeback(s)) { - switch_data_mode(s, REISERFS_DATA_WRITEBACK); - reiserfs_info (s, "switching to writeback data mode\n"); - } - } + if (mount_options & (1 << REISERFS_DATA_LOG)) { + if (!reiserfs_data_log(s)) { + switch_data_mode(s, REISERFS_DATA_LOG); + reiserfs_info(s, "switching to journaled data mode\n"); + } + } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) { + if (!reiserfs_data_ordered(s)) { + switch_data_mode(s, REISERFS_DATA_ORDERED); + reiserfs_info(s, "switching to ordered data mode\n"); + } + } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) { + if (!reiserfs_data_writeback(s)) { + switch_data_mode(s, REISERFS_DATA_WRITEBACK); + reiserfs_info(s, "switching to writeback data mode\n"); + } + } } -static void handle_barrier_mode(struct super_block *s, unsigned long bits) { - int flush = (1 << REISERFS_BARRIER_FLUSH); - int none = (1 << REISERFS_BARRIER_NONE); - int all_barrier = flush | none; - - if (bits & all_barrier) { - REISERFS_SB(s)->s_mount_opt &= ~all_barrier; - if (bits & flush) { - REISERFS_SB(s)->s_mount_opt |= flush; - printk("reiserfs: enabling write barrier flush mode\n"); - } else if (bits & none) { - REISERFS_SB(s)->s_mount_opt |= none; - printk("reiserfs: write barriers turned off\n"); - } - } +static void handle_barrier_mode(struct super_block *s, unsigned long bits) +{ + int flush = (1 << REISERFS_BARRIER_FLUSH); + int none = (1 << REISERFS_BARRIER_NONE); + int all_barrier = flush | none; + + if (bits & all_barrier) { + REISERFS_SB(s)->s_mount_opt &= ~all_barrier; + if (bits & flush) { + REISERFS_SB(s)->s_mount_opt |= flush; + printk("reiserfs: enabling write barrier flush mode\n"); + } else if (bits & none) { + REISERFS_SB(s)->s_mount_opt |= none; + printk("reiserfs: write barriers turned off\n"); + } + } } -static void handle_attrs( struct super_block *s ) +static void handle_attrs(struct super_block *s) { - struct reiserfs_super_block * rs; + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); - if( reiserfs_attrs( s ) ) { - rs = SB_DISK_SUPER_BLOCK (s); - if( old_format_only(s) ) { - reiserfs_warning(s, "reiserfs: cannot support attributes on 3.5.x disk format" ); - REISERFS_SB(s) -> s_mount_opt &= ~ ( 1 << REISERFS_ATTRS ); + if (reiserfs_attrs(s)) { + if (old_format_only(s)) { + reiserfs_warning(s, + "reiserfs: cannot support attributes on 3.5.x disk format"); + REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS); return; } - if( !( le32_to_cpu( rs -> s_flags ) & reiserfs_attrs_cleared ) ) { - reiserfs_warning(s, "reiserfs: cannot support attributes until flag is set in super-block" ); - REISERFS_SB(s) -> s_mount_opt &= ~ ( 1 << REISERFS_ATTRS ); + if (!(le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared)) { + reiserfs_warning(s, + "reiserfs: cannot support attributes until flag is set in super-block"); + REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS); } + } else if (le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared) { + REISERFS_SB(s)->s_mount_opt |= REISERFS_ATTRS; } } -static int reiserfs_remount (struct super_block * s, int * mount_flags, char * arg) +static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) { - struct reiserfs_super_block * rs; - struct reiserfs_transaction_handle th ; - unsigned long blocks; - unsigned long mount_options = REISERFS_SB(s)->s_mount_opt; - unsigned long safe_mask = 0; - unsigned int commit_max_age = (unsigned int)-1; - struct reiserfs_journal *journal = SB_JOURNAL(s); - int err; + struct reiserfs_super_block *rs; + struct reiserfs_transaction_handle th; + unsigned long blocks; + unsigned long mount_options = REISERFS_SB(s)->s_mount_opt; + unsigned long safe_mask = 0; + unsigned int commit_max_age = (unsigned int)-1; + struct reiserfs_journal *journal = SB_JOURNAL(s); + int err; #ifdef CONFIG_QUOTA - int i; + int i; #endif - rs = SB_DISK_SUPER_BLOCK (s); + rs = SB_DISK_SUPER_BLOCK(s); - if (!reiserfs_parse_options(s, arg, &mount_options, &blocks, NULL, &commit_max_age)) { + if (!reiserfs_parse_options + (s, arg, &mount_options, &blocks, NULL, &commit_max_age)) { #ifdef CONFIG_QUOTA - for (i = 0; i < MAXQUOTAS; i++) - if (REISERFS_SB(s)->s_qf_names[i]) { - kfree(REISERFS_SB(s)->s_qf_names[i]); - REISERFS_SB(s)->s_qf_names[i] = NULL; - } + for (i = 0; i < MAXQUOTAS; i++) + if (REISERFS_SB(s)->s_qf_names[i]) { + kfree(REISERFS_SB(s)->s_qf_names[i]); + REISERFS_SB(s)->s_qf_names[i] = NULL; + } #endif - return -EINVAL; - } - - handle_attrs(s); - - /* Add options that are safe here */ - safe_mask |= 1 << REISERFS_SMALLTAIL; - safe_mask |= 1 << REISERFS_LARGETAIL; - safe_mask |= 1 << REISERFS_NO_BORDER; - safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION; - safe_mask |= 1 << REISERFS_HASHED_RELOCATION; - safe_mask |= 1 << REISERFS_TEST4; - safe_mask |= 1 << REISERFS_ATTRS; - safe_mask |= 1 << REISERFS_XATTRS_USER; - safe_mask |= 1 << REISERFS_POSIXACL; - safe_mask |= 1 << REISERFS_BARRIER_FLUSH; - safe_mask |= 1 << REISERFS_BARRIER_NONE; - safe_mask |= 1 << REISERFS_ERROR_RO; - safe_mask |= 1 << REISERFS_ERROR_CONTINUE; - safe_mask |= 1 << REISERFS_ERROR_PANIC; - - /* Update the bitmask, taking care to keep - * the bits we're not allowed to change here */ - REISERFS_SB(s)->s_mount_opt = (REISERFS_SB(s)->s_mount_opt & ~safe_mask) | (mount_options & safe_mask); - - if(commit_max_age != 0 && commit_max_age != (unsigned int)-1) { - journal->j_max_commit_age = commit_max_age; - journal->j_max_trans_age = commit_max_age; - } - else if(commit_max_age == 0) - { - /* 0 means restore defaults. */ - journal->j_max_commit_age = journal->j_default_max_commit_age; - journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; - } - - if(blocks) { - int rc = reiserfs_resize(s, blocks); - if (rc != 0) - return rc; - } - - if (*mount_flags & MS_RDONLY) { - reiserfs_xattr_init (s, *mount_flags); - /* remount read-only */ - if (s->s_flags & MS_RDONLY) - /* it is read-only already */ - return 0; - /* try to remount file system with read-only permissions */ - if (sb_umount_state(rs) == REISERFS_VALID_FS || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { - return 0; - } - - err = journal_begin(&th, s, 10) ; - if (err) - return err; - - /* Mounting a rw partition read-only. */ - reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; - set_sb_umount_state( rs, REISERFS_SB(s)->s_mount_state ); - journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); - } else { - /* remount read-write */ - if (!(s->s_flags & MS_RDONLY)) { - reiserfs_xattr_init (s, *mount_flags); - return 0; /* We are read-write already */ - } - - if (reiserfs_is_journal_aborted (journal)) - return journal->j_errno; - - handle_data_mode(s, mount_options); - handle_barrier_mode(s, mount_options); - REISERFS_SB(s)->s_mount_state = sb_umount_state(rs) ; - s->s_flags &= ~MS_RDONLY ; /* now it is safe to call journal_begin */ - err = journal_begin(&th, s, 10) ; - if (err) - return err; - - /* Mount a partition which is read-only, read-write */ - reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; - REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); - s->s_flags &= ~MS_RDONLY; - set_sb_umount_state( rs, REISERFS_ERROR_FS ); - /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */ - journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); - REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS ; - } - /* this will force a full flush of all journal lists */ - SB_JOURNAL(s)->j_must_wait = 1 ; - err = journal_end(&th, s, 10) ; - if (err) - return err; - s->s_dirt = 0; - - if (!( *mount_flags & MS_RDONLY ) ) { - finish_unfinished( s ); - reiserfs_xattr_init (s, *mount_flags); - } - - return 0; + return -EINVAL; + } + + handle_attrs(s); + + /* Add options that are safe here */ + safe_mask |= 1 << REISERFS_SMALLTAIL; + safe_mask |= 1 << REISERFS_LARGETAIL; + safe_mask |= 1 << REISERFS_NO_BORDER; + safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION; + safe_mask |= 1 << REISERFS_HASHED_RELOCATION; + safe_mask |= 1 << REISERFS_TEST4; + safe_mask |= 1 << REISERFS_ATTRS; + safe_mask |= 1 << REISERFS_XATTRS_USER; + safe_mask |= 1 << REISERFS_POSIXACL; + safe_mask |= 1 << REISERFS_BARRIER_FLUSH; + safe_mask |= 1 << REISERFS_BARRIER_NONE; + safe_mask |= 1 << REISERFS_ERROR_RO; + safe_mask |= 1 << REISERFS_ERROR_CONTINUE; + safe_mask |= 1 << REISERFS_ERROR_PANIC; + safe_mask |= 1 << REISERFS_QUOTA; + + /* Update the bitmask, taking care to keep + * the bits we're not allowed to change here */ + REISERFS_SB(s)->s_mount_opt = + (REISERFS_SB(s)-> + s_mount_opt & ~safe_mask) | (mount_options & safe_mask); + + if (commit_max_age != 0 && commit_max_age != (unsigned int)-1) { + journal->j_max_commit_age = commit_max_age; + journal->j_max_trans_age = commit_max_age; + } else if (commit_max_age == 0) { + /* 0 means restore defaults. */ + journal->j_max_commit_age = journal->j_default_max_commit_age; + journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE; + } + + if (blocks) { + int rc = reiserfs_resize(s, blocks); + if (rc != 0) + return rc; + } + + if (*mount_flags & MS_RDONLY) { + reiserfs_xattr_init(s, *mount_flags); + /* remount read-only */ + if (s->s_flags & MS_RDONLY) + /* it is read-only already */ + return 0; + /* try to remount file system with read-only permissions */ + if (sb_umount_state(rs) == REISERFS_VALID_FS + || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { + return 0; + } + + err = journal_begin(&th, s, 10); + if (err) + return err; + + /* Mounting a rw partition read-only. */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + set_sb_umount_state(rs, REISERFS_SB(s)->s_mount_state); + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + } else { + /* remount read-write */ + if (!(s->s_flags & MS_RDONLY)) { + reiserfs_xattr_init(s, *mount_flags); + return 0; /* We are read-write already */ + } + + if (reiserfs_is_journal_aborted(journal)) + return journal->j_errno; + + handle_data_mode(s, mount_options); + handle_barrier_mode(s, mount_options); + REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); + s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */ + err = journal_begin(&th, s, 10); + if (err) + return err; + + /* Mount a partition which is read-only, read-write */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); + s->s_flags &= ~MS_RDONLY; + set_sb_umount_state(rs, REISERFS_ERROR_FS); + /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */ + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS; + } + /* this will force a full flush of all journal lists */ + SB_JOURNAL(s)->j_must_wait = 1; + err = journal_end(&th, s, 10); + if (err) + return err; + s->s_dirt = 0; + + if (!(*mount_flags & MS_RDONLY)) { + finish_unfinished(s); + reiserfs_xattr_init(s, *mount_flags); + } + + return 0; } /* load_bitmap_info_data - Sets up the reiserfs_bitmap_info structure from disk. @@ -1204,791 +1285,879 @@ static int reiserfs_remount (struct super_block * s, int * mount_flags, char * a * free blocks at all. */ -static void load_bitmap_info_data (struct super_block *sb, - struct reiserfs_bitmap_info *bi) +static void load_bitmap_info_data(struct super_block *sb, + struct reiserfs_bitmap_info *bi) { - unsigned long *cur = (unsigned long *)bi->bh->b_data; - - while ((char *)cur < (bi->bh->b_data + sb->s_blocksize)) { - - /* No need to scan if all 0's or all 1's. - * Since we're only counting 0's, we can simply ignore all 1's */ - if (*cur == 0) { - if (bi->first_zero_hint == 0) { - bi->first_zero_hint = ((char *)cur - bi->bh->b_data) << 3; - } - bi->free_count += sizeof(unsigned long)*8; - } else if (*cur != ~0L) { - int b; - for (b = 0; b < sizeof(unsigned long)*8; b++) { - if (!reiserfs_test_le_bit (b, cur)) { - bi->free_count ++; - if (bi->first_zero_hint == 0) - bi->first_zero_hint = - (((char *)cur - bi->bh->b_data) << 3) + b; - } + unsigned long *cur = (unsigned long *)bi->bh->b_data; + + while ((char *)cur < (bi->bh->b_data + sb->s_blocksize)) { + + /* No need to scan if all 0's or all 1's. + * Since we're only counting 0's, we can simply ignore all 1's */ + if (*cur == 0) { + if (bi->first_zero_hint == 0) { + bi->first_zero_hint = + ((char *)cur - bi->bh->b_data) << 3; + } + bi->free_count += sizeof(unsigned long) * 8; + } else if (*cur != ~0L) { + int b; + for (b = 0; b < sizeof(unsigned long) * 8; b++) { + if (!reiserfs_test_le_bit(b, cur)) { + bi->free_count++; + if (bi->first_zero_hint == 0) + bi->first_zero_hint = + (((char *)cur - + bi->bh->b_data) << 3) + b; + } + } } - } - cur ++; - } + cur++; + } #ifdef CONFIG_REISERFS_CHECK // This outputs a lot of unneded info on big FSes // reiserfs_warning ("bitmap loaded from block %d: %d free blocks", -// bi->bh->b_blocknr, bi->free_count); +// bi->bh->b_blocknr, bi->free_count); #endif } - -static int read_bitmaps (struct super_block * s) + +static int read_bitmaps(struct super_block *s) { - int i, bmap_nr; + int i, bmap_nr; + + SB_AP_BITMAP(s) = + vmalloc(sizeof(struct reiserfs_bitmap_info) * SB_BMAP_NR(s)); + if (SB_AP_BITMAP(s) == 0) + return 1; + memset(SB_AP_BITMAP(s), 0, + sizeof(struct reiserfs_bitmap_info) * SB_BMAP_NR(s)); + for (i = 0, bmap_nr = + REISERFS_DISK_OFFSET_IN_BYTES / s->s_blocksize + 1; + i < SB_BMAP_NR(s); i++, bmap_nr = s->s_blocksize * 8 * i) { + SB_AP_BITMAP(s)[i].bh = sb_getblk(s, bmap_nr); + if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) + ll_rw_block(READ, 1, &SB_AP_BITMAP(s)[i].bh); + } + for (i = 0; i < SB_BMAP_NR(s); i++) { + wait_on_buffer(SB_AP_BITMAP(s)[i].bh); + if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) { + reiserfs_warning(s, "sh-2029: reiserfs read_bitmaps: " + "bitmap block (#%lu) reading failed", + SB_AP_BITMAP(s)[i].bh->b_blocknr); + for (i = 0; i < SB_BMAP_NR(s); i++) + brelse(SB_AP_BITMAP(s)[i].bh); + vfree(SB_AP_BITMAP(s)); + SB_AP_BITMAP(s) = NULL; + return 1; + } + load_bitmap_info_data(s, SB_AP_BITMAP(s) + i); + } + return 0; +} - SB_AP_BITMAP (s) = vmalloc (sizeof (struct reiserfs_bitmap_info) * SB_BMAP_NR(s)); - if (SB_AP_BITMAP (s) == 0) - return 1; - memset (SB_AP_BITMAP (s), 0, sizeof (struct reiserfs_bitmap_info) * SB_BMAP_NR(s)); - for (i = 0, bmap_nr = REISERFS_DISK_OFFSET_IN_BYTES / s->s_blocksize + 1; - i < SB_BMAP_NR(s); i++, bmap_nr = s->s_blocksize * 8 * i) { - SB_AP_BITMAP (s)[i].bh = sb_getblk(s, bmap_nr); - if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) - ll_rw_block(READ, 1, &SB_AP_BITMAP(s)[i].bh); - } - for (i = 0; i < SB_BMAP_NR(s); i++) { - wait_on_buffer(SB_AP_BITMAP (s)[i].bh); - if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) { - reiserfs_warning(s,"sh-2029: reiserfs read_bitmaps: " - "bitmap block (#%lu) reading failed", - SB_AP_BITMAP(s)[i].bh->b_blocknr); - for (i = 0; i < SB_BMAP_NR(s); i++) - brelse(SB_AP_BITMAP(s)[i].bh); - vfree(SB_AP_BITMAP(s)); - SB_AP_BITMAP(s) = NULL; - return 1; +static int read_old_bitmaps(struct super_block *s) +{ + int i; + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); + int bmp1 = (REISERFS_OLD_DISK_OFFSET_IN_BYTES / s->s_blocksize) + 1; /* first of bitmap blocks */ + + /* read true bitmap */ + SB_AP_BITMAP(s) = + vmalloc(sizeof(struct reiserfs_buffer_info *) * sb_bmap_nr(rs)); + if (SB_AP_BITMAP(s) == 0) + return 1; + + memset(SB_AP_BITMAP(s), 0, + sizeof(struct reiserfs_buffer_info *) * sb_bmap_nr(rs)); + + for (i = 0; i < sb_bmap_nr(rs); i++) { + SB_AP_BITMAP(s)[i].bh = sb_bread(s, bmp1 + i); + if (!SB_AP_BITMAP(s)[i].bh) + return 1; + load_bitmap_info_data(s, SB_AP_BITMAP(s) + i); } - load_bitmap_info_data (s, SB_AP_BITMAP (s) + i); - } - return 0; + + return 0; } -static int read_old_bitmaps (struct super_block * s) +static int read_super_block(struct super_block *s, int offset) { - int i ; - struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK(s); - int bmp1 = (REISERFS_OLD_DISK_OFFSET_IN_BYTES / s->s_blocksize) + 1; /* first of bitmap blocks */ + struct buffer_head *bh; + struct reiserfs_super_block *rs; + int fs_blocksize; - /* read true bitmap */ - SB_AP_BITMAP (s) = vmalloc (sizeof (struct reiserfs_buffer_info *) * sb_bmap_nr(rs)); - if (SB_AP_BITMAP (s) == 0) - return 1; + bh = sb_bread(s, offset / s->s_blocksize); + if (!bh) { + reiserfs_warning(s, "sh-2006: read_super_block: " + "bread failed (dev %s, block %lu, size %lu)", + reiserfs_bdevname(s), offset / s->s_blocksize, + s->s_blocksize); + return 1; + } - memset (SB_AP_BITMAP (s), 0, sizeof (struct reiserfs_buffer_info *) * sb_bmap_nr(rs)); + rs = (struct reiserfs_super_block *)bh->b_data; + if (!is_any_reiserfs_magic_string(rs)) { + brelse(bh); + return 1; + } + // + // ok, reiserfs signature (old or new) found in at the given offset + // + fs_blocksize = sb_blocksize(rs); + brelse(bh); + sb_set_blocksize(s, fs_blocksize); - for (i = 0; i < sb_bmap_nr(rs); i ++) { - SB_AP_BITMAP (s)[i].bh = sb_bread (s, bmp1 + i); - if (!SB_AP_BITMAP (s)[i].bh) - return 1; - load_bitmap_info_data (s, SB_AP_BITMAP (s) + i); - } + bh = sb_bread(s, offset / s->s_blocksize); + if (!bh) { + reiserfs_warning(s, "sh-2007: read_super_block: " + "bread failed (dev %s, block %lu, size %lu)\n", + reiserfs_bdevname(s), offset / s->s_blocksize, + s->s_blocksize); + return 1; + } - return 0; -} + rs = (struct reiserfs_super_block *)bh->b_data; + if (sb_blocksize(rs) != s->s_blocksize) { + reiserfs_warning(s, "sh-2011: read_super_block: " + "can't find a reiserfs filesystem on (dev %s, block %Lu, size %lu)\n", + reiserfs_bdevname(s), + (unsigned long long)bh->b_blocknr, + s->s_blocksize); + brelse(bh); + return 1; + } -static int read_super_block (struct super_block * s, int offset) -{ - struct buffer_head * bh; - struct reiserfs_super_block * rs; - int fs_blocksize; - - - bh = sb_bread (s, offset / s->s_blocksize); - if (!bh) { - reiserfs_warning (s, "sh-2006: read_super_block: " - "bread failed (dev %s, block %lu, size %lu)", - reiserfs_bdevname (s), offset / s->s_blocksize, s->s_blocksize); - return 1; - } - - rs = (struct reiserfs_super_block *)bh->b_data; - if (!is_any_reiserfs_magic_string (rs)) { - brelse (bh); - return 1; - } - - // - // ok, reiserfs signature (old or new) found in at the given offset - // - fs_blocksize = sb_blocksize(rs); - brelse (bh); - sb_set_blocksize (s, fs_blocksize); - - bh = sb_bread (s, offset / s->s_blocksize); - if (!bh) { - reiserfs_warning (s, "sh-2007: read_super_block: " - "bread failed (dev %s, block %lu, size %lu)\n", - reiserfs_bdevname (s), offset / s->s_blocksize, s->s_blocksize); - return 1; - } - - rs = (struct reiserfs_super_block *)bh->b_data; - if (sb_blocksize(rs) != s->s_blocksize) { - reiserfs_warning (s, "sh-2011: read_super_block: " - "can't find a reiserfs filesystem on (dev %s, block %Lu, size %lu)\n", - reiserfs_bdevname (s), (unsigned long long)bh->b_blocknr, s->s_blocksize); - brelse (bh); - return 1; - } - - if ( rs->s_v1.s_root_block == cpu_to_le32(-1) ) { - brelse(bh) ; - reiserfs_warning (s, "Unfinished reiserfsck --rebuild-tree run detected. Please run\n" - "reiserfsck --rebuild-tree and wait for a completion. If that fails\n" - "get newer reiserfsprogs package"); - return 1; - } - - SB_BUFFER_WITH_SB (s) = bh; - SB_DISK_SUPER_BLOCK (s) = rs; - - if (is_reiserfs_jr (rs)) { - /* magic is of non-standard journal filesystem, look at s_version to - find which format is in use */ - if (sb_version(rs) == REISERFS_VERSION_2) - reiserfs_warning (s, "read_super_block: found reiserfs format \"3.6\"" - " with non-standard journal"); - else if (sb_version(rs) == REISERFS_VERSION_1) - reiserfs_warning (s, "read_super_block: found reiserfs format \"3.5\"" - " with non-standard journal"); - else { - reiserfs_warning (s, "sh-2012: read_super_block: found unknown " - "format \"%u\" of reiserfs with non-standard magic", - sb_version(rs)); - return 1; + if (rs->s_v1.s_root_block == cpu_to_le32(-1)) { + brelse(bh); + reiserfs_warning(s, + "Unfinished reiserfsck --rebuild-tree run detected. Please run\n" + "reiserfsck --rebuild-tree and wait for a completion. If that fails\n" + "get newer reiserfsprogs package"); + return 1; } - } - else - /* s_version of standard format may contain incorrect information, - so we just look at the magic string */ - reiserfs_info (s, "found reiserfs format \"%s\" with standard journal\n", - is_reiserfs_3_5 (rs) ? "3.5" : "3.6"); - s->s_op = &reiserfs_sops; - s->s_export_op = &reiserfs_export_ops; + SB_BUFFER_WITH_SB(s) = bh; + SB_DISK_SUPER_BLOCK(s) = rs; + + if (is_reiserfs_jr(rs)) { + /* magic is of non-standard journal filesystem, look at s_version to + find which format is in use */ + if (sb_version(rs) == REISERFS_VERSION_2) + reiserfs_warning(s, + "read_super_block: found reiserfs format \"3.6\"" + " with non-standard journal"); + else if (sb_version(rs) == REISERFS_VERSION_1) + reiserfs_warning(s, + "read_super_block: found reiserfs format \"3.5\"" + " with non-standard journal"); + else { + reiserfs_warning(s, + "sh-2012: read_super_block: found unknown " + "format \"%u\" of reiserfs with non-standard magic", + sb_version(rs)); + return 1; + } + } else + /* s_version of standard format may contain incorrect information, + so we just look at the magic string */ + reiserfs_info(s, + "found reiserfs format \"%s\" with standard journal\n", + is_reiserfs_3_5(rs) ? "3.5" : "3.6"); + + s->s_op = &reiserfs_sops; + s->s_export_op = &reiserfs_export_ops; #ifdef CONFIG_QUOTA - s->s_qcop = &reiserfs_qctl_operations; - s->dq_op = &reiserfs_quota_operations; + s->s_qcop = &reiserfs_qctl_operations; + s->dq_op = &reiserfs_quota_operations; #endif - /* new format is limited by the 32 bit wide i_blocks field, want to - ** be one full block below that. - */ - s->s_maxbytes = (512LL << 32) - s->s_blocksize ; - return 0; + /* new format is limited by the 32 bit wide i_blocks field, want to + ** be one full block below that. + */ + s->s_maxbytes = (512LL << 32) - s->s_blocksize; + return 0; } - - /* after journal replay, reread all bitmap and super blocks */ -static int reread_meta_blocks(struct super_block *s) { - int i ; - ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))) ; - wait_on_buffer(SB_BUFFER_WITH_SB(s)) ; - if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { - reiserfs_warning (s, "reread_meta_blocks, error reading the super") ; - return 1 ; - } - - for (i = 0; i < SB_BMAP_NR(s) ; i++) { - ll_rw_block(READ, 1, &(SB_AP_BITMAP(s)[i].bh)) ; - wait_on_buffer(SB_AP_BITMAP(s)[i].bh) ; - if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) { - reiserfs_warning (s, "reread_meta_blocks, error reading bitmap block number %d at %llu", - i, (unsigned long long)SB_AP_BITMAP(s)[i].bh->b_blocknr) ; - return 1 ; - } - } - return 0 ; +static int reread_meta_blocks(struct super_block *s) +{ + int i; + ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))); + wait_on_buffer(SB_BUFFER_WITH_SB(s)); + if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { + reiserfs_warning(s, + "reread_meta_blocks, error reading the super"); + return 1; + } -} + for (i = 0; i < SB_BMAP_NR(s); i++) { + ll_rw_block(READ, 1, &(SB_AP_BITMAP(s)[i].bh)); + wait_on_buffer(SB_AP_BITMAP(s)[i].bh); + if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) { + reiserfs_warning(s, + "reread_meta_blocks, error reading bitmap block number %d at %llu", + i, + (unsigned long long)SB_AP_BITMAP(s)[i]. + bh->b_blocknr); + return 1; + } + } + return 0; +} ///////////////////////////////////////////////////// // hash detection stuff - // if root directory is empty - we set default - Yura's - hash and // warn about it // FIXME: we look for only one name in a directory. If tea and yura // bith have the same value - we ask user to send report to the // mailing list -static __u32 find_hash_out (struct super_block * s) +static __u32 find_hash_out(struct super_block *s) { - int retval; - struct inode * inode; - struct cpu_key key; - INITIALIZE_PATH (path); - struct reiserfs_dir_entry de; - __u32 hash = DEFAULT_HASH; - - inode = s->s_root->d_inode; - - do { // Some serious "goto"-hater was there ;) - u32 teahash, r5hash, yurahash; - - make_cpu_key (&key, inode, ~0, TYPE_DIRENTRY, 3); - retval = search_by_entry_key (s, &key, &path, &de); - if (retval == IO_ERROR) { - pathrelse (&path); - return UNSET_HASH ; - } - if (retval == NAME_NOT_FOUND) - de.de_entry_num --; - set_de_name_and_namelen (&de); - if (deh_offset( &(de.de_deh[de.de_entry_num]) ) == DOT_DOT_OFFSET) { - /* allow override in this case */ - if (reiserfs_rupasov_hash(s)) { - hash = YURA_HASH ; - } - reiserfs_warning(s,"FS seems to be empty, autodetect " - "is using the default hash"); - break; - } - r5hash=GET_HASH_VALUE (r5_hash (de.de_name, de.de_namelen)); - teahash=GET_HASH_VALUE (keyed_hash (de.de_name, de.de_namelen)); - yurahash=GET_HASH_VALUE (yura_hash (de.de_name, de.de_namelen)); - if ( ( (teahash == r5hash) && (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num]))) == r5hash) ) || - ( (teahash == yurahash) && (yurahash == GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])))) ) || - ( (r5hash == yurahash) && (yurahash == GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])))) ) ) { - reiserfs_warning(s,"Unable to automatically detect hash function. " - "Please mount with -o hash={tea,rupasov,r5}", - reiserfs_bdevname (s)); - hash = UNSET_HASH; - break; - } - if (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])) ) == yurahash) - hash = YURA_HASH; - else if (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])) ) == teahash) - hash = TEA_HASH; - else if (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])) ) == r5hash) - hash = R5_HASH; - else { - reiserfs_warning (s,"Unrecognised hash function"); - hash = UNSET_HASH; - } - } while (0); - - pathrelse (&path); - return hash; + int retval; + struct inode *inode; + struct cpu_key key; + INITIALIZE_PATH(path); + struct reiserfs_dir_entry de; + __u32 hash = DEFAULT_HASH; + + inode = s->s_root->d_inode; + + do { // Some serious "goto"-hater was there ;) + u32 teahash, r5hash, yurahash; + + make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3); + retval = search_by_entry_key(s, &key, &path, &de); + if (retval == IO_ERROR) { + pathrelse(&path); + return UNSET_HASH; + } + if (retval == NAME_NOT_FOUND) + de.de_entry_num--; + set_de_name_and_namelen(&de); + if (deh_offset(&(de.de_deh[de.de_entry_num])) == DOT_DOT_OFFSET) { + /* allow override in this case */ + if (reiserfs_rupasov_hash(s)) { + hash = YURA_HASH; + } + reiserfs_warning(s, "FS seems to be empty, autodetect " + "is using the default hash"); + break; + } + r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen)); + teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen)); + yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen)); + if (((teahash == r5hash) + && + (GET_HASH_VALUE(deh_offset(&(de.de_deh[de.de_entry_num]))) + == r5hash)) || ((teahash == yurahash) + && (yurahash == + GET_HASH_VALUE(deh_offset + (& + (de. + de_deh[de. + de_entry_num]))))) + || ((r5hash == yurahash) + && (yurahash == + GET_HASH_VALUE(deh_offset + (&(de.de_deh[de.de_entry_num])))))) { + reiserfs_warning(s, + "Unable to automatically detect hash function. " + "Please mount with -o hash={tea,rupasov,r5}", + reiserfs_bdevname(s)); + hash = UNSET_HASH; + break; + } + if (GET_HASH_VALUE(deh_offset(&(de.de_deh[de.de_entry_num]))) == + yurahash) + hash = YURA_HASH; + else if (GET_HASH_VALUE + (deh_offset(&(de.de_deh[de.de_entry_num]))) == teahash) + hash = TEA_HASH; + else if (GET_HASH_VALUE + (deh_offset(&(de.de_deh[de.de_entry_num]))) == r5hash) + hash = R5_HASH; + else { + reiserfs_warning(s, "Unrecognised hash function"); + hash = UNSET_HASH; + } + } while (0); + + pathrelse(&path); + return hash; } // finds out which hash names are sorted with -static int what_hash (struct super_block * s) +static int what_hash(struct super_block *s) { - __u32 code; - - code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s)); - - /* reiserfs_hash_detect() == true if any of the hash mount options - ** were used. We must check them to make sure the user isn't - ** using a bad hash value - */ - if (code == UNSET_HASH || reiserfs_hash_detect(s)) - code = find_hash_out (s); - - if (code != UNSET_HASH && reiserfs_hash_detect(s)) { - /* detection has found the hash, and we must check against the - ** mount options - */ - if (reiserfs_rupasov_hash(s) && code != YURA_HASH) { - reiserfs_warning (s, "Error, %s hash detected, " - "unable to force rupasov hash", reiserfs_hashname(code)) ; - code = UNSET_HASH ; - } else if (reiserfs_tea_hash(s) && code != TEA_HASH) { - reiserfs_warning (s, "Error, %s hash detected, " - "unable to force tea hash", reiserfs_hashname(code)) ; - code = UNSET_HASH ; - } else if (reiserfs_r5_hash(s) && code != R5_HASH) { - reiserfs_warning (s, "Error, %s hash detected, " - "unable to force r5 hash", reiserfs_hashname(code)) ; - code = UNSET_HASH ; - } - } else { - /* find_hash_out was not called or could not determine the hash */ - if (reiserfs_rupasov_hash(s)) { - code = YURA_HASH ; - } else if (reiserfs_tea_hash(s)) { - code = TEA_HASH ; - } else if (reiserfs_r5_hash(s)) { - code = R5_HASH ; - } - } - - /* if we are mounted RW, and we have a new valid hash code, update - ** the super - */ - if (code != UNSET_HASH && - !(s->s_flags & MS_RDONLY) && - code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) { - set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code); - } - return code; + __u32 code; + + code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s)); + + /* reiserfs_hash_detect() == true if any of the hash mount options + ** were used. We must check them to make sure the user isn't + ** using a bad hash value + */ + if (code == UNSET_HASH || reiserfs_hash_detect(s)) + code = find_hash_out(s); + + if (code != UNSET_HASH && reiserfs_hash_detect(s)) { + /* detection has found the hash, and we must check against the + ** mount options + */ + if (reiserfs_rupasov_hash(s) && code != YURA_HASH) { + reiserfs_warning(s, "Error, %s hash detected, " + "unable to force rupasov hash", + reiserfs_hashname(code)); + code = UNSET_HASH; + } else if (reiserfs_tea_hash(s) && code != TEA_HASH) { + reiserfs_warning(s, "Error, %s hash detected, " + "unable to force tea hash", + reiserfs_hashname(code)); + code = UNSET_HASH; + } else if (reiserfs_r5_hash(s) && code != R5_HASH) { + reiserfs_warning(s, "Error, %s hash detected, " + "unable to force r5 hash", + reiserfs_hashname(code)); + code = UNSET_HASH; + } + } else { + /* find_hash_out was not called or could not determine the hash */ + if (reiserfs_rupasov_hash(s)) { + code = YURA_HASH; + } else if (reiserfs_tea_hash(s)) { + code = TEA_HASH; + } else if (reiserfs_r5_hash(s)) { + code = R5_HASH; + } + } + + /* if we are mounted RW, and we have a new valid hash code, update + ** the super + */ + if (code != UNSET_HASH && + !(s->s_flags & MS_RDONLY) && + code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) { + set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code); + } + return code; } // return pointer to appropriate function -static hashf_t hash_function (struct super_block * s) +static hashf_t hash_function(struct super_block *s) { - switch (what_hash (s)) { - case TEA_HASH: - reiserfs_info (s, "Using tea hash to sort names\n"); - return keyed_hash; - case YURA_HASH: - reiserfs_info (s, "Using rupasov hash to sort names\n"); - return yura_hash; - case R5_HASH: - reiserfs_info (s, "Using r5 hash to sort names\n"); - return r5_hash; - } - return NULL; + switch (what_hash(s)) { + case TEA_HASH: + reiserfs_info(s, "Using tea hash to sort names\n"); + return keyed_hash; + case YURA_HASH: + reiserfs_info(s, "Using rupasov hash to sort names\n"); + return yura_hash; + case R5_HASH: + reiserfs_info(s, "Using r5 hash to sort names\n"); + return r5_hash; + } + return NULL; } // this is used to set up correct value for old partitions -static int function2code (hashf_t func) +static int function2code(hashf_t func) { - if (func == keyed_hash) - return TEA_HASH; - if (func == yura_hash) - return YURA_HASH; - if (func == r5_hash) - return R5_HASH; + if (func == keyed_hash) + return TEA_HASH; + if (func == yura_hash) + return YURA_HASH; + if (func == r5_hash) + return R5_HASH; - BUG() ; // should never happen + BUG(); // should never happen - return 0; + return 0; } #define SWARN(silent, s, ...) \ if (!(silent)) \ reiserfs_warning (s, __VA_ARGS__) -static int reiserfs_fill_super (struct super_block * s, void * data, int silent) +static int reiserfs_fill_super(struct super_block *s, void *data, int silent) { - struct inode *root_inode; - int j; - struct reiserfs_transaction_handle th ; - int old_format = 0; - unsigned long blocks; - unsigned int commit_max_age = 0; - int jinit_done = 0 ; - struct reiserfs_iget_args args ; - struct reiserfs_super_block * rs; - char *jdev_name; - struct reiserfs_sb_info *sbi; - int errval = -EINVAL; - - sbi = kmalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); - if (!sbi) { - errval = -ENOMEM; - goto error; - } - s->s_fs_info = sbi; - memset (sbi, 0, sizeof (struct reiserfs_sb_info)); - /* Set default values for options: non-aggressive tails, RO on errors */ - REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL); - REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ERROR_RO); - /* no preallocation minimum, be smart in - reiserfs_file_write instead */ - REISERFS_SB(s)->s_alloc_options.preallocmin = 0; - /* Preallocate by 16 blocks (17-1) at once */ - REISERFS_SB(s)->s_alloc_options.preallocsize = 17; - /* Initialize the rwsem for xattr dir */ - init_rwsem(&REISERFS_SB(s)->xattr_dir_sem); - - /* setup default block allocator options */ - reiserfs_init_alloc_options(s); - - jdev_name = NULL; - if (reiserfs_parse_options (s, (char *) data, &(sbi->s_mount_opt), &blocks, &jdev_name, &commit_max_age) == 0) { - goto error; - } - - if (blocks) { - SWARN (silent, s, "jmacd-7: reiserfs_fill_super: resize option " - "for remount only"); - goto error; - } - - /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */ - if (!read_super_block (s, REISERFS_OLD_DISK_OFFSET_IN_BYTES)) - old_format = 1; - /* try new format (64-th 1k block), which can contain reiserfs super block */ - else if (read_super_block (s, REISERFS_DISK_OFFSET_IN_BYTES)) { - SWARN(silent, s, "sh-2021: reiserfs_fill_super: can not find reiserfs on %s", reiserfs_bdevname (s)); - goto error; - } - - rs = SB_DISK_SUPER_BLOCK (s); - /* Let's do basic sanity check to verify that underlying device is not - smaller than the filesystem. If the check fails then abort and scream, - because bad stuff will happen otherwise. */ - if ( s->s_bdev && s->s_bdev->bd_inode && i_size_read(s->s_bdev->bd_inode) < sb_block_count(rs)*sb_blocksize(rs)) { - SWARN (silent, s, "Filesystem on %s cannot be mounted because it is bigger than the device", reiserfs_bdevname(s)); - SWARN(silent, s, "You may need to run fsck or increase size of your LVM partition"); - SWARN(silent, s, "Or may be you forgot to reboot after fdisk when it told you to"); - goto error; - } - - sbi->s_mount_state = SB_REISERFS_STATE(s); - sbi->s_mount_state = REISERFS_VALID_FS ; - - if (old_format ? read_old_bitmaps(s) : read_bitmaps(s)) { - SWARN(silent, s, "jmacd-8: reiserfs_fill_super: unable to read bitmap"); - goto error; - } + struct inode *root_inode; + int j; + struct reiserfs_transaction_handle th; + int old_format = 0; + unsigned long blocks; + unsigned int commit_max_age = 0; + int jinit_done = 0; + struct reiserfs_iget_args args; + struct reiserfs_super_block *rs; + char *jdev_name; + struct reiserfs_sb_info *sbi; + int errval = -EINVAL; + + sbi = kmalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); + if (!sbi) { + errval = -ENOMEM; + goto error; + } + s->s_fs_info = sbi; + memset(sbi, 0, sizeof(struct reiserfs_sb_info)); + /* Set default values for options: non-aggressive tails, RO on errors */ + REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL); + REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ERROR_RO); + /* no preallocation minimum, be smart in + reiserfs_file_write instead */ + REISERFS_SB(s)->s_alloc_options.preallocmin = 0; + /* Preallocate by 16 blocks (17-1) at once */ + REISERFS_SB(s)->s_alloc_options.preallocsize = 17; + /* Initialize the rwsem for xattr dir */ + init_rwsem(&REISERFS_SB(s)->xattr_dir_sem); + + /* setup default block allocator options */ + reiserfs_init_alloc_options(s); + + jdev_name = NULL; + if (reiserfs_parse_options + (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name, + &commit_max_age) == 0) { + goto error; + } + + if (blocks) { + SWARN(silent, s, "jmacd-7: reiserfs_fill_super: resize option " + "for remount only"); + goto error; + } + + /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */ + if (!read_super_block(s, REISERFS_OLD_DISK_OFFSET_IN_BYTES)) + old_format = 1; + /* try new format (64-th 1k block), which can contain reiserfs super block */ + else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) { + SWARN(silent, s, + "sh-2021: reiserfs_fill_super: can not find reiserfs on %s", + reiserfs_bdevname(s)); + goto error; + } + + rs = SB_DISK_SUPER_BLOCK(s); + /* Let's do basic sanity check to verify that underlying device is not + smaller than the filesystem. If the check fails then abort and scream, + because bad stuff will happen otherwise. */ + if (s->s_bdev && s->s_bdev->bd_inode + && i_size_read(s->s_bdev->bd_inode) < + sb_block_count(rs) * sb_blocksize(rs)) { + SWARN(silent, s, + "Filesystem on %s cannot be mounted because it is bigger than the device", + reiserfs_bdevname(s)); + SWARN(silent, s, + "You may need to run fsck or increase size of your LVM partition"); + SWARN(silent, s, + "Or may be you forgot to reboot after fdisk when it told you to"); + goto error; + } + + sbi->s_mount_state = SB_REISERFS_STATE(s); + sbi->s_mount_state = REISERFS_VALID_FS; + + if (old_format ? read_old_bitmaps(s) : read_bitmaps(s)) { + SWARN(silent, s, + "jmacd-8: reiserfs_fill_super: unable to read bitmap"); + goto error; + } #ifdef CONFIG_REISERFS_CHECK - SWARN (silent, s, "CONFIG_REISERFS_CHECK is set ON"); - SWARN (silent, s, "- it is slow mode for debugging."); + SWARN(silent, s, "CONFIG_REISERFS_CHECK is set ON"); + SWARN(silent, s, "- it is slow mode for debugging."); #endif - /* make data=ordered the default */ - if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) && - !reiserfs_data_writeback(s)) - { - REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED); - } - - if (reiserfs_data_log(s)) { - reiserfs_info (s, "using journaled data mode\n"); - } else if (reiserfs_data_ordered(s)) { - reiserfs_info (s, "using ordered data mode\n"); - } else { - reiserfs_info (s, "using writeback data mode\n"); - } - if (reiserfs_barrier_flush(s)) { - printk("reiserfs: using flush barriers\n"); - } - - // set_device_ro(s->s_dev, 1) ; - if( journal_init(s, jdev_name, old_format, commit_max_age) ) { - SWARN(silent, s, "sh-2022: reiserfs_fill_super: unable to initialize journal space") ; - goto error ; - } else { - jinit_done = 1 ; /* once this is set, journal_release must be called - ** if we error out of the mount - */ - } - if (reread_meta_blocks(s)) { - SWARN(silent, s, "jmacd-9: reiserfs_fill_super: unable to reread meta blocks after journal init") ; - goto error ; - } - - if (replay_only (s)) - goto error; - - if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) { - SWARN(silent, s, "clm-7000: Detected readonly device, marking FS readonly") ; - s->s_flags |= MS_RDONLY ; - } - args.objectid = REISERFS_ROOT_OBJECTID ; - args.dirid = REISERFS_ROOT_PARENT_OBJECTID ; - root_inode = iget5_locked (s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args)); - if (!root_inode) { - SWARN(silent, s, "jmacd-10: reiserfs_fill_super: get root inode failed"); - goto error; - } - - if (root_inode->i_state & I_NEW) { - reiserfs_read_locked_inode(root_inode, &args); - unlock_new_inode(root_inode); - } - - s->s_root = d_alloc_root(root_inode); - if (!s->s_root) { - iput(root_inode); - goto error; - } - - // define and initialize hash function - sbi->s_hash_function = hash_function (s); - if (sbi->s_hash_function == NULL) { - dput(s->s_root) ; - s->s_root = NULL ; - goto error ; - } - - if (is_reiserfs_3_5 (rs) || (is_reiserfs_jr (rs) && SB_VERSION (s) == REISERFS_VERSION_1)) - set_bit(REISERFS_3_5, &(sbi->s_properties)); - else - set_bit(REISERFS_3_6, &(sbi->s_properties)); - - if (!(s->s_flags & MS_RDONLY)) { - - errval = journal_begin(&th, s, 1) ; - if (errval) { - dput (s->s_root); - s->s_root = NULL; - goto error; - } - reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; - - set_sb_umount_state( rs, REISERFS_ERROR_FS ); - set_sb_fs_state (rs, 0); - - if (old_format_only(s)) { - /* filesystem of format 3.5 either with standard or non-standard - journal */ - if (convert_reiserfs (s)) { - /* and -o conv is given */ - if(!silent) - reiserfs_info (s,"converting 3.5 filesystem to the 3.6 format") ; - - if (is_reiserfs_3_5 (rs)) - /* put magic string of 3.6 format. 2.2 will not be able to - mount this filesystem anymore */ - memcpy (rs->s_v1.s_magic, reiserfs_3_6_magic_string, - sizeof (reiserfs_3_6_magic_string)); - - set_sb_version(rs,REISERFS_VERSION_2); - reiserfs_convert_objectid_map_v1(s) ; - set_bit(REISERFS_3_6, &(sbi->s_properties)); - clear_bit(REISERFS_3_5, &(sbi->s_properties)); - } else if (!silent){ - reiserfs_info (s, "using 3.5.x disk format\n") ; - } - } - - journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); - errval = journal_end(&th, s, 1) ; - if (errval) { - dput (s->s_root); - s->s_root = NULL; - goto error; - } - - if ((errval = reiserfs_xattr_init (s, s->s_flags))) { - dput (s->s_root); - s->s_root = NULL; - goto error; - } - - /* look for files which were to be removed in previous session */ - finish_unfinished (s); - } else { - if ( old_format_only(s) && !silent) { - reiserfs_info (s, "using 3.5.x disk format\n") ; - } - - if ((errval = reiserfs_xattr_init (s, s->s_flags))) { - dput (s->s_root); - s->s_root = NULL; - goto error; - } - } - // mark hash in super block: it could be unset. overwrite should be ok - set_sb_hash_function_code( rs, function2code(sbi->s_hash_function ) ); - - handle_attrs( s ); - - reiserfs_proc_info_init( s ); - - init_waitqueue_head (&(sbi->s_wait)); - spin_lock_init(&sbi->bitmap_lock); - - return (0); - - error: - if (jinit_done) { /* kill the commit thread, free journal ram */ - journal_release_error(NULL, s) ; - } - if (SB_DISK_SUPER_BLOCK (s)) { - for (j = 0; j < SB_BMAP_NR (s); j ++) { - if (SB_AP_BITMAP (s)) - brelse (SB_AP_BITMAP (s)[j].bh); - } - if (SB_AP_BITMAP (s)) - vfree (SB_AP_BITMAP (s)); - } - if (SB_BUFFER_WITH_SB (s)) - brelse(SB_BUFFER_WITH_SB (s)); + /* make data=ordered the default */ + if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) && + !reiserfs_data_writeback(s)) { + REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED); + } + + if (reiserfs_data_log(s)) { + reiserfs_info(s, "using journaled data mode\n"); + } else if (reiserfs_data_ordered(s)) { + reiserfs_info(s, "using ordered data mode\n"); + } else { + reiserfs_info(s, "using writeback data mode\n"); + } + if (reiserfs_barrier_flush(s)) { + printk("reiserfs: using flush barriers\n"); + } + // set_device_ro(s->s_dev, 1) ; + if (journal_init(s, jdev_name, old_format, commit_max_age)) { + SWARN(silent, s, + "sh-2022: reiserfs_fill_super: unable to initialize journal space"); + goto error; + } else { + jinit_done = 1; /* once this is set, journal_release must be called + ** if we error out of the mount + */ + } + if (reread_meta_blocks(s)) { + SWARN(silent, s, + "jmacd-9: reiserfs_fill_super: unable to reread meta blocks after journal init"); + goto error; + } + + if (replay_only(s)) + goto error; + + if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) { + SWARN(silent, s, + "clm-7000: Detected readonly device, marking FS readonly"); + s->s_flags |= MS_RDONLY; + } + args.objectid = REISERFS_ROOT_OBJECTID; + args.dirid = REISERFS_ROOT_PARENT_OBJECTID; + root_inode = + iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor, + reiserfs_init_locked_inode, (void *)(&args)); + if (!root_inode) { + SWARN(silent, s, + "jmacd-10: reiserfs_fill_super: get root inode failed"); + goto error; + } + + if (root_inode->i_state & I_NEW) { + reiserfs_read_locked_inode(root_inode, &args); + unlock_new_inode(root_inode); + } + + s->s_root = d_alloc_root(root_inode); + if (!s->s_root) { + iput(root_inode); + goto error; + } + // define and initialize hash function + sbi->s_hash_function = hash_function(s); + if (sbi->s_hash_function == NULL) { + dput(s->s_root); + s->s_root = NULL; + goto error; + } + + if (is_reiserfs_3_5(rs) + || (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1)) + set_bit(REISERFS_3_5, &(sbi->s_properties)); + else + set_bit(REISERFS_3_6, &(sbi->s_properties)); + + if (!(s->s_flags & MS_RDONLY)) { + + errval = journal_begin(&th, s, 1); + if (errval) { + dput(s->s_root); + s->s_root = NULL; + goto error; + } + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); + + set_sb_umount_state(rs, REISERFS_ERROR_FS); + set_sb_fs_state(rs, 0); + + if (old_format_only(s)) { + /* filesystem of format 3.5 either with standard or non-standard + journal */ + if (convert_reiserfs(s)) { + /* and -o conv is given */ + if (!silent) + reiserfs_info(s, + "converting 3.5 filesystem to the 3.6 format"); + + if (is_reiserfs_3_5(rs)) + /* put magic string of 3.6 format. 2.2 will not be able to + mount this filesystem anymore */ + memcpy(rs->s_v1.s_magic, + reiserfs_3_6_magic_string, + sizeof + (reiserfs_3_6_magic_string)); + + set_sb_version(rs, REISERFS_VERSION_2); + reiserfs_convert_objectid_map_v1(s); + set_bit(REISERFS_3_6, &(sbi->s_properties)); + clear_bit(REISERFS_3_5, &(sbi->s_properties)); + } else if (!silent) { + reiserfs_info(s, "using 3.5.x disk format\n"); + } + } + + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + errval = journal_end(&th, s, 1); + if (errval) { + dput(s->s_root); + s->s_root = NULL; + goto error; + } + + if ((errval = reiserfs_xattr_init(s, s->s_flags))) { + dput(s->s_root); + s->s_root = NULL; + goto error; + } + + /* look for files which were to be removed in previous session */ + finish_unfinished(s); + } else { + if (old_format_only(s) && !silent) { + reiserfs_info(s, "using 3.5.x disk format\n"); + } + + if ((errval = reiserfs_xattr_init(s, s->s_flags))) { + dput(s->s_root); + s->s_root = NULL; + goto error; + } + } + // mark hash in super block: it could be unset. overwrite should be ok + set_sb_hash_function_code(rs, function2code(sbi->s_hash_function)); + + handle_attrs(s); + + reiserfs_proc_info_init(s); + + init_waitqueue_head(&(sbi->s_wait)); + spin_lock_init(&sbi->bitmap_lock); + + return (0); + + error: + if (jinit_done) { /* kill the commit thread, free journal ram */ + journal_release_error(NULL, s); + } + if (SB_DISK_SUPER_BLOCK(s)) { + for (j = 0; j < SB_BMAP_NR(s); j++) { + if (SB_AP_BITMAP(s)) + brelse(SB_AP_BITMAP(s)[j].bh); + } + if (SB_AP_BITMAP(s)) + vfree(SB_AP_BITMAP(s)); + } + if (SB_BUFFER_WITH_SB(s)) + brelse(SB_BUFFER_WITH_SB(s)); #ifdef CONFIG_QUOTA - for (j = 0; j < MAXQUOTAS; j++) { - if (sbi->s_qf_names[j]) - kfree(sbi->s_qf_names[j]); - } + for (j = 0; j < MAXQUOTAS; j++) { + if (sbi->s_qf_names[j]) + kfree(sbi->s_qf_names[j]); + } #endif - if (sbi != NULL) { - kfree(sbi); - } + if (sbi != NULL) { + kfree(sbi); + } - s->s_fs_info = NULL; - return errval; + s->s_fs_info = NULL; + return errval; } - -static int reiserfs_statfs (struct super_block * s, struct kstatfs * buf) +static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf) { - struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s); - - buf->f_namelen = (REISERFS_MAX_NAME (s->s_blocksize)); - buf->f_bfree = sb_free_blocks(rs); - buf->f_bavail = buf->f_bfree; - buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1; - buf->f_bsize = s->s_blocksize; - /* changed to accommodate gcc folks.*/ - buf->f_type = REISERFS_SUPER_MAGIC; - return 0; + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); + + buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize)); + buf->f_bfree = sb_free_blocks(rs); + buf->f_bavail = buf->f_bfree; + buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1; + buf->f_bsize = s->s_blocksize; + /* changed to accommodate gcc folks. */ + buf->f_type = REISERFS_SUPER_MAGIC; + return 0; } #ifdef CONFIG_QUOTA static int reiserfs_dquot_initialize(struct inode *inode, int type) { - struct reiserfs_transaction_handle th; - int ret; - - /* We may create quota structure so we need to reserve enough blocks */ - reiserfs_write_lock(inode->i_sb); - journal_begin(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS); - ret = dquot_initialize(inode, type); - journal_end(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS); - reiserfs_write_unlock(inode->i_sb); - return ret; + struct reiserfs_transaction_handle th; + int ret, err; + + /* We may create quota structure so we need to reserve enough blocks */ + reiserfs_write_lock(inode->i_sb); + ret = + journal_begin(&th, inode->i_sb, + 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb)); + if (ret) + goto out; + ret = dquot_initialize(inode, type); + err = + journal_end(&th, inode->i_sb, + 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb)); + if (!ret && err) + ret = err; + out: + reiserfs_write_unlock(inode->i_sb); + return ret; } static int reiserfs_dquot_drop(struct inode *inode) { - struct reiserfs_transaction_handle th; - int ret; - - /* We may delete quota structure so we need to reserve enough blocks */ - reiserfs_write_lock(inode->i_sb); - journal_begin(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS); - ret = dquot_drop(inode); - journal_end(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS); - reiserfs_write_unlock(inode->i_sb); - return ret; + struct reiserfs_transaction_handle th; + int ret, err; + + /* We may delete quota structure so we need to reserve enough blocks */ + reiserfs_write_lock(inode->i_sb); + ret = + journal_begin(&th, inode->i_sb, + 2 * REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)); + if (ret) + goto out; + ret = dquot_drop(inode); + err = + journal_end(&th, inode->i_sb, + 2 * REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)); + if (!ret && err) + ret = err; + out: + reiserfs_write_unlock(inode->i_sb); + return ret; } static int reiserfs_write_dquot(struct dquot *dquot) { - struct reiserfs_transaction_handle th; - int ret; - - reiserfs_write_lock(dquot->dq_sb); - journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS); - ret = dquot_commit(dquot); - journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS); - reiserfs_write_unlock(dquot->dq_sb); - return ret; + struct reiserfs_transaction_handle th; + int ret, err; + + reiserfs_write_lock(dquot->dq_sb); + ret = + journal_begin(&th, dquot->dq_sb, + REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); + if (ret) + goto out; + ret = dquot_commit(dquot); + err = + journal_end(&th, dquot->dq_sb, + REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); + if (!ret && err) + ret = err; + out: + reiserfs_write_unlock(dquot->dq_sb); + return ret; } static int reiserfs_acquire_dquot(struct dquot *dquot) { - struct reiserfs_transaction_handle th; - int ret; - - reiserfs_write_lock(dquot->dq_sb); - journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS); - ret = dquot_acquire(dquot); - journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS); - reiserfs_write_unlock(dquot->dq_sb); - return ret; + struct reiserfs_transaction_handle th; + int ret, err; + + reiserfs_write_lock(dquot->dq_sb); + ret = + journal_begin(&th, dquot->dq_sb, + REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); + if (ret) + goto out; + ret = dquot_acquire(dquot); + err = + journal_end(&th, dquot->dq_sb, + REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb)); + if (!ret && err) + ret = err; + out: + reiserfs_write_unlock(dquot->dq_sb); + return ret; } static int reiserfs_release_dquot(struct dquot *dquot) { - struct reiserfs_transaction_handle th; - int ret; - - reiserfs_write_lock(dquot->dq_sb); - journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS); - ret = dquot_release(dquot); - journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS); - reiserfs_write_unlock(dquot->dq_sb); - return ret; + struct reiserfs_transaction_handle th; + int ret, err; + + reiserfs_write_lock(dquot->dq_sb); + ret = + journal_begin(&th, dquot->dq_sb, + REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + if (ret) + goto out; + ret = dquot_release(dquot); + err = + journal_end(&th, dquot->dq_sb, + REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + if (!ret && err) + ret = err; + out: + reiserfs_write_unlock(dquot->dq_sb); + return ret; } static int reiserfs_mark_dquot_dirty(struct dquot *dquot) { - /* Are we journalling quotas? */ - if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || - REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { - dquot_mark_dquot_dirty(dquot); - return reiserfs_write_dquot(dquot); - } - else - return dquot_mark_dquot_dirty(dquot); + /* Are we journalling quotas? */ + if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] || + REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) { + dquot_mark_dquot_dirty(dquot); + return reiserfs_write_dquot(dquot); + } else + return dquot_mark_dquot_dirty(dquot); } static int reiserfs_write_info(struct super_block *sb, int type) { - struct reiserfs_transaction_handle th; - int ret; - - /* Data block + inode block */ - reiserfs_write_lock(sb); - journal_begin(&th, sb, 2); - ret = dquot_commit_info(sb, type); - journal_end(&th, sb, 2); - reiserfs_write_unlock(sb); - return ret; + struct reiserfs_transaction_handle th; + int ret, err; + + /* Data block + inode block */ + reiserfs_write_lock(sb); + ret = journal_begin(&th, sb, 2); + if (ret) + goto out; + ret = dquot_commit_info(sb, type); + err = journal_end(&th, sb, 2); + if (!ret && err) + ret = err; + out: + reiserfs_write_unlock(sb); + return ret; } /* - * Turn on quotas during mount time - we need to find - * the quota file and such... + * Turn on quotas during mount time - we need to find the quota file and such... */ static int reiserfs_quota_on_mount(struct super_block *sb, int type) { - int err; - struct dentry *dentry; - struct qstr name = { .name = REISERFS_SB(sb)->s_qf_names[type], - .hash = 0, - .len = strlen(REISERFS_SB(sb)->s_qf_names[type])}; - - dentry = lookup_hash(&name, sb->s_root); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - err = vfs_quota_on_mount(type, REISERFS_SB(sb)->s_jquota_fmt, dentry); - /* Now invalidate and put the dentry - quota got its own reference - * to inode and dentry has at least wrong hash so we had better - * throw it away */ - d_invalidate(dentry); - dput(dentry); - return err; + return vfs_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type], + REISERFS_SB(sb)->s_jquota_fmt, type); } /* * Standard function to be called on quota_on */ -static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, char *path) +static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, + char *path) { - int err; - struct nameidata nd; - - err = path_lookup(path, LOOKUP_FOLLOW, &nd); - if (err) - return err; - /* Quotafile not on the same filesystem? */ - if (nd.mnt->mnt_sb != sb) { - path_release(&nd); - return -EXDEV; - } - /* We must not pack tails for quota files on reiserfs for quota IO to work */ - if (!REISERFS_I(nd.dentry->d_inode)->i_flags & i_nopack_mask) { - reiserfs_warning(sb, "reiserfs: Quota file must have tail packing disabled."); - path_release(&nd); - return -EINVAL; - } - /* Not journalling quota? No more tests needed... */ - if (!REISERFS_SB(sb)->s_qf_names[USRQUOTA] && - !REISERFS_SB(sb)->s_qf_names[GRPQUOTA]) { + int err; + struct nameidata nd; + + if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) + return -EINVAL; + err = path_lookup(path, LOOKUP_FOLLOW, &nd); + if (err) + return err; + /* Quotafile not on the same filesystem? */ + if (nd.mnt->mnt_sb != sb) { + path_release(&nd); + return -EXDEV; + } + /* We must not pack tails for quota files on reiserfs for quota IO to work */ + if (!REISERFS_I(nd.dentry->d_inode)->i_flags & i_nopack_mask) { + reiserfs_warning(sb, + "reiserfs: Quota file must have tail packing disabled."); + path_release(&nd); + return -EINVAL; + } + /* Not journalling quota? No more tests needed... */ + if (!REISERFS_SB(sb)->s_qf_names[USRQUOTA] && + !REISERFS_SB(sb)->s_qf_names[GRPQUOTA]) { + path_release(&nd); + return vfs_quota_on(sb, type, format_id, path); + } + /* Quotafile not of fs root? */ + if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode) + reiserfs_warning(sb, + "reiserfs: Quota file not on filesystem root. " + "Journalled quota will not work."); path_release(&nd); - return vfs_quota_on(sb, type, format_id, path); - } - /* Quotafile not of fs root? */ - if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode) - reiserfs_warning(sb, "reiserfs: Quota file not on filesystem root. " - "Journalled quota will not work."); - path_release(&nd); - return vfs_quota_on(sb, type, format_id, path); + return vfs_quota_on(sb, type, format_id, path); } /* Read data from quotafile - avoid pagecache and such because we cannot afford @@ -1998,42 +2167,44 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, ch static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off) { - struct inode *inode = sb_dqopt(sb)->files[type]; - unsigned long blk = off >> sb->s_blocksize_bits; - int err = 0, offset = off & (sb->s_blocksize - 1), tocopy; - size_t toread; - struct buffer_head tmp_bh, *bh; - loff_t i_size = i_size_read(inode); - - if (off > i_size) - return 0; - if (off+len > i_size) - len = i_size-off; - toread = len; - while (toread > 0) { - tocopy = sb->s_blocksize - offset < toread ? sb->s_blocksize - offset : toread; - tmp_bh.b_state = 0; - /* Quota files are without tails so we can safely use this function */ - reiserfs_write_lock(sb); - err = reiserfs_get_block(inode, blk, &tmp_bh, 0); - reiserfs_write_unlock(sb); - if (err) - return err; - if (!buffer_mapped(&tmp_bh)) /* A hole? */ - memset(data, 0, tocopy); - else { - bh = sb_bread(sb, tmp_bh.b_blocknr); - if (!bh) - return -EIO; - memcpy(data, bh->b_data+offset, tocopy); - brelse(bh); - } - offset = 0; - toread -= tocopy; - data += tocopy; - blk++; - } - return len; + struct inode *inode = sb_dqopt(sb)->files[type]; + unsigned long blk = off >> sb->s_blocksize_bits; + int err = 0, offset = off & (sb->s_blocksize - 1), tocopy; + size_t toread; + struct buffer_head tmp_bh, *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + tocopy = + sb->s_blocksize - offset < + toread ? sb->s_blocksize - offset : toread; + tmp_bh.b_state = 0; + /* Quota files are without tails so we can safely use this function */ + reiserfs_write_lock(sb); + err = reiserfs_get_block(inode, blk, &tmp_bh, 0); + reiserfs_write_unlock(sb); + if (err) + return err; + if (!buffer_mapped(&tmp_bh)) /* A hole? */ + memset(data, 0, tocopy); + else { + bh = sb_bread(sb, tmp_bh.b_blocknr); + if (!bh) + return -EIO; + memcpy(data, bh->b_data + offset, tocopy); + brelse(bh); + } + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; } /* Write to quotafile (we know the transaction is already started and has @@ -2041,117 +2212,116 @@ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, static ssize_t reiserfs_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off) { - struct inode *inode = sb_dqopt(sb)->files[type]; - unsigned long blk = off >> sb->s_blocksize_bits; - int err = 0, offset = off & (sb->s_blocksize - 1), tocopy; - int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL; - size_t towrite = len; - struct buffer_head tmp_bh, *bh; - - down(&inode->i_sem); - while (towrite > 0) { - tocopy = sb->s_blocksize - offset < towrite ? - sb->s_blocksize - offset : towrite; - tmp_bh.b_state = 0; - err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE); - if (err) - goto out; - if (offset || tocopy != sb->s_blocksize) - bh = sb_bread(sb, tmp_bh.b_blocknr); - else - bh = sb_getblk(sb, tmp_bh.b_blocknr); - if (!bh) { - err = -EIO; - goto out; - } - lock_buffer(bh); - memcpy(bh->b_data+offset, data, tocopy); - flush_dcache_page(bh->b_page); - set_buffer_uptodate(bh); - unlock_buffer(bh); - reiserfs_prepare_for_journal(sb, bh, 1); - journal_mark_dirty(current->journal_info, sb, bh); - if (!journal_quota) - reiserfs_add_ordered_list(inode, bh); - brelse(bh); - offset = 0; - towrite -= tocopy; - data += tocopy; - blk++; - } -out: - if (len == towrite) - return err; - if (inode->i_size < off+len-towrite) - i_size_write(inode, off+len-towrite); - inode->i_version++; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); - up(&inode->i_sem); - return len - towrite; + struct inode *inode = sb_dqopt(sb)->files[type]; + unsigned long blk = off >> sb->s_blocksize_bits; + int err = 0, offset = off & (sb->s_blocksize - 1), tocopy; + int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL; + size_t towrite = len; + struct buffer_head tmp_bh, *bh; + + down(&inode->i_sem); + while (towrite > 0) { + tocopy = sb->s_blocksize - offset < towrite ? + sb->s_blocksize - offset : towrite; + tmp_bh.b_state = 0; + err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE); + if (err) + goto out; + if (offset || tocopy != sb->s_blocksize) + bh = sb_bread(sb, tmp_bh.b_blocknr); + else + bh = sb_getblk(sb, tmp_bh.b_blocknr); + if (!bh) { + err = -EIO; + goto out; + } + lock_buffer(bh); + memcpy(bh->b_data + offset, data, tocopy); + flush_dcache_page(bh->b_page); + set_buffer_uptodate(bh); + unlock_buffer(bh); + reiserfs_prepare_for_journal(sb, bh, 1); + journal_mark_dirty(current->journal_info, sb, bh); + if (!journal_quota) + reiserfs_add_ordered_list(inode, bh); + brelse(bh); + offset = 0; + towrite -= tocopy; + data += tocopy; + blk++; + } + out: + if (len == towrite) + return err; + if (inode->i_size < off + len - towrite) + i_size_write(inode, off + len - towrite); + inode->i_version++; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + up(&inode->i_sem); + return len - towrite; } #endif -static struct super_block* -get_super_block (struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +static struct super_block *get_super_block(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) { return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super); } -static int __init -init_reiserfs_fs ( void ) +static int __init init_reiserfs_fs(void) { int ret; - if ((ret = init_inodecache ())) { + if ((ret = init_inodecache())) { return ret; } - if ((ret = reiserfs_xattr_register_handlers ())) - goto failed_reiserfs_xattr_register_handlers; + if ((ret = reiserfs_xattr_register_handlers())) + goto failed_reiserfs_xattr_register_handlers; - reiserfs_proc_info_global_init (); - reiserfs_proc_register_global ("version", reiserfs_global_version_in_proc); + reiserfs_proc_info_global_init(); + reiserfs_proc_register_global("version", + reiserfs_global_version_in_proc); - ret = register_filesystem (& reiserfs_fs_type); + ret = register_filesystem(&reiserfs_fs_type); if (ret == 0) { return 0; } - reiserfs_xattr_unregister_handlers (); + reiserfs_xattr_unregister_handlers(); -failed_reiserfs_xattr_register_handlers: - reiserfs_proc_unregister_global ("version"); - reiserfs_proc_info_global_done (); - destroy_inodecache (); + failed_reiserfs_xattr_register_handlers: + reiserfs_proc_unregister_global("version"); + reiserfs_proc_info_global_done(); + destroy_inodecache(); return ret; } -static void __exit -exit_reiserfs_fs ( void ) +static void __exit exit_reiserfs_fs(void) { - reiserfs_xattr_unregister_handlers (); - reiserfs_proc_unregister_global ("version"); - reiserfs_proc_info_global_done (); - unregister_filesystem (& reiserfs_fs_type); - destroy_inodecache (); + reiserfs_xattr_unregister_handlers(); + reiserfs_proc_unregister_global("version"); + reiserfs_proc_info_global_done(); + unregister_filesystem(&reiserfs_fs_type); + destroy_inodecache(); } struct file_system_type reiserfs_fs_type = { - .owner = THIS_MODULE, - .name = "reiserfs", - .get_sb = get_super_block, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .owner = THIS_MODULE, + .name = "reiserfs", + .get_sb = get_super_block, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, }; -MODULE_DESCRIPTION ("ReiserFS journaled filesystem"); -MODULE_AUTHOR ("Hans Reiser <reiser@namesys.com>"); -MODULE_LICENSE ("GPL"); +MODULE_DESCRIPTION("ReiserFS journaled filesystem"); +MODULE_AUTHOR("Hans Reiser <reiser@namesys.com>"); +MODULE_LICENSE("GPL"); -module_init (init_reiserfs_fs); -module_exit (exit_reiserfs_fs); +module_init(init_reiserfs_fs); +module_exit(exit_reiserfs_fs); diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c index 6191909d516..c92e124f628 100644 --- a/fs/reiserfs/tail_conversion.c +++ b/fs/reiserfs/tail_conversion.c @@ -11,156 +11,159 @@ /* access to tail : when one is going to read tail it must make sure, that is not running. direct2indirect and indirect2direct can not run concurrently */ - /* Converts direct items to an unformatted node. Panics if file has no tail. -ENOSPC if no disk space for conversion */ /* path points to first direct item of the file regarless of how many of them are there */ -int direct2indirect (struct reiserfs_transaction_handle *th, struct inode * inode, - struct path * path, struct buffer_head * unbh, - loff_t tail_offset) +int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode, + struct path *path, struct buffer_head *unbh, + loff_t tail_offset) { - struct super_block * sb = inode->i_sb; - struct buffer_head *up_to_date_bh ; - struct item_head * p_le_ih = PATH_PITEM_HEAD (path); - unsigned long total_tail = 0 ; - struct cpu_key end_key; /* Key to search for the last byte of the - converted item. */ - struct item_head ind_ih; /* new indirect item to be inserted or - key of unfm pointer to be pasted */ - int n_blk_size, - n_retval; /* returned value for reiserfs_insert_item and clones */ - unp_t unfm_ptr; /* Handle on an unformatted node - that will be inserted in the - tree. */ - - BUG_ON (!th->t_trans_id); - - REISERFS_SB(sb)->s_direct2indirect ++; - - n_blk_size = sb->s_blocksize; - - /* and key to search for append or insert pointer to the new - unformatted node. */ - copy_item_head (&ind_ih, p_le_ih); - set_le_ih_k_offset (&ind_ih, tail_offset); - set_le_ih_k_type (&ind_ih, TYPE_INDIRECT); - - /* Set the key to search for the place for new unfm pointer */ - make_cpu_key (&end_key, inode, tail_offset, TYPE_INDIRECT, 4); - - // FIXME: we could avoid this - if ( search_for_position_by_key (sb, &end_key, path) == POSITION_FOUND ) { - reiserfs_warning (sb, "PAP-14030: direct2indirect: " - "pasted or inserted byte exists in the tree %K. " - "Use fsck to repair.", &end_key); - pathrelse(path); - return -EIO; - } - - p_le_ih = PATH_PITEM_HEAD (path); - - unfm_ptr = cpu_to_le32 (unbh->b_blocknr); - - if ( is_statdata_le_ih (p_le_ih) ) { - /* Insert new indirect item. */ - set_ih_free_space (&ind_ih, 0); /* delete at nearest future */ - put_ih_item_len( &ind_ih, UNFM_P_SIZE ); - PATH_LAST_POSITION (path)++; - n_retval = reiserfs_insert_item (th, path, &end_key, &ind_ih, inode, + struct super_block *sb = inode->i_sb; + struct buffer_head *up_to_date_bh; + struct item_head *p_le_ih = PATH_PITEM_HEAD(path); + unsigned long total_tail = 0; + struct cpu_key end_key; /* Key to search for the last byte of the + converted item. */ + struct item_head ind_ih; /* new indirect item to be inserted or + key of unfm pointer to be pasted */ + int n_blk_size, n_retval; /* returned value for reiserfs_insert_item and clones */ + unp_t unfm_ptr; /* Handle on an unformatted node + that will be inserted in the + tree. */ + + BUG_ON(!th->t_trans_id); + + REISERFS_SB(sb)->s_direct2indirect++; + + n_blk_size = sb->s_blocksize; + + /* and key to search for append or insert pointer to the new + unformatted node. */ + copy_item_head(&ind_ih, p_le_ih); + set_le_ih_k_offset(&ind_ih, tail_offset); + set_le_ih_k_type(&ind_ih, TYPE_INDIRECT); + + /* Set the key to search for the place for new unfm pointer */ + make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4); + + // FIXME: we could avoid this + if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) { + reiserfs_warning(sb, "PAP-14030: direct2indirect: " + "pasted or inserted byte exists in the tree %K. " + "Use fsck to repair.", &end_key); + pathrelse(path); + return -EIO; + } + + p_le_ih = PATH_PITEM_HEAD(path); + + unfm_ptr = cpu_to_le32(unbh->b_blocknr); + + if (is_statdata_le_ih(p_le_ih)) { + /* Insert new indirect item. */ + set_ih_free_space(&ind_ih, 0); /* delete at nearest future */ + put_ih_item_len(&ind_ih, UNFM_P_SIZE); + PATH_LAST_POSITION(path)++; + n_retval = + reiserfs_insert_item(th, path, &end_key, &ind_ih, inode, (char *)&unfm_ptr); - } else { - /* Paste into last indirect item of an object. */ - n_retval = reiserfs_paste_into_item(th, path, &end_key, inode, - (char *)&unfm_ptr, UNFM_P_SIZE); - } - if ( n_retval ) { - return n_retval; - } - - // note: from here there are two keys which have matching first - // three key components. They only differ by the fourth one. - - - /* Set the key to search for the direct items of the file */ - make_cpu_key (&end_key, inode, max_reiserfs_offset (inode), TYPE_DIRECT, 4); - - /* Move bytes from the direct items to the new unformatted node - and delete them. */ - while (1) { - int tail_size; - - /* end_key.k_offset is set so, that we will always have found - last item of the file */ - if ( search_for_position_by_key (sb, &end_key, path) == POSITION_FOUND ) - reiserfs_panic (sb, "PAP-14050: direct2indirect: " - "direct item (%K) not found", &end_key); - p_le_ih = PATH_PITEM_HEAD (path); - RFALSE( !is_direct_le_ih (p_le_ih), - "vs-14055: direct item expected(%K), found %h", - &end_key, p_le_ih); - tail_size = (le_ih_k_offset (p_le_ih) & (n_blk_size - 1)) - + ih_item_len(p_le_ih) - 1; - - /* we only send the unbh pointer if the buffer is not up to date. - ** this avoids overwriting good data from writepage() with old data - ** from the disk or buffer cache - ** Special case: unbh->b_page will be NULL if we are coming through - ** DIRECT_IO handler here. - */ - if (!unbh->b_page || buffer_uptodate(unbh) || PageUptodate(unbh->b_page)) { - up_to_date_bh = NULL ; } else { - up_to_date_bh = unbh ; + /* Paste into last indirect item of an object. */ + n_retval = reiserfs_paste_into_item(th, path, &end_key, inode, + (char *)&unfm_ptr, + UNFM_P_SIZE); } - n_retval = reiserfs_delete_item (th, path, &end_key, inode, - up_to_date_bh) ; - - total_tail += n_retval ; - if (tail_size == n_retval) - // done: file does not have direct items anymore - break; - - } - /* if we've copied bytes from disk into the page, we need to zero - ** out the unused part of the block (it was not up to date before) - */ - if (up_to_date_bh) { - unsigned pgoff = (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1); - char *kaddr=kmap_atomic(up_to_date_bh->b_page, KM_USER0); - memset(kaddr + pgoff, 0, n_blk_size - total_tail) ; - kunmap_atomic(kaddr, KM_USER0); - } - - REISERFS_I(inode)->i_first_direct_byte = U32_MAX; - - return 0; -} + if (n_retval) { + return n_retval; + } + // note: from here there are two keys which have matching first + // three key components. They only differ by the fourth one. + + /* Set the key to search for the direct items of the file */ + make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT, + 4); + + /* Move bytes from the direct items to the new unformatted node + and delete them. */ + while (1) { + int tail_size; + + /* end_key.k_offset is set so, that we will always have found + last item of the file */ + if (search_for_position_by_key(sb, &end_key, path) == + POSITION_FOUND) + reiserfs_panic(sb, + "PAP-14050: direct2indirect: " + "direct item (%K) not found", &end_key); + p_le_ih = PATH_PITEM_HEAD(path); + RFALSE(!is_direct_le_ih(p_le_ih), + "vs-14055: direct item expected(%K), found %h", + &end_key, p_le_ih); + tail_size = (le_ih_k_offset(p_le_ih) & (n_blk_size - 1)) + + ih_item_len(p_le_ih) - 1; + + /* we only send the unbh pointer if the buffer is not up to date. + ** this avoids overwriting good data from writepage() with old data + ** from the disk or buffer cache + ** Special case: unbh->b_page will be NULL if we are coming through + ** DIRECT_IO handler here. + */ + if (!unbh->b_page || buffer_uptodate(unbh) + || PageUptodate(unbh->b_page)) { + up_to_date_bh = NULL; + } else { + up_to_date_bh = unbh; + } + n_retval = reiserfs_delete_item(th, path, &end_key, inode, + up_to_date_bh); + + total_tail += n_retval; + if (tail_size == n_retval) + // done: file does not have direct items anymore + break; + } + /* if we've copied bytes from disk into the page, we need to zero + ** out the unused part of the block (it was not up to date before) + */ + if (up_to_date_bh) { + unsigned pgoff = + (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1); + char *kaddr = kmap_atomic(up_to_date_bh->b_page, KM_USER0); + memset(kaddr + pgoff, 0, n_blk_size - total_tail); + kunmap_atomic(kaddr, KM_USER0); + } + + REISERFS_I(inode)->i_first_direct_byte = U32_MAX; + + return 0; +} /* stolen from fs/buffer.c */ -void reiserfs_unmap_buffer(struct buffer_head *bh) { - lock_buffer(bh) ; - if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { - BUG() ; - } - clear_buffer_dirty(bh) ; - /* Remove the buffer from whatever list it belongs to. We are mostly - interested in removing it from per-sb j_dirty_buffers list, to avoid - BUG() on attempt to write not mapped buffer */ - if ( (!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) { - struct inode *inode = bh->b_page->mapping->host; - struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); - spin_lock(&j->j_dirty_buffers_lock); - list_del_init(&bh->b_assoc_buffers); - reiserfs_free_jh(bh); - spin_unlock(&j->j_dirty_buffers_lock); - } - clear_buffer_mapped(bh) ; - clear_buffer_req(bh) ; - clear_buffer_new(bh); - bh->b_bdev = NULL; - unlock_buffer(bh) ; +void reiserfs_unmap_buffer(struct buffer_head *bh) +{ + lock_buffer(bh); + if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { + BUG(); + } + clear_buffer_dirty(bh); + /* Remove the buffer from whatever list it belongs to. We are mostly + interested in removing it from per-sb j_dirty_buffers list, to avoid + BUG() on attempt to write not mapped buffer */ + if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) { + struct inode *inode = bh->b_page->mapping->host; + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); + spin_lock(&j->j_dirty_buffers_lock); + list_del_init(&bh->b_assoc_buffers); + reiserfs_free_jh(bh); + spin_unlock(&j->j_dirty_buffers_lock); + } + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + bh->b_bdev = NULL; + unlock_buffer(bh); } /* this first locks inode (neither reads nor sync are permitted), @@ -169,108 +172,108 @@ void reiserfs_unmap_buffer(struct buffer_head *bh) { what we expect from it (number of cut bytes). But when tail remains in the unformatted node, we set mode to SKIP_BALANCING and unlock inode */ -int indirect2direct (struct reiserfs_transaction_handle *th, - struct inode * p_s_inode, - struct page *page, - struct path * p_s_path, /* path to the indirect item. */ - const struct cpu_key * p_s_item_key, /* Key to look for unformatted node pointer to be cut. */ - loff_t n_new_file_size, /* New file size. */ - char * p_c_mode) +int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_inode, struct page *page, struct path *p_s_path, /* path to the indirect item. */ + const struct cpu_key *p_s_item_key, /* Key to look for unformatted node pointer to be cut. */ + loff_t n_new_file_size, /* New file size. */ + char *p_c_mode) { - struct super_block * p_s_sb = p_s_inode->i_sb; - struct item_head s_ih; - unsigned long n_block_size = p_s_sb->s_blocksize; - char * tail; - int tail_len, round_tail_len; - loff_t pos, pos1; /* position of first byte of the tail */ - struct cpu_key key; + struct super_block *p_s_sb = p_s_inode->i_sb; + struct item_head s_ih; + unsigned long n_block_size = p_s_sb->s_blocksize; + char *tail; + int tail_len, round_tail_len; + loff_t pos, pos1; /* position of first byte of the tail */ + struct cpu_key key; - BUG_ON (!th->t_trans_id); + BUG_ON(!th->t_trans_id); - REISERFS_SB(p_s_sb)->s_indirect2direct ++; + REISERFS_SB(p_s_sb)->s_indirect2direct++; - *p_c_mode = M_SKIP_BALANCING; + *p_c_mode = M_SKIP_BALANCING; - /* store item head path points to. */ - copy_item_head (&s_ih, PATH_PITEM_HEAD(p_s_path)); - - tail_len = (n_new_file_size & (n_block_size - 1)); - if (get_inode_sd_version (p_s_inode) == STAT_DATA_V2) - round_tail_len = ROUND_UP (tail_len); - else - round_tail_len = tail_len; - - pos = le_ih_k_offset (&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE - 1) * p_s_sb->s_blocksize; - pos1 = pos; - - // we are protected by i_sem. The tail can not disapper, not - // append can be done either - // we are in truncate or packing tail in file_release - - tail = (char *)kmap(page) ; /* this can schedule */ - - if (path_changed (&s_ih, p_s_path)) { - /* re-search indirect item */ - if ( search_for_position_by_key (p_s_sb, p_s_item_key, p_s_path) == POSITION_NOT_FOUND ) - reiserfs_panic(p_s_sb, "PAP-5520: indirect2direct: " - "item to be converted %K does not exist", p_s_item_key); + /* store item head path points to. */ copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); + + tail_len = (n_new_file_size & (n_block_size - 1)); + if (get_inode_sd_version(p_s_inode) == STAT_DATA_V2) + round_tail_len = ROUND_UP(tail_len); + else + round_tail_len = tail_len; + + pos = + le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE - + 1) * p_s_sb->s_blocksize; + pos1 = pos; + + // we are protected by i_sem. The tail can not disapper, not + // append can be done either + // we are in truncate or packing tail in file_release + + tail = (char *)kmap(page); /* this can schedule */ + + if (path_changed(&s_ih, p_s_path)) { + /* re-search indirect item */ + if (search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) + == POSITION_NOT_FOUND) + reiserfs_panic(p_s_sb, + "PAP-5520: indirect2direct: " + "item to be converted %K does not exist", + p_s_item_key); + copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); #ifdef CONFIG_REISERFS_CHECK - pos = le_ih_k_offset (&s_ih) - 1 + - (ih_item_len(&s_ih) / UNFM_P_SIZE - 1) * p_s_sb->s_blocksize; - if (pos != pos1) - reiserfs_panic (p_s_sb, "vs-5530: indirect2direct: " - "tail position changed while we were reading it"); + pos = le_ih_k_offset(&s_ih) - 1 + + (ih_item_len(&s_ih) / UNFM_P_SIZE - + 1) * p_s_sb->s_blocksize; + if (pos != pos1) + reiserfs_panic(p_s_sb, "vs-5530: indirect2direct: " + "tail position changed while we were reading it"); #endif - } - - - /* Set direct item header to insert. */ - make_le_item_head (&s_ih, NULL, get_inode_item_key_version (p_s_inode), pos1 + 1, - TYPE_DIRECT, round_tail_len, 0xffff/*ih_free_space*/); - - /* we want a pointer to the first byte of the tail in the page. - ** the page was locked and this part of the page was up to date when - ** indirect2direct was called, so we know the bytes are still valid - */ - tail = tail + (pos & (PAGE_CACHE_SIZE - 1)) ; - - PATH_LAST_POSITION(p_s_path)++; - - key = *p_s_item_key; - set_cpu_key_k_type (&key, TYPE_DIRECT); - key.key_length = 4; - /* Insert tail as new direct item in the tree */ - if ( reiserfs_insert_item(th, p_s_path, &key, &s_ih, p_s_inode, - tail ? tail : NULL) < 0 ) { - /* No disk memory. So we can not convert last unformatted node - to the direct item. In this case we used to adjust - indirect items's ih_free_space. Now ih_free_space is not - used, it would be ideal to write zeros to corresponding - unformatted node. For now i_size is considered as guard for - going out of file size */ - kunmap(page) ; - return n_block_size - round_tail_len; - } - kunmap(page) ; - - /* make sure to get the i_blocks changes from reiserfs_insert_item */ - reiserfs_update_sd(th, p_s_inode); + } - // note: we have now the same as in above direct2indirect - // conversion: there are two keys which have matching first three - // key components. They only differ by the fouhth one. + /* Set direct item header to insert. */ + make_le_item_head(&s_ih, NULL, get_inode_item_key_version(p_s_inode), + pos1 + 1, TYPE_DIRECT, round_tail_len, + 0xffff /*ih_free_space */ ); + + /* we want a pointer to the first byte of the tail in the page. + ** the page was locked and this part of the page was up to date when + ** indirect2direct was called, so we know the bytes are still valid + */ + tail = tail + (pos & (PAGE_CACHE_SIZE - 1)); + + PATH_LAST_POSITION(p_s_path)++; + + key = *p_s_item_key; + set_cpu_key_k_type(&key, TYPE_DIRECT); + key.key_length = 4; + /* Insert tail as new direct item in the tree */ + if (reiserfs_insert_item(th, p_s_path, &key, &s_ih, p_s_inode, + tail ? tail : NULL) < 0) { + /* No disk memory. So we can not convert last unformatted node + to the direct item. In this case we used to adjust + indirect items's ih_free_space. Now ih_free_space is not + used, it would be ideal to write zeros to corresponding + unformatted node. For now i_size is considered as guard for + going out of file size */ + kunmap(page); + return n_block_size - round_tail_len; + } + kunmap(page); - /* We have inserted new direct item and must remove last - unformatted node. */ - *p_c_mode = M_CUT; + /* make sure to get the i_blocks changes from reiserfs_insert_item */ + reiserfs_update_sd(th, p_s_inode); - /* we store position of first direct item in the in-core inode */ - //mark_file_with_tail (p_s_inode, pos1 + 1); - REISERFS_I(p_s_inode)->i_first_direct_byte = pos1 + 1; - - return n_block_size - round_tail_len; -} + // note: we have now the same as in above direct2indirect + // conversion: there are two keys which have matching first three + // key components. They only differ by the fouhth one. + /* We have inserted new direct item and must remove last + unformatted node. */ + *p_c_mode = M_CUT; + /* we store position of first direct item in the in-core inode */ + //mark_file_with_tail (p_s_inode, pos1 + 1); + REISERFS_I(p_s_inode)->i_first_direct_byte = pos1 + 1; + return n_block_size - round_tail_len; +} diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 45582fe8b46..87ac9dc8b38 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -39,7 +39,6 @@ #include <linux/xattr.h> #include <linux/reiserfs_xattr.h> #include <linux/reiserfs_acl.h> -#include <linux/mbcache.h> #include <asm/uaccess.h> #include <asm/checksum.h> #include <linux/smp_lock.h> @@ -51,67 +50,68 @@ #define PRIVROOT_NAME ".reiserfs_priv" #define XAROOT_NAME "xattrs" -static struct reiserfs_xattr_handler *find_xattr_handler_prefix (const char *prefix); +static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char + *prefix); -static struct dentry * -create_xa_root (struct super_block *sb) +static struct dentry *create_xa_root(struct super_block *sb) { - struct dentry *privroot = dget (REISERFS_SB(sb)->priv_root); - struct dentry *xaroot; - - /* This needs to be created at mount-time */ - if (!privroot) - return ERR_PTR(-EOPNOTSUPP); - - xaroot = lookup_one_len (XAROOT_NAME, privroot, strlen (XAROOT_NAME)); - if (IS_ERR (xaroot)) { - goto out; - } else if (!xaroot->d_inode) { - int err; - down (&privroot->d_inode->i_sem); - err = privroot->d_inode->i_op->mkdir (privroot->d_inode, xaroot, 0700); - up (&privroot->d_inode->i_sem); - - if (err) { - dput (xaroot); - dput (privroot); - return ERR_PTR (err); - } - REISERFS_SB(sb)->xattr_root = dget (xaroot); - } - -out: - dput (privroot); - return xaroot; + struct dentry *privroot = dget(REISERFS_SB(sb)->priv_root); + struct dentry *xaroot; + + /* This needs to be created at mount-time */ + if (!privroot) + return ERR_PTR(-EOPNOTSUPP); + + xaroot = lookup_one_len(XAROOT_NAME, privroot, strlen(XAROOT_NAME)); + if (IS_ERR(xaroot)) { + goto out; + } else if (!xaroot->d_inode) { + int err; + down(&privroot->d_inode->i_sem); + err = + privroot->d_inode->i_op->mkdir(privroot->d_inode, xaroot, + 0700); + up(&privroot->d_inode->i_sem); + + if (err) { + dput(xaroot); + dput(privroot); + return ERR_PTR(err); + } + REISERFS_SB(sb)->xattr_root = dget(xaroot); + } + + out: + dput(privroot); + return xaroot; } /* This will return a dentry, or error, refering to the xa root directory. * If the xa root doesn't exist yet, the dentry will be returned without * an associated inode. This dentry can be used with ->mkdir to create * the xa directory. */ -static struct dentry * -__get_xa_root (struct super_block *s) +static struct dentry *__get_xa_root(struct super_block *s) { - struct dentry *privroot = dget (REISERFS_SB(s)->priv_root); - struct dentry *xaroot = NULL; - - if (IS_ERR (privroot) || !privroot) - return privroot; - - xaroot = lookup_one_len (XAROOT_NAME, privroot, strlen (XAROOT_NAME)); - if (IS_ERR (xaroot)) { - goto out; - } else if (!xaroot->d_inode) { - dput (xaroot); - xaroot = NULL; - goto out; - } - - REISERFS_SB(s)->xattr_root = dget (xaroot); - -out: - dput (privroot); - return xaroot; + struct dentry *privroot = dget(REISERFS_SB(s)->priv_root); + struct dentry *xaroot = NULL; + + if (IS_ERR(privroot) || !privroot) + return privroot; + + xaroot = lookup_one_len(XAROOT_NAME, privroot, strlen(XAROOT_NAME)); + if (IS_ERR(xaroot)) { + goto out; + } else if (!xaroot->d_inode) { + dput(xaroot); + xaroot = NULL; + goto out; + } + + REISERFS_SB(s)->xattr_root = dget(xaroot); + + out: + dput(privroot); + return xaroot; } /* Returns the dentry (or NULL) referring to the root of the extended @@ -119,147 +119,145 @@ out: * Otherwise, we attempt to retreive it from disk. It may also return * a pointer-encoded error. */ -static inline struct dentry * -get_xa_root (struct super_block *s) +static inline struct dentry *get_xa_root(struct super_block *s) { - struct dentry *dentry = dget (REISERFS_SB(s)->xattr_root); + struct dentry *dentry = dget(REISERFS_SB(s)->xattr_root); - if (!dentry) - dentry = __get_xa_root (s); + if (!dentry) + dentry = __get_xa_root(s); - return dentry; + return dentry; } /* Opens the directory corresponding to the inode's extended attribute store. * If flags allow, the tree to the directory may be created. If creation is * prohibited, -ENODATA is returned. */ -static struct dentry * -open_xa_dir (const struct inode *inode, int flags) +static struct dentry *open_xa_dir(const struct inode *inode, int flags) { - struct dentry *xaroot, *xadir; - char namebuf[17]; - - xaroot = get_xa_root (inode->i_sb); - if (IS_ERR (xaroot)) { - return xaroot; - } else if (!xaroot) { - if (flags == 0 || flags & XATTR_CREATE) { - xaroot = create_xa_root (inode->i_sb); - if (IS_ERR (xaroot)) - return xaroot; - } - if (!xaroot) - return ERR_PTR (-ENODATA); - } - - /* ok, we have xaroot open */ - - snprintf (namebuf, sizeof (namebuf), "%X.%X", - le32_to_cpu (INODE_PKEY (inode)->k_objectid), - inode->i_generation); - xadir = lookup_one_len (namebuf, xaroot, strlen (namebuf)); - if (IS_ERR (xadir)) { - dput (xaroot); - return xadir; - } - - if (!xadir->d_inode) { - int err; - if (flags == 0 || flags & XATTR_CREATE) { - /* Although there is nothing else trying to create this directory, - * another directory with the same hash may be created, so we need - * to protect against that */ - err = xaroot->d_inode->i_op->mkdir (xaroot->d_inode, xadir, 0700); - if (err) { - dput (xaroot); - dput (xadir); - return ERR_PTR (err); - } - } - if (!xadir->d_inode) { - dput (xaroot); - dput (xadir); - return ERR_PTR (-ENODATA); - } - } - - dput (xaroot); - return xadir; + struct dentry *xaroot, *xadir; + char namebuf[17]; + + xaroot = get_xa_root(inode->i_sb); + if (IS_ERR(xaroot)) { + return xaroot; + } else if (!xaroot) { + if (flags == 0 || flags & XATTR_CREATE) { + xaroot = create_xa_root(inode->i_sb); + if (IS_ERR(xaroot)) + return xaroot; + } + if (!xaroot) + return ERR_PTR(-ENODATA); + } + + /* ok, we have xaroot open */ + + snprintf(namebuf, sizeof(namebuf), "%X.%X", + le32_to_cpu(INODE_PKEY(inode)->k_objectid), + inode->i_generation); + xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf)); + if (IS_ERR(xadir)) { + dput(xaroot); + return xadir; + } + + if (!xadir->d_inode) { + int err; + if (flags == 0 || flags & XATTR_CREATE) { + /* Although there is nothing else trying to create this directory, + * another directory with the same hash may be created, so we need + * to protect against that */ + err = + xaroot->d_inode->i_op->mkdir(xaroot->d_inode, xadir, + 0700); + if (err) { + dput(xaroot); + dput(xadir); + return ERR_PTR(err); + } + } + if (!xadir->d_inode) { + dput(xaroot); + dput(xadir); + return ERR_PTR(-ENODATA); + } + } + + dput(xaroot); + return xadir; } /* Returns a dentry corresponding to a specific extended attribute file * for the inode. If flags allow, the file is created. Otherwise, a * valid or negative dentry, or an error is returned. */ -static struct dentry * -get_xa_file_dentry (const struct inode *inode, const char *name, int flags) +static struct dentry *get_xa_file_dentry(const struct inode *inode, + const char *name, int flags) { - struct dentry *xadir, *xafile; - int err = 0; - - xadir = open_xa_dir (inode, flags); - if (IS_ERR (xadir)) { - return ERR_PTR (PTR_ERR (xadir)); - } else if (xadir && !xadir->d_inode) { - dput (xadir); - return ERR_PTR (-ENODATA); - } - - xafile = lookup_one_len (name, xadir, strlen (name)); - if (IS_ERR (xafile)) { - dput (xadir); - return ERR_PTR (PTR_ERR (xafile)); - } - - if (xafile->d_inode) { /* file exists */ - if (flags & XATTR_CREATE) { - err = -EEXIST; - dput (xafile); - goto out; - } - } else if (flags & XATTR_REPLACE || flags & FL_READONLY) { - goto out; - } else { - /* inode->i_sem is down, so nothing else can try to create - * the same xattr */ - err = xadir->d_inode->i_op->create (xadir->d_inode, xafile, - 0700|S_IFREG, NULL); - - if (err) { - dput (xafile); - goto out; - } - } - -out: - dput (xadir); - if (err) - xafile = ERR_PTR (err); - return xafile; -} + struct dentry *xadir, *xafile; + int err = 0; + + xadir = open_xa_dir(inode, flags); + if (IS_ERR(xadir)) { + return ERR_PTR(PTR_ERR(xadir)); + } else if (xadir && !xadir->d_inode) { + dput(xadir); + return ERR_PTR(-ENODATA); + } + + xafile = lookup_one_len(name, xadir, strlen(name)); + if (IS_ERR(xafile)) { + dput(xadir); + return ERR_PTR(PTR_ERR(xafile)); + } + + if (xafile->d_inode) { /* file exists */ + if (flags & XATTR_CREATE) { + err = -EEXIST; + dput(xafile); + goto out; + } + } else if (flags & XATTR_REPLACE || flags & FL_READONLY) { + goto out; + } else { + /* inode->i_sem is down, so nothing else can try to create + * the same xattr */ + err = xadir->d_inode->i_op->create(xadir->d_inode, xafile, + 0700 | S_IFREG, NULL); + + if (err) { + dput(xafile); + goto out; + } + } + out: + dput(xadir); + if (err) + xafile = ERR_PTR(err); + return xafile; +} /* Opens a file pointer to the attribute associated with inode */ -static struct file * -open_xa_file (const struct inode *inode, const char *name, int flags) +static struct file *open_xa_file(const struct inode *inode, const char *name, + int flags) { - struct dentry *xafile; - struct file *fp; - - xafile = get_xa_file_dentry (inode, name, flags); - if (IS_ERR (xafile)) - return ERR_PTR (PTR_ERR (xafile)); - else if (!xafile->d_inode) { - dput (xafile); - return ERR_PTR (-ENODATA); - } + struct dentry *xafile; + struct file *fp; + + xafile = get_xa_file_dentry(inode, name, flags); + if (IS_ERR(xafile)) + return ERR_PTR(PTR_ERR(xafile)); + else if (!xafile->d_inode) { + dput(xafile); + return ERR_PTR(-ENODATA); + } - fp = dentry_open (xafile, NULL, O_RDWR); - /* dentry_open dputs the dentry if it fails */ + fp = dentry_open(xafile, NULL, O_RDWR); + /* dentry_open dputs the dentry if it fails */ - return fp; + return fp; } - /* * this is very similar to fs/reiserfs/dir.c:reiserfs_readdir, but * we need to drop the path before calling the filldir struct. That @@ -273,139 +271,146 @@ open_xa_file (const struct inode *inode, const char *name, int flags) * we're called with i_sem held, so there are no worries about the directory * changing underneath us. */ -static int __xattr_readdir(struct file * filp, void * dirent, filldir_t filldir) +static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; - struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ - INITIALIZE_PATH (path_to_entry); - struct buffer_head * bh; - int entry_num; - struct item_head * ih, tmp_ih; - int search_res; - char * local_buf; - loff_t next_pos; - char small_buf[32] ; /* avoid kmalloc if we can */ - struct reiserfs_de_head *deh; - int d_reclen; - char * d_name; - off_t d_off; - ino_t d_ino; - struct reiserfs_dir_entry de; - - - /* form key for search the next directory entry using f_pos field of - file structure */ - next_pos = max_reiserfs_offset(inode); - - while (1) { -research: - if (next_pos <= DOT_DOT_OFFSET) - break; - make_cpu_key (&pos_key, inode, next_pos, TYPE_DIRENTRY, 3); - - search_res = search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry, &de); - if (search_res == IO_ERROR) { - // FIXME: we could just skip part of directory which could - // not be read - pathrelse(&path_to_entry); - return -EIO; - } - - if (search_res == NAME_NOT_FOUND) - de.de_entry_num--; + struct inode *inode = filp->f_dentry->d_inode; + struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ + INITIALIZE_PATH(path_to_entry); + struct buffer_head *bh; + int entry_num; + struct item_head *ih, tmp_ih; + int search_res; + char *local_buf; + loff_t next_pos; + char small_buf[32]; /* avoid kmalloc if we can */ + struct reiserfs_de_head *deh; + int d_reclen; + char *d_name; + off_t d_off; + ino_t d_ino; + struct reiserfs_dir_entry de; + + /* form key for search the next directory entry using f_pos field of + file structure */ + next_pos = max_reiserfs_offset(inode); + + while (1) { + research: + if (next_pos <= DOT_DOT_OFFSET) + break; + make_cpu_key(&pos_key, inode, next_pos, TYPE_DIRENTRY, 3); + + search_res = + search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry, + &de); + if (search_res == IO_ERROR) { + // FIXME: we could just skip part of directory which could + // not be read + pathrelse(&path_to_entry); + return -EIO; + } - set_de_name_and_namelen(&de); - entry_num = de.de_entry_num; - deh = &(de.de_deh[entry_num]); + if (search_res == NAME_NOT_FOUND) + de.de_entry_num--; - bh = de.de_bh; - ih = de.de_ih; + set_de_name_and_namelen(&de); + entry_num = de.de_entry_num; + deh = &(de.de_deh[entry_num]); - if (!is_direntry_le_ih(ih)) { - reiserfs_warning(inode->i_sb, "not direntry %h", ih); - break; - } - copy_item_head(&tmp_ih, ih); + bh = de.de_bh; + ih = de.de_ih; - /* we must have found item, that is item of this directory, */ - RFALSE( COMP_SHORT_KEYS (&(ih->ih_key), &pos_key), - "vs-9000: found item %h does not match to dir we readdir %K", - ih, &pos_key); + if (!is_direntry_le_ih(ih)) { + reiserfs_warning(inode->i_sb, "not direntry %h", ih); + break; + } + copy_item_head(&tmp_ih, ih); - if (deh_offset(deh) <= DOT_DOT_OFFSET) { - break; - } + /* we must have found item, that is item of this directory, */ + RFALSE(COMP_SHORT_KEYS(&(ih->ih_key), &pos_key), + "vs-9000: found item %h does not match to dir we readdir %K", + ih, &pos_key); - /* look for the previous entry in the directory */ - next_pos = deh_offset (deh) - 1; + if (deh_offset(deh) <= DOT_DOT_OFFSET) { + break; + } - if (!de_visible (deh)) - /* it is hidden entry */ - continue; + /* look for the previous entry in the directory */ + next_pos = deh_offset(deh) - 1; - d_reclen = entry_length(bh, ih, entry_num); - d_name = B_I_DEH_ENTRY_FILE_NAME (bh, ih, deh); - d_off = deh_offset (deh); - d_ino = deh_objectid (deh); + if (!de_visible(deh)) + /* it is hidden entry */ + continue; - if (!d_name[d_reclen - 1]) - d_reclen = strlen (d_name); + d_reclen = entry_length(bh, ih, entry_num); + d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh); + d_off = deh_offset(deh); + d_ino = deh_objectid(deh); - if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)){ - /* too big to send back to VFS */ - continue ; - } + if (!d_name[d_reclen - 1]) + d_reclen = strlen(d_name); - /* Ignore the .reiserfs_priv entry */ - if (reiserfs_xattrs (inode->i_sb) && - !old_format_only(inode->i_sb) && - deh_objectid (deh) == le32_to_cpu (INODE_PKEY(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->k_objectid)) - continue; + if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)) { + /* too big to send back to VFS */ + continue; + } - if (d_reclen <= 32) { - local_buf = small_buf ; - } else { - local_buf = reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb) ; - if (!local_buf) { - pathrelse (&path_to_entry); - return -ENOMEM ; - } - if (item_moved (&tmp_ih, &path_to_entry)) { - reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; - - /* sigh, must retry. Do this same offset again */ - next_pos = d_off; - goto research; - } - } + /* Ignore the .reiserfs_priv entry */ + if (reiserfs_xattrs(inode->i_sb) && + !old_format_only(inode->i_sb) && + deh_objectid(deh) == + le32_to_cpu(INODE_PKEY + (REISERFS_SB(inode->i_sb)->priv_root->d_inode)-> + k_objectid)) + continue; + + if (d_reclen <= 32) { + local_buf = small_buf; + } else { + local_buf = + reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb); + if (!local_buf) { + pathrelse(&path_to_entry); + return -ENOMEM; + } + if (item_moved(&tmp_ih, &path_to_entry)) { + reiserfs_kfree(local_buf, d_reclen, + inode->i_sb); + + /* sigh, must retry. Do this same offset again */ + next_pos = d_off; + goto research; + } + } - // Note, that we copy name to user space via temporary - // buffer (local_buf) because filldir will block if - // user space buffer is swapped out. At that time - // entry can move to somewhere else - memcpy (local_buf, d_name, d_reclen); + // Note, that we copy name to user space via temporary + // buffer (local_buf) because filldir will block if + // user space buffer is swapped out. At that time + // entry can move to somewhere else + memcpy(local_buf, d_name, d_reclen); - /* the filldir function might need to start transactions, - * or do who knows what. Release the path now that we've - * copied all the important stuff out of the deh - */ - pathrelse (&path_to_entry); - - if (filldir (dirent, local_buf, d_reclen, d_off, d_ino, - DT_UNKNOWN) < 0) { - if (local_buf != small_buf) { - reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; - } - goto end; - } - if (local_buf != small_buf) { - reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; - } - } /* while */ + /* the filldir function might need to start transactions, + * or do who knows what. Release the path now that we've + * copied all the important stuff out of the deh + */ + pathrelse(&path_to_entry); + + if (filldir(dirent, local_buf, d_reclen, d_off, d_ino, + DT_UNKNOWN) < 0) { + if (local_buf != small_buf) { + reiserfs_kfree(local_buf, d_reclen, + inode->i_sb); + } + goto end; + } + if (local_buf != small_buf) { + reiserfs_kfree(local_buf, d_reclen, inode->i_sb); + } + } /* while */ -end: - pathrelse (&path_to_entry); - return 0; + end: + pathrelse(&path_to_entry); + return 0; } /* @@ -417,63 +422,59 @@ end: static int xattr_readdir(struct file *file, filldir_t filler, void *buf) { - struct inode *inode = file->f_dentry->d_inode; - int res = -ENOTDIR; - if (!file->f_op || !file->f_op->readdir) - goto out; - down(&inode->i_sem); + struct inode *inode = file->f_dentry->d_inode; + int res = -ENOTDIR; + if (!file->f_op || !file->f_op->readdir) + goto out; + down(&inode->i_sem); // down(&inode->i_zombie); - res = -ENOENT; - if (!IS_DEADDIR(inode)) { - lock_kernel(); - res = __xattr_readdir(file, buf, filler); - unlock_kernel(); - } + res = -ENOENT; + if (!IS_DEADDIR(inode)) { + lock_kernel(); + res = __xattr_readdir(file, buf, filler); + unlock_kernel(); + } // up(&inode->i_zombie); - up(&inode->i_sem); -out: - return res; + up(&inode->i_sem); + out: + return res; } - /* Internal operations on file data */ -static inline void -reiserfs_put_page(struct page *page) +static inline void reiserfs_put_page(struct page *page) { - kunmap(page); - page_cache_release(page); + kunmap(page); + page_cache_release(page); } -static struct page * -reiserfs_get_page(struct inode *dir, unsigned long n) +static struct page *reiserfs_get_page(struct inode *dir, unsigned long n) { - struct address_space *mapping = dir->i_mapping; - struct page *page; - /* We can deadlock if we try to free dentries, - and an unlink/rmdir has just occured - GFP_NOFS avoids this */ - mapping->flags = (mapping->flags & ~__GFP_BITS_MASK) | GFP_NOFS; - page = read_cache_page (mapping, n, - (filler_t*)mapping->a_ops->readpage, NULL); - if (!IS_ERR(page)) { - wait_on_page_locked(page); - kmap(page); - if (!PageUptodate(page)) - goto fail; - - if (PageError(page)) - goto fail; - } - return page; - -fail: - reiserfs_put_page(page); - return ERR_PTR(-EIO); + struct address_space *mapping = dir->i_mapping; + struct page *page; + /* We can deadlock if we try to free dentries, + and an unlink/rmdir has just occured - GFP_NOFS avoids this */ + mapping->flags = (mapping->flags & ~__GFP_BITS_MASK) | GFP_NOFS; + page = read_cache_page(mapping, n, + (filler_t *) mapping->a_ops->readpage, NULL); + if (!IS_ERR(page)) { + wait_on_page_locked(page); + kmap(page); + if (!PageUptodate(page)) + goto fail; + + if (PageError(page)) + goto fail; + } + return page; + + fail: + reiserfs_put_page(page); + return ERR_PTR(-EIO); } -static inline __u32 -xattr_hash (const char *msg, int len) +static inline __u32 xattr_hash(const char *msg, int len) { - return csum_partial (msg, len, 0); + return csum_partial(msg, len, 0); } /* Generic extended attribute operations that can be used by xa plugins */ @@ -482,294 +483,300 @@ xattr_hash (const char *msg, int len) * inode->i_sem: down */ int -reiserfs_xattr_set (struct inode *inode, const char *name, const void *buffer, - size_t buffer_size, int flags) +reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer, + size_t buffer_size, int flags) { - int err = 0; - struct file *fp; - struct page *page; - char *data; - struct address_space *mapping; - size_t file_pos = 0; - size_t buffer_pos = 0; - struct inode *xinode; - struct iattr newattrs; - __u32 xahash = 0; - - if (IS_RDONLY (inode)) - return -EROFS; - - if (IS_IMMUTABLE (inode) || IS_APPEND (inode)) - return -EPERM; - - if (get_inode_sd_version (inode) == STAT_DATA_V1) - return -EOPNOTSUPP; - - /* Empty xattrs are ok, they're just empty files, no hash */ - if (buffer && buffer_size) - xahash = xattr_hash (buffer, buffer_size); - -open_file: - fp = open_xa_file (inode, name, flags); - if (IS_ERR (fp)) { - err = PTR_ERR (fp); - goto out; - } - - xinode = fp->f_dentry->d_inode; - REISERFS_I(inode)->i_flags |= i_has_xattr_dir; - - /* we need to copy it off.. */ - if (xinode->i_nlink > 1) { - fput(fp); - err = reiserfs_xattr_del (inode, name); - if (err < 0) - goto out; - /* We just killed the old one, we're not replacing anymore */ - if (flags & XATTR_REPLACE) - flags &= ~XATTR_REPLACE; - goto open_file; - } - - /* Resize it so we're ok to write there */ - newattrs.ia_size = buffer_size; - newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; - down (&xinode->i_sem); - err = notify_change(fp->f_dentry, &newattrs); - if (err) - goto out_filp; - - mapping = xinode->i_mapping; - while (buffer_pos < buffer_size || buffer_pos == 0) { - size_t chunk; - size_t skip = 0; - size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1)); - if (buffer_size - buffer_pos > PAGE_CACHE_SIZE) - chunk = PAGE_CACHE_SIZE; - else - chunk = buffer_size - buffer_pos; - - page = reiserfs_get_page (xinode, file_pos >> PAGE_CACHE_SHIFT); - if (IS_ERR (page)) { - err = PTR_ERR (page); - goto out_filp; - } - - lock_page (page); - data = page_address (page); - - if (file_pos == 0) { - struct reiserfs_xattr_header *rxh; - skip = file_pos = sizeof (struct reiserfs_xattr_header); - if (chunk + skip > PAGE_CACHE_SIZE) - chunk = PAGE_CACHE_SIZE - skip; - rxh = (struct reiserfs_xattr_header *)data; - rxh->h_magic = cpu_to_le32 (REISERFS_XATTR_MAGIC); - rxh->h_hash = cpu_to_le32 (xahash); - } - - err = mapping->a_ops->prepare_write (fp, page, page_offset, - page_offset + chunk + skip); - if (!err) { - if (buffer) - memcpy (data + skip, buffer + buffer_pos, chunk); - err = mapping->a_ops->commit_write (fp, page, page_offset, - page_offset + chunk + skip); + int err = 0; + struct file *fp; + struct page *page; + char *data; + struct address_space *mapping; + size_t file_pos = 0; + size_t buffer_pos = 0; + struct inode *xinode; + struct iattr newattrs; + __u32 xahash = 0; + + if (IS_RDONLY(inode)) + return -EROFS; + + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + + if (get_inode_sd_version(inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + /* Empty xattrs are ok, they're just empty files, no hash */ + if (buffer && buffer_size) + xahash = xattr_hash(buffer, buffer_size); + + open_file: + fp = open_xa_file(inode, name, flags); + if (IS_ERR(fp)) { + err = PTR_ERR(fp); + goto out; + } + + xinode = fp->f_dentry->d_inode; + REISERFS_I(inode)->i_flags |= i_has_xattr_dir; + + /* we need to copy it off.. */ + if (xinode->i_nlink > 1) { + fput(fp); + err = reiserfs_xattr_del(inode, name); + if (err < 0) + goto out; + /* We just killed the old one, we're not replacing anymore */ + if (flags & XATTR_REPLACE) + flags &= ~XATTR_REPLACE; + goto open_file; + } + + /* Resize it so we're ok to write there */ + newattrs.ia_size = buffer_size; + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; + down(&xinode->i_sem); + err = notify_change(fp->f_dentry, &newattrs); + if (err) + goto out_filp; + + mapping = xinode->i_mapping; + while (buffer_pos < buffer_size || buffer_pos == 0) { + size_t chunk; + size_t skip = 0; + size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1)); + if (buffer_size - buffer_pos > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE; + else + chunk = buffer_size - buffer_pos; + + page = reiserfs_get_page(xinode, file_pos >> PAGE_CACHE_SHIFT); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out_filp; + } + + lock_page(page); + data = page_address(page); + + if (file_pos == 0) { + struct reiserfs_xattr_header *rxh; + skip = file_pos = sizeof(struct reiserfs_xattr_header); + if (chunk + skip > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE - skip; + rxh = (struct reiserfs_xattr_header *)data; + rxh->h_magic = cpu_to_le32(REISERFS_XATTR_MAGIC); + rxh->h_hash = cpu_to_le32(xahash); + } + + err = mapping->a_ops->prepare_write(fp, page, page_offset, + page_offset + chunk + skip); + if (!err) { + if (buffer) + memcpy(data + skip, buffer + buffer_pos, chunk); + err = + mapping->a_ops->commit_write(fp, page, page_offset, + page_offset + chunk + + skip); + } + unlock_page(page); + reiserfs_put_page(page); + buffer_pos += chunk; + file_pos += chunk; + skip = 0; + if (err || buffer_size == 0 || !buffer) + break; + } + + /* We can't mark the inode dirty if it's not hashed. This is the case + * when we're inheriting the default ACL. If we dirty it, the inode + * gets marked dirty, but won't (ever) make it onto the dirty list until + * it's synced explicitly to clear I_DIRTY. This is bad. */ + if (!hlist_unhashed(&inode->i_hash)) { + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); } - unlock_page (page); - reiserfs_put_page (page); - buffer_pos += chunk; - file_pos += chunk; - skip = 0; - if (err || buffer_size == 0 || !buffer) - break; - } - - /* We can't mark the inode dirty if it's not hashed. This is the case - * when we're inheriting the default ACL. If we dirty it, the inode - * gets marked dirty, but won't (ever) make it onto the dirty list until - * it's synced explicitly to clear I_DIRTY. This is bad. */ - if (!hlist_unhashed(&inode->i_hash)) { - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty (inode); - } - -out_filp: - up (&xinode->i_sem); - fput(fp); - -out: - return err; + + out_filp: + up(&xinode->i_sem); + fput(fp); + + out: + return err; } /* * inode->i_sem: down */ int -reiserfs_xattr_get (const struct inode *inode, const char *name, void *buffer, - size_t buffer_size) +reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer, + size_t buffer_size) { - ssize_t err = 0; - struct file *fp; - size_t isize; - size_t file_pos = 0; - size_t buffer_pos = 0; - struct page *page; - struct inode *xinode; - __u32 hash = 0; - - if (name == NULL) - return -EINVAL; - - /* We can't have xattrs attached to v1 items since they don't have - * generation numbers */ - if (get_inode_sd_version (inode) == STAT_DATA_V1) - return -EOPNOTSUPP; - - fp = open_xa_file (inode, name, FL_READONLY); - if (IS_ERR (fp)) { - err = PTR_ERR (fp); - goto out; - } - - xinode = fp->f_dentry->d_inode; - isize = xinode->i_size; - REISERFS_I(inode)->i_flags |= i_has_xattr_dir; - - /* Just return the size needed */ - if (buffer == NULL) { - err = isize - sizeof (struct reiserfs_xattr_header); - goto out_dput; - } - - if (buffer_size < isize - sizeof (struct reiserfs_xattr_header)) { - err = -ERANGE; - goto out_dput; - } - - while (file_pos < isize) { - size_t chunk; - char *data; - size_t skip = 0; - if (isize - file_pos > PAGE_CACHE_SIZE) - chunk = PAGE_CACHE_SIZE; - else - chunk = isize - file_pos; - - page = reiserfs_get_page (xinode, file_pos >> PAGE_CACHE_SHIFT); - if (IS_ERR (page)) { - err = PTR_ERR (page); - goto out_dput; - } - - lock_page (page); - data = page_address (page); - if (file_pos == 0) { - struct reiserfs_xattr_header *rxh = - (struct reiserfs_xattr_header *)data; - skip = file_pos = sizeof (struct reiserfs_xattr_header); - chunk -= skip; - /* Magic doesn't match up.. */ - if (rxh->h_magic != cpu_to_le32 (REISERFS_XATTR_MAGIC)) { - unlock_page (page); - reiserfs_put_page (page); - reiserfs_warning (inode->i_sb, "Invalid magic for xattr (%s) " - "associated with %k", name, - INODE_PKEY (inode)); - err = -EIO; - goto out_dput; - } - hash = le32_to_cpu (rxh->h_hash); - } - memcpy (buffer + buffer_pos, data + skip, chunk); - unlock_page (page); - reiserfs_put_page (page); - file_pos += chunk; - buffer_pos += chunk; - skip = 0; - } - err = isize - sizeof (struct reiserfs_xattr_header); - - if (xattr_hash (buffer, isize - sizeof (struct reiserfs_xattr_header)) != hash) { - reiserfs_warning (inode->i_sb, "Invalid hash for xattr (%s) associated " - "with %k", name, INODE_PKEY (inode)); - err = -EIO; - } - -out_dput: - fput(fp); - -out: - return err; + ssize_t err = 0; + struct file *fp; + size_t isize; + size_t file_pos = 0; + size_t buffer_pos = 0; + struct page *page; + struct inode *xinode; + __u32 hash = 0; + + if (name == NULL) + return -EINVAL; + + /* We can't have xattrs attached to v1 items since they don't have + * generation numbers */ + if (get_inode_sd_version(inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + fp = open_xa_file(inode, name, FL_READONLY); + if (IS_ERR(fp)) { + err = PTR_ERR(fp); + goto out; + } + + xinode = fp->f_dentry->d_inode; + isize = xinode->i_size; + REISERFS_I(inode)->i_flags |= i_has_xattr_dir; + + /* Just return the size needed */ + if (buffer == NULL) { + err = isize - sizeof(struct reiserfs_xattr_header); + goto out_dput; + } + + if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) { + err = -ERANGE; + goto out_dput; + } + + while (file_pos < isize) { + size_t chunk; + char *data; + size_t skip = 0; + if (isize - file_pos > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE; + else + chunk = isize - file_pos; + + page = reiserfs_get_page(xinode, file_pos >> PAGE_CACHE_SHIFT); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out_dput; + } + + lock_page(page); + data = page_address(page); + if (file_pos == 0) { + struct reiserfs_xattr_header *rxh = + (struct reiserfs_xattr_header *)data; + skip = file_pos = sizeof(struct reiserfs_xattr_header); + chunk -= skip; + /* Magic doesn't match up.. */ + if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) { + unlock_page(page); + reiserfs_put_page(page); + reiserfs_warning(inode->i_sb, + "Invalid magic for xattr (%s) " + "associated with %k", name, + INODE_PKEY(inode)); + err = -EIO; + goto out_dput; + } + hash = le32_to_cpu(rxh->h_hash); + } + memcpy(buffer + buffer_pos, data + skip, chunk); + unlock_page(page); + reiserfs_put_page(page); + file_pos += chunk; + buffer_pos += chunk; + skip = 0; + } + err = isize - sizeof(struct reiserfs_xattr_header); + + if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) != + hash) { + reiserfs_warning(inode->i_sb, + "Invalid hash for xattr (%s) associated " + "with %k", name, INODE_PKEY(inode)); + err = -EIO; + } + + out_dput: + fput(fp); + + out: + return err; } static int -__reiserfs_xattr_del (struct dentry *xadir, const char *name, int namelen) +__reiserfs_xattr_del(struct dentry *xadir, const char *name, int namelen) { - struct dentry *dentry; - struct inode *dir = xadir->d_inode; - int err = 0; - - dentry = lookup_one_len (name, xadir, namelen); - if (IS_ERR (dentry)) { - err = PTR_ERR (dentry); - goto out; - } else if (!dentry->d_inode) { - err = -ENODATA; - goto out_file; - } - - /* Skip directories.. */ - if (S_ISDIR (dentry->d_inode->i_mode)) - goto out_file; - - if (!is_reiserfs_priv_object (dentry->d_inode)) { - reiserfs_warning (dir->i_sb, "OID %08x [%.*s/%.*s] doesn't have " - "priv flag set [parent is %sset].", - le32_to_cpu (INODE_PKEY (dentry->d_inode)->k_objectid), - xadir->d_name.len, xadir->d_name.name, namelen, name, - is_reiserfs_priv_object (xadir->d_inode) ? "" : "not "); - dput (dentry); - return -EIO; - } - - err = dir->i_op->unlink (dir, dentry); - if (!err) - d_delete (dentry); - -out_file: - dput (dentry); - -out: - return err; -} + struct dentry *dentry; + struct inode *dir = xadir->d_inode; + int err = 0; + + dentry = lookup_one_len(name, xadir, namelen); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; + } else if (!dentry->d_inode) { + err = -ENODATA; + goto out_file; + } + + /* Skip directories.. */ + if (S_ISDIR(dentry->d_inode->i_mode)) + goto out_file; + + if (!is_reiserfs_priv_object(dentry->d_inode)) { + reiserfs_warning(dir->i_sb, "OID %08x [%.*s/%.*s] doesn't have " + "priv flag set [parent is %sset].", + le32_to_cpu(INODE_PKEY(dentry->d_inode)-> + k_objectid), xadir->d_name.len, + xadir->d_name.name, namelen, name, + is_reiserfs_priv_object(xadir-> + d_inode) ? "" : + "not "); + dput(dentry); + return -EIO; + } + err = dir->i_op->unlink(dir, dentry); + if (!err) + d_delete(dentry); -int -reiserfs_xattr_del (struct inode *inode, const char *name) + out_file: + dput(dentry); + + out: + return err; +} + +int reiserfs_xattr_del(struct inode *inode, const char *name) { - struct dentry *dir; - int err; + struct dentry *dir; + int err; - if (IS_RDONLY (inode)) - return -EROFS; + if (IS_RDONLY(inode)) + return -EROFS; - dir = open_xa_dir (inode, FL_READONLY); - if (IS_ERR (dir)) { - err = PTR_ERR (dir); - goto out; - } + dir = open_xa_dir(inode, FL_READONLY); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto out; + } - err = __reiserfs_xattr_del (dir, name, strlen (name)); - dput (dir); + err = __reiserfs_xattr_del(dir, name, strlen(name)); + dput(dir); - if (!err) { - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty (inode); - } + if (!err) { + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + } -out: - return err; + out: + return err; } /* The following are side effects of other operations that aren't explicitly @@ -777,167 +784,163 @@ out: * or ownership changes, object deletions, etc. */ static int -reiserfs_delete_xattrs_filler (void *buf, const char *name, int namelen, - loff_t offset, ino_t ino, unsigned int d_type) +reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen, + loff_t offset, ino_t ino, unsigned int d_type) { - struct dentry *xadir = (struct dentry *)buf; + struct dentry *xadir = (struct dentry *)buf; - return __reiserfs_xattr_del (xadir, name, namelen); + return __reiserfs_xattr_del(xadir, name, namelen); } /* This is called w/ inode->i_sem downed */ -int -reiserfs_delete_xattrs (struct inode *inode) +int reiserfs_delete_xattrs(struct inode *inode) { - struct file *fp; - struct dentry *dir, *root; - int err = 0; - - /* Skip out, an xattr has no xattrs associated with it */ - if (is_reiserfs_priv_object (inode) || - get_inode_sd_version (inode) == STAT_DATA_V1 || - !reiserfs_xattrs(inode->i_sb)) - { - return 0; - } - reiserfs_read_lock_xattrs (inode->i_sb); - dir = open_xa_dir (inode, FL_READONLY); - reiserfs_read_unlock_xattrs (inode->i_sb); - if (IS_ERR (dir)) { - err = PTR_ERR (dir); - goto out; - } else if (!dir->d_inode) { - dput (dir); - return 0; - } - - fp = dentry_open (dir, NULL, O_RDWR); - if (IS_ERR (fp)) { - err = PTR_ERR (fp); - /* dentry_open dputs the dentry if it fails */ - goto out; - } - - lock_kernel (); - err = xattr_readdir (fp, reiserfs_delete_xattrs_filler, dir); - if (err) { - unlock_kernel (); - goto out_dir; - } - - /* Leftovers besides . and .. -- that's not good. */ - if (dir->d_inode->i_nlink <= 2) { - root = get_xa_root (inode->i_sb); - reiserfs_write_lock_xattrs (inode->i_sb); - err = vfs_rmdir (root->d_inode, dir); - reiserfs_write_unlock_xattrs (inode->i_sb); - dput (root); - } else { - reiserfs_warning (inode->i_sb, - "Couldn't remove all entries in directory"); - } - unlock_kernel (); - -out_dir: - fput(fp); - -out: - if (!err) - REISERFS_I(inode)->i_flags = REISERFS_I(inode)->i_flags & ~i_has_xattr_dir; - return err; + struct file *fp; + struct dentry *dir, *root; + int err = 0; + + /* Skip out, an xattr has no xattrs associated with it */ + if (is_reiserfs_priv_object(inode) || + get_inode_sd_version(inode) == STAT_DATA_V1 || + !reiserfs_xattrs(inode->i_sb)) { + return 0; + } + reiserfs_read_lock_xattrs(inode->i_sb); + dir = open_xa_dir(inode, FL_READONLY); + reiserfs_read_unlock_xattrs(inode->i_sb); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto out; + } else if (!dir->d_inode) { + dput(dir); + return 0; + } + + fp = dentry_open(dir, NULL, O_RDWR); + if (IS_ERR(fp)) { + err = PTR_ERR(fp); + /* dentry_open dputs the dentry if it fails */ + goto out; + } + + lock_kernel(); + err = xattr_readdir(fp, reiserfs_delete_xattrs_filler, dir); + if (err) { + unlock_kernel(); + goto out_dir; + } + + /* Leftovers besides . and .. -- that's not good. */ + if (dir->d_inode->i_nlink <= 2) { + root = get_xa_root(inode->i_sb); + reiserfs_write_lock_xattrs(inode->i_sb); + err = vfs_rmdir(root->d_inode, dir); + reiserfs_write_unlock_xattrs(inode->i_sb); + dput(root); + } else { + reiserfs_warning(inode->i_sb, + "Couldn't remove all entries in directory"); + } + unlock_kernel(); + + out_dir: + fput(fp); + + out: + if (!err) + REISERFS_I(inode)->i_flags = + REISERFS_I(inode)->i_flags & ~i_has_xattr_dir; + return err; } struct reiserfs_chown_buf { - struct inode *inode; - struct dentry *xadir; - struct iattr *attrs; + struct inode *inode; + struct dentry *xadir; + struct iattr *attrs; }; /* XXX: If there is a better way to do this, I'd love to hear about it */ static int -reiserfs_chown_xattrs_filler (void *buf, const char *name, int namelen, - loff_t offset, ino_t ino, unsigned int d_type) +reiserfs_chown_xattrs_filler(void *buf, const char *name, int namelen, + loff_t offset, ino_t ino, unsigned int d_type) { - struct reiserfs_chown_buf *chown_buf = (struct reiserfs_chown_buf *)buf; - struct dentry *xafile, *xadir = chown_buf->xadir; - struct iattr *attrs = chown_buf->attrs; - int err = 0; - - xafile = lookup_one_len (name, xadir, namelen); - if (IS_ERR (xafile)) - return PTR_ERR (xafile); - else if (!xafile->d_inode) { - dput (xafile); - return -ENODATA; - } - - if (!S_ISDIR (xafile->d_inode->i_mode)) - err = notify_change (xafile, attrs); - dput (xafile); - - return err; + struct reiserfs_chown_buf *chown_buf = (struct reiserfs_chown_buf *)buf; + struct dentry *xafile, *xadir = chown_buf->xadir; + struct iattr *attrs = chown_buf->attrs; + int err = 0; + + xafile = lookup_one_len(name, xadir, namelen); + if (IS_ERR(xafile)) + return PTR_ERR(xafile); + else if (!xafile->d_inode) { + dput(xafile); + return -ENODATA; + } + + if (!S_ISDIR(xafile->d_inode->i_mode)) + err = notify_change(xafile, attrs); + dput(xafile); + + return err; } -int -reiserfs_chown_xattrs (struct inode *inode, struct iattr *attrs) +int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs) { - struct file *fp; - struct dentry *dir; - int err = 0; - struct reiserfs_chown_buf buf; - unsigned int ia_valid = attrs->ia_valid; - - /* Skip out, an xattr has no xattrs associated with it */ - if (is_reiserfs_priv_object (inode) || - get_inode_sd_version (inode) == STAT_DATA_V1 || - !reiserfs_xattrs(inode->i_sb)) - { - return 0; - } - reiserfs_read_lock_xattrs (inode->i_sb); - dir = open_xa_dir (inode, FL_READONLY); - reiserfs_read_unlock_xattrs (inode->i_sb); - if (IS_ERR (dir)) { - if (PTR_ERR (dir) != -ENODATA) - err = PTR_ERR (dir); - goto out; - } else if (!dir->d_inode) { - dput (dir); - goto out; - } - - fp = dentry_open (dir, NULL, O_RDWR); - if (IS_ERR (fp)) { - err = PTR_ERR (fp); - /* dentry_open dputs the dentry if it fails */ - goto out; - } - - lock_kernel (); - - attrs->ia_valid &= (ATTR_UID | ATTR_GID | ATTR_CTIME); - buf.xadir = dir; - buf.attrs = attrs; - buf.inode = inode; - - err = xattr_readdir (fp, reiserfs_chown_xattrs_filler, &buf); - if (err) { - unlock_kernel (); - goto out_dir; - } - - err = notify_change (dir, attrs); - unlock_kernel (); - -out_dir: - fput(fp); - -out: - attrs->ia_valid = ia_valid; - return err; -} + struct file *fp; + struct dentry *dir; + int err = 0; + struct reiserfs_chown_buf buf; + unsigned int ia_valid = attrs->ia_valid; + + /* Skip out, an xattr has no xattrs associated with it */ + if (is_reiserfs_priv_object(inode) || + get_inode_sd_version(inode) == STAT_DATA_V1 || + !reiserfs_xattrs(inode->i_sb)) { + return 0; + } + reiserfs_read_lock_xattrs(inode->i_sb); + dir = open_xa_dir(inode, FL_READONLY); + reiserfs_read_unlock_xattrs(inode->i_sb); + if (IS_ERR(dir)) { + if (PTR_ERR(dir) != -ENODATA) + err = PTR_ERR(dir); + goto out; + } else if (!dir->d_inode) { + dput(dir); + goto out; + } + + fp = dentry_open(dir, NULL, O_RDWR); + if (IS_ERR(fp)) { + err = PTR_ERR(fp); + /* dentry_open dputs the dentry if it fails */ + goto out; + } + lock_kernel(); + + attrs->ia_valid &= (ATTR_UID | ATTR_GID | ATTR_CTIME); + buf.xadir = dir; + buf.attrs = attrs; + buf.inode = inode; + + err = xattr_readdir(fp, reiserfs_chown_xattrs_filler, &buf); + if (err) { + unlock_kernel(); + goto out_dir; + } + + err = notify_change(dir, attrs); + unlock_kernel(); + + out_dir: + fput(fp); + + out: + attrs->ia_valid = ia_valid; + return err; +} /* Actual operations that are exported to VFS-land */ @@ -946,61 +949,60 @@ out: * Preliminary locking: we down dentry->d_inode->i_sem */ ssize_t -reiserfs_getxattr (struct dentry *dentry, const char *name, void *buffer, - size_t size) +reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, + size_t size) { - struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name); - int err; - - if (!xah || !reiserfs_xattrs(dentry->d_sb) || - get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1) - return -EOPNOTSUPP; - - reiserfs_read_lock_xattr_i (dentry->d_inode); - reiserfs_read_lock_xattrs (dentry->d_sb); - err = xah->get (dentry->d_inode, name, buffer, size); - reiserfs_read_unlock_xattrs (dentry->d_sb); - reiserfs_read_unlock_xattr_i (dentry->d_inode); - return err; + struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix(name); + int err; + + if (!xah || !reiserfs_xattrs(dentry->d_sb) || + get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + reiserfs_read_lock_xattr_i(dentry->d_inode); + reiserfs_read_lock_xattrs(dentry->d_sb); + err = xah->get(dentry->d_inode, name, buffer, size); + reiserfs_read_unlock_xattrs(dentry->d_sb); + reiserfs_read_unlock_xattr_i(dentry->d_inode); + return err; } - /* * Inode operation setxattr() * * dentry->d_inode->i_sem down */ int -reiserfs_setxattr (struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) +reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) { - struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name); - int err; - int lock; - - if (!xah || !reiserfs_xattrs(dentry->d_sb) || - get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1) - return -EOPNOTSUPP; - - if (IS_RDONLY (dentry->d_inode)) - return -EROFS; - - if (IS_IMMUTABLE (dentry->d_inode) || IS_APPEND (dentry->d_inode)) - return -EROFS; - - reiserfs_write_lock_xattr_i (dentry->d_inode); - lock = !has_xattr_dir (dentry->d_inode); - if (lock) - reiserfs_write_lock_xattrs (dentry->d_sb); - else - reiserfs_read_lock_xattrs (dentry->d_sb); - err = xah->set (dentry->d_inode, name, value, size, flags); - if (lock) - reiserfs_write_unlock_xattrs (dentry->d_sb); - else - reiserfs_read_unlock_xattrs (dentry->d_sb); - reiserfs_write_unlock_xattr_i (dentry->d_inode); - return err; + struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix(name); + int err; + int lock; + + if (!xah || !reiserfs_xattrs(dentry->d_sb) || + get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + if (IS_RDONLY(dentry->d_inode)) + return -EROFS; + + if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) + return -EROFS; + + reiserfs_write_lock_xattr_i(dentry->d_inode); + lock = !has_xattr_dir(dentry->d_inode); + if (lock) + reiserfs_write_lock_xattrs(dentry->d_sb); + else + reiserfs_read_lock_xattrs(dentry->d_sb); + err = xah->set(dentry->d_inode, name, value, size, flags); + if (lock) + reiserfs_write_unlock_xattrs(dentry->d_sb); + else + reiserfs_read_unlock_xattrs(dentry->d_sb); + reiserfs_write_unlock_xattr_i(dentry->d_inode); + return err; } /* @@ -1008,344 +1010,343 @@ reiserfs_setxattr (struct dentry *dentry, const char *name, const void *value, * * dentry->d_inode->i_sem down */ -int -reiserfs_removexattr (struct dentry *dentry, const char *name) +int reiserfs_removexattr(struct dentry *dentry, const char *name) { - int err; - struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name); + int err; + struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix(name); - if (!xah || !reiserfs_xattrs(dentry->d_sb) || - get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1) - return -EOPNOTSUPP; + if (!xah || !reiserfs_xattrs(dentry->d_sb) || + get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + return -EOPNOTSUPP; - if (IS_RDONLY (dentry->d_inode)) - return -EROFS; + if (IS_RDONLY(dentry->d_inode)) + return -EROFS; - if (IS_IMMUTABLE (dentry->d_inode) || IS_APPEND (dentry->d_inode)) - return -EPERM; + if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) + return -EPERM; - reiserfs_write_lock_xattr_i (dentry->d_inode); - reiserfs_read_lock_xattrs (dentry->d_sb); + reiserfs_write_lock_xattr_i(dentry->d_inode); + reiserfs_read_lock_xattrs(dentry->d_sb); - /* Deletion pre-operation */ - if (xah->del) { - err = xah->del (dentry->d_inode, name); - if (err) - goto out; - } + /* Deletion pre-operation */ + if (xah->del) { + err = xah->del(dentry->d_inode, name); + if (err) + goto out; + } - err = reiserfs_xattr_del (dentry->d_inode, name); + err = reiserfs_xattr_del(dentry->d_inode, name); - dentry->d_inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty (dentry->d_inode); + dentry->d_inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(dentry->d_inode); -out: - reiserfs_read_unlock_xattrs (dentry->d_sb); - reiserfs_write_unlock_xattr_i (dentry->d_inode); - return err; + out: + reiserfs_read_unlock_xattrs(dentry->d_sb); + reiserfs_write_unlock_xattr_i(dentry->d_inode); + return err; } - /* This is what filldir will use: * r_pos will always contain the amount of space required for the entire * list. If r_pos becomes larger than r_size, we need more space and we * return an error indicating this. If r_pos is less than r_size, then we've * filled the buffer successfully and we return success */ struct reiserfs_listxattr_buf { - int r_pos; - int r_size; - char *r_buf; - struct inode *r_inode; + int r_pos; + int r_size; + char *r_buf; + struct inode *r_inode; }; static int -reiserfs_listxattr_filler (void *buf, const char *name, int namelen, - loff_t offset, ino_t ino, unsigned int d_type) +reiserfs_listxattr_filler(void *buf, const char *name, int namelen, + loff_t offset, ino_t ino, unsigned int d_type) { - struct reiserfs_listxattr_buf *b = (struct reiserfs_listxattr_buf *)buf; - int len = 0; - if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) { - struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name); - if (!xah) return 0; /* Unsupported xattr name, skip it */ - - /* We call ->list() twice because the operation isn't required to just - * return the name back - we want to make sure we have enough space */ - len += xah->list (b->r_inode, name, namelen, NULL); - - if (len) { - if (b->r_pos + len + 1 <= b->r_size) { - char *p = b->r_buf + b->r_pos; - p += xah->list (b->r_inode, name, namelen, p); - *p++ = '\0'; - } - b->r_pos += len + 1; - } - } - - return 0; + struct reiserfs_listxattr_buf *b = (struct reiserfs_listxattr_buf *)buf; + int len = 0; + if (name[0] != '.' + || (namelen != 1 && (name[1] != '.' || namelen != 2))) { + struct reiserfs_xattr_handler *xah = + find_xattr_handler_prefix(name); + if (!xah) + return 0; /* Unsupported xattr name, skip it */ + + /* We call ->list() twice because the operation isn't required to just + * return the name back - we want to make sure we have enough space */ + len += xah->list(b->r_inode, name, namelen, NULL); + + if (len) { + if (b->r_pos + len + 1 <= b->r_size) { + char *p = b->r_buf + b->r_pos; + p += xah->list(b->r_inode, name, namelen, p); + *p++ = '\0'; + } + b->r_pos += len + 1; + } + } + + return 0; } + /* * Inode operation listxattr() * * Preliminary locking: we down dentry->d_inode->i_sem */ -ssize_t -reiserfs_listxattr (struct dentry *dentry, char *buffer, size_t size) +ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) { - struct file *fp; - struct dentry *dir; - int err = 0; - struct reiserfs_listxattr_buf buf; - - if (!dentry->d_inode) - return -EINVAL; - - if (!reiserfs_xattrs(dentry->d_sb) || - get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1) - return -EOPNOTSUPP; - - reiserfs_read_lock_xattr_i (dentry->d_inode); - reiserfs_read_lock_xattrs (dentry->d_sb); - dir = open_xa_dir (dentry->d_inode, FL_READONLY); - reiserfs_read_unlock_xattrs (dentry->d_sb); - if (IS_ERR (dir)) { - err = PTR_ERR (dir); - if (err == -ENODATA) - err = 0; /* Not an error if there aren't any xattrs */ - goto out; - } - - fp = dentry_open (dir, NULL, O_RDWR); - if (IS_ERR (fp)) { - err = PTR_ERR (fp); - /* dentry_open dputs the dentry if it fails */ - goto out; - } - - buf.r_buf = buffer; - buf.r_size = buffer ? size : 0; - buf.r_pos = 0; - buf.r_inode = dentry->d_inode; - - REISERFS_I(dentry->d_inode)->i_flags |= i_has_xattr_dir; - - err = xattr_readdir (fp, reiserfs_listxattr_filler, &buf); - if (err) - goto out_dir; - - if (buf.r_pos > buf.r_size && buffer != NULL) - err = -ERANGE; - else - err = buf.r_pos; - -out_dir: - fput(fp); - -out: - reiserfs_read_unlock_xattr_i (dentry->d_inode); - return err; + struct file *fp; + struct dentry *dir; + int err = 0; + struct reiserfs_listxattr_buf buf; + + if (!dentry->d_inode) + return -EINVAL; + + if (!reiserfs_xattrs(dentry->d_sb) || + get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + reiserfs_read_lock_xattr_i(dentry->d_inode); + reiserfs_read_lock_xattrs(dentry->d_sb); + dir = open_xa_dir(dentry->d_inode, FL_READONLY); + reiserfs_read_unlock_xattrs(dentry->d_sb); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + if (err == -ENODATA) + err = 0; /* Not an error if there aren't any xattrs */ + goto out; + } + + fp = dentry_open(dir, NULL, O_RDWR); + if (IS_ERR(fp)) { + err = PTR_ERR(fp); + /* dentry_open dputs the dentry if it fails */ + goto out; + } + + buf.r_buf = buffer; + buf.r_size = buffer ? size : 0; + buf.r_pos = 0; + buf.r_inode = dentry->d_inode; + + REISERFS_I(dentry->d_inode)->i_flags |= i_has_xattr_dir; + + err = xattr_readdir(fp, reiserfs_listxattr_filler, &buf); + if (err) + goto out_dir; + + if (buf.r_pos > buf.r_size && buffer != NULL) + err = -ERANGE; + else + err = buf.r_pos; + + out_dir: + fput(fp); + + out: + reiserfs_read_unlock_xattr_i(dentry->d_inode); + return err; } /* This is the implementation for the xattr plugin infrastructure */ -static struct list_head xattr_handlers = LIST_HEAD_INIT (xattr_handlers); +static struct list_head xattr_handlers = LIST_HEAD_INIT(xattr_handlers); static DEFINE_RWLOCK(handler_lock); -static struct reiserfs_xattr_handler * -find_xattr_handler_prefix (const char *prefix) +static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char + *prefix) { - struct reiserfs_xattr_handler *xah = NULL; - struct list_head *p; - - read_lock (&handler_lock); - list_for_each (p, &xattr_handlers) { - xah = list_entry (p, struct reiserfs_xattr_handler, handlers); - if (strncmp (xah->prefix, prefix, strlen (xah->prefix)) == 0) - break; - xah = NULL; - } - - read_unlock (&handler_lock); - return xah; + struct reiserfs_xattr_handler *xah = NULL; + struct list_head *p; + + read_lock(&handler_lock); + list_for_each(p, &xattr_handlers) { + xah = list_entry(p, struct reiserfs_xattr_handler, handlers); + if (strncmp(xah->prefix, prefix, strlen(xah->prefix)) == 0) + break; + xah = NULL; + } + + read_unlock(&handler_lock); + return xah; } -static void -__unregister_handlers (void) +static void __unregister_handlers(void) { - struct reiserfs_xattr_handler *xah; - struct list_head *p, *tmp; + struct reiserfs_xattr_handler *xah; + struct list_head *p, *tmp; - list_for_each_safe (p, tmp, &xattr_handlers) { - xah = list_entry (p, struct reiserfs_xattr_handler, handlers); - if (xah->exit) - xah->exit(); + list_for_each_safe(p, tmp, &xattr_handlers) { + xah = list_entry(p, struct reiserfs_xattr_handler, handlers); + if (xah->exit) + xah->exit(); - list_del_init (p); - } - INIT_LIST_HEAD (&xattr_handlers); + list_del_init(p); + } + INIT_LIST_HEAD(&xattr_handlers); } -int __init -reiserfs_xattr_register_handlers (void) +int __init reiserfs_xattr_register_handlers(void) { - int err = 0; - struct reiserfs_xattr_handler *xah; - struct list_head *p; + int err = 0; + struct reiserfs_xattr_handler *xah; + struct list_head *p; - write_lock (&handler_lock); + write_lock(&handler_lock); - /* If we're already initialized, nothing to do */ - if (!list_empty (&xattr_handlers)) { - write_unlock (&handler_lock); - return 0; - } + /* If we're already initialized, nothing to do */ + if (!list_empty(&xattr_handlers)) { + write_unlock(&handler_lock); + return 0; + } - /* Add the handlers */ - list_add_tail (&user_handler.handlers, &xattr_handlers); - list_add_tail (&trusted_handler.handlers, &xattr_handlers); + /* Add the handlers */ + list_add_tail(&user_handler.handlers, &xattr_handlers); + list_add_tail(&trusted_handler.handlers, &xattr_handlers); #ifdef CONFIG_REISERFS_FS_SECURITY - list_add_tail (&security_handler.handlers, &xattr_handlers); + list_add_tail(&security_handler.handlers, &xattr_handlers); #endif #ifdef CONFIG_REISERFS_FS_POSIX_ACL - list_add_tail (&posix_acl_access_handler.handlers, &xattr_handlers); - list_add_tail (&posix_acl_default_handler.handlers, &xattr_handlers); + list_add_tail(&posix_acl_access_handler.handlers, &xattr_handlers); + list_add_tail(&posix_acl_default_handler.handlers, &xattr_handlers); #endif - /* Run initializers, if available */ - list_for_each (p, &xattr_handlers) { - xah = list_entry (p, struct reiserfs_xattr_handler, handlers); - if (xah->init) { - err = xah->init (); - if (err) { - list_del_init (p); - break; - } - } - } - - /* Clean up other handlers, if any failed */ - if (err) - __unregister_handlers (); - - write_unlock (&handler_lock); - return err; + /* Run initializers, if available */ + list_for_each(p, &xattr_handlers) { + xah = list_entry(p, struct reiserfs_xattr_handler, handlers); + if (xah->init) { + err = xah->init(); + if (err) { + list_del_init(p); + break; + } + } + } + + /* Clean up other handlers, if any failed */ + if (err) + __unregister_handlers(); + + write_unlock(&handler_lock); + return err; } -void -reiserfs_xattr_unregister_handlers (void) +void reiserfs_xattr_unregister_handlers(void) { - write_lock (&handler_lock); - __unregister_handlers (); - write_unlock (&handler_lock); + write_lock(&handler_lock); + __unregister_handlers(); + write_unlock(&handler_lock); } /* This will catch lookups from the fs root to .reiserfs_priv */ static int -xattr_lookup_poison (struct dentry *dentry, struct qstr *q1, struct qstr *name) +xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name) { - struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root; - if (name->len == priv_root->d_name.len && - name->hash == priv_root->d_name.hash && - !memcmp (name->name, priv_root->d_name.name, name->len)) { - return -ENOENT; - } else if (q1->len == name->len && - !memcmp(q1->name, name->name, name->len)) - return 0; - return 1; + struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root; + if (name->len == priv_root->d_name.len && + name->hash == priv_root->d_name.hash && + !memcmp(name->name, priv_root->d_name.name, name->len)) { + return -ENOENT; + } else if (q1->len == name->len && + !memcmp(q1->name, name->name, name->len)) + return 0; + return 1; } static struct dentry_operations xattr_lookup_poison_ops = { - .d_compare = xattr_lookup_poison, + .d_compare = xattr_lookup_poison, }; - /* We need to take a copy of the mount flags since things like * MS_RDONLY don't get set until *after* we're called. * mount_flags != mount_options */ -int -reiserfs_xattr_init (struct super_block *s, int mount_flags) +int reiserfs_xattr_init(struct super_block *s, int mount_flags) { - int err = 0; - - /* We need generation numbers to ensure that the oid mapping is correct - * v3.5 filesystems don't have them. */ - if (!old_format_only (s)) { - set_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt)); - } else if (reiserfs_xattrs_optional (s)) { - /* Old format filesystem, but optional xattrs have been enabled - * at mount time. Error out. */ - reiserfs_warning (s, "xattrs/ACLs not supported on pre v3.6 " - "format filesystem. Failing mount."); - err = -EOPNOTSUPP; - goto error; - } else { - /* Old format filesystem, but no optional xattrs have been enabled. This - * means we silently disable xattrs on the filesystem. */ - clear_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt)); - } - - /* If we don't have the privroot located yet - go find it */ - if (reiserfs_xattrs (s) && !REISERFS_SB(s)->priv_root) { - struct dentry *dentry; - dentry = lookup_one_len (PRIVROOT_NAME, s->s_root, - strlen (PRIVROOT_NAME)); - if (!IS_ERR (dentry)) { - if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) { - struct inode *inode = dentry->d_parent->d_inode; - down (&inode->i_sem); - err = inode->i_op->mkdir (inode, dentry, 0700); - up (&inode->i_sem); - if (err) { - dput (dentry); - dentry = NULL; - } - - if (dentry && dentry->d_inode) - reiserfs_warning (s, "Created %s on %s - reserved for " - "xattr storage.", PRIVROOT_NAME, - reiserfs_bdevname (inode->i_sb)); - } else if (!dentry->d_inode) { - dput (dentry); - dentry = NULL; - } - } else - err = PTR_ERR (dentry); - - if (!err && dentry) { - s->s_root->d_op = &xattr_lookup_poison_ops; - reiserfs_mark_inode_private (dentry->d_inode); - REISERFS_SB(s)->priv_root = dentry; - } else if (!(mount_flags & MS_RDONLY)) { /* xattrs are unavailable */ - /* If we're read-only it just means that the dir hasn't been - * created. Not an error -- just no xattrs on the fs. We'll - * check again if we go read-write */ - reiserfs_warning (s, "xattrs/ACLs enabled and couldn't " - "find/create .reiserfs_priv. Failing mount."); - err = -EOPNOTSUPP; - } - } - -error: - /* This is only nonzero if there was an error initializing the xattr - * directory or if there is a condition where we don't support them. */ - if (err) { - clear_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt)); - clear_bit (REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt)); - clear_bit (REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt)); - } - - /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */ - s->s_flags = s->s_flags & ~MS_POSIXACL; - if (reiserfs_posixacl (s)) - s->s_flags |= MS_POSIXACL; - - return err; + int err = 0; + + /* We need generation numbers to ensure that the oid mapping is correct + * v3.5 filesystems don't have them. */ + if (!old_format_only(s)) { + set_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt)); + } else if (reiserfs_xattrs_optional(s)) { + /* Old format filesystem, but optional xattrs have been enabled + * at mount time. Error out. */ + reiserfs_warning(s, "xattrs/ACLs not supported on pre v3.6 " + "format filesystem. Failing mount."); + err = -EOPNOTSUPP; + goto error; + } else { + /* Old format filesystem, but no optional xattrs have been enabled. This + * means we silently disable xattrs on the filesystem. */ + clear_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt)); + } + + /* If we don't have the privroot located yet - go find it */ + if (reiserfs_xattrs(s) && !REISERFS_SB(s)->priv_root) { + struct dentry *dentry; + dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, + strlen(PRIVROOT_NAME)); + if (!IS_ERR(dentry)) { + if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) { + struct inode *inode = dentry->d_parent->d_inode; + down(&inode->i_sem); + err = inode->i_op->mkdir(inode, dentry, 0700); + up(&inode->i_sem); + if (err) { + dput(dentry); + dentry = NULL; + } + + if (dentry && dentry->d_inode) + reiserfs_warning(s, + "Created %s on %s - reserved for " + "xattr storage.", + PRIVROOT_NAME, + reiserfs_bdevname + (inode->i_sb)); + } else if (!dentry->d_inode) { + dput(dentry); + dentry = NULL; + } + } else + err = PTR_ERR(dentry); + + if (!err && dentry) { + s->s_root->d_op = &xattr_lookup_poison_ops; + reiserfs_mark_inode_private(dentry->d_inode); + REISERFS_SB(s)->priv_root = dentry; + } else if (!(mount_flags & MS_RDONLY)) { /* xattrs are unavailable */ + /* If we're read-only it just means that the dir hasn't been + * created. Not an error -- just no xattrs on the fs. We'll + * check again if we go read-write */ + reiserfs_warning(s, "xattrs/ACLs enabled and couldn't " + "find/create .reiserfs_priv. Failing mount."); + err = -EOPNOTSUPP; + } + } + + error: + /* This is only nonzero if there was an error initializing the xattr + * directory or if there is a condition where we don't support them. */ + if (err) { + clear_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt)); + clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt)); + clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt)); + } + + /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */ + s->s_flags = s->s_flags & ~MS_POSIXACL; + if (reiserfs_posixacl(s)) + s->s_flags |= MS_POSIXACL; + + return err; } static int -__reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd, - int need_lock) +__reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd, + int need_lock) { - umode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; if (mask & MAY_WRITE) { /* @@ -1363,50 +1364,50 @@ __reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd, } /* We don't do permission checks on the internal objects. - * Permissions are determined by the "owning" object. */ - if (is_reiserfs_priv_object (inode)) + * Permissions are determined by the "owning" object. */ + if (is_reiserfs_priv_object(inode)) return 0; if (current->fsuid == inode->i_uid) { mode >>= 6; #ifdef CONFIG_REISERFS_FS_POSIX_ACL } else if (reiserfs_posixacl(inode->i_sb) && - get_inode_sd_version (inode) != STAT_DATA_V1) { - struct posix_acl *acl; + get_inode_sd_version(inode) != STAT_DATA_V1) { + struct posix_acl *acl; /* ACL can't contain additional permissions if the ACL_MASK entry is 0 */ if (!(mode & S_IRWXG)) goto check_groups; - if (need_lock) { - reiserfs_read_lock_xattr_i (inode); - reiserfs_read_lock_xattrs (inode->i_sb); + if (need_lock) { + reiserfs_read_lock_xattr_i(inode); + reiserfs_read_lock_xattrs(inode->i_sb); + } + acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); + if (need_lock) { + reiserfs_read_unlock_xattrs(inode->i_sb); + reiserfs_read_unlock_xattr_i(inode); } - acl = reiserfs_get_acl (inode, ACL_TYPE_ACCESS); - if (need_lock) { - reiserfs_read_unlock_xattrs (inode->i_sb); - reiserfs_read_unlock_xattr_i (inode); + if (IS_ERR(acl)) { + if (PTR_ERR(acl) == -ENODATA) + goto check_groups; + return PTR_ERR(acl); } - if (IS_ERR (acl)) { - if (PTR_ERR (acl) == -ENODATA) - goto check_groups; - return PTR_ERR (acl); - } - - if (acl) { - int err = posix_acl_permission (inode, acl, mask); - posix_acl_release (acl); - if (err == -EACCES) { - goto check_capabilities; - } - return err; + + if (acl) { + int err = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + if (err == -EACCES) { + goto check_capabilities; + } + return err; } else { goto check_groups; - } + } #endif } else { -check_groups: + check_groups: if (in_group_p(inode->i_gid)) mode >>= 3; } @@ -1414,10 +1415,10 @@ check_groups: /* * If the DACs are ok we don't need any capability check. */ - if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask)) + if (((mode & mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == mask)) return 0; -check_capabilities: + check_capabilities: /* * Read/write DACs are always overridable. * Executable DACs are overridable if at least one exec bit is set. @@ -1437,14 +1438,13 @@ check_capabilities: return -EACCES; } -int -reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd) +int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd) { - return __reiserfs_permission (inode, mask, nd, 1); + return __reiserfs_permission(inode, mask, nd, 1); } int -reiserfs_permission_locked (struct inode *inode, int mask, struct nameidata *nd) +reiserfs_permission_locked(struct inode *inode, int mask, struct nameidata *nd) { - return __reiserfs_permission (inode, mask, nd, 0); + return __reiserfs_permission(inode, mask, nd, 0); } diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index e302071903a..6703efa3c43 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -4,12 +4,13 @@ #include <linux/errno.h> #include <linux/pagemap.h> #include <linux/xattr.h> -#include <linux/xattr_acl.h> +#include <linux/posix_acl_xattr.h> #include <linux/reiserfs_xattr.h> #include <linux/reiserfs_acl.h> #include <asm/uaccess.h> -static int reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl); +static int reiserfs_set_acl(struct inode *inode, int type, + struct posix_acl *acl); static int xattr_set_acl(struct inode *inode, int type, const void *value, size_t size) @@ -34,14 +35,13 @@ xattr_set_acl(struct inode *inode, int type, const void *value, size_t size) } else acl = NULL; - error = reiserfs_set_acl (inode, type, acl); + error = reiserfs_set_acl(inode, type, acl); -release_and_out: + release_and_out: posix_acl_release(acl); return error; } - static int xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) { @@ -51,7 +51,7 @@ xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) if (!reiserfs_posixacl(inode->i_sb)) return -EOPNOTSUPP; - acl = reiserfs_get_acl (inode, type); + acl = reiserfs_get_acl(inode, type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -62,12 +62,10 @@ xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) return error; } - /* * Convert from filesystem to in-memory representation. */ -static struct posix_acl * -posix_acl_from_disk(const void *value, size_t size) +static struct posix_acl *posix_acl_from_disk(const void *value, size_t size) { const char *end = (char *)value + size; int n, count; @@ -76,8 +74,8 @@ posix_acl_from_disk(const void *value, size_t size) if (!value) return NULL; if (size < sizeof(reiserfs_acl_header)) - return ERR_PTR(-EINVAL); - if (((reiserfs_acl_header *)value)->a_version != + return ERR_PTR(-EINVAL); + if (((reiserfs_acl_header *) value)->a_version != cpu_to_le32(REISERFS_ACL_VERSION)) return ERR_PTR(-EINVAL); value = (char *)value + sizeof(reiserfs_acl_header); @@ -89,41 +87,39 @@ posix_acl_from_disk(const void *value, size_t size) acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); - for (n=0; n < count; n++) { - reiserfs_acl_entry *entry = - (reiserfs_acl_entry *)value; + for (n = 0; n < count; n++) { + reiserfs_acl_entry *entry = (reiserfs_acl_entry *) value; if ((char *)value + sizeof(reiserfs_acl_entry_short) > end) goto fail; - acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); - switch(acl->a_entries[n].e_tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - value = (char *)value + - sizeof(reiserfs_acl_entry_short); - acl->a_entries[n].e_id = ACL_UNDEFINED_ID; - break; - - case ACL_USER: - case ACL_GROUP: - value = (char *)value + sizeof(reiserfs_acl_entry); - if ((char *)value > end) - goto fail; - acl->a_entries[n].e_id = - le32_to_cpu(entry->e_id); - break; - - default: + switch (acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(reiserfs_acl_entry_short); + acl->a_entries[n].e_id = ACL_UNDEFINED_ID; + break; + + case ACL_USER: + case ACL_GROUP: + value = (char *)value + sizeof(reiserfs_acl_entry); + if ((char *)value > end) goto fail; + acl->a_entries[n].e_id = le32_to_cpu(entry->e_id); + break; + + default: + goto fail; } } if (value != end) goto fail; return acl; -fail: + fail: posix_acl_release(acl); return ERR_PTR(-EINVAL); } @@ -131,46 +127,46 @@ fail: /* * Convert from in-memory to filesystem representation. */ -static void * -posix_acl_to_disk(const struct posix_acl *acl, size_t *size) +static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size) { reiserfs_acl_header *ext_acl; char *e; int n; *size = reiserfs_acl_size(acl->a_count); - ext_acl = (reiserfs_acl_header *)kmalloc(sizeof(reiserfs_acl_header) + - acl->a_count * sizeof(reiserfs_acl_entry), GFP_NOFS); + ext_acl = (reiserfs_acl_header *) kmalloc(sizeof(reiserfs_acl_header) + + acl->a_count * + sizeof(reiserfs_acl_entry), + GFP_NOFS); if (!ext_acl) return ERR_PTR(-ENOMEM); ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION); e = (char *)ext_acl + sizeof(reiserfs_acl_header); - for (n=0; n < acl->a_count; n++) { - reiserfs_acl_entry *entry = (reiserfs_acl_entry *)e; - entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + for (n = 0; n < acl->a_count; n++) { + reiserfs_acl_entry *entry = (reiserfs_acl_entry *) e; + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); - switch(acl->a_entries[n].e_tag) { - case ACL_USER: - case ACL_GROUP: - entry->e_id = - cpu_to_le32(acl->a_entries[n].e_id); - e += sizeof(reiserfs_acl_entry); - break; - - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - e += sizeof(reiserfs_acl_entry_short); - break; - - default: - goto fail; + switch (acl->a_entries[n].e_tag) { + case ACL_USER: + case ACL_GROUP: + entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + e += sizeof(reiserfs_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(reiserfs_acl_entry_short); + break; + + default: + goto fail; } } return (char *)ext_acl; -fail: + fail: kfree(ext_acl); return ERR_PTR(-EINVAL); } @@ -181,59 +177,58 @@ fail: * inode->i_sem: down * BKL held [before 2.5.x] */ -struct posix_acl * -reiserfs_get_acl(struct inode *inode, int type) +struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) { char *name, *value; struct posix_acl *acl, **p_acl; size_t size; int retval; - struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); - - switch (type) { - case ACL_TYPE_ACCESS: - name = XATTR_NAME_ACL_ACCESS; - p_acl = &reiserfs_i->i_acl_access; - break; - case ACL_TYPE_DEFAULT: - name = XATTR_NAME_ACL_DEFAULT; - p_acl = &reiserfs_i->i_acl_default; - break; - default: - return ERR_PTR (-EINVAL); - } - - if (IS_ERR (*p_acl)) { - if (PTR_ERR (*p_acl) == -ENODATA) - return NULL; - } else if (*p_acl != NULL) - return posix_acl_dup (*p_acl); - - size = reiserfs_xattr_get (inode, name, NULL, 0); - if ((int)size < 0) { - if (size == -ENODATA || size == -ENOSYS) { - *p_acl = ERR_PTR (-ENODATA); - return NULL; - } - return ERR_PTR (size); - } + struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + p_acl = &reiserfs_i->i_acl_access; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + p_acl = &reiserfs_i->i_acl_default; + break; + default: + return ERR_PTR(-EINVAL); + } + + if (IS_ERR(*p_acl)) { + if (PTR_ERR(*p_acl) == -ENODATA) + return NULL; + } else if (*p_acl != NULL) + return posix_acl_dup(*p_acl); + + size = reiserfs_xattr_get(inode, name, NULL, 0); + if ((int)size < 0) { + if (size == -ENODATA || size == -ENOSYS) { + *p_acl = ERR_PTR(-ENODATA); + return NULL; + } + return ERR_PTR(size); + } - value = kmalloc (size, GFP_NOFS); - if (!value) - return ERR_PTR (-ENOMEM); + value = kmalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); retval = reiserfs_xattr_get(inode, name, value, size); if (retval == -ENODATA || retval == -ENOSYS) { /* This shouldn't actually happen as it should have been caught above.. but just in case */ acl = NULL; - *p_acl = ERR_PTR (-ENODATA); - } else if (retval < 0) { + *p_acl = ERR_PTR(-ENODATA); + } else if (retval < 0) { acl = ERR_PTR(retval); } else { acl = posix_acl_from_disk(value, retval); - *p_acl = posix_acl_dup (acl); - } + *p_acl = posix_acl_dup(acl); + } kfree(value); return acl; @@ -248,72 +243,72 @@ reiserfs_get_acl(struct inode *inode, int type) static int reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) { - char *name; + char *name; void *value = NULL; struct posix_acl **p_acl; size_t size; int error; - struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); + struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - switch (type) { - case ACL_TYPE_ACCESS: - name = XATTR_NAME_ACL_ACCESS; - p_acl = &reiserfs_i->i_acl_access; - if (acl) { - mode_t mode = inode->i_mode; - error = posix_acl_equiv_mode (acl, &mode); - if (error < 0) - return error; - else { - inode->i_mode = mode; - if (error == 0) - acl = NULL; - } - } - break; - case ACL_TYPE_DEFAULT: - name = XATTR_NAME_ACL_DEFAULT; - p_acl = &reiserfs_i->i_acl_default; - if (!S_ISDIR (inode->i_mode)) - return acl ? -EACCES : 0; - break; - default: - return -EINVAL; - } - - if (acl) { - value = posix_acl_to_disk(acl, &size); - if (IS_ERR(value)) - return (int)PTR_ERR(value); - error = reiserfs_xattr_set(inode, name, value, size, 0); + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + p_acl = &reiserfs_i->i_acl_access; + if (acl) { + mode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + return error; + else { + inode->i_mode = mode; + if (error == 0) + acl = NULL; + } + } + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + p_acl = &reiserfs_i->i_acl_default; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + default: + return -EINVAL; + } + + if (acl) { + value = posix_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + error = reiserfs_xattr_set(inode, name, value, size, 0); } else { - error = reiserfs_xattr_del (inode, name); - if (error == -ENODATA) { - /* This may seem odd here, but it means that the ACL was set - * with a value representable with mode bits. If there was - * an ACL before, reiserfs_xattr_del already dirtied the inode. - */ - mark_inode_dirty (inode); - error = 0; - } - } + error = reiserfs_xattr_del(inode, name); + if (error == -ENODATA) { + /* This may seem odd here, but it means that the ACL was set + * with a value representable with mode bits. If there was + * an ACL before, reiserfs_xattr_del already dirtied the inode. + */ + mark_inode_dirty(inode); + error = 0; + } + } if (value) kfree(value); - if (!error) { - /* Release the old one */ - if (!IS_ERR (*p_acl) && *p_acl) - posix_acl_release (*p_acl); + if (!error) { + /* Release the old one */ + if (!IS_ERR(*p_acl) && *p_acl) + posix_acl_release(*p_acl); - if (acl == NULL) - *p_acl = ERR_PTR (-ENODATA); - else - *p_acl = posix_acl_dup (acl); - } + if (acl == NULL) + *p_acl = ERR_PTR(-ENODATA); + else + *p_acl = posix_acl_dup(acl); + } return error; } @@ -321,196 +316,194 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) /* dir->i_sem: down, * inode is new and not released into the wild yet */ int -reiserfs_inherit_default_acl (struct inode *dir, struct dentry *dentry, struct inode *inode) +reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry, + struct inode *inode) { - struct posix_acl *acl; - int err = 0; - - /* ACLs only get applied to files and directories */ - if (S_ISLNK (inode->i_mode)) - return 0; - - /* ACLs can only be used on "new" objects, so if it's an old object - * there is nothing to inherit from */ - if (get_inode_sd_version (dir) == STAT_DATA_V1) - goto apply_umask; - - /* Don't apply ACLs to objects in the .reiserfs_priv tree.. This - * would be useless since permissions are ignored, and a pain because - * it introduces locking cycles */ - if (is_reiserfs_priv_object (dir)) { - reiserfs_mark_inode_private (inode); - goto apply_umask; - } - - acl = reiserfs_get_acl (dir, ACL_TYPE_DEFAULT); - if (IS_ERR (acl)) { - if (PTR_ERR (acl) == -ENODATA) - goto apply_umask; - return PTR_ERR (acl); - } - - if (acl) { - struct posix_acl *acl_copy; - mode_t mode = inode->i_mode; - int need_acl; - - /* Copy the default ACL to the default ACL of a new directory */ - if (S_ISDIR (inode->i_mode)) { - err = reiserfs_set_acl (inode, ACL_TYPE_DEFAULT, acl); - if (err) - goto cleanup; - } - - /* Now we reconcile the new ACL and the mode, - potentially modifying both */ - acl_copy = posix_acl_clone (acl, GFP_NOFS); - if (!acl_copy) { - err = -ENOMEM; - goto cleanup; - } - - - need_acl = posix_acl_create_masq (acl_copy, &mode); - if (need_acl >= 0) { - if (mode != inode->i_mode) { - inode->i_mode = mode; - } - - /* If we need an ACL.. */ - if (need_acl > 0) { - err = reiserfs_set_acl (inode, ACL_TYPE_ACCESS, acl_copy); - if (err) - goto cleanup_copy; - } - } -cleanup_copy: - posix_acl_release (acl_copy); -cleanup: - posix_acl_release (acl); - } else { -apply_umask: - /* no ACL, apply umask */ - inode->i_mode &= ~current->fs->umask; - } - - return err; + struct posix_acl *acl; + int err = 0; + + /* ACLs only get applied to files and directories */ + if (S_ISLNK(inode->i_mode)) + return 0; + + /* ACLs can only be used on "new" objects, so if it's an old object + * there is nothing to inherit from */ + if (get_inode_sd_version(dir) == STAT_DATA_V1) + goto apply_umask; + + /* Don't apply ACLs to objects in the .reiserfs_priv tree.. This + * would be useless since permissions are ignored, and a pain because + * it introduces locking cycles */ + if (is_reiserfs_priv_object(dir)) { + reiserfs_mark_inode_private(inode); + goto apply_umask; + } + + acl = reiserfs_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) { + if (PTR_ERR(acl) == -ENODATA) + goto apply_umask; + return PTR_ERR(acl); + } + + if (acl) { + struct posix_acl *acl_copy; + mode_t mode = inode->i_mode; + int need_acl; + + /* Copy the default ACL to the default ACL of a new directory */ + if (S_ISDIR(inode->i_mode)) { + err = reiserfs_set_acl(inode, ACL_TYPE_DEFAULT, acl); + if (err) + goto cleanup; + } + + /* Now we reconcile the new ACL and the mode, + potentially modifying both */ + acl_copy = posix_acl_clone(acl, GFP_NOFS); + if (!acl_copy) { + err = -ENOMEM; + goto cleanup; + } + + need_acl = posix_acl_create_masq(acl_copy, &mode); + if (need_acl >= 0) { + if (mode != inode->i_mode) { + inode->i_mode = mode; + } + + /* If we need an ACL.. */ + if (need_acl > 0) { + err = + reiserfs_set_acl(inode, ACL_TYPE_ACCESS, + acl_copy); + if (err) + goto cleanup_copy; + } + } + cleanup_copy: + posix_acl_release(acl_copy); + cleanup: + posix_acl_release(acl); + } else { + apply_umask: + /* no ACL, apply umask */ + inode->i_mode &= ~current->fs->umask; + } + + return err; } /* Looks up and caches the result of the default ACL. * We do this so that we don't need to carry the xattr_sem into * reiserfs_new_inode if we don't need to */ -int -reiserfs_cache_default_acl (struct inode *inode) +int reiserfs_cache_default_acl(struct inode *inode) { - int ret = 0; - if (reiserfs_posixacl (inode->i_sb) && - !is_reiserfs_priv_object (inode)) { - struct posix_acl *acl; - reiserfs_read_lock_xattr_i (inode); - reiserfs_read_lock_xattrs (inode->i_sb); - acl = reiserfs_get_acl (inode, ACL_TYPE_DEFAULT); - reiserfs_read_unlock_xattrs (inode->i_sb); - reiserfs_read_unlock_xattr_i (inode); - ret = acl ? 1 : 0; - posix_acl_release (acl); - } - - return ret; + int ret = 0; + if (reiserfs_posixacl(inode->i_sb) && !is_reiserfs_priv_object(inode)) { + struct posix_acl *acl; + reiserfs_read_lock_xattr_i(inode); + reiserfs_read_lock_xattrs(inode->i_sb); + acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT); + reiserfs_read_unlock_xattrs(inode->i_sb); + reiserfs_read_unlock_xattr_i(inode); + ret = acl ? 1 : 0; + posix_acl_release(acl); + } + + return ret; } -int -reiserfs_acl_chmod (struct inode *inode) +int reiserfs_acl_chmod(struct inode *inode) { - struct posix_acl *acl, *clone; - int error; + struct posix_acl *acl, *clone; + int error; - if (S_ISLNK(inode->i_mode)) - return -EOPNOTSUPP; + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; - if (get_inode_sd_version (inode) == STAT_DATA_V1 || - !reiserfs_posixacl(inode->i_sb)) - { - return 0; + if (get_inode_sd_version(inode) == STAT_DATA_V1 || + !reiserfs_posixacl(inode->i_sb)) { + return 0; } - reiserfs_read_lock_xattrs (inode->i_sb); - acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); - reiserfs_read_unlock_xattrs (inode->i_sb); - if (!acl) - return 0; - if (IS_ERR(acl)) - return PTR_ERR(acl); - clone = posix_acl_clone(acl, GFP_NOFS); - posix_acl_release(acl); - if (!clone) - return -ENOMEM; - error = posix_acl_chmod_masq(clone, inode->i_mode); - if (!error) { - int lock = !has_xattr_dir (inode); - reiserfs_write_lock_xattr_i (inode); - if (lock) - reiserfs_write_lock_xattrs (inode->i_sb); - else - reiserfs_read_lock_xattrs (inode->i_sb); - error = reiserfs_set_acl(inode, ACL_TYPE_ACCESS, clone); - if (lock) - reiserfs_write_unlock_xattrs (inode->i_sb); - else - reiserfs_read_unlock_xattrs (inode->i_sb); - reiserfs_write_unlock_xattr_i (inode); - } - posix_acl_release(clone); - return error; + reiserfs_read_lock_xattrs(inode->i_sb); + acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); + reiserfs_read_unlock_xattrs(inode->i_sb); + if (!acl) + return 0; + if (IS_ERR(acl)) + return PTR_ERR(acl); + clone = posix_acl_clone(acl, GFP_NOFS); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + error = posix_acl_chmod_masq(clone, inode->i_mode); + if (!error) { + int lock = !has_xattr_dir(inode); + reiserfs_write_lock_xattr_i(inode); + if (lock) + reiserfs_write_lock_xattrs(inode->i_sb); + else + reiserfs_read_lock_xattrs(inode->i_sb); + error = reiserfs_set_acl(inode, ACL_TYPE_ACCESS, clone); + if (lock) + reiserfs_write_unlock_xattrs(inode->i_sb); + else + reiserfs_read_unlock_xattrs(inode->i_sb); + reiserfs_write_unlock_xattr_i(inode); + } + posix_acl_release(clone); + return error; } static int posix_acl_access_get(struct inode *inode, const char *name, - void *buffer, size_t size) + void *buffer, size_t size) { - if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1) + if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1) return -EINVAL; return xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); } static int posix_acl_access_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) + const void *value, size_t size, int flags) { - if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1) + if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1) return -EINVAL; return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); } -static int -posix_acl_access_del (struct inode *inode, const char *name) +static int posix_acl_access_del(struct inode *inode, const char *name) { - struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); - struct posix_acl **acl = &reiserfs_i->i_acl_access; - if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1) - return -EINVAL; - if (!IS_ERR (*acl) && *acl) { - posix_acl_release (*acl); - *acl = ERR_PTR (-ENODATA); - } - - return 0; + struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); + struct posix_acl **acl = &reiserfs_i->i_acl_access; + if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1) + return -EINVAL; + if (!IS_ERR(*acl) && *acl) { + posix_acl_release(*acl); + *acl = ERR_PTR(-ENODATA); + } + + return 0; } static int -posix_acl_access_list (struct inode *inode, const char *name, int namelen, char *out) +posix_acl_access_list(struct inode *inode, const char *name, int namelen, + char *out) { - int len = namelen; - if (!reiserfs_posixacl (inode->i_sb)) - return 0; - if (out) - memcpy (out, name, len); + int len = namelen; + if (!reiserfs_posixacl(inode->i_sb)) + return 0; + if (out) + memcpy(out, name, len); - return len; + return len; } struct reiserfs_xattr_handler posix_acl_access_handler = { - .prefix = XATTR_NAME_ACL_ACCESS, + .prefix = POSIX_ACL_XATTR_ACCESS, .get = posix_acl_access_get, .set = posix_acl_access_set, .del = posix_acl_access_del, @@ -518,52 +511,52 @@ struct reiserfs_xattr_handler posix_acl_access_handler = { }; static int -posix_acl_default_get (struct inode *inode, const char *name, - void *buffer, size_t size) +posix_acl_default_get(struct inode *inode, const char *name, + void *buffer, size_t size) { - if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1) + if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1) return -EINVAL; return xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); } static int posix_acl_default_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) + const void *value, size_t size, int flags) { - if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1) + if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1) return -EINVAL; return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); } -static int -posix_acl_default_del (struct inode *inode, const char *name) +static int posix_acl_default_del(struct inode *inode, const char *name) { - struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); - struct posix_acl **acl = &reiserfs_i->i_acl_default; - if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1) - return -EINVAL; - if (!IS_ERR (*acl) && *acl) { - posix_acl_release (*acl); - *acl = ERR_PTR (-ENODATA); - } - - return 0; + struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); + struct posix_acl **acl = &reiserfs_i->i_acl_default; + if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1) + return -EINVAL; + if (!IS_ERR(*acl) && *acl) { + posix_acl_release(*acl); + *acl = ERR_PTR(-ENODATA); + } + + return 0; } static int -posix_acl_default_list (struct inode *inode, const char *name, int namelen, char *out) +posix_acl_default_list(struct inode *inode, const char *name, int namelen, + char *out) { - int len = namelen; - if (!reiserfs_posixacl (inode->i_sb)) - return 0; - if (out) - memcpy (out, name, len); + int len = namelen; + if (!reiserfs_posixacl(inode->i_sb)) + return 0; + if (out) + memcpy(out, name, len); - return len; + return len; } struct reiserfs_xattr_handler posix_acl_default_handler = { - .prefix = XATTR_NAME_ACL_DEFAULT, + .prefix = POSIX_ACL_XATTR_DEFAULT, .get = posix_acl_default_get, .set = posix_acl_default_set, .del = posix_acl_default_del, diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index e044d511711..5e90a95ad60 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -9,57 +9,55 @@ #define XATTR_SECURITY_PREFIX "security." static int -security_get (struct inode *inode, const char *name, void *buffer, size_t size) +security_get(struct inode *inode, const char *name, void *buffer, size_t size) { - if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) - return -EINVAL; + if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; - if (is_reiserfs_priv_object(inode)) - return -EPERM; + if (is_reiserfs_priv_object(inode)) + return -EPERM; - return reiserfs_xattr_get (inode, name, buffer, size); + return reiserfs_xattr_get(inode, name, buffer, size); } static int -security_set (struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +security_set(struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) { - if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) - return -EINVAL; + if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; - if (is_reiserfs_priv_object(inode)) - return -EPERM; + if (is_reiserfs_priv_object(inode)) + return -EPERM; - return reiserfs_xattr_set (inode, name, buffer, size, flags); + return reiserfs_xattr_set(inode, name, buffer, size, flags); } -static int -security_del (struct inode *inode, const char *name) +static int security_del(struct inode *inode, const char *name) { - if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) - return -EINVAL; + if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; - if (is_reiserfs_priv_object(inode)) - return -EPERM; + if (is_reiserfs_priv_object(inode)) + return -EPERM; - return 0; + return 0; } static int -security_list (struct inode *inode, const char *name, int namelen, char *out) +security_list(struct inode *inode, const char *name, int namelen, char *out) { - int len = namelen; + int len = namelen; - if (is_reiserfs_priv_object(inode)) - return 0; + if (is_reiserfs_priv_object(inode)) + return 0; - if (out) - memcpy (out, name, len); + if (out) + memcpy(out, name, len); - return len; + return len; } - struct reiserfs_xattr_handler security_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = security_get, diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 43762197fb0..2501f7e66ab 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -9,69 +9,67 @@ #define XATTR_TRUSTED_PREFIX "trusted." static int -trusted_get (struct inode *inode, const char *name, void *buffer, size_t size) +trusted_get(struct inode *inode, const char *name, void *buffer, size_t size) { - if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) - return -EINVAL; + if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) + return -EINVAL; - if (!reiserfs_xattrs (inode->i_sb)) - return -EOPNOTSUPP; + if (!reiserfs_xattrs(inode->i_sb)) + return -EOPNOTSUPP; - if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) - return -EPERM; + if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) + return -EPERM; - return reiserfs_xattr_get (inode, name, buffer, size); + return reiserfs_xattr_get(inode, name, buffer, size); } static int -trusted_set (struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +trusted_set(struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) { - if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) - return -EINVAL; + if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) + return -EINVAL; - if (!reiserfs_xattrs (inode->i_sb)) - return -EOPNOTSUPP; + if (!reiserfs_xattrs(inode->i_sb)) + return -EOPNOTSUPP; - if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) - return -EPERM; + if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) + return -EPERM; - return reiserfs_xattr_set (inode, name, buffer, size, flags); + return reiserfs_xattr_set(inode, name, buffer, size, flags); } -static int -trusted_del (struct inode *inode, const char *name) +static int trusted_del(struct inode *inode, const char *name) { - if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) - return -EINVAL; + if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) + return -EINVAL; - if (!reiserfs_xattrs (inode->i_sb)) - return -EOPNOTSUPP; + if (!reiserfs_xattrs(inode->i_sb)) + return -EOPNOTSUPP; - if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) - return -EPERM; + if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) + return -EPERM; - return 0; + return 0; } static int -trusted_list (struct inode *inode, const char *name, int namelen, char *out) +trusted_list(struct inode *inode, const char *name, int namelen, char *out) { - int len = namelen; + int len = namelen; - if (!reiserfs_xattrs (inode->i_sb)) - return 0; + if (!reiserfs_xattrs(inode->i_sb)) + return 0; - if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) - return 0; + if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) + return 0; - if (out) - memcpy (out, name, len); + if (out) + memcpy(out, name, len); - return len; + return len; } - struct reiserfs_xattr_handler trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = trusted_get, diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index 0772806466a..51458048ca6 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -13,81 +13,80 @@ #define XATTR_USER_PREFIX "user." static int -user_get (struct inode *inode, const char *name, void *buffer, size_t size) +user_get(struct inode *inode, const char *name, void *buffer, size_t size) { - int error; + int error; - if (strlen(name) < sizeof(XATTR_USER_PREFIX)) - return -EINVAL; + if (strlen(name) < sizeof(XATTR_USER_PREFIX)) + return -EINVAL; - if (!reiserfs_xattrs_user (inode->i_sb)) - return -EOPNOTSUPP; + if (!reiserfs_xattrs_user(inode->i_sb)) + return -EOPNOTSUPP; - error = reiserfs_permission_locked (inode, MAY_READ, NULL); - if (error) - return error; + error = reiserfs_permission_locked(inode, MAY_READ, NULL); + if (error) + return error; - return reiserfs_xattr_get (inode, name, buffer, size); + return reiserfs_xattr_get(inode, name, buffer, size); } static int -user_set (struct inode *inode, const char *name, const void *buffer, - size_t size, int flags) +user_set(struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) { - int error; + int error; - if (strlen(name) < sizeof(XATTR_USER_PREFIX)) - return -EINVAL; + if (strlen(name) < sizeof(XATTR_USER_PREFIX)) + return -EINVAL; - if (!reiserfs_xattrs_user (inode->i_sb)) - return -EOPNOTSUPP; + if (!reiserfs_xattrs_user(inode->i_sb)) + return -EOPNOTSUPP; - if (!S_ISREG (inode->i_mode) && - (!S_ISDIR (inode->i_mode) || inode->i_mode & S_ISVTX)) - return -EPERM; + if (!S_ISREG(inode->i_mode) && + (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) + return -EPERM; - error = reiserfs_permission_locked (inode, MAY_WRITE, NULL); - if (error) - return error; + error = reiserfs_permission_locked(inode, MAY_WRITE, NULL); + if (error) + return error; - return reiserfs_xattr_set (inode, name, buffer, size, flags); + return reiserfs_xattr_set(inode, name, buffer, size, flags); } -static int -user_del (struct inode *inode, const char *name) +static int user_del(struct inode *inode, const char *name) { - int error; + int error; - if (strlen(name) < sizeof(XATTR_USER_PREFIX)) - return -EINVAL; + if (strlen(name) < sizeof(XATTR_USER_PREFIX)) + return -EINVAL; - if (!reiserfs_xattrs_user (inode->i_sb)) - return -EOPNOTSUPP; + if (!reiserfs_xattrs_user(inode->i_sb)) + return -EOPNOTSUPP; - if (!S_ISREG (inode->i_mode) && - (!S_ISDIR (inode->i_mode) || inode->i_mode & S_ISVTX)) - return -EPERM; + if (!S_ISREG(inode->i_mode) && + (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) + return -EPERM; - error = reiserfs_permission_locked (inode, MAY_WRITE, NULL); - if (error) - return error; + error = reiserfs_permission_locked(inode, MAY_WRITE, NULL); + if (error) + return error; - return 0; + return 0; } static int -user_list (struct inode *inode, const char *name, int namelen, char *out) +user_list(struct inode *inode, const char *name, int namelen, char *out) { - int len = namelen; - if (!reiserfs_xattrs_user (inode->i_sb)) - return 0; + int len = namelen; + if (!reiserfs_xattrs_user(inode->i_sb)) + return 0; - if (out) - memcpy (out, name, len); + if (out) + memcpy(out, name, len); - return len; + return len; } struct reiserfs_xattr_handler user_handler = { diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c index 8b069e06433..0c64bc3a012 100644 --- a/fs/smbfs/symlink.c +++ b/fs/smbfs/symlink.c @@ -34,7 +34,7 @@ int smb_symlink(struct inode *inode, struct dentry *dentry, const char *oldname) return smb_proc_symlink(server_from_dentry(dentry), dentry, oldname); } -static int smb_follow_link(struct dentry *dentry, struct nameidata *nd) +static void *smb_follow_link(struct dentry *dentry, struct nameidata *nd) { char *link = __getname(); DEBUG1("followlink of %s/%s\n", DENTRY_PATH(dentry)); @@ -52,10 +52,10 @@ static int smb_follow_link(struct dentry *dentry, struct nameidata *nd) } } nd_set_link(nd, link); - return 0; + return NULL; } -static void smb_put_link(struct dentry *dentry, struct nameidata *nd) +static void smb_put_link(struct dentry *dentry, struct nameidata *nd, void *p) { char *s = nd_get_link(nd); if (!IS_ERR(s)) diff --git a/fs/super.c b/fs/super.c index 3a1b8ca04ba..6e57ee252e1 100644 --- a/fs/super.c +++ b/fs/super.c @@ -341,20 +341,22 @@ static inline void write_super(struct super_block *sb) */ void sync_supers(void) { - struct super_block * sb; -restart: + struct super_block *sb; + spin_lock(&sb_lock); - sb = sb_entry(super_blocks.next); - while (sb != sb_entry(&super_blocks)) +restart: + list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_dirt) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); write_super(sb); - drop_super(sb); - goto restart; - } else - sb = sb_entry(sb->s_list.next); + up_read(&sb->s_umount); + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; + } + } spin_unlock(&sb_lock); } @@ -381,20 +383,16 @@ void sync_filesystems(int wait) down(&mutex); /* Could be down_interruptible */ spin_lock(&sb_lock); - for (sb = sb_entry(super_blocks.next); sb != sb_entry(&super_blocks); - sb = sb_entry(sb->s_list.next)) { + list_for_each_entry(sb, &super_blocks, s_list) { if (!sb->s_op->sync_fs) continue; if (sb->s_flags & MS_RDONLY) continue; sb->s_need_sync_fs = 1; } - spin_unlock(&sb_lock); restart: - spin_lock(&sb_lock); - for (sb = sb_entry(super_blocks.next); sb != sb_entry(&super_blocks); - sb = sb_entry(sb->s_list.next)) { + list_for_each_entry(sb, &super_blocks, s_list) { if (!sb->s_need_sync_fs) continue; sb->s_need_sync_fs = 0; @@ -405,8 +403,11 @@ restart: down_read(&sb->s_umount); if (sb->s_root && (wait || sb->s_dirt)) sb->s_op->sync_fs(sb, wait); - drop_super(sb); - goto restart; + up_read(&sb->s_umount); + /* restart only when sb is no longer on the list */ + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; } spin_unlock(&sb_lock); up(&mutex); @@ -422,21 +423,25 @@ restart: struct super_block * get_super(struct block_device *bdev) { - struct list_head *p; + struct super_block *sb; + if (!bdev) return NULL; -rescan: + spin_lock(&sb_lock); - list_for_each(p, &super_blocks) { - struct super_block *s = sb_entry(p); - if (s->s_bdev == bdev) { - s->s_count++; +rescan: + list_for_each_entry(sb, &super_blocks, s_list) { + if (sb->s_bdev == bdev) { + sb->s_count++; spin_unlock(&sb_lock); - down_read(&s->s_umount); - if (s->s_root) - return s; - drop_super(s); - goto rescan; + down_read(&sb->s_umount); + if (sb->s_root) + return sb; + up_read(&sb->s_umount); + /* restart only when sb is no longer on the list */ + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto rescan; } } spin_unlock(&sb_lock); @@ -447,20 +452,22 @@ EXPORT_SYMBOL(get_super); struct super_block * user_get_super(dev_t dev) { - struct list_head *p; + struct super_block *sb; -rescan: spin_lock(&sb_lock); - list_for_each(p, &super_blocks) { - struct super_block *s = sb_entry(p); - if (s->s_dev == dev) { - s->s_count++; +rescan: + list_for_each_entry(sb, &super_blocks, s_list) { + if (sb->s_dev == dev) { + sb->s_count++; spin_unlock(&sb_lock); - down_read(&s->s_umount); - if (s->s_root) - return s; - drop_super(s); - goto rescan; + down_read(&sb->s_umount); + if (sb->s_root) + return sb; + up_read(&sb->s_umount); + /* restart only when sb is no longer on the list */ + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto rescan; } } spin_unlock(&sb_lock); @@ -833,8 +840,8 @@ do_kern_mount(const char *fstype, int flags, const char *name, void *data) mnt->mnt_root = dget(sb->s_root); mnt->mnt_mountpoint = sb->s_root; mnt->mnt_parent = mnt; - mnt->mnt_namespace = current->namespace; up_write(&sb->s_umount); + free_secdata(secdata); put_filesystem(type); return mnt; out_sb: diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index d4aaa88d021..78899eeab97 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -25,7 +25,7 @@ fill_read(struct dentry *dentry, char *buffer, loff_t off, size_t count) struct kobject * kobj = to_kobj(dentry->d_parent); if (!attr->read) - return -EINVAL; + return -EIO; return attr->read(kobj, buffer, off, count); } @@ -71,7 +71,7 @@ flush_write(struct dentry *dentry, char *buffer, loff_t offset, size_t count) struct kobject *kobj = to_kobj(dentry->d_parent); if (!attr->write) - return -EINVAL; + return -EIO; return attr->write(kobj, buffer, offset, count); } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index fe198210bc2..59734ba1ee6 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -8,6 +8,7 @@ #include <linux/mount.h> #include <linux/module.h> #include <linux/kobject.h> +#include <linux/namei.h> #include "sysfs.h" DECLARE_RWSEM(sysfs_rename_sem); @@ -99,20 +100,21 @@ static int create_dir(struct kobject * k, struct dentry * p, umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; down(&p->d_inode->i_sem); - *d = sysfs_get_dentry(p,n); + *d = lookup_one_len(n, p, strlen(n)); if (!IS_ERR(*d)) { - error = sysfs_create(*d, mode, init_dir); + error = sysfs_make_dirent(p->d_fsdata, *d, k, mode, SYSFS_DIR); if (!error) { - error = sysfs_make_dirent(p->d_fsdata, *d, k, mode, - SYSFS_DIR); + error = sysfs_create(*d, mode, init_dir); if (!error) { p->d_inode->i_nlink++; (*d)->d_op = &sysfs_dentry_ops; d_rehash(*d); } } - if (error && (error != -EEXIST)) + if (error && (error != -EEXIST)) { + sysfs_put((*d)->d_fsdata); d_drop(*d); + } dput(*d); } else error = PTR_ERR(*d); @@ -171,17 +173,19 @@ static int sysfs_attach_attr(struct sysfs_dirent * sd, struct dentry * dentry) init = init_file; } + dentry->d_fsdata = sysfs_get(sd); + sd->s_dentry = dentry; error = sysfs_create(dentry, (attr->mode & S_IALLUGO) | S_IFREG, init); - if (error) + if (error) { + sysfs_put(sd); return error; + } if (bin_attr) { dentry->d_inode->i_size = bin_attr->size; dentry->d_inode->i_fop = &bin_fops; } dentry->d_op = &sysfs_dentry_ops; - dentry->d_fsdata = sysfs_get(sd); - sd->s_dentry = dentry; d_rehash(dentry); return 0; @@ -191,13 +195,15 @@ static int sysfs_attach_link(struct sysfs_dirent * sd, struct dentry * dentry) { int err = 0; + dentry->d_fsdata = sysfs_get(sd); + sd->s_dentry = dentry; err = sysfs_create(dentry, S_IFLNK|S_IRWXUGO, init_symlink); if (!err) { dentry->d_op = &sysfs_dentry_ops; - dentry->d_fsdata = sysfs_get(sd); - sd->s_dentry = dentry; d_rehash(dentry); - } + } else + sysfs_put(sd); + return err; } @@ -228,6 +234,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, struct inode_operations sysfs_dir_inode_operations = { .lookup = sysfs_lookup, + .setattr = sysfs_setattr, }; static void remove_dir(struct dentry * d) @@ -309,7 +316,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name) down(&parent->d_inode->i_sem); - new_dentry = sysfs_get_dentry(parent, new_name); + new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); if (!IS_ERR(new_dentry)) { if (!new_dentry->d_inode) { error = kobject_set_name(kobj, "%s", new_name); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 364208071e1..4013d7905e8 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -3,8 +3,9 @@ */ #include <linux/module.h> -#include <linux/dnotify.h> +#include <linux/fsnotify.h> #include <linux/kobject.h> +#include <linux/namei.h> #include <asm/uaccess.h> #include <asm/semaphore.h> @@ -13,7 +14,7 @@ #define to_subsys(k) container_of(k,struct subsystem,kset.kobj) #define to_sattr(a) container_of(a,struct subsys_attribute,attr) -/** +/* * Subsystem file operations. * These operations allow subsystems to have files that can be * read/written. @@ -23,7 +24,7 @@ subsys_attr_show(struct kobject * kobj, struct attribute * attr, char * page) { struct subsystem * s = to_subsys(kobj); struct subsys_attribute * sattr = to_sattr(attr); - ssize_t ret = 0; + ssize_t ret = -EIO; if (sattr->show) ret = sattr->show(s,page); @@ -36,7 +37,7 @@ subsys_attr_store(struct kobject * kobj, struct attribute * attr, { struct subsystem * s = to_subsys(kobj); struct subsys_attribute * sattr = to_sattr(attr); - ssize_t ret = 0; + ssize_t ret = -EIO; if (sattr->store) ret = sattr->store(s,page,count); @@ -182,7 +183,7 @@ fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t return -ENOMEM; if (count >= PAGE_SIZE) - count = PAGE_SIZE - 1; + count = PAGE_SIZE; error = copy_from_user(buffer->page,buf,count); buffer->needs_read_fill = 1; return error ? -EFAULT : count; @@ -191,8 +192,9 @@ fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t /** * flush_write_buffer - push buffer to kobject. - * @file: file pointer. + * @dentry: dentry to the attribute * @buffer: data buffer for file. + * @count: number of bytes * * Get the correct pointers for the kobject and the attribute we're * dealing with, then call the store() method for the attribute, @@ -389,9 +391,6 @@ int sysfs_create_file(struct kobject * kobj, const struct attribute * attr) * sysfs_update_file - update the modified timestamp on an object attribute. * @kobj: object we're acting for. * @attr: attribute descriptor. - * - * Also call dnotify for the dentry, which lots of userspace programs - * use. */ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr) { @@ -400,13 +399,13 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr) int res = -ENOENT; down(&dir->d_inode->i_sem); - victim = sysfs_get_dentry(dir, attr->name); + victim = lookup_one_len(attr->name, dir, strlen(attr->name)); if (!IS_ERR(victim)) { /* make sure dentry is really there */ if (victim->d_inode && (victim->d_parent->d_inode == dir->d_inode)) { victim->d_inode->i_mtime = CURRENT_TIME; - dnotify_parent(victim, DN_MODIFY); + fsnotify_modify(victim); /** * Drop reference from initial sysfs_get_dentry(). @@ -438,22 +437,24 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) { struct dentry *dir = kobj->dentry; struct dentry *victim; - struct sysfs_dirent *sd; - umode_t umode = (mode & S_IALLUGO) | S_IFREG; + struct inode * inode; + struct iattr newattrs; int res = -ENOENT; down(&dir->d_inode->i_sem); - victim = sysfs_get_dentry(dir, attr->name); + victim = lookup_one_len(attr->name, dir, strlen(attr->name)); if (!IS_ERR(victim)) { if (victim->d_inode && (victim->d_parent->d_inode == dir->d_inode)) { - sd = victim->d_fsdata; - attr->mode = mode; - sd->s_mode = umode; - victim->d_inode->i_mode = umode; - dput(victim); - res = 0; + inode = victim->d_inode; + down(&inode->i_sem); + newattrs.ia_mode = (mode & S_IALLUGO) | + (inode->i_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; + res = notify_change(victim, &newattrs); + up(&inode->i_sem); } + dput(victim); } up(&dir->d_inode->i_sem); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index f11ac5ea702..122145b0895 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -11,6 +11,7 @@ #include <linux/kobject.h> #include <linux/module.h> #include <linux/dcache.h> +#include <linux/namei.h> #include <linux/err.h> #include "sysfs.h" @@ -68,7 +69,8 @@ void sysfs_remove_group(struct kobject * kobj, struct dentry * dir; if (grp->name) - dir = sysfs_get_dentry(kobj->dentry,grp->name); + dir = lookup_one_len(grp->name, kobj->dentry, + strlen(grp->name)); else dir = dget(kobj->dentry); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index aff7b2dfa8e..970a33f0329 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -26,18 +26,107 @@ static struct backing_dev_info sysfs_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, }; -struct inode * sysfs_new_inode(mode_t mode) +static struct inode_operations sysfs_inode_operations ={ + .setattr = sysfs_setattr, +}; + +int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) +{ + struct inode * inode = dentry->d_inode; + struct sysfs_dirent * sd = dentry->d_fsdata; + struct iattr * sd_iattr; + unsigned int ia_valid = iattr->ia_valid; + int error; + + if (!sd) + return -EINVAL; + + sd_iattr = sd->s_iattr; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + error = inode_setattr(inode, iattr); + if (error) + return error; + + if (!sd_iattr) { + /* setting attributes for the first time, allocate now */ + sd_iattr = kmalloc(sizeof(struct iattr), GFP_KERNEL); + if (!sd_iattr) + return -ENOMEM; + /* assign default attributes */ + memset(sd_iattr, 0, sizeof(struct iattr)); + sd_iattr->ia_mode = sd->s_mode; + sd_iattr->ia_uid = 0; + sd_iattr->ia_gid = 0; + sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; + sd->s_iattr = sd_iattr; + } + + /* attributes were changed atleast once in past */ + + if (ia_valid & ATTR_UID) + sd_iattr->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + sd_iattr->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + sd_iattr->ia_mode = sd->s_mode = mode; + } + + return error; +} + +static inline void set_default_inode_attr(struct inode * inode, mode_t mode) +{ + inode->i_mode = mode; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; +} + +static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) +{ + inode->i_mode = iattr->ia_mode; + inode->i_uid = iattr->ia_uid; + inode->i_gid = iattr->ia_gid; + inode->i_atime = iattr->ia_atime; + inode->i_mtime = iattr->ia_mtime; + inode->i_ctime = iattr->ia_ctime; +} + +struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent * sd) { struct inode * inode = new_inode(sysfs_sb); if (inode) { - inode->i_mode = mode; - inode->i_uid = 0; - inode->i_gid = 0; inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->a_ops = &sysfs_aops; inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; + inode->i_op = &sysfs_inode_operations; + + if (sd->s_iattr) { + /* sysfs_dirent has non-default attributes + * get them for the new inode from persistent copy + * in sysfs_dirent + */ + set_inode_attr(inode, sd->s_iattr); + } else + set_default_inode_attr(inode, mode); } return inode; } @@ -48,7 +137,8 @@ int sysfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *)) struct inode * inode = NULL; if (dentry) { if (!dentry->d_inode) { - if ((inode = sysfs_new_inode(mode))) { + struct sysfs_dirent * sd = dentry->d_fsdata; + if ((inode = sysfs_new_inode(mode, sd))) { if (dentry->d_parent && dentry->d_parent->d_inode) { struct inode *p_inode = dentry->d_parent->d_inode; p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; @@ -76,16 +166,6 @@ int sysfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *)) return error; } -struct dentry * sysfs_get_dentry(struct dentry * parent, const char * name) -{ - struct qstr qstr; - - qstr.name = name; - qstr.len = strlen(name); - qstr.hash = full_name_hash(name,qstr.len); - return lookup_hash(&qstr,parent); -} - /* * Get the name for corresponding element represented by the given sysfs_dirent */ @@ -148,6 +228,10 @@ void sysfs_hash_and_remove(struct dentry * dir, const char * name) struct sysfs_dirent * sd; struct sysfs_dirent * parent_sd = dir->d_fsdata; + if (dir->d_inode == NULL) + /* no inode means this hasn't been made visible yet */ + return; + down(&dir->d_inode->i_sem); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 5c805bb1a4b..f1117e885bd 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -28,6 +28,7 @@ static struct sysfs_dirent sysfs_root = { .s_children = LIST_HEAD_INIT(sysfs_root.s_children), .s_element = NULL, .s_type = SYSFS_ROOT, + .s_iattr = NULL, }; static int sysfs_fill_super(struct super_block *sb, void *data, int silent) @@ -42,7 +43,8 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; sysfs_sb = sb; - inode = sysfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO); + inode = sysfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, + &sysfs_root); if (inode) { inode->i_op = &sysfs_dir_inode_operations; inode->i_fop = &sysfs_dir_operations; diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index dfdf7017435..de402fa915f 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -43,7 +43,7 @@ static void fill_object_path(struct kobject * kobj, char * buffer, int length) } } -static int sysfs_add_link(struct dentry * parent, char * name, struct kobject * target) +static int sysfs_add_link(struct dentry * parent, const char * name, struct kobject * target) { struct sysfs_dirent * parent_sd = parent->d_fsdata; struct sysfs_symlink * sl; @@ -79,7 +79,7 @@ exit1: * @target: object we're pointing to. * @name: name of the symlink. */ -int sysfs_create_link(struct kobject * kobj, struct kobject * target, char * name) +int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char * name) { struct dentry * dentry = kobj->dentry; int error = 0; @@ -99,13 +99,13 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, char * nam * @name: name of the symlink to remove. */ -void sysfs_remove_link(struct kobject * kobj, char * name) +void sysfs_remove_link(struct kobject * kobj, const char * name) { sysfs_hash_and_remove(kobj->dentry,name); } static int sysfs_get_target_path(struct kobject * kobj, struct kobject * target, - char *path) + char *path) { char * s; int depth, size; @@ -151,17 +151,17 @@ static int sysfs_getlink(struct dentry *dentry, char * path) } -static int sysfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd) { int error = -ENOMEM; unsigned long page = get_zeroed_page(GFP_KERNEL); if (page) error = sysfs_getlink(dentry, (char *) page); nd_set_link(nd, error ? ERR_PTR(error) : (char *)page); - return 0; + return NULL; } -static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd) +static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) { char *page = nd_get_link(nd); if (!IS_ERR(page)) diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index a8a24a0c0b3..3f8953e0e5d 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -2,12 +2,11 @@ extern struct vfsmount * sysfs_mount; extern kmem_cache_t *sysfs_dir_cachep; -extern struct inode * sysfs_new_inode(mode_t mode); +extern struct inode * sysfs_new_inode(mode_t mode, struct sysfs_dirent *); extern int sysfs_create(struct dentry *, int mode, int (*init)(struct inode *)); extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *, umode_t, int); -extern struct dentry * sysfs_get_dentry(struct dentry *, const char *); extern int sysfs_add_file(struct dentry *, const struct attribute *, int); extern void sysfs_hash_and_remove(struct dentry * dir, const char * name); @@ -17,6 +16,7 @@ extern void sysfs_remove_subdir(struct dentry *); extern const unsigned char * sysfs_get_name(struct sysfs_dirent *sd); extern void sysfs_drop_dentry(struct sysfs_dirent *sd, struct dentry *parent); +extern int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); extern struct rw_semaphore sysfs_rename_sem; extern struct super_block * sysfs_sb; @@ -75,6 +75,7 @@ static inline void release_sysfs_dirent(struct sysfs_dirent * sd) kobject_put(sl->target_kobj); kfree(sl); } + kfree(sd->s_iattr); kmem_cache_free(sysfs_dir_cachep, sd); } diff --git a/fs/sysv/symlink.c b/fs/sysv/symlink.c index ed637db2dcb..b85ce61d635 100644 --- a/fs/sysv/symlink.c +++ b/fs/sysv/symlink.c @@ -8,10 +8,10 @@ #include "sysv.h" #include <linux/namei.h> -static int sysv_follow_link(struct dentry *dentry, struct nameidata *nd) +static void *sysv_follow_link(struct dentry *dentry, struct nameidata *nd) { nd_set_link(nd, (char *)SYSV_I(dentry->d_inode)->i_data); - return 0; + return NULL; } struct inode_operations sysv_fast_symlink_inode_operations = { diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 3f6dc7112bc..ac191ed7df0 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -159,14 +159,12 @@ udf_find_entry(struct inode *dir, struct dentry *dentry, char *nameptr; uint8_t lfi; uint16_t liu; - loff_t size = (udf_ext0_offset(dir) + dir->i_size) >> 2; + loff_t size; kernel_lb_addr bloc, eloc; uint32_t extoffset, elen, offset; struct buffer_head *bh = NULL; - if (!dir) - return NULL; - + size = (udf_ext0_offset(dir) + dir->i_size) >> 2; f_pos = (udf_ext0_offset(dir) >> 2); fibh->soffset = fibh->eoffset = (f_pos & ((dir->i_sb->s_blocksize - 1) >> 2)) << 2; diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c index a0e49149098..337512ed578 100644 --- a/fs/ufs/symlink.c +++ b/fs/ufs/symlink.c @@ -29,11 +29,11 @@ #include <linux/namei.h> #include <linux/ufs_fs.h> -static int ufs_follow_link(struct dentry *dentry, struct nameidata *nd) +static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd) { struct ufs_inode_info *p = UFS_I(dentry->d_inode); nd_set_link(nd, (char*)p->i_u1.i_symlink); - return 0; + return NULL; } struct inode_operations ufs_fast_symlink_inode_operations = { diff --git a/fs/xattr.c b/fs/xattr.c index 93dee70a1db..6acd5c63da9 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -16,6 +16,7 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/module.h> +#include <linux/fsnotify.h> #include <asm/uaccess.h> /* @@ -57,8 +58,10 @@ setxattr(struct dentry *d, char __user *name, void __user *value, if (error) goto out; error = d->d_inode->i_op->setxattr(d, kname, kvalue, size, flags); - if (!error) + if (!error) { + fsnotify_xattr(d); security_inode_post_setxattr(d, kname, kvalue, size, flags); + } out: up(&d->d_inode->i_sem); } diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 93ce257cd14..a3a4b5aaf5d 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -149,11 +149,12 @@ linvfs_unwritten_convert( */ STATIC void linvfs_unwritten_convert_direct( - struct inode *inode, + struct kiocb *iocb, loff_t offset, ssize_t size, void *private) { + struct inode *inode = iocb->ki_filp->f_dentry->d_inode; ASSERT(!private || inode == (struct inode *)private); /* private indicates an unwritten extent lay beneath this IO */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 997963e5362..df0cba239dd 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -61,12 +61,13 @@ * File wide globals */ -STATIC kmem_cache_t *pagebuf_cache; +STATIC kmem_cache_t *pagebuf_zone; STATIC kmem_shaker_t pagebuf_shake; -STATIC int pagebuf_daemon_wakeup(int, unsigned int); +STATIC int xfsbufd_wakeup(int, unsigned int); STATIC void pagebuf_delwri_queue(xfs_buf_t *, int); -STATIC struct workqueue_struct *pagebuf_logio_workqueue; -STATIC struct workqueue_struct *pagebuf_dataio_workqueue; + +STATIC struct workqueue_struct *xfslogd_workqueue; +STATIC struct workqueue_struct *xfsdatad_workqueue; /* * Pagebuf debugging @@ -123,9 +124,9 @@ ktrace_t *pagebuf_trace_buf; #define pagebuf_allocate(flags) \ - kmem_zone_alloc(pagebuf_cache, pb_to_km(flags)) + kmem_zone_alloc(pagebuf_zone, pb_to_km(flags)) #define pagebuf_deallocate(pb) \ - kmem_zone_free(pagebuf_cache, (pb)); + kmem_zone_free(pagebuf_zone, (pb)); /* * Page Region interfaces. @@ -425,7 +426,7 @@ _pagebuf_lookup_pages( __FUNCTION__, gfp_mask); XFS_STATS_INC(pb_page_retries); - pagebuf_daemon_wakeup(0, gfp_mask); + xfsbufd_wakeup(0, gfp_mask); blk_congestion_wait(WRITE, HZ/50); goto retry; } @@ -1136,8 +1137,8 @@ pagebuf_iodone( if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) { if (schedule) { INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb); - queue_work(dataio ? pagebuf_dataio_workqueue : - pagebuf_logio_workqueue, &pb->pb_iodone_work); + queue_work(dataio ? xfsdatad_workqueue : + xfslogd_workqueue, &pb->pb_iodone_work); } else { pagebuf_iodone_work(pb); } @@ -1562,16 +1563,6 @@ xfs_free_buftarg( kmem_free(btp, sizeof(*btp)); } -void -xfs_incore_relse( - xfs_buftarg_t *btp, - int delwri_only, - int wait) -{ - invalidate_bdev(btp->pbr_bdev, 1); - truncate_inode_pages(btp->pbr_mapping, 0LL); -} - STATIC int xfs_setsize_buftarg_flags( xfs_buftarg_t *btp, @@ -1742,27 +1733,27 @@ pagebuf_runall_queues( } /* Defines for pagebuf daemon */ -STATIC DECLARE_COMPLETION(pagebuf_daemon_done); -STATIC struct task_struct *pagebuf_daemon_task; -STATIC int pagebuf_daemon_active; -STATIC int force_flush; -STATIC int force_sleep; +STATIC DECLARE_COMPLETION(xfsbufd_done); +STATIC struct task_struct *xfsbufd_task; +STATIC int xfsbufd_active; +STATIC int xfsbufd_force_flush; +STATIC int xfsbufd_force_sleep; STATIC int -pagebuf_daemon_wakeup( +xfsbufd_wakeup( int priority, unsigned int mask) { - if (force_sleep) + if (xfsbufd_force_sleep) return 0; - force_flush = 1; + xfsbufd_force_flush = 1; barrier(); - wake_up_process(pagebuf_daemon_task); + wake_up_process(xfsbufd_task); return 0; } STATIC int -pagebuf_daemon( +xfsbufd( void *data) { struct list_head tmp; @@ -1774,17 +1765,17 @@ pagebuf_daemon( daemonize("xfsbufd"); current->flags |= PF_MEMALLOC; - pagebuf_daemon_task = current; - pagebuf_daemon_active = 1; + xfsbufd_task = current; + xfsbufd_active = 1; barrier(); INIT_LIST_HEAD(&tmp); do { - if (unlikely(current->flags & PF_FREEZE)) { - force_sleep = 1; - refrigerator(PF_FREEZE); + if (unlikely(freezing(current))) { + xfsbufd_force_sleep = 1; + refrigerator(); } else { - force_sleep = 0; + xfsbufd_force_sleep = 0; } set_current_state(TASK_INTERRUPTIBLE); @@ -1797,7 +1788,7 @@ pagebuf_daemon( ASSERT(pb->pb_flags & PBF_DELWRI); if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) { - if (!force_flush && + if (!xfsbufd_force_flush && time_before(jiffies, pb->pb_queuetime + age)) { pagebuf_unlock(pb); @@ -1824,10 +1815,10 @@ pagebuf_daemon( if (as_list_len > 0) purge_addresses(); - force_flush = 0; - } while (pagebuf_daemon_active); + xfsbufd_force_flush = 0; + } while (xfsbufd_active); - complete_and_exit(&pagebuf_daemon_done, 0); + complete_and_exit(&xfsbufd_done, 0); } /* @@ -1844,8 +1835,8 @@ xfs_flush_buftarg( xfs_buf_t *pb, *n; int pincount = 0; - pagebuf_runall_queues(pagebuf_dataio_workqueue); - pagebuf_runall_queues(pagebuf_logio_workqueue); + pagebuf_runall_queues(xfsdatad_workqueue); + pagebuf_runall_queues(xfslogd_workqueue); INIT_LIST_HEAD(&tmp); spin_lock(&pbd_delwrite_lock); @@ -1898,43 +1889,43 @@ xfs_flush_buftarg( } STATIC int -pagebuf_daemon_start(void) +xfs_buf_daemons_start(void) { - int rval; + int error = -ENOMEM; - pagebuf_logio_workqueue = create_workqueue("xfslogd"); - if (!pagebuf_logio_workqueue) - return -ENOMEM; + xfslogd_workqueue = create_workqueue("xfslogd"); + if (!xfslogd_workqueue) + goto out; - pagebuf_dataio_workqueue = create_workqueue("xfsdatad"); - if (!pagebuf_dataio_workqueue) { - destroy_workqueue(pagebuf_logio_workqueue); - return -ENOMEM; - } + xfsdatad_workqueue = create_workqueue("xfsdatad"); + if (!xfsdatad_workqueue) + goto out_destroy_xfslogd_workqueue; - rval = kernel_thread(pagebuf_daemon, NULL, CLONE_FS|CLONE_FILES); - if (rval < 0) { - destroy_workqueue(pagebuf_logio_workqueue); - destroy_workqueue(pagebuf_dataio_workqueue); - } + error = kernel_thread(xfsbufd, NULL, CLONE_FS|CLONE_FILES); + if (error < 0) + goto out_destroy_xfsdatad_workqueue; + return 0; - return rval; + out_destroy_xfsdatad_workqueue: + destroy_workqueue(xfsdatad_workqueue); + out_destroy_xfslogd_workqueue: + destroy_workqueue(xfslogd_workqueue); + out: + return error; } /* - * pagebuf_daemon_stop - * * Note: do not mark as __exit, it is called from pagebuf_terminate. */ STATIC void -pagebuf_daemon_stop(void) +xfs_buf_daemons_stop(void) { - pagebuf_daemon_active = 0; + xfsbufd_active = 0; barrier(); - wait_for_completion(&pagebuf_daemon_done); + wait_for_completion(&xfsbufd_done); - destroy_workqueue(pagebuf_logio_workqueue); - destroy_workqueue(pagebuf_dataio_workqueue); + destroy_workqueue(xfslogd_workqueue); + destroy_workqueue(xfsdatad_workqueue); } /* @@ -1944,27 +1935,37 @@ pagebuf_daemon_stop(void) int __init pagebuf_init(void) { - pagebuf_cache = kmem_cache_create("xfs_buf_t", sizeof(xfs_buf_t), 0, - SLAB_HWCACHE_ALIGN, NULL, NULL); - if (pagebuf_cache == NULL) { - printk("XFS: couldn't init xfs_buf_t cache\n"); - pagebuf_terminate(); - return -ENOMEM; - } + int error = -ENOMEM; + + pagebuf_zone = kmem_zone_init(sizeof(xfs_buf_t), "xfs_buf"); + if (!pagebuf_zone) + goto out; #ifdef PAGEBUF_TRACE pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP); #endif - pagebuf_daemon_start(); + error = xfs_buf_daemons_start(); + if (error) + goto out_free_buf_zone; - pagebuf_shake = kmem_shake_register(pagebuf_daemon_wakeup); - if (pagebuf_shake == NULL) { - pagebuf_terminate(); - return -ENOMEM; + pagebuf_shake = kmem_shake_register(xfsbufd_wakeup); + if (!pagebuf_shake) { + error = -ENOMEM; + goto out_stop_daemons; } return 0; + + out_stop_daemons: + xfs_buf_daemons_stop(); + out_free_buf_zone: +#ifdef PAGEBUF_TRACE + ktrace_free(pagebuf_trace_buf); +#endif + kmem_zone_destroy(pagebuf_zone); + out: + return error; } @@ -1976,12 +1977,12 @@ pagebuf_init(void) void pagebuf_terminate(void) { - pagebuf_daemon_stop(); + xfs_buf_daemons_stop(); #ifdef PAGEBUF_TRACE ktrace_free(pagebuf_trace_buf); #endif - kmem_zone_destroy(pagebuf_cache); + kmem_zone_destroy(pagebuf_zone); kmem_shake_deregister(pagebuf_shake); } diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 74deed8e6d9..3f8f69a66ae 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -576,7 +576,6 @@ extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); extern void xfs_free_buftarg(xfs_buftarg_t *, int); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); -extern void xfs_incore_relse(xfs_buftarg_t *, int, int); extern int xfs_flush_buftarg(xfs_buftarg_t *, int); #define xfs_getsize_buftarg(buftarg) \ diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 24fa3b101b9..f1ce4323f56 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -57,7 +57,9 @@ #include <linux/smp_lock.h> static struct vm_operations_struct linvfs_file_vm_ops; - +#ifdef CONFIG_XFS_DMAPI +static struct vm_operations_struct linvfs_dmapi_file_vm_ops; +#endif STATIC inline ssize_t __linvfs_read( @@ -388,6 +390,14 @@ done: return -error; } +#ifdef CONFIG_XFS_DMAPI +STATIC void +linvfs_mmap_close( + struct vm_area_struct *vma) +{ + xfs_dm_mm_put(vma); +} +#endif /* CONFIG_XFS_DMAPI */ STATIC int linvfs_file_mmap( @@ -399,16 +409,19 @@ linvfs_file_mmap( vattr_t va = { .va_mask = XFS_AT_UPDATIME }; int error; + vma->vm_ops = &linvfs_file_vm_ops; + if (vp->v_vfsp->vfs_flag & VFS_DMI) { xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp); error = -XFS_SEND_MMAP(mp, vma, 0); if (error) return error; +#ifdef CONFIG_XFS_DMAPI + vma->vm_ops = &linvfs_dmapi_file_vm_ops; +#endif } - vma->vm_ops = &linvfs_file_vm_ops; - VOP_SETATTR(vp, &va, XFS_AT_UPDATIME, NULL, error); if (!error) vn_revalidate(vp); /* update Linux inode flags */ @@ -609,7 +622,15 @@ struct file_operations linvfs_dir_operations = { static struct vm_operations_struct linvfs_file_vm_ops = { .nopage = filemap_nopage, .populate = filemap_populate, +}; + +#ifdef CONFIG_XFS_DMAPI +static struct vm_operations_struct linvfs_dmapi_file_vm_ops = { + .close = linvfs_mmap_close, + .nopage = filemap_nopage, + .populate = filemap_populate, #ifdef HAVE_VMOP_MPROTECT .mprotect = linvfs_mprotect, #endif }; +#endif /* CONFIG_XFS_DMAPI */ diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 69809eef8a5..05a447e51cc 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -1174,7 +1174,8 @@ xfs_ioc_xattr( switch (cmd) { case XFS_IOC_FSGETXATTR: { - va.va_mask = XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS; + va.va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | \ + XFS_AT_NEXTENTS | XFS_AT_PROJID; VOP_GETATTR(vp, &va, 0, NULL, error); if (error) return -error; @@ -1182,6 +1183,7 @@ xfs_ioc_xattr( fa.fsx_xflags = va.va_xflags; fa.fsx_extsize = va.va_extsize; fa.fsx_nextents = va.va_nextents; + fa.fsx_projid = va.va_projid; if (copy_to_user(arg, &fa, sizeof(fa))) return -XFS_ERROR(EFAULT); @@ -1196,9 +1198,10 @@ xfs_ioc_xattr( if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) attr_flags |= ATTR_NONBLOCK; - va.va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE; + va.va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID; va.va_xflags = fa.fsx_xflags; va.va_extsize = fa.fsx_extsize; + va.va_projid = fa.fsx_projid; VOP_SETATTR(vp, &va, attr_flags, NULL, error); if (!error) @@ -1207,7 +1210,8 @@ xfs_ioc_xattr( } case XFS_IOC_FSGETXATTRA: { - va.va_mask = XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_ANEXTENTS; + va.va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | \ + XFS_AT_ANEXTENTS | XFS_AT_PROJID; VOP_GETATTR(vp, &va, 0, NULL, error); if (error) return -error; @@ -1215,6 +1219,7 @@ xfs_ioc_xattr( fa.fsx_xflags = va.va_xflags; fa.fsx_extsize = va.va_extsize; fa.fsx_nextents = va.va_anextents; + fa.fsx_projid = va.va_projid; if (copy_to_user(arg, &fa, sizeof(fa))) return -XFS_ERROR(EFAULT); diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 407e9935939..f252605514e 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -374,7 +374,7 @@ linvfs_rename( * we need to be very careful about how much stack we use. * uio is kmalloced for this reason... */ -STATIC int +STATIC void * linvfs_follow_link( struct dentry *dentry, struct nameidata *nd) @@ -391,14 +391,14 @@ linvfs_follow_link( link = (char *)kmalloc(MAXNAMELEN+1, GFP_KERNEL); if (!link) { nd_set_link(nd, ERR_PTR(-ENOMEM)); - return 0; + return NULL; } uio = (uio_t *)kmalloc(sizeof(uio_t), GFP_KERNEL); if (!uio) { kfree(link); nd_set_link(nd, ERR_PTR(-ENOMEM)); - return 0; + return NULL; } vp = LINVFS_GET_VP(dentry->d_inode); @@ -422,10 +422,10 @@ linvfs_follow_link( kfree(uio); nd_set_link(nd, link); - return 0; + return NULL; } -static void linvfs_put_link(struct dentry *dentry, struct nameidata *nd) +static void linvfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) { char *s = nd_get_link(nd); if (!IS_ERR(s)) diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 71bb41019a1..42dc5e4662e 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -145,10 +145,10 @@ static inline void set_buffer_unwritten_io(struct buffer_head *bh) #define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val #define xfs_rotorstep xfs_params.rotorstep.val -#ifndef __smp_processor_id -#define __smp_processor_id() smp_processor_id() +#ifndef raw_smp_processor_id +#define raw_smp_processor_id() smp_processor_id() #endif -#define current_cpu() __smp_processor_id() +#define current_cpu() raw_smp_processor_id() #define current_pid() (current->pid) #define current_fsuid(cred) (current->fsuid) #define current_fsgid(cred) (current->fsgid) @@ -230,8 +230,10 @@ static inline void set_buffer_unwritten_io(struct buffer_head *bh) * field (see the QCMD macro in quota.h). These macros help keep the * code portable - they are not visible from the syscall interface. */ -#define Q_XSETGQLIM XQM_CMD(0x8) /* set groups disk limits */ -#define Q_XGETGQUOTA XQM_CMD(0x9) /* get groups disk limits */ +#define Q_XSETGQLIM XQM_CMD(8) /* set groups disk limits */ +#define Q_XGETGQUOTA XQM_CMD(9) /* get groups disk limits */ +#define Q_XSETPQLIM XQM_CMD(10) /* set projects disk limits */ +#define Q_XGETPQUOTA XQM_CMD(11) /* get projects disk limits */ /* IRIX uses a dynamic sizing algorithm (ndquot = 200 + numprocs*2) */ /* we may well need to fine-tune this if it ever becomes an issue. */ diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index aa9daaea6c3..acab58c4804 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -209,30 +209,6 @@ unlock: return (-status); } -/* - * xfs_inval_cached_pages - * - * This routine is responsible for keeping direct I/O and buffered I/O - * somewhat coherent. From here we make sure that we're at least - * temporarily holding the inode I/O lock exclusively and then call - * the page cache to flush and invalidate any cached pages. If there - * are no cached pages this routine will be very quick. - */ -void -xfs_inval_cached_pages( - vnode_t *vp, - xfs_iocore_t *io, - xfs_off_t offset, - int write, - int relock) -{ - if (VN_CACHED(vp)) { - xfs_inval_cached_trace(io, offset, -1, ctooff(offtoct(offset)), -1); - VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(offset)), -1, FI_REMAPF_LOCKED); - } - -} - ssize_t /* bytes read, or (-) error */ xfs_read( bhv_desc_t *bdp, @@ -304,10 +280,11 @@ xfs_read( if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { vrwlock_t locktype = VRWLOCK_READ; + int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, size, - FILP_DELAY_FLAG(file), &locktype); + dmflags, &locktype); if (ret) { xfs_iunlock(ip, XFS_IOLOCK_SHARED); goto unlock_isem; @@ -867,11 +844,15 @@ retry: !(ioflags & IO_INVIS)) { xfs_rwunlock(bdp, locktype); + if (need_isem) + up(&inode->i_sem); error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (error) - goto out_unlock_isem; + goto out_nounlocks; + if (need_isem) + down(&inode->i_sem); xfs_rwlock(bdp, locktype); pos = xip->i_d.di_size; ret = 0; @@ -986,6 +967,7 @@ retry: out_unlock_isem: if (need_isem) up(&inode->i_sem); + out_nounlocks: return -error; } diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h index d723e35254a..f197a720e39 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -94,8 +94,6 @@ extern int xfs_bdstrat_cb(struct xfs_buf *); extern int xfs_zero_eof(struct vnode *, struct xfs_iocore *, xfs_off_t, xfs_fsize_t, xfs_fsize_t); -extern void xfs_inval_cached_pages(struct vnode *, struct xfs_iocore *, - xfs_off_t, int, int); extern ssize_t xfs_read(struct bhv_desc *, struct kiocb *, const struct iovec *, unsigned int, loff_t *, int, struct cred *); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 455e2b2fb96..f6dd7de2592 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -483,7 +483,7 @@ xfssyncd( set_current_state(TASK_INTERRUPTIBLE); timeleft = schedule_timeout(timeleft); /* swsusp */ - try_to_freeze(PF_FREEZE); + try_to_freeze(); if (vfsp->vfs_flag & VFS_UMOUNT) break; @@ -590,8 +590,10 @@ linvfs_sync_super( int error; int flags = SYNC_FSDATA; - if (wait) - flags |= SYNC_WAIT; + if (unlikely(sb->s_frozen == SB_FREEZE_WRITE)) + flags = SYNC_QUIESCE; + else + flags = SYNC_FSDATA | (wait ? SYNC_WAIT : 0); VFS_SYNC(vfsp, flags, NULL, error); sb->s_dirt = 0; @@ -701,7 +703,8 @@ linvfs_getxquota( struct vfs *vfsp = LINVFS_GET_VFS(sb); int error, getmode; - getmode = (type == GRPQUOTA) ? Q_XGETGQUOTA : Q_XGETQUOTA; + getmode = (type == USRQUOTA) ? Q_XGETQUOTA : + ((type == GRPQUOTA) ? Q_XGETGQUOTA : Q_XGETPQUOTA); VFS_QUOTACTL(vfsp, getmode, id, (caddr_t)fdq, error); return -error; } @@ -716,7 +719,8 @@ linvfs_setxquota( struct vfs *vfsp = LINVFS_GET_VFS(sb); int error, setmode; - setmode = (type == GRPQUOTA) ? Q_XSETGQLIM : Q_XSETQLIM; + setmode = (type == USRQUOTA) ? Q_XSETQLIM : + ((type == GRPQUOTA) ? Q_XSETGQLIM : Q_XSETPQLIM); VFS_QUOTACTL(vfsp, setmode, id, (caddr_t)fdq, error); return -error; } diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h index 76493991578..7ee1f714e9b 100644 --- a/fs/xfs/linux-2.6/xfs_vfs.h +++ b/fs/xfs/linux-2.6/xfs_vfs.h @@ -107,6 +107,7 @@ typedef enum { #define SYNC_FSDATA 0x0020 /* flush fs data (e.g. superblocks) */ #define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */ #define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */ +#define SYNC_QUIESCE 0x0100 /* quiesce fileystem for a snapshot */ typedef int (*vfs_mount_t)(bhv_desc_t *, struct xfs_mount_args *, struct cred *); diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c index a832d165f24..250cad54e89 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.c +++ b/fs/xfs/linux-2.6/xfs_vnode.c @@ -411,13 +411,13 @@ vn_remove( /* 0 */ (void *)(__psint_t)(vk), \ /* 1 */ (void *)(s), \ /* 2 */ (void *)(__psint_t) line, \ -/* 3 */ (void *)(vn_count(vp)), \ +/* 3 */ (void *)(__psint_t)(vn_count(vp)), \ /* 4 */ (void *)(ra), \ /* 5 */ (void *)(__psunsigned_t)(vp)->v_flag, \ /* 6 */ (void *)(__psint_t)current_cpu(), \ /* 7 */ (void *)(__psint_t)current_pid(), \ /* 8 */ (void *)__return_address, \ -/* 9 */ 0, 0, 0, 0, 0, 0, 0) +/* 9 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL) /* * Vnode tracing code. diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 00466c3194a..a6e57c647be 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -426,7 +426,7 @@ typedef struct vattr { u_long va_extsize; /* file extent size */ u_long va_nextents; /* number of extents in file */ u_long va_anextents; /* number of attr extents in file */ - int va_projid; /* project id */ + prid_t va_projid; /* project id */ } vattr_t; /* diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 740d20d3318..46ce1e3ce1d 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -101,7 +101,7 @@ int xfs_dqerror_mod = 33; * is the d_id field. The idea is to fill in the entire q_core * when we read in the on disk dquot. */ -xfs_dquot_t * +STATIC xfs_dquot_t * xfs_qm_dqinit( xfs_mount_t *mp, xfs_dqid_t id, @@ -286,7 +286,9 @@ xfs_qm_adjust_dqlimits( * We also return 0 as the values of the timers in Q_GETQUOTA calls, when * enforcement's off. * In contrast, warnings are a little different in that they don't - * 'automatically' get started when limits get exceeded. + * 'automatically' get started when limits get exceeded. They do + * get reset to zero, however, when we find the count to be under + * the soft limit (they are only ever set non-zero via userspace). */ void xfs_qm_adjust_dqtimers( @@ -315,6 +317,8 @@ xfs_qm_adjust_dqtimers( INT_GET(d->d_blk_hardlimit, ARCH_CONVERT)))) { INT_SET(d->d_btimer, ARCH_CONVERT, get_seconds() + XFS_QI_BTIMELIMIT(mp)); + } else { + d->d_bwarns = 0; } } else { if ((!d->d_blk_softlimit || @@ -336,6 +340,8 @@ xfs_qm_adjust_dqtimers( INT_GET(d->d_ino_hardlimit, ARCH_CONVERT)))) { INT_SET(d->d_itimer, ARCH_CONVERT, get_seconds() + XFS_QI_ITIMELIMIT(mp)); + } else { + d->d_iwarns = 0; } } else { if ((!d->d_ino_softlimit || @@ -357,6 +363,8 @@ xfs_qm_adjust_dqtimers( INT_GET(d->d_rtb_hardlimit, ARCH_CONVERT)))) { INT_SET(d->d_rtbtimer, ARCH_CONVERT, get_seconds() + XFS_QI_RTBTIMELIMIT(mp)); + } else { + d->d_rtbwarns = 0; } } else { if ((!d->d_rtb_softlimit || @@ -371,68 +379,6 @@ xfs_qm_adjust_dqtimers( } /* - * Increment or reset warnings of a given dquot. - */ -int -xfs_qm_dqwarn( - xfs_disk_dquot_t *d, - uint flags) -{ - int warned; - - /* - * root's limits are not real limits. - */ - if (!d->d_id) - return (0); - - warned = 0; - if (INT_GET(d->d_blk_softlimit, ARCH_CONVERT) && - (INT_GET(d->d_bcount, ARCH_CONVERT) >= - INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) { - if (flags & XFS_QMOPT_DOWARN) { - INT_MOD(d->d_bwarns, ARCH_CONVERT, +1); - warned++; - } - } else { - if (!d->d_blk_softlimit || - (INT_GET(d->d_bcount, ARCH_CONVERT) < - INT_GET(d->d_blk_softlimit, ARCH_CONVERT))) { - d->d_bwarns = 0; - } - } - - if (INT_GET(d->d_ino_softlimit, ARCH_CONVERT) > 0 && - (INT_GET(d->d_icount, ARCH_CONVERT) >= - INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) { - if (flags & XFS_QMOPT_DOWARN) { - INT_MOD(d->d_iwarns, ARCH_CONVERT, +1); - warned++; - } - } else { - if (!d->d_ino_softlimit || - (INT_GET(d->d_icount, ARCH_CONVERT) < - INT_GET(d->d_ino_softlimit, ARCH_CONVERT))) { - d->d_iwarns = 0; - } - } -#ifdef QUOTADEBUG - if (INT_GET(d->d_iwarns, ARCH_CONVERT)) - cmn_err(CE_DEBUG, - "--------@@Inode warnings running : %Lu >= %Lu", - INT_GET(d->d_icount, ARCH_CONVERT), - INT_GET(d->d_ino_softlimit, ARCH_CONVERT)); - if (INT_GET(d->d_bwarns, ARCH_CONVERT)) - cmn_err(CE_DEBUG, - "--------@@Blks warnings running : %Lu >= %Lu", - INT_GET(d->d_bcount, ARCH_CONVERT), - INT_GET(d->d_blk_softlimit, ARCH_CONVERT)); -#endif - return (warned); -} - - -/* * initialize a buffer full of dquots and log the whole thing */ STATIC void @@ -461,9 +407,9 @@ xfs_qm_init_dquot_blk( for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++) xfs_qm_dqinit_core(curid, type, d); xfs_trans_dquot_buf(tp, bp, - type & XFS_DQ_USER ? - XFS_BLI_UDQUOT_BUF : - XFS_BLI_GDQUOT_BUF); + (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : + ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : + XFS_BLI_GDQUOT_BUF))); xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1); } @@ -544,8 +490,7 @@ xfs_qm_dqalloc( * the entire thing. */ xfs_qm_init_dquot_blk(tp, mp, INT_GET(dqp->q_core.d_id, ARCH_CONVERT), - dqp->dq_flags & (XFS_DQ_USER|XFS_DQ_GROUP), - bp); + dqp->dq_flags & XFS_DQ_ALLTYPES, bp); if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed))) { goto error1; @@ -675,8 +620,7 @@ xfs_qm_dqtobp( /* * A simple sanity check in case we got a corrupted dquot... */ - if (xfs_qm_dqcheck(ddq, id, - dqp->dq_flags & (XFS_DQ_USER|XFS_DQ_GROUP), + if (xfs_qm_dqcheck(ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES, flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN), "dqtobp")) { if (!(flags & XFS_QMOPT_DQREPAIR)) { @@ -953,8 +897,8 @@ int xfs_qm_dqget( xfs_mount_t *mp, xfs_inode_t *ip, /* locked inode (optional) */ - xfs_dqid_t id, /* gid or uid, depending on type */ - uint type, /* UDQUOT or GDQUOT */ + xfs_dqid_t id, /* uid/projid/gid depending on type */ + uint type, /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */ uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */ xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ { @@ -965,6 +909,7 @@ xfs_qm_dqget( ASSERT(XFS_IS_QUOTA_RUNNING(mp)); if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) || + (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) || (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) { return (ESRCH); } @@ -983,7 +928,9 @@ xfs_qm_dqget( again: #ifdef DEBUG - ASSERT(type == XFS_DQ_USER || type == XFS_DQ_GROUP); + ASSERT(type == XFS_DQ_USER || + type == XFS_DQ_PROJ || + type == XFS_DQ_GROUP); if (ip) { ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); if (type == XFS_DQ_USER) @@ -1306,8 +1253,8 @@ xfs_qm_dqflush( return (error); } - if (xfs_qm_dqcheck(&dqp->q_core, INT_GET(ddqp->d_id, ARCH_CONVERT), 0, XFS_QMOPT_DOWARN, - "dqflush (incore copy)")) { + if (xfs_qm_dqcheck(&dqp->q_core, INT_GET(ddqp->d_id, ARCH_CONVERT), + 0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) { xfs_force_shutdown(dqp->q_mount, XFS_CORRUPT_INCORE); return XFS_ERROR(EIO); } @@ -1459,7 +1406,8 @@ xfs_dqlock2( { if (d1 && d2) { ASSERT(d1 != d2); - if (INT_GET(d1->q_core.d_id, ARCH_CONVERT) > INT_GET(d2->q_core.d_id, ARCH_CONVERT)) { + if (INT_GET(d1->q_core.d_id, ARCH_CONVERT) > + INT_GET(d2->q_core.d_id, ARCH_CONVERT)) { xfs_dqlock(d2); xfs_dqlock(d1); } else { @@ -1582,8 +1530,7 @@ xfs_qm_dqprint(xfs_dquot_t *dqp) cmn_err(CE_DEBUG, "-----------KERNEL DQUOT----------------"); cmn_err(CE_DEBUG, "---- dquotID = %d", (int)INT_GET(dqp->q_core.d_id, ARCH_CONVERT)); - cmn_err(CE_DEBUG, "---- type = %s", - XFS_QM_ISUDQ(dqp) ? "USR" : "GRP"); + cmn_err(CE_DEBUG, "---- type = %s", DQFLAGTO_TYPESTR(dqp)); cmn_err(CE_DEBUG, "---- fs = 0x%p", dqp->q_mount); cmn_err(CE_DEBUG, "---- blkno = 0x%x", (int) dqp->q_blkno); cmn_err(CE_DEBUG, "---- boffset = 0x%x", (int) dqp->q_bufoffset); diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h index 0c3fe3175ba..39175103c8e 100644 --- a/fs/xfs/quota/xfs_dquot.h +++ b/fs/xfs/quota/xfs_dquot.h @@ -114,25 +114,18 @@ typedef struct xfs_dquot { #define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++) /* - * Quota Accounting flags + * Quota Accounting/Enforcement flags */ -#define XFS_ALL_QUOTA_ACCT (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT) -#define XFS_ALL_QUOTA_ENFD (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD) -#define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD) -#define XFS_ALL_QUOTA_ACTV (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE) -#define XFS_ALL_QUOTA_ACCT_ENFD (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ - XFS_GQUOTA_ACCT|XFS_GQUOTA_ENFD) +#define XFS_ALL_QUOTA_ACCT \ + (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT) +#define XFS_ALL_QUOTA_ENFD (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD) +#define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD) -#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) -#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) -#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) - -/* - * Quota Limit Enforcement flags - */ +#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) #define XFS_IS_QUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ENFD) -#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) -#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD) +#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) +#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) +#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) #ifdef DEBUG static inline int @@ -167,6 +160,8 @@ XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp) #define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp)) #define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) #define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) +#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) +#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) #define XFS_DQ_TO_QINF(dqp) ((dqp)->q_mount->m_quotainfo) #define XFS_DQ_TO_QIP(dqp) (XFS_QM_ISUDQ(dqp) ? \ XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \ @@ -174,7 +169,7 @@ XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp) #define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \ (XFS_IS_UQUOTA_ON((d)->q_mount)) : \ - (XFS_IS_GQUOTA_ON((d)->q_mount)))) + (XFS_IS_OQUOTA_ON((d)->q_mount)))) #ifdef XFS_DQUOT_TRACE /* @@ -211,7 +206,6 @@ extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); extern void xfs_qm_adjust_dqlimits(xfs_mount_t *, xfs_disk_dquot_t *); -extern int xfs_qm_dqwarn(xfs_disk_dquot_t *, uint); extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, xfs_dqid_t, uint, uint, xfs_dquot_t **); extern void xfs_qm_dqput(xfs_dquot_t *); diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index a5425ee6e7b..f5271b7b1e8 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -428,7 +428,7 @@ xfs_qm_dquot_logitem_committing( /* * This is the ops vector for dquots */ -struct xfs_item_ops xfs_dquot_item_ops = { +STATIC struct xfs_item_ops xfs_dquot_item_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_dquot_logitem_format, @@ -646,7 +646,7 @@ xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn) return; } -struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { +STATIC struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_qoff_logitem_format, @@ -669,7 +669,7 @@ struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { /* * This is the ops vector shared by all quotaoff-start log items. */ -struct xfs_item_ops xfs_qm_qoff_logitem_ops = { +STATIC struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_qm_qoff_logitem_format, diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 89f2cd656eb..f665ca8f9e9 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -81,12 +81,18 @@ struct xfs_qm *xfs_Gqm; kmem_zone_t *qm_dqzone; kmem_zone_t *qm_dqtrxzone; -kmem_shaker_t xfs_qm_shaker; +STATIC kmem_shaker_t xfs_qm_shaker; STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); +STATIC void xfs_qm_freelist_init(xfs_frlist_t *); +STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *); +STATIC int xfs_qm_mplist_nowait(xfs_mount_t *); +STATIC int xfs_qm_dqhashlock_nowait(xfs_dquot_t *); + STATIC int xfs_qm_init_quotainos(xfs_mount_t *); +STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); STATIC int xfs_qm_shake(int, unsigned int); #ifdef DEBUG @@ -184,7 +190,7 @@ xfs_Gqm_init(void) /* * Destroy the global quota manager when its reference count goes to zero. */ -void +STATIC void xfs_qm_destroy( struct xfs_qm *xqm) { @@ -304,9 +310,9 @@ xfs_qm_mount_quotainit( uint flags) { /* - * User or group quotas has to be on. + * User, projects or group quotas has to be on. */ - ASSERT(flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA)); + ASSERT(flags & (XFSMNT_UQUOTA | XFSMNT_PQUOTA | XFSMNT_GQUOTA)); /* * Initialize the flags in the mount structure. From this point @@ -324,7 +330,11 @@ xfs_qm_mount_quotainit( if (flags & XFSMNT_GQUOTA) { mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); if (flags & XFSMNT_GQUOTAENF) - mp->m_qflags |= XFS_GQUOTA_ENFD; + mp->m_qflags |= XFS_OQUOTA_ENFD; + } else if (flags & XFSMNT_PQUOTA) { + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); + if (flags & XFSMNT_PQUOTAENF) + mp->m_qflags |= XFS_OQUOTA_ENFD; } } @@ -357,11 +367,11 @@ xfs_qm_mount_quotas( /* * If a file system had quotas running earlier, but decided to - * mount without -o quota/uquota/gquota options, revoke the + * mount without -o uquota/pquota/gquota options, revoke the * quotachecked license, and bail out. */ if (! XFS_IS_QUOTA_ON(mp) && - (mp->m_sb.sb_qflags & (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT))) { + (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT)) { mp->m_qflags = 0; goto write_changes; } @@ -509,7 +519,7 @@ out: * Flush all dquots of the given file system to disk. The dquots are * _not_ purged from memory here, just their data written to disk. */ -int +STATIC int xfs_qm_dqflush_all( xfs_mount_t *mp, int flags) @@ -613,7 +623,7 @@ xfs_qm_detach_gdquots( STATIC int xfs_qm_dqpurge_int( xfs_mount_t *mp, - uint flags) /* QUOTAOFF/UMOUNTING/UQUOTA/GQUOTA */ + uint flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */ { xfs_dquot_t *dqp; uint dqtype; @@ -625,6 +635,7 @@ xfs_qm_dqpurge_int( return (0); dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0; + dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0; dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0; xfs_qm_mplist_lock(mp); @@ -734,11 +745,11 @@ xfs_qm_dqattach_one( /* * udqhint is the i_udquot field in inode, and is non-NULL only - * when the type arg is XFS_DQ_GROUP. Its purpose is to save a + * when the type arg is group/project. Its purpose is to save a * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside * the user dquot. */ - ASSERT(!udqhint || type == XFS_DQ_GROUP); + ASSERT(!udqhint || type == XFS_DQ_GROUP || type == XFS_DQ_PROJ); if (udqhint && !dolock) xfs_dqlock(udqhint); @@ -897,8 +908,8 @@ xfs_qm_dqattach_grouphint( /* - * Given a locked inode, attach dquot(s) to it, taking UQUOTAON / GQUOTAON - * in to account. + * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON + * into account. * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed. * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty * much made this code a complete mess, but it has been pretty useful. @@ -937,8 +948,13 @@ xfs_qm_dqattach( nquotas++; } ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); - if (XFS_IS_GQUOTA_ON(mp)) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, + if (XFS_IS_OQUOTA_ON(mp)) { + error = XFS_IS_GQUOTA_ON(mp) ? + xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, + flags & XFS_QMOPT_DQALLOC, + flags & XFS_QMOPT_DQLOCK, + ip->i_udquot, &ip->i_gdquot) : + xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, flags & XFS_QMOPT_DQALLOC, flags & XFS_QMOPT_DQLOCK, ip->i_udquot, &ip->i_gdquot); @@ -989,7 +1005,7 @@ xfs_qm_dqattach( } if (XFS_IS_UQUOTA_ON(mp)) ASSERT(ip->i_udquot); - if (XFS_IS_GQUOTA_ON(mp)) + if (XFS_IS_OQUOTA_ON(mp)) ASSERT(ip->i_gdquot); } #endif @@ -1018,13 +1034,13 @@ xfs_qm_dqdetach( ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino); ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino); - if (ip->i_udquot) - xfs_dqtrace_entry_ino(ip->i_udquot, "DQDETTACH", ip); if (ip->i_udquot) { + xfs_dqtrace_entry_ino(ip->i_udquot, "DQDETTACH", ip); xfs_qm_dqrele(ip->i_udquot); ip->i_udquot = NULL; } if (ip->i_gdquot) { + xfs_dqtrace_entry_ino(ip->i_gdquot, "DQDETTACH", ip); xfs_qm_dqrele(ip->i_gdquot); ip->i_gdquot = NULL; } @@ -1149,7 +1165,7 @@ xfs_qm_sync( * This initializes all the quota information that's kept in the * mount structure */ -int +STATIC int xfs_qm_init_quotainfo( xfs_mount_t *mp) { @@ -1202,8 +1218,9 @@ xfs_qm_init_quotainfo( * and group quotas, at least not at this point. */ error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0, - (XFS_IS_UQUOTA_RUNNING(mp)) ? - XFS_DQ_USER : XFS_DQ_GROUP, + XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : + (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : + XFS_DQ_PROJ), XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN, &dqp); if (! error) { @@ -1234,6 +1251,10 @@ xfs_qm_init_quotainfo( INT_GET(ddqp->d_iwarns, ARCH_CONVERT) ? INT_GET(ddqp->d_iwarns, ARCH_CONVERT) : XFS_QM_IWARNLIMIT; + qinf->qi_rtbwarnlimit = + INT_GET(ddqp->d_rtbwarns, ARCH_CONVERT) ? + INT_GET(ddqp->d_rtbwarns, ARCH_CONVERT) : + XFS_QM_RTBWARNLIMIT; qinf->qi_bhardlimit = INT_GET(ddqp->d_blk_hardlimit, ARCH_CONVERT); qinf->qi_bsoftlimit = @@ -1259,6 +1280,7 @@ xfs_qm_init_quotainfo( qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; + qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; } return (0); @@ -1366,13 +1388,20 @@ xfs_qm_dqget_noattach( ASSERT(udqp); } - if (XFS_IS_GQUOTA_ON(mp)) { + if (XFS_IS_OQUOTA_ON(mp)) { ASSERT(ip->i_gdquot == NULL); if (udqp) xfs_dqunlock(udqp); - if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_gid, XFS_DQ_GROUP, - XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN, - &gdqp))) { + error = XFS_IS_GQUOTA_ON(mp) ? + xfs_qm_dqget(mp, ip, + ip->i_d.di_gid, XFS_DQ_GROUP, + XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN, + &gdqp) : + xfs_qm_dqget(mp, ip, + ip->i_d.di_projid, XFS_DQ_PROJ, + XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN, + &gdqp); + if (error) { if (udqp) xfs_qm_dqrele(udqp); ASSERT(error != ESRCH); @@ -1521,8 +1550,10 @@ xfs_qm_reset_dqcounts( INT_SET(ddq->d_rtbcount, ARCH_CONVERT, 0ULL); INT_SET(ddq->d_btimer, ARCH_CONVERT, (time_t)0); INT_SET(ddq->d_itimer, ARCH_CONVERT, (time_t)0); + INT_SET(ddq->d_rtbtimer, ARCH_CONVERT, (time_t)0); INT_SET(ddq->d_bwarns, ARCH_CONVERT, 0UL); INT_SET(ddq->d_iwarns, ARCH_CONVERT, 0UL); + INT_SET(ddq->d_rtbwarns, ARCH_CONVERT, 0UL); ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1); } @@ -1541,11 +1572,14 @@ xfs_qm_dqiter_bufs( int error; int notcommitted; int incr; + int type; ASSERT(blkcnt > 0); notcommitted = 0; incr = (blkcnt > XFS_QM_MAX_DQCLUSTER_LOGSZ) ? XFS_QM_MAX_DQCLUSTER_LOGSZ : blkcnt; + type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : + (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP); error = 0; /* @@ -1564,9 +1598,7 @@ xfs_qm_dqiter_bufs( if (error) break; - (void) xfs_qm_reset_dqcounts(mp, bp, firstid, - flags & XFS_QMOPT_UQUOTA ? - XFS_DQ_USER : XFS_DQ_GROUP); + (void) xfs_qm_reset_dqcounts(mp, bp, firstid, type); xfs_bdwrite(mp, bp); /* * goto the next block. @@ -1578,7 +1610,7 @@ xfs_qm_dqiter_bufs( } /* - * Iterate over all allocated USR/GRP dquots in the system, calling a + * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a * caller supplied function for every chunk of dquots that we find. */ STATIC int @@ -1849,7 +1881,7 @@ xfs_qm_dqusage_adjust( xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks); xfs_qm_dqput(udqp); } - if (XFS_IS_GQUOTA_ON(mp)) { + if (XFS_IS_OQUOTA_ON(mp)) { ASSERT(gdqp); xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks); xfs_qm_dqput(gdqp); @@ -1898,7 +1930,7 @@ xfs_qm_quotacheck( cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname); /* - * First we go thru all the dquots on disk, USR and GRP, and reset + * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset * their counters to zero. We need a clean slate. * We don't log our changes till later. */ @@ -1909,9 +1941,10 @@ xfs_qm_quotacheck( } if ((gip = XFS_QI_GQIP(mp))) { - if ((error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA))) + if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? + XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA))) goto error_return; - flags |= XFS_GQUOTA_CHKD; + flags |= XFS_OQUOTA_CHKD; } do { @@ -1938,7 +1971,7 @@ xfs_qm_quotacheck( if (error) { xfs_qm_dqpurge_all(mp, XFS_QMOPT_UQUOTA|XFS_QMOPT_GQUOTA| - XFS_QMOPT_QUOTAOFF); + XFS_QMOPT_PQUOTA|XFS_QMOPT_QUOTAOFF); goto error_return; } /* @@ -1961,7 +1994,7 @@ xfs_qm_quotacheck( * quotachecked status, since we won't be doing accounting for * that type anymore. */ - mp->m_qflags &= ~(XFS_GQUOTA_CHKD | XFS_UQUOTA_CHKD); + mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD); mp->m_qflags |= flags; XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++"); @@ -2013,7 +2046,7 @@ xfs_qm_init_quotainos( 0, 0, &uip, 0))) return XFS_ERROR(error); } - if (XFS_IS_GQUOTA_ON(mp) && + if (XFS_IS_OQUOTA_ON(mp) && mp->m_sb.sb_gquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_gquotino > 0); if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, @@ -2043,10 +2076,12 @@ xfs_qm_init_quotainos( flags &= ~XFS_QMOPT_SBVERSION; } - if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) { - if ((error = xfs_qm_qino_alloc(mp, &gip, - sbflags | XFS_SB_GQUOTINO, - flags | XFS_QMOPT_GQUOTA))) { + if (XFS_IS_OQUOTA_ON(mp) && gip == NULL) { + flags |= (XFS_IS_GQUOTA_ON(mp) ? + XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); + error = xfs_qm_qino_alloc(mp, &gip, + sbflags | XFS_SB_GQUOTINO, flags); + if (error) { if (uip) VN_RELE(XFS_ITOV(uip)); @@ -2452,6 +2487,7 @@ xfs_qm_vop_dqalloc( xfs_inode_t *ip, uid_t uid, gid_t gid, + prid_t prid, uint flags, xfs_dquot_t **O_udqpp, xfs_dquot_t **O_gdqpp) @@ -2483,8 +2519,7 @@ xfs_qm_vop_dqalloc( } uq = gq = NULL; - if ((flags & XFS_QMOPT_UQUOTA) && - XFS_IS_UQUOTA_ON(mp)) { + if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { if (ip->i_d.di_uid != uid) { /* * What we need is the dquot that has this uid, and @@ -2522,8 +2557,7 @@ xfs_qm_vop_dqalloc( xfs_dqunlock(uq); } } - if ((flags & XFS_QMOPT_GQUOTA) && - XFS_IS_GQUOTA_ON(mp)) { + if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { if (ip->i_d.di_gid != gid) { xfs_iunlock(ip, lockflags); if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid, @@ -2546,6 +2580,29 @@ xfs_qm_vop_dqalloc( XFS_DQHOLD(gq); xfs_dqunlock(gq); } + } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { + if (ip->i_d.di_projid != prid) { + xfs_iunlock(ip, lockflags); + if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, + XFS_DQ_PROJ, + XFS_QMOPT_DQALLOC | + XFS_QMOPT_DOWARN, + &gq))) { + if (uq) + xfs_qm_dqrele(uq); + ASSERT(error != ENOENT); + return (error); + } + xfs_dqunlock(gq); + lockflags = XFS_ILOCK_SHARED; + xfs_ilock(ip, lockflags); + } else { + ASSERT(ip->i_gdquot); + gq = ip->i_gdquot; + xfs_dqlock(gq); + XFS_DQHOLD(gq); + xfs_dqunlock(gq); + } } if (uq) xfs_dqtrace_entry_ino(uq, "DQALLOC", ip); @@ -2574,6 +2631,9 @@ xfs_qm_vop_chown( xfs_dquot_t *newdq) { xfs_dquot_t *prevdq; + uint bfield = XFS_IS_REALTIME_INODE(ip) ? + XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; + ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); @@ -2582,20 +2642,12 @@ xfs_qm_vop_chown( ASSERT(prevdq); ASSERT(prevdq != newdq); - xfs_trans_mod_dquot(tp, prevdq, - XFS_TRANS_DQ_BCOUNT, - -(ip->i_d.di_nblocks)); - xfs_trans_mod_dquot(tp, prevdq, - XFS_TRANS_DQ_ICOUNT, - -1); + xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_d.di_nblocks)); + xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1); /* the sparkling new dquot */ - xfs_trans_mod_dquot(tp, newdq, - XFS_TRANS_DQ_BCOUNT, - ip->i_d.di_nblocks); - xfs_trans_mod_dquot(tp, newdq, - XFS_TRANS_DQ_ICOUNT, - 1); + xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks); + xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); /* * Take an extra reference, because the inode @@ -2611,7 +2663,7 @@ xfs_qm_vop_chown( } /* - * Quota reservations for setattr(AT_UID|AT_GID). + * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID). */ int xfs_qm_vop_chown_reserve( @@ -2623,7 +2675,7 @@ xfs_qm_vop_chown_reserve( { int error; xfs_mount_t *mp; - uint delblks; + uint delblks, blkflags; xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq; ASSERT(XFS_ISLOCKED_INODE(ip)); @@ -2632,6 +2684,8 @@ xfs_qm_vop_chown_reserve( delblks = ip->i_delayed_blks; delblksudq = delblksgdq = unresudq = unresgdq = NULL; + blkflags = XFS_IS_REALTIME_INODE(ip) ? + XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; if (XFS_IS_UQUOTA_ON(mp) && udqp && ip->i_d.di_uid != (uid_t)INT_GET(udqp->q_core.d_id, ARCH_CONVERT)) { @@ -2646,18 +2700,22 @@ xfs_qm_vop_chown_reserve( unresudq = ip->i_udquot; } } - if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && - ip->i_d.di_gid != INT_GET(gdqp->q_core.d_id, ARCH_CONVERT)) { - delblksgdq = gdqp; - if (delblks) { - ASSERT(ip->i_gdquot); - unresgdq = ip->i_gdquot; + if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { + if ((XFS_IS_GQUOTA_ON(ip->i_mount) && ip->i_d.di_gid != + INT_GET(gdqp->q_core.d_id, ARCH_CONVERT)) || + (XFS_IS_PQUOTA_ON(ip->i_mount) && ip->i_d.di_projid != + INT_GET(gdqp->q_core.d_id, ARCH_CONVERT))) { + delblksgdq = gdqp; + if (delblks) { + ASSERT(ip->i_gdquot); + unresgdq = ip->i_gdquot; + } } } if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, delblksudq, delblksgdq, ip->i_d.di_nblocks, 1, - flags | XFS_QMOPT_RES_REGBLKS))) + flags | blkflags))) return (error); /* @@ -2674,11 +2732,11 @@ xfs_qm_vop_chown_reserve( ASSERT(unresudq || unresgdq); if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0, - flags | XFS_QMOPT_RES_REGBLKS))) + flags | blkflags))) return (error); xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0, - XFS_QMOPT_RES_REGBLKS); + blkflags); } return (0); @@ -2751,7 +2809,7 @@ xfs_qm_vop_dqattach_and_dqmod_newinode( } /* ------------- list stuff -----------------*/ -void +STATIC void xfs_qm_freelist_init(xfs_frlist_t *ql) { ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql; @@ -2760,7 +2818,7 @@ xfs_qm_freelist_init(xfs_frlist_t *ql) ql->qh_nelems = 0; } -void +STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *ql) { xfs_dquot_t *dqp, *nextdqp; @@ -2786,7 +2844,7 @@ xfs_qm_freelist_destroy(xfs_frlist_t *ql) ASSERT(ql->qh_nelems == 0); } -void +STATIC void xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq) { dq->dq_flnext = ql->qh_next; @@ -2816,7 +2874,7 @@ xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq) xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq); } -int +STATIC int xfs_qm_dqhashlock_nowait( xfs_dquot_t *dqp) { @@ -2836,7 +2894,7 @@ xfs_qm_freelist_lock_nowait( return (locked); } -int +STATIC int xfs_qm_mplist_nowait( xfs_mount_t *mp) { diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h index dcf1a7a831d..b03eecf3b6c 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/quota/xfs_qm.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU General Public License as @@ -133,8 +133,9 @@ typedef struct xfs_quotainfo { time_t qi_btimelimit; /* limit for blks timer */ time_t qi_itimelimit; /* limit for inodes timer */ time_t qi_rtbtimelimit;/* limit for rt blks timer */ - xfs_qwarncnt_t qi_bwarnlimit; /* limit for num warnings */ - xfs_qwarncnt_t qi_iwarnlimit; /* limit for num warnings */ + xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ + xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ + xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ mutex_t qi_quotaofflock;/* to serialize quotaoff */ xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ uint qi_dqperchunk; /* # ondisk dqs in above chunk */ @@ -176,6 +177,7 @@ typedef struct xfs_dquot_acct { #define XFS_QM_BWARNLIMIT 5 #define XFS_QM_IWARNLIMIT 5 +#define XFS_QM_RTBWARNLIMIT 5 #define XFS_QM_LOCK(xqm) (mutex_lock(&xqm##_lock, PINOD)) #define XFS_QM_UNLOCK(xqm) (mutex_unlock(&xqm##_lock)) @@ -184,7 +186,6 @@ typedef struct xfs_dquot_acct { extern void xfs_mount_reset_sbqflags(xfs_mount_t *); -extern int xfs_qm_init_quotainfo(xfs_mount_t *); extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); extern int xfs_qm_mount_quotas(xfs_mount_t *, int); extern void xfs_qm_mount_quotainit(xfs_mount_t *, uint); @@ -203,7 +204,7 @@ extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); /* vop stuff */ extern int xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *, - uid_t, gid_t, uint, + uid_t, gid_t, prid_t, uint, xfs_dquot_t **, xfs_dquot_t **); extern void xfs_qm_vop_dqattach_and_dqmod_newinode( xfs_trans_t *, xfs_inode_t *, @@ -215,14 +216,9 @@ extern int xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *, xfs_dquot_t *, xfs_dquot_t *, uint); /* list stuff */ -extern void xfs_qm_freelist_init(xfs_frlist_t *); -extern void xfs_qm_freelist_destroy(xfs_frlist_t *); -extern void xfs_qm_freelist_insert(xfs_frlist_t *, xfs_dquot_t *); extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *); extern void xfs_qm_freelist_unlink(xfs_dquot_t *); extern int xfs_qm_freelist_lock_nowait(xfs_qm_t *); -extern int xfs_qm_mplist_nowait(xfs_mount_t *); -extern int xfs_qm_dqhashlock_nowait(xfs_dquot_t *); /* system call interface */ extern int xfs_qm_quotactl(bhv_desc_t *, int, int, xfs_caddr_t); diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index be67d9c265f..dc3c37a1e15 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -71,10 +71,13 @@ #define MNTOPT_NOQUOTA "noquota" /* no quotas */ #define MNTOPT_USRQUOTA "usrquota" /* user quota enabled */ #define MNTOPT_GRPQUOTA "grpquota" /* group quota enabled */ +#define MNTOPT_PRJQUOTA "prjquota" /* project quota enabled */ #define MNTOPT_UQUOTA "uquota" /* user quota (IRIX variant) */ #define MNTOPT_GQUOTA "gquota" /* group quota (IRIX variant) */ +#define MNTOPT_PQUOTA "pquota" /* project quota (IRIX variant) */ #define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */ #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ +#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ #define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ STATIC int @@ -109,6 +112,14 @@ xfs_qm_parseargs( args->flags |= XFSMNT_UQUOTA; args->flags &= ~XFSMNT_UQUOTAENF; referenced = 1; + } else if (!strcmp(this_char, MNTOPT_PQUOTA) || + !strcmp(this_char, MNTOPT_PRJQUOTA)) { + args->flags |= XFSMNT_PQUOTA | XFSMNT_PQUOTAENF; + referenced = 1; + } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { + args->flags |= XFSMNT_PQUOTA; + args->flags &= ~XFSMNT_PQUOTAENF; + referenced = 1; } else if (!strcmp(this_char, MNTOPT_GQUOTA) || !strcmp(this_char, MNTOPT_GRPQUOTA)) { args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF; @@ -127,6 +138,12 @@ xfs_qm_parseargs( *this_char++ = ','; } + if ((args->flags & XFSMNT_GQUOTA) && (args->flags & XFSMNT_PQUOTA)) { + cmn_err(CE_WARN, + "XFS: cannot mount with both project and group quota"); + return XFS_ERROR(EINVAL); + } + PVFS_PARSEARGS(BHV_NEXT(bhv), options, args, update, error); if (!error && !referenced) bhv_remove_vfsops(bhvtovfs(bhv), VFS_POSITION_QM); @@ -148,13 +165,19 @@ xfs_qm_showargs( seq_puts(m, "," MNTOPT_UQUOTANOENF); } + if (mp->m_qflags & XFS_PQUOTA_ACCT) { + (mp->m_qflags & XFS_OQUOTA_ENFD) ? + seq_puts(m, "," MNTOPT_PRJQUOTA) : + seq_puts(m, "," MNTOPT_PQUOTANOENF); + } + if (mp->m_qflags & XFS_GQUOTA_ACCT) { - (mp->m_qflags & XFS_GQUOTA_ENFD) ? + (mp->m_qflags & XFS_OQUOTA_ENFD) ? seq_puts(m, "," MNTOPT_GRPQUOTA) : seq_puts(m, "," MNTOPT_GQUOTANOENF); } - if (!(mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT))) + if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, "," MNTOPT_NOQUOTA); PVFS_SHOWARGS(BHV_NEXT(bhv), m, error); @@ -171,7 +194,7 @@ xfs_qm_mount( struct xfs_mount *mp = XFS_VFSTOM(vfsp); int error; - if (args->flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA)) + if (args->flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA | XFSMNT_PQUOTA)) xfs_qm_mount_quotainit(mp, args->flags); PVFS_MOUNT(BHV_NEXT(bhv), args, cr, error); return error; @@ -255,16 +278,17 @@ xfs_qm_newmount( uint *quotaflags) { uint quotaondisk; - uint uquotaondisk = 0, gquotaondisk = 0; + uint uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0; *quotaflags = 0; *needquotamount = B_FALSE; quotaondisk = XFS_SB_VERSION_HASQUOTA(&mp->m_sb) && - mp->m_sb.sb_qflags & (XFS_UQUOTA_ACCT|XFS_GQUOTA_ACCT); + (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT); if (quotaondisk) { uquotaondisk = mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT; + pquotaondisk = mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT; gquotaondisk = mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT; } @@ -277,13 +301,16 @@ xfs_qm_newmount( if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) || (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) || + (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) || + (!pquotaondisk && XFS_IS_PQUOTA_ON(mp)) || (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || - (!gquotaondisk && XFS_IS_GQUOTA_ON(mp))) && + (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) && xfs_dev_is_read_only(mp, "changing quota state")) { cmn_err(CE_WARN, - "XFS: please mount with%s%s%s.", + "XFS: please mount with%s%s%s%s.", (!quotaondisk ? "out quota" : ""), (uquotaondisk ? " usrquota" : ""), + (pquotaondisk ? " prjquota" : ""), (gquotaondisk ? " grpquota" : "")); return XFS_ERROR(EPERM); } @@ -359,7 +386,7 @@ xfs_qm_dqrele_null( } -struct xfs_qmops xfs_qmcore_xfs = { +STATIC struct xfs_qmops xfs_qmcore_xfs = { .xfs_qminit = xfs_qm_newmount, .xfs_qmdone = xfs_qm_unmount_quotadestroy, .xfs_qmmount = xfs_qm_endmount, diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 229f5b5a2d2..68e98962dbe 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -118,40 +118,41 @@ xfs_qm_quotactl( * The following commands are valid even when quotaoff. */ switch (cmd) { + case Q_XQUOTARM: /* - * truncate quota files. quota must be off. + * Truncate quota files. quota must be off. */ - case Q_XQUOTARM: if (XFS_IS_QUOTA_ON(mp) || addr == NULL) return XFS_ERROR(EINVAL); if (vfsp->vfs_flag & VFS_RDONLY) return XFS_ERROR(EROFS); return (xfs_qm_scall_trunc_qfiles(mp, xfs_qm_import_qtype_flags(*(uint *)addr))); + + case Q_XGETQSTAT: /* * Get quota status information. */ - case Q_XGETQSTAT: return (xfs_qm_scall_getqstat(mp, (fs_quota_stat_t *)addr)); + case Q_XQUOTAON: /* - * QUOTAON for root f/s and quota enforcement on others.. - * Quota accounting for non-root f/s's must be turned on - * at mount time. + * QUOTAON - enabling quota enforcement. + * Quota accounting must be turned on at mount time. */ - case Q_XQUOTAON: if (addr == NULL) return XFS_ERROR(EINVAL); if (vfsp->vfs_flag & VFS_RDONLY) return XFS_ERROR(EROFS); return (xfs_qm_scall_quotaon(mp, xfs_qm_import_flags(*(uint *)addr))); - case Q_XQUOTAOFF: + + case Q_XQUOTAOFF: if (vfsp->vfs_flag & VFS_RDONLY) return XFS_ERROR(EROFS); break; - default: + default: break; } @@ -159,7 +160,7 @@ xfs_qm_quotactl( return XFS_ERROR(ESRCH); switch (cmd) { - case Q_XQUOTAOFF: + case Q_XQUOTAOFF: if (vfsp->vfs_flag & VFS_RDONLY) return XFS_ERROR(EROFS); error = xfs_qm_scall_quotaoff(mp, @@ -167,42 +168,39 @@ xfs_qm_quotactl( B_FALSE); break; - /* - * Defaults to XFS_GETUQUOTA. - */ - case Q_XGETQUOTA: + case Q_XGETQUOTA: error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_USER, (fs_disk_quota_t *)addr); break; - /* - * Set limits, both hard and soft. Defaults to Q_SETUQLIM. - */ - case Q_XSETQLIM: + case Q_XGETGQUOTA: + error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_GROUP, + (fs_disk_quota_t *)addr); + break; + case Q_XGETPQUOTA: + error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_PROJ, + (fs_disk_quota_t *)addr); + break; + + case Q_XSETQLIM: if (vfsp->vfs_flag & VFS_RDONLY) return XFS_ERROR(EROFS); error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_USER, (fs_disk_quota_t *)addr); break; - - case Q_XSETGQLIM: + case Q_XSETGQLIM: if (vfsp->vfs_flag & VFS_RDONLY) return XFS_ERROR(EROFS); error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_GROUP, (fs_disk_quota_t *)addr); break; - - - case Q_XGETGQUOTA: - error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_GROUP, - (fs_disk_quota_t *)addr); + case Q_XSETPQLIM: + if (vfsp->vfs_flag & VFS_RDONLY) + return XFS_ERROR(EROFS); + error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_PROJ, + (fs_disk_quota_t *)addr); break; - /* - * Quotas are entirely undefined after quotaoff in XFS quotas. - * For instance, there's no way to set limits when quotaoff. - */ - - default: + default: error = XFS_ERROR(EINVAL); break; } @@ -286,8 +284,12 @@ xfs_qm_scall_quotaoff( } if (flags & XFS_GQUOTA_ACCT) { dqtype |= XFS_QMOPT_GQUOTA; - flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD); + flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); inactivate_flags |= XFS_GQUOTA_ACTIVE; + } else if (flags & XFS_PQUOTA_ACCT) { + dqtype |= XFS_QMOPT_PQUOTA; + flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); + inactivate_flags |= XFS_PQUOTA_ACTIVE; } /* @@ -364,7 +366,8 @@ xfs_qm_scall_quotaoff( /* * If quotas is completely disabled, close shop. */ - if ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_ALL) { + if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) || + ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) { mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); xfs_qm_destroy_quotainfo(mp); return (0); @@ -378,7 +381,7 @@ xfs_qm_scall_quotaoff( XFS_PURGE_INODE(XFS_QI_UQIP(mp)); XFS_QI_UQIP(mp) = NULL; } - if ((dqtype & XFS_QMOPT_GQUOTA) && XFS_QI_GQIP(mp)) { + if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) { XFS_PURGE_INODE(XFS_QI_GQIP(mp)); XFS_QI_GQIP(mp) = NULL; } @@ -411,7 +414,8 @@ xfs_qm_scall_trunc_qfiles( } } - if ((flags & XFS_DQ_GROUP) && mp->m_sb.sb_gquotino != NULLFSINO) { + if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) && + mp->m_sb.sb_gquotino != NULLFSINO) { error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0); if (! error) { (void) xfs_truncate_file(mp, qip); @@ -434,7 +438,7 @@ xfs_qm_scall_quotaon( uint flags) { int error; - unsigned long s; + unsigned long s; uint qf; uint accflags; __int64_t sbflags; @@ -468,9 +472,13 @@ xfs_qm_scall_quotaon( (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && (flags & XFS_UQUOTA_ENFD)) || + ((flags & XFS_PQUOTA_ACCT) == 0 && + (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && + (flags & XFS_OQUOTA_ENFD)) + || ((flags & XFS_GQUOTA_ACCT) == 0 && (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && - (flags & XFS_GQUOTA_ENFD))) { + (flags & XFS_OQUOTA_ENFD))) { qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n", flags, mp->m_sb.sb_qflags); return XFS_ERROR(EINVAL); @@ -504,6 +512,10 @@ xfs_qm_scall_quotaon( */ if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) != (mp->m_qflags & XFS_UQUOTA_ACCT)) || + ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) != + (mp->m_qflags & XFS_PQUOTA_ACCT)) || + ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) != + (mp->m_qflags & XFS_GQUOTA_ACCT)) || (flags & XFS_ALL_QUOTA_ENFD) == 0) return (0); @@ -521,7 +533,6 @@ xfs_qm_scall_quotaon( } - /* * Return quota status information, such as uquota-off, enforcements, etc. */ @@ -606,7 +617,8 @@ xfs_qm_scall_setqlim( if (!capable(CAP_SYS_ADMIN)) return XFS_ERROR(EPERM); - if ((newlim->d_fieldmask & (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK)) == 0) + if ((newlim->d_fieldmask & + (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0) return (0); tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); @@ -691,12 +703,23 @@ xfs_qm_scall_setqlim( qdprintk("ihard %Ld < isoft %Ld\n", hard, soft); } + /* + * Update warnings counter(s) if requested + */ + if (newlim->d_fieldmask & FS_DQ_BWARNS) + INT_SET(ddq->d_bwarns, ARCH_CONVERT, newlim->d_bwarns); + if (newlim->d_fieldmask & FS_DQ_IWARNS) + INT_SET(ddq->d_iwarns, ARCH_CONVERT, newlim->d_iwarns); + if (newlim->d_fieldmask & FS_DQ_RTBWARNS) + INT_SET(ddq->d_rtbwarns, ARCH_CONVERT, newlim->d_rtbwarns); + if (id == 0) { /* * Timelimits for the super user set the relative time * the other users can be over quota for this file system. * If it is zero a default is used. Ditto for the default - * soft and hard limit values (already done, above). + * soft and hard limit values (already done, above), and + * for warnings. */ if (newlim->d_fieldmask & FS_DQ_BTIMER) { mp->m_quotainfo->qi_btimelimit = newlim->d_btimer; @@ -710,7 +733,13 @@ xfs_qm_scall_setqlim( mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer; INT_SET(ddq->d_rtbtimer, ARCH_CONVERT, newlim->d_rtbtimer); } - } else /* if (XFS_IS_QUOTA_ENFORCED(mp)) */ { + if (newlim->d_fieldmask & FS_DQ_BWARNS) + mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns; + if (newlim->d_fieldmask & FS_DQ_IWARNS) + mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns; + if (newlim->d_fieldmask & FS_DQ_RTBWARNS) + mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns; + } else { /* * If the user is now over quota, start the timelimit. * The user will not be 'warned'. @@ -776,9 +805,9 @@ xfs_qm_log_quotaoff_end( xfs_qoff_logitem_t *startqoff, uint flags) { - xfs_trans_t *tp; + xfs_trans_t *tp; int error; - xfs_qoff_logitem_t *qoffi; + xfs_qoff_logitem_t *qoffi; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); @@ -928,18 +957,26 @@ xfs_qm_export_dquot( STATIC uint xfs_qm_import_qtype_flags( - uint uflags) + uint uflags) { + uint oflags = 0; + /* - * Can't be both at the same time. + * Can't be more than one, or none. */ if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) == - (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) || - ((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) == 0)) + (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) || + ((uflags & (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) == + (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) || + ((uflags & (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) == + (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) || + ((uflags & (XFS_GROUP_QUOTA|XFS_USER_QUOTA|XFS_PROJ_QUOTA)) == 0)) return (0); - return (uflags & XFS_USER_QUOTA) ? - XFS_DQ_USER : XFS_DQ_GROUP; + oflags |= (uflags & XFS_USER_QUOTA) ? XFS_DQ_USER : 0; + oflags |= (uflags & XFS_PROJ_QUOTA) ? XFS_DQ_PROJ : 0; + oflags |= (uflags & XFS_GROUP_QUOTA) ? XFS_DQ_GROUP: 0; + return oflags; } STATIC uint @@ -947,14 +984,19 @@ xfs_qm_export_qtype_flags( uint flags) { /* - * Can't be both at the same time. + * Can't be more than one, or none. */ - ASSERT((flags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) != - (XFS_GROUP_QUOTA | XFS_USER_QUOTA)); - ASSERT((flags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) != 0); + ASSERT((flags & (XFS_PROJ_QUOTA | XFS_USER_QUOTA)) != + (XFS_PROJ_QUOTA | XFS_USER_QUOTA)); + ASSERT((flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)) != + (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)); + ASSERT((flags & (XFS_USER_QUOTA | XFS_GROUP_QUOTA)) != + (XFS_USER_QUOTA | XFS_GROUP_QUOTA)); + ASSERT((flags & (XFS_PROJ_QUOTA|XFS_USER_QUOTA|XFS_GROUP_QUOTA)) != 0); return (flags & XFS_DQ_USER) ? - XFS_USER_QUOTA : XFS_GROUP_QUOTA; + XFS_USER_QUOTA : (flags & XFS_DQ_PROJ) ? + XFS_PROJ_QUOTA : XFS_GROUP_QUOTA; } STATIC uint @@ -965,12 +1007,14 @@ xfs_qm_import_flags( if (uflags & XFS_QUOTA_UDQ_ACCT) flags |= XFS_UQUOTA_ACCT; + if (uflags & XFS_QUOTA_PDQ_ACCT) + flags |= XFS_PQUOTA_ACCT; if (uflags & XFS_QUOTA_GDQ_ACCT) flags |= XFS_GQUOTA_ACCT; if (uflags & XFS_QUOTA_UDQ_ENFD) flags |= XFS_UQUOTA_ENFD; - if (uflags & XFS_QUOTA_GDQ_ENFD) - flags |= XFS_GQUOTA_ENFD; + if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD)) + flags |= XFS_OQUOTA_ENFD; return (flags); } @@ -984,12 +1028,16 @@ xfs_qm_export_flags( uflags = 0; if (flags & XFS_UQUOTA_ACCT) uflags |= XFS_QUOTA_UDQ_ACCT; + if (flags & XFS_PQUOTA_ACCT) + uflags |= XFS_QUOTA_PDQ_ACCT; if (flags & XFS_GQUOTA_ACCT) uflags |= XFS_QUOTA_GDQ_ACCT; if (flags & XFS_UQUOTA_ENFD) uflags |= XFS_QUOTA_UDQ_ENFD; - if (flags & XFS_GQUOTA_ENFD) - uflags |= XFS_QUOTA_GDQ_ENFD; + if (flags & (XFS_OQUOTA_ENFD)) { + uflags |= (flags & XFS_GQUOTA_ACCT) ? + XFS_QUOTA_GDQ_ENFD : XFS_QUOTA_PDQ_ENFD; + } return (uflags); } @@ -1070,7 +1118,7 @@ again: xfs_qm_dqrele(ip->i_udquot); ip->i_udquot = NULL; } - if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) { + if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) { xfs_qm_dqrele(ip->i_gdquot); ip->i_gdquot = NULL; } @@ -1160,7 +1208,6 @@ xfs_qm_dqtest_print( { cmn_err(CE_DEBUG, "-----------DQTEST DQUOT----------------"); cmn_err(CE_DEBUG, "---- dquot ID = %d", d->d_id); - cmn_err(CE_DEBUG, "---- type = %s", XFS_QM_ISUDQ(d)? "USR" : "GRP"); cmn_err(CE_DEBUG, "---- fs = 0x%p", d->q_mount); cmn_err(CE_DEBUG, "---- bcount = %Lu (0x%x)", d->d_bcount, (int)d->d_bcount); @@ -1231,7 +1278,7 @@ xfs_dqtest_cmp2( #ifdef QUOTADEBUG if (!err) { cmn_err(CE_DEBUG, "%d [%s] [0x%p] qchecked", - d->d_id, XFS_QM_ISUDQ(d) ? "USR" : "GRP", d->q_mount); + d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); } #endif return (err); @@ -1287,6 +1334,7 @@ STATIC void xfs_qm_internalqcheck_get_dquots( xfs_mount_t *mp, xfs_dqid_t uid, + xfs_dqid_t projid, xfs_dqid_t gid, xfs_dqtest_t **ud, xfs_dqtest_t **gd) @@ -1295,6 +1343,8 @@ xfs_qm_internalqcheck_get_dquots( xfs_qm_internalqcheck_dqget(mp, uid, XFS_DQ_USER, ud); if (XFS_IS_GQUOTA_ON(mp)) xfs_qm_internalqcheck_dqget(mp, gid, XFS_DQ_GROUP, gd); + else if (XFS_IS_PQUOTA_ON(mp)) + xfs_qm_internalqcheck_dqget(mp, projid, XFS_DQ_PROJ, gd); } @@ -1362,13 +1412,14 @@ xfs_qm_internalqcheck_adjust( } xfs_qm_internalqcheck_get_dquots(mp, (xfs_dqid_t) ip->i_d.di_uid, + (xfs_dqid_t) ip->i_d.di_projid, (xfs_dqid_t) ip->i_d.di_gid, &ud, &gd); if (XFS_IS_UQUOTA_ON(mp)) { ASSERT(ud); xfs_qm_internalqcheck_dqadjust(ip, ud); } - if (XFS_IS_GQUOTA_ON(mp)) { + if (XFS_IS_OQUOTA_ON(mp)) { ASSERT(gd); xfs_qm_internalqcheck_dqadjust(ip, gd); } diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h index 414b6004af2..bf413e70ec0 100644 --- a/fs/xfs/quota/xfs_quota_priv.h +++ b/fs/xfs/quota/xfs_quota_priv.h @@ -56,6 +56,7 @@ #define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit) #define XFS_QI_ITIMELIMIT(mp) ((mp)->m_quotainfo->qi_itimelimit) #define XFS_QI_BWARNLIMIT(mp) ((mp)->m_quotainfo->qi_bwarnlimit) +#define XFS_QI_RTBWARNLIMIT(mp) ((mp)->m_quotainfo->qi_rtbwarnlimit) #define XFS_QI_IWARNLIMIT(mp) ((mp)->m_quotainfo->qi_iwarnlimit) #define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock) @@ -102,7 +103,8 @@ static inline int XQMISLCKD(struct xfs_dqhash *h) (xfs_Gqm->qm_grp_dqhtable + \ XFS_DQ_HASHVAL(mp, id))) #define XFS_IS_DQTYPE_ON(mp, type) (type == XFS_DQ_USER ? \ - XFS_IS_UQUOTA_ON(mp):XFS_IS_GQUOTA_ON(mp)) + XFS_IS_UQUOTA_ON(mp) : \ + XFS_IS_OQUOTA_ON(mp)) #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ !dqp->q_core.d_blk_hardlimit && \ !dqp->q_core.d_blk_softlimit && \ @@ -177,16 +179,11 @@ for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \ (!((dqp)->q_core.d_id)) #define XFS_PURGE_INODE(ip) \ - { \ - vmap_t dqvmap; \ - vnode_t *dqvp; \ - dqvp = XFS_ITOV(ip); \ - VMAP(dqvp, dqvmap); \ - VN_RELE(dqvp); \ - } + IRELE(ip); #define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ - (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : "???")) + (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ + (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) #define DQFLAGTO_DIRTYSTR(d) (XFS_DQ_IS_DIRTY(d) ? "DIRTY" : "NOTDIRTY") #endif /* __XFS_QUOTA_PRIV_H__ */ diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index 149b2a1fd94..3b99daf8a64 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -187,7 +187,7 @@ xfs_trans_dup_dqinfo( /* * Wrap around mod_dquot to account for both user and group quotas. */ -void +STATIC void xfs_trans_mod_dquot_byino( xfs_trans_t *tp, xfs_inode_t *ip, @@ -207,12 +207,10 @@ xfs_trans_mod_dquot_byino( if (tp->t_dqinfo == NULL) xfs_trans_alloc_dqinfo(tp); - if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot) { + if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot) (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta); - } - if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot) { + if (XFS_IS_OQUOTA_ON(mp) && ip->i_gdquot) (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta); - } } STATIC xfs_dqtrx_t * @@ -368,7 +366,7 @@ xfs_trans_dqlockedjoin( * Unreserve just the reservations done by this transaction. * dquot is still left locked at exit. */ -void +STATIC void xfs_trans_apply_dquot_deltas( xfs_trans_t *tp) { @@ -499,7 +497,7 @@ xfs_trans_apply_dquot_deltas( * Adjust the RT reservation. */ if (qtrx->qt_rtblk_res != 0) { - if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) { + if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) { if (qtrx->qt_rtblk_res > qtrx->qt_rtblk_res_used) dqp->q_res_rtbcount -= (xfs_qcnt_t) @@ -532,12 +530,6 @@ xfs_trans_apply_dquot_deltas( (xfs_qcnt_t)qtrx->qt_icount_delta; } - -#ifdef QUOTADEBUG - if (qtrx->qt_rtblk_res != 0) - cmn_err(CE_DEBUG, "RT res %d for 0x%p\n", - (int) qtrx->qt_rtblk_res, dqp); -#endif ASSERT(dqp->q_res_bcount >= INT_GET(dqp->q_core.d_bcount, ARCH_CONVERT)); ASSERT(dqp->q_res_icount >= @@ -638,7 +630,10 @@ xfs_trans_dqresv( int error; xfs_qcnt_t hardlimit; xfs_qcnt_t softlimit; - time_t btimer; + time_t timer; + xfs_qwarncnt_t warns; + xfs_qwarncnt_t warnlimit; + xfs_qcnt_t count; xfs_qcnt_t *resbcountp; xfs_quotainfo_t *q = mp->m_quotainfo; @@ -653,7 +648,9 @@ xfs_trans_dqresv( softlimit = INT_GET(dqp->q_core.d_blk_softlimit, ARCH_CONVERT); if (!softlimit) softlimit = q->qi_bsoftlimit; - btimer = INT_GET(dqp->q_core.d_btimer, ARCH_CONVERT); + timer = INT_GET(dqp->q_core.d_btimer, ARCH_CONVERT); + warns = INT_GET(dqp->q_core.d_bwarns, ARCH_CONVERT); + warnlimit = XFS_QI_BWARNLIMIT(dqp->q_mount); resbcountp = &dqp->q_res_bcount; } else { ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); @@ -663,7 +660,9 @@ xfs_trans_dqresv( softlimit = INT_GET(dqp->q_core.d_rtb_softlimit, ARCH_CONVERT); if (!softlimit) softlimit = q->qi_rtbsoftlimit; - btimer = INT_GET(dqp->q_core.d_rtbtimer, ARCH_CONVERT); + timer = INT_GET(dqp->q_core.d_rtbtimer, ARCH_CONVERT); + warns = INT_GET(dqp->q_core.d_rtbwarns, ARCH_CONVERT); + warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount); resbcountp = &dqp->q_res_rtbcount; } error = 0; @@ -693,37 +692,36 @@ xfs_trans_dqresv( * If timer or warnings has expired, * return EDQUOT */ - if ((btimer != 0 && get_seconds() > btimer) || - (dqp->q_core.d_bwarns && - INT_GET(dqp->q_core.d_bwarns, ARCH_CONVERT) >= - XFS_QI_BWARNLIMIT(dqp->q_mount))) { + if ((timer != 0 && get_seconds() > timer) || + (warns != 0 && warns >= warnlimit)) { error = EDQUOT; goto error_return; } } } if (ninos > 0) { - hardlimit = INT_GET(dqp->q_core.d_ino_hardlimit, ARCH_CONVERT); + count = INT_GET(dqp->q_core.d_icount, ARCH_CONVERT); + timer = INT_GET(dqp->q_core.d_itimer, ARCH_CONVERT); + warns = INT_GET(dqp->q_core.d_iwarns, ARCH_CONVERT); + warnlimit = XFS_QI_IWARNLIMIT(dqp->q_mount); + hardlimit = INT_GET(dqp->q_core.d_ino_hardlimit, + ARCH_CONVERT); if (!hardlimit) hardlimit = q->qi_ihardlimit; - softlimit = INT_GET(dqp->q_core.d_ino_softlimit, ARCH_CONVERT); + softlimit = INT_GET(dqp->q_core.d_ino_softlimit, + ARCH_CONVERT); if (!softlimit) softlimit = q->qi_isoftlimit; - if (hardlimit > 0ULL && - INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) >= hardlimit) { + if (hardlimit > 0ULL && count >= hardlimit) { error = EDQUOT; goto error_return; - } else if (softlimit > 0ULL && - INT_GET(dqp->q_core.d_icount, ARCH_CONVERT) >= softlimit) { + } else if (softlimit > 0ULL && count >= softlimit) { /* * If timer or warnings has expired, * return EDQUOT */ - if ((dqp->q_core.d_itimer && - get_seconds() > INT_GET(dqp->q_core.d_itimer, ARCH_CONVERT)) || - (dqp->q_core.d_iwarns && - INT_GET(dqp->q_core.d_iwarns, ARCH_CONVERT) >= - XFS_QI_IWARNLIMIT(dqp->q_mount))) { + if ((timer != 0 && get_seconds() > timer) || + (warns != 0 && warns >= warnlimit)) { error = EDQUOT; goto error_return; } diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c index 7d6e1f37df1..4ed7b6928cd 100644 --- a/fs/xfs/support/debug.c +++ b/fs/xfs/support/debug.c @@ -36,7 +36,6 @@ #include <linux/sched.h> #include <linux/kernel.h> -int doass = 1; static char message[256]; /* keep it off the stack */ static DEFINE_SPINLOCK(xfs_err_lock); diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h index 40b0f4c54d9..c5b9365a7e2 100644 --- a/fs/xfs/support/debug.h +++ b/fs/xfs/support/debug.h @@ -50,16 +50,11 @@ extern void cmn_err(int, char *, ...); #endif #ifdef DEBUG -# ifdef lint -# define ASSERT(EX) ((void)0) /* avoid "constant in conditional" babble */ -# else -# define ASSERT(EX) ((!doass||(EX))?((void)0):assfail(#EX, __FILE__, __LINE__)) -# endif /* lint */ +# define ASSERT(EX) ((EX) ? ((void)0) : assfail(#EX, __FILE__, __LINE__)) #else # define ASSERT(x) ((void)0) #endif -extern int doass; /* dynamically turn off asserts */ extern void assfail(char *, char *, int); #ifdef DEBUG extern unsigned long random(void); diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 36603db10fe..dcfe1970362 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -59,7 +59,7 @@ #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -int +STATIC int xfs_alloc_search_busy(xfs_trans_t *tp, xfs_agnumber_t agno, xfs_agblock_t bno, @@ -2562,7 +2562,7 @@ xfs_alloc_clear_busy(xfs_trans_t *tp, /* * returns non-zero if any of (agno,bno):len is in a busy list */ -int +STATIC int xfs_alloc_search_busy(xfs_trans_t *tp, xfs_agnumber_t agno, xfs_agblock_t bno, diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index ee8b5904ec7..a41ad3a5e55 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -71,6 +71,11 @@ * Provide the external interfaces to manage attribute lists. */ +#define ATTR_SYSCOUNT 2 +STATIC struct attrnames posix_acl_access; +STATIC struct attrnames posix_acl_default; +STATIC struct attrnames *attr_system_names[ATTR_SYSCOUNT]; + /*======================================================================== * Function prototypes for the kernel. *========================================================================*/ @@ -83,6 +88,7 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); /* * Internal routines when attribute list is one block. */ +STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context); @@ -90,6 +96,7 @@ STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context); /* * Internal routines when attribute list is more than one block. */ +STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC int xfs_attr_node_addname(xfs_da_args_t *args); STATIC int xfs_attr_node_removename(xfs_da_args_t *args); STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context); @@ -1102,7 +1109,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) * This leaf block cannot have a "remote" value, we only call this routine * if bmap_one_block() says there is only one block (ie: no remote blks). */ -int +STATIC int xfs_attr_leaf_get(xfs_da_args_t *args) { xfs_dabuf_t *bp; @@ -1707,7 +1714,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) * block, ie: both true Btree attr lists and for single-leaf-blocks with * "remote" values taking up more blocks. */ -int +STATIC int xfs_attr_node_get(xfs_da_args_t *args) { xfs_da_state_t *state; @@ -2398,7 +2405,7 @@ posix_acl_default_exists( return xfs_acl_vhasacl_default(vp); } -struct attrnames posix_acl_access = { +STATIC struct attrnames posix_acl_access = { .attr_name = "posix_acl_access", .attr_namelen = sizeof("posix_acl_access") - 1, .attr_get = posix_acl_access_get, @@ -2407,7 +2414,7 @@ struct attrnames posix_acl_access = { .attr_exists = posix_acl_access_exists, }; -struct attrnames posix_acl_default = { +STATIC struct attrnames posix_acl_default = { .attr_name = "posix_acl_default", .attr_namelen = sizeof("posix_acl_default") - 1, .attr_get = posix_acl_default_get, @@ -2416,7 +2423,7 @@ struct attrnames posix_acl_default = { .attr_exists = posix_acl_default_exists, }; -struct attrnames *attr_system_names[] = +STATIC struct attrnames *attr_system_names[] = { &posix_acl_access, &posix_acl_default }; diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index 67cd0f5ac1a..45ab1c542ba 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -76,11 +76,6 @@ extern struct attrnames attr_system; extern struct attrnames attr_trusted; extern struct attrnames *attr_namespaces[ATTR_NAMECOUNT]; -#define ATTR_SYSCOUNT 2 -extern struct attrnames posix_acl_access; -extern struct attrnames posix_acl_default; -extern struct attrnames *attr_system_names[ATTR_SYSCOUNT]; - extern attrnames_t *attr_lookup_namespace(char *, attrnames_t **, int); extern int attr_generic_list(struct vnode *, void *, size_t, int, ssize_t *); @@ -184,8 +179,6 @@ int xfs_attr_list(bhv_desc_t *, char *, int, int, struct attrlist_cursor_kern *, struct cred *); int xfs_attr_inactive(struct xfs_inode *dp); -int xfs_attr_node_get(struct xfs_da_args *); -int xfs_attr_leaf_get(struct xfs_da_args *); int xfs_attr_shortform_getvalue(struct xfs_da_args *); int xfs_attr_fetch(struct xfs_inode *, char *, int, char *, int *, int, struct cred *); diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index b11256e58bf..1cdd574c63a 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -79,6 +79,8 @@ /* * Routines used for growing the Btree. */ +STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block, + xfs_dabuf_t **bpp); STATIC int xfs_attr_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args, int freemap_index); STATIC void xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer); @@ -92,6 +94,16 @@ STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state, int *number_usedbytes_in_blk1); /* + * Routines used for shrinking the Btree. + */ +STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, + xfs_dabuf_t *bp, int level); +STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, + xfs_dabuf_t *bp); +STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, + xfs_dablk_t blkno, int blkcnt); + +/* * Utility routines. */ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, @@ -99,6 +111,10 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, xfs_attr_leafblock_t *dst_leaf, int dst_start, int move_count, xfs_mount_t *mp); +STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); +STATIC int xfs_attr_put_listent(xfs_attr_list_context_t *context, + attrnames_t *, char *name, int namelen, + int valuelen); /*======================================================================== @@ -774,7 +790,7 @@ out: * Create the initial contents of a leaf attribute list * or a leaf in a node attribute list. */ -int +STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) { xfs_attr_leafblock_t *leaf; @@ -2209,7 +2225,7 @@ xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count) * Calculate the number of bytes used to store the indicated attribute * (whether local or remote only calculate bytes in this block). */ -int +STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) { xfs_attr_leaf_name_local_t *name_loc; @@ -2380,7 +2396,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) * we may be reading them directly out of a user buffer. */ /*ARGSUSED*/ -int +STATIC int xfs_attr_put_listent(xfs_attr_list_context_t *context, attrnames_t *namesp, char *name, int namelen, int valuelen) { @@ -2740,7 +2756,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) * Recurse (gasp!) through the attribute nodes until we find leaves. * We're doing a depth-first traversal in order to invalidate everything. */ -int +STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, int level) { @@ -2849,7 +2865,7 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, * Note that we must release the lock on the buffer so that we are not * caught holding something that the logging code wants to flush to disk. */ -int +STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) { xfs_attr_leafblock_t *leaf; @@ -2934,7 +2950,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) * Look at all the extents for this logical region, * invalidate any buffers that are incore/in transactions. */ -int +STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dablk_t blkno, int blkcnt) { diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index b1480e0b334..0a4cfad6df9 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -261,8 +261,6 @@ int xfs_attr_leaf_flipflags(xfs_da_args_t *args); /* * Routines used for growing the Btree. */ -int xfs_attr_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block, - struct xfs_dabuf **bpp); int xfs_attr_leaf_split(struct xfs_da_state *state, struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); @@ -284,12 +282,6 @@ void xfs_attr_leaf_unbalance(struct xfs_da_state *state, struct xfs_da_state_blk *drop_blk, struct xfs_da_state_blk *save_blk); int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); -int xfs_attr_node_inactive(struct xfs_trans **trans, struct xfs_inode *dp, - struct xfs_dabuf *bp, int level); -int xfs_attr_leaf_inactive(struct xfs_trans **trans, struct xfs_inode *dp, - struct xfs_dabuf *bp); -int xfs_attr_leaf_freextent(struct xfs_trans **trans, struct xfs_inode *dp, - xfs_dablk_t blkno, int blkcnt); /* * Utility routines. @@ -299,10 +291,6 @@ int xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp, struct xfs_dabuf *leaf2_bp); int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int blocksize, int *local); -int xfs_attr_leaf_entsize(struct xfs_attr_leafblock *leaf, int index); -int xfs_attr_put_listent(struct xfs_attr_list_context *context, - struct attrnames *, char *name, int namelen, - int valuelen); int xfs_attr_rolltrans(struct xfs_trans **transp, struct xfs_inode *dp); #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c index a20a6c3dc13..76c9ad3875e 100644 --- a/fs/xfs/xfs_bit.c +++ b/fs/xfs/xfs_bit.c @@ -45,7 +45,7 @@ /* * Index of high bit number in byte, -1 for none set, 0..7 otherwise. */ -const char xfs_highbit[256] = { +STATIC const char xfs_highbit[256] = { -1, 0, 1, 1, 2, 2, 2, 2, /* 00 .. 07 */ 3, 3, 3, 3, 3, 3, 3, 3, /* 08 .. 0f */ 4, 4, 4, 4, 4, 4, 4, 4, /* 10 .. 17 */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index de316241866..6f5d283888a 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -301,6 +301,19 @@ xfs_bmap_search_extents( xfs_bmbt_irec_t *gotp, /* out: extent entry found */ xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */ +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + */ +STATIC int /* error */ +xfs_bmap_isaeof( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t off, /* file offset in fsblocks */ + int whichfork, /* data or attribute fork */ + char *aeof); /* return value */ + #ifdef XFS_BMAP_TRACE /* * Add a bmap trace buffer entry. Base routine for the others. @@ -4532,18 +4545,17 @@ xfs_bmapi( xfs_extlen_t alen; /* allocated extent length */ xfs_fileoff_t aoff; /* allocated file offset */ xfs_bmalloca_t bma; /* args for xfs_bmap_alloc */ - char contig; /* allocation must be one extent */ xfs_btree_cur_t *cur; /* bmap btree cursor */ - char delay; /* this request is for delayed alloc */ xfs_fileoff_t end; /* end of mapped file region */ int eof; /* we've hit the end of extent list */ + char contig; /* allocation must be one extent */ + char delay; /* this request is for delayed alloc */ + char exact; /* don't do all of wasdelayed extent */ xfs_bmbt_rec_t *ep; /* extent list entry pointer */ int error; /* error return */ - char exact; /* don't do all of wasdelayed extent */ xfs_bmbt_irec_t got; /* current extent list record */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_extlen_t indlen; /* indirect blocks length */ - char inhole; /* current location is hole in file */ xfs_extnum_t lastx; /* last useful extent number */ int logflags; /* flags for transaction logging */ xfs_extlen_t minleft; /* min blocks left after allocation */ @@ -4554,13 +4566,15 @@ xfs_bmapi( xfs_extnum_t nextents; /* number of extents in file */ xfs_fileoff_t obno; /* old block number (offset) */ xfs_bmbt_irec_t prev; /* previous extent list record */ - char stateless; /* ignore state flag set */ int tmp_logflags; /* temp flags holder */ + int whichfork; /* data or attr fork */ + char inhole; /* current location is hole in file */ + char stateless; /* ignore state flag set */ char trim; /* output trimmed to match range */ char userdata; /* allocating non-metadata */ char wasdelay; /* old extent was delayed */ - int whichfork; /* data or attr fork */ char wr; /* this is a write request */ + char rt; /* this is a realtime file */ char rsvd; /* OK to allocate reserved blocks */ #ifdef DEBUG xfs_fileoff_t orig_bno; /* original block number value */ @@ -4590,6 +4604,7 @@ xfs_bmapi( } if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); + rt = XFS_IS_REALTIME_INODE(ip); ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(ifp->if_ext_max == XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); @@ -4694,9 +4709,16 @@ xfs_bmapi( } minlen = contig ? alen : 1; if (delay) { - indlen = (xfs_extlen_t) - xfs_bmap_worst_indlen(ip, alen); - ASSERT(indlen > 0); + xfs_extlen_t extsz = 0; + + /* Figure out the extent size, adjust alen */ + if (rt) { + if (!(extsz = ip->i_d.di_extsize)) + extsz = mp->m_sb.sb_rextsize; + alen = roundup(alen, extsz); + extsz = alen / mp->m_sb.sb_rextsize; + } + /* * Make a transaction-less quota reservation for * delayed allocation blocks. This number gets @@ -4704,8 +4726,10 @@ xfs_bmapi( * We return EDQUOT if we haven't allocated * blks already inside this loop; */ - if (XFS_TRANS_RESERVE_BLKQUOTA( - mp, NULL, ip, (long)alen)) { + if (XFS_TRANS_RESERVE_QUOTA_NBLKS( + mp, NULL, ip, (long)alen, 0, + rt ? XFS_QMOPT_RES_RTBLKS : + XFS_QMOPT_RES_REGBLKS)) { if (n == 0) { *nmap = 0; ASSERT(cur == NULL); @@ -4718,40 +4742,34 @@ xfs_bmapi( * Split changing sb for alen and indlen since * they could be coming from different places. */ - if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) { - xfs_extlen_t extsz; - xfs_extlen_t ralen; - if (!(extsz = ip->i_d.di_extsize)) - extsz = mp->m_sb.sb_rextsize; - ralen = roundup(alen, extsz); - ralen = ralen / mp->m_sb.sb_rextsize; - if (xfs_mod_incore_sb(mp, - XFS_SBS_FREXTENTS, - -(ralen), rsvd)) { - if (XFS_IS_QUOTA_ON(ip->i_mount)) - XFS_TRANS_UNRESERVE_BLKQUOTA( - mp, NULL, ip, - (long)alen); - break; - } - } else { - if (xfs_mod_incore_sb(mp, - XFS_SBS_FDBLOCKS, - -(alen), rsvd)) { - if (XFS_IS_QUOTA_ON(ip->i_mount)) - XFS_TRANS_UNRESERVE_BLKQUOTA( - mp, NULL, ip, - (long)alen); - break; - } - } + indlen = (xfs_extlen_t) + xfs_bmap_worst_indlen(ip, alen); + ASSERT(indlen > 0); - if (xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, - -(indlen), rsvd)) { - XFS_TRANS_UNRESERVE_BLKQUOTA( - mp, NULL, ip, (long)alen); + if (rt) + error = xfs_mod_incore_sb(mp, + XFS_SBS_FREXTENTS, + -(extsz), rsvd); + else + error = xfs_mod_incore_sb(mp, + XFS_SBS_FDBLOCKS, + -(alen), rsvd); + if (!error) + error = xfs_mod_incore_sb(mp, + XFS_SBS_FDBLOCKS, + -(indlen), rsvd); + + if (error) { + if (XFS_IS_QUOTA_ON(ip->i_mount)) + /* unreserve the blocks now */ + XFS_TRANS_UNRESERVE_QUOTA_NBLKS( + mp, NULL, ip, + (long)alen, 0, rt ? + XFS_QMOPT_RES_RTBLKS : + XFS_QMOPT_RES_REGBLKS); break; } + ip->i_delayed_blks += alen; abno = NULLSTARTBLOCK(indlen); } else { @@ -5376,13 +5394,24 @@ xfs_bunmapi( } if (wasdel) { ASSERT(STARTBLOCKVAL(del.br_startblock) > 0); - xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, - (int)del.br_blockcount, rsvd); - /* Unreserve our quota space */ - XFS_TRANS_RESERVE_QUOTA_NBLKS( - mp, NULL, ip, -((long)del.br_blockcount), 0, - isrt ? XFS_QMOPT_RES_RTBLKS : + /* Update realtim/data freespace, unreserve quota */ + if (isrt) { + xfs_filblks_t rtexts; + + rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); + do_div(rtexts, mp->m_sb.sb_rextsize); + xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, + (int)rtexts, rsvd); + XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, NULL, ip, + -((long)del.br_blockcount), 0, + XFS_QMOPT_RES_RTBLKS); + } else { + xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, + (int)del.br_blockcount, rsvd); + XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, NULL, ip, + -((long)del.br_blockcount), 0, XFS_QMOPT_RES_REGBLKS); + } ip->i_delayed_blks -= del.br_blockcount; if (cur) cur->bc_private.b.flags |= @@ -5714,7 +5743,7 @@ unlock_and_return: * blocks at the end of the file which do not start at the previous data block, * we will try to align the new blocks at stripe unit boundaries. */ -int /* error */ +STATIC int /* error */ xfs_bmap_isaeof( xfs_inode_t *ip, /* incore inode pointer */ xfs_fileoff_t off, /* file offset in fsblocks */ diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index f1bc22fb26a..e6d22ec9b2e 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -332,19 +332,6 @@ xfs_getbmap( int iflags); /* interface flags */ /* - * Check the last inode extent to determine whether this allocation will result - * in blocks being allocated at the end of the file. When we allocate new data - * blocks at the end of the file which do not start at the previous data block, - * we will try to align the new blocks at stripe unit boundaries. - */ -int -xfs_bmap_isaeof( - struct xfs_inode *ip, - xfs_fileoff_t off, - int whichfork, - char *aeof); - -/* * Check if the endoff is outside the last extent. If so the caller will grow * the allocation to a stripe unit boundary */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 163305a79fc..09c413576ba 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -2331,20 +2331,6 @@ xfs_bmbt_lookup_ge( return xfs_bmbt_lookup(cur, XFS_LOOKUP_GE, stat); } -int /* error */ -xfs_bmbt_lookup_le( - xfs_btree_cur_t *cur, - xfs_fileoff_t off, - xfs_fsblock_t bno, - xfs_filblks_t len, - int *stat) /* success/failure */ -{ - cur->bc_rec.b.br_startoff = off; - cur->bc_rec.b.br_startblock = bno; - cur->bc_rec.b.br_blockcount = len; - return xfs_bmbt_lookup(cur, XFS_LOOKUP_LE, stat); -} - /* * Give the bmap btree a new root block. Copy the old broot contents * down into a real block and make the broot point to it. diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 843ff12b4bf..0a40cf126c2 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -580,14 +580,6 @@ xfs_bmbt_lookup_ge( xfs_filblks_t, int *); -int -xfs_bmbt_lookup_le( - struct xfs_btree_cur *, - xfs_fileoff_t, - xfs_fsblock_t, - xfs_filblks_t, - int *); - /* * Give the bmap btree a new root block. Copy the old broot contents * down into a real block and make the broot point to it. diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 9dd22dd9548..0cc63d657a1 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -90,6 +90,16 @@ xfs_btree_maxrecs( */ /* + * Retrieve the block pointer from the cursor at the given level. + * This may be a bmap btree root or from a buffer. + */ +STATIC xfs_btree_block_t * /* generic btree block pointer */ +xfs_btree_get_block( + xfs_btree_cur_t *cur, /* btree cursor */ + int level, /* level in btree */ + struct xfs_buf **bpp); /* buffer containing the block */ + +/* * Checking routine: return maxrecs for the block. */ STATIC int /* number of records fitting in block */ @@ -497,7 +507,7 @@ xfs_btree_firstrec( * Retrieve the block pointer from the cursor at the given level. * This may be a bmap btree root or from a buffer. */ -xfs_btree_block_t * /* generic btree block pointer */ +STATIC xfs_btree_block_t * /* generic btree block pointer */ xfs_btree_get_block( xfs_btree_cur_t *cur, /* btree cursor */ int level, /* level in btree */ diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 93872bba41f..09b4e1532a3 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -325,16 +325,6 @@ xfs_btree_firstrec( int level); /* level to change */ /* - * Retrieve the block pointer from the cursor at the given level. - * This may be a bmap btree root or from a buffer. - */ -xfs_btree_block_t * /* generic btree block pointer */ -xfs_btree_get_block( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level in btree */ - struct xfs_buf **bpp); /* buffer containing the block */ - -/* * Get a buffer for the block, return it with no data read. * Long-form addressing. */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 9ab0039f07d..30b8285ad47 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -172,7 +172,7 @@ STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip); * * If the XFS_BLI_STALE flag has been set, then log nothing. */ -uint +STATIC uint xfs_buf_item_size( xfs_buf_log_item_t *bip) { @@ -240,7 +240,7 @@ xfs_buf_item_size( * format structure, and the rest point to contiguous chunks * within the buffer. */ -void +STATIC void xfs_buf_item_format( xfs_buf_log_item_t *bip, xfs_log_iovec_t *log_vector) @@ -365,7 +365,7 @@ xfs_buf_item_format( * item in memory so it cannot be written out. Simply call bpin() * on the buffer to do this. */ -void +STATIC void xfs_buf_item_pin( xfs_buf_log_item_t *bip) { @@ -391,7 +391,7 @@ xfs_buf_item_pin( * If the XFS_BLI_STALE flag is set and we are the last reference, * then free up the buf log item and unlock the buffer. */ -void +STATIC void xfs_buf_item_unpin( xfs_buf_log_item_t *bip, int stale) @@ -446,7 +446,7 @@ xfs_buf_item_unpin( * so we need to free the item's descriptor (that points to the item) * in the transaction. */ -void +STATIC void xfs_buf_item_unpin_remove( xfs_buf_log_item_t *bip, xfs_trans_t *tp) @@ -493,7 +493,7 @@ xfs_buf_item_unpin_remove( * the lock right away, return 0. If we can get the lock, pull the * buffer from the free list, mark it busy, and return 1. */ -uint +STATIC uint xfs_buf_item_trylock( xfs_buf_log_item_t *bip) { @@ -537,7 +537,7 @@ xfs_buf_item_trylock( * This is for support of xfs_trans_bhold(). Make sure the * XFS_BLI_HOLD field is cleared if we don't free the item. */ -void +STATIC void xfs_buf_item_unlock( xfs_buf_log_item_t *bip) { @@ -635,7 +635,7 @@ xfs_buf_item_unlock( * by returning the original lsn of that transaction here rather than * the current one. */ -xfs_lsn_t +STATIC xfs_lsn_t xfs_buf_item_committed( xfs_buf_log_item_t *bip, xfs_lsn_t lsn) @@ -654,7 +654,7 @@ xfs_buf_item_committed( * and have aborted this transaction, we'll trap this buffer when it tries to * get written out. */ -void +STATIC void xfs_buf_item_abort( xfs_buf_log_item_t *bip) { @@ -674,7 +674,7 @@ xfs_buf_item_abort( * B_DELWRI set, then get it going out to disk with a call to bawrite(). * If not, then just release the buffer. */ -void +STATIC void xfs_buf_item_push( xfs_buf_log_item_t *bip) { @@ -693,7 +693,7 @@ xfs_buf_item_push( } /* ARGSUSED */ -void +STATIC void xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn) { } @@ -701,7 +701,7 @@ xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn) /* * This is the ops vector shared by all buf log items. */ -struct xfs_item_ops xfs_buf_item_ops = { +STATIC struct xfs_item_ops xfs_buf_item_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_buf_item_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_buf_item_format, diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 5f1b0c9308f..01aed5f2d57 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -80,7 +80,7 @@ typedef struct xfs_buf_log_format_t { * user or group dquots and may require special recovery handling. */ #define XFS_BLI_UDQUOT_BUF 0x4 -/* #define XFS_BLI_PDQUOT_BUF 0x8 */ +#define XFS_BLI_PDQUOT_BUF 0x8 #define XFS_BLI_GDQUOT_BUF 0x10 #define XFS_BLI_CHUNK 128 diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index d7fe2886676..8b792ddf216 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -113,7 +113,10 @@ STATIC void xfs_da_node_unbalance(xfs_da_state_t *state, STATIC uint xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count); STATIC int xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp); STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra); - +STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, + xfs_da_state_blk_t *drop_blk, + xfs_da_state_blk_t *save_blk); +STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); /*======================================================================== * Routines used for growing the Btree. @@ -1424,7 +1427,7 @@ xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count) /* * Unlink a block from a doubly linked list of blocks. */ -int /* error */ +STATIC int /* error */ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, xfs_da_state_blk_t *save_blk) { @@ -2381,7 +2384,7 @@ xfs_da_state_alloc(void) /* * Kill the altpath contents of a da-state structure. */ -void +STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state) { int i; diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 9fc699d9699..3a9b9e809c6 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -296,8 +296,6 @@ int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, /* * Utility routines. */ -int xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, - xfs_da_state_blk_t *save_blk); int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, xfs_da_state_blk_t *new_blk); @@ -320,7 +318,6 @@ uint xfs_da_hashname(uchar_t *name_string, int name_length); uint xfs_da_log2_roundup(uint i); xfs_da_state_t *xfs_da_state_alloc(void); void xfs_da_state_free(xfs_da_state_t *state); -void xfs_da_state_kill_altpath(xfs_da_state_t *state); void xfs_da_buf_done(xfs_dabuf_t *dabuf); void xfs_da_log_buf(struct xfs_trans *tp, xfs_dabuf_t *dabuf, uint first, diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 63abdc2ac7f..681be5c93af 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -180,9 +180,10 @@ xfs_swapext( goto error0; } - if (VN_CACHED(tvp) != 0) - xfs_inval_cached_pages(XFS_ITOV(tip), &(tip->i_iocore), - (xfs_off_t)0, 0, 0); + if (VN_CACHED(tvp) != 0) { + xfs_inval_cached_trace(&tip->i_iocore, 0, -1, 0, -1); + VOP_FLUSHINVAL_PAGES(tvp, 0, -1, FI_REMAPF_LOCKED); + } /* Verify O_DIRECT for ftmp */ if (VN_CACHED(tvp) != 0) { diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index db9887a107d..a0aa0e44ff9 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -304,7 +304,7 @@ xfs_dir2_data_freeinsert( /* * Remove a bestfree entry from the table. */ -void +STATIC void xfs_dir2_data_freeremove( xfs_dir2_data_t *d, /* data block pointer */ xfs_dir2_data_free_t *dfp, /* bestfree entry pointer */ diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h index 3f02294ccff..476cac920bf 100644 --- a/fs/xfs/xfs_dir2_data.h +++ b/fs/xfs/xfs_dir2_data.h @@ -193,10 +193,6 @@ extern xfs_dir2_data_free_t * xfs_dir2_data_unused_t *dup, int *loghead); extern void - xfs_dir2_data_freeremove(xfs_dir2_data_t *d, - xfs_dir2_data_free_t *dfp, int *loghead); - -extern void xfs_dir2_data_freescan(struct xfs_mount *mp, xfs_dir2_data_t *d, int *loghead, char *aendp); diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 262d1e86df3..056f5283904 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -77,6 +77,10 @@ static void xfs_dir2_leaf_check(xfs_inode_t *dp, xfs_dabuf_t *bp); #endif static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **lbpp, int *indexp, xfs_dabuf_t **dbpp); +static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp, + int first, int last); +static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp); + /* * Convert a block form directory to a leaf form directory. @@ -1214,7 +1218,7 @@ xfs_dir2_leaf_init( /* * Log the bests entries indicated from a leaf1 block. */ -void +static void xfs_dir2_leaf_log_bests( xfs_trans_t *tp, /* transaction pointer */ xfs_dabuf_t *bp, /* leaf buffer */ @@ -1278,7 +1282,7 @@ xfs_dir2_leaf_log_header( /* * Log the tail of the leaf1 block. */ -void +STATIC void xfs_dir2_leaf_log_tail( xfs_trans_t *tp, /* transaction pointer */ xfs_dabuf_t *bp) /* leaf buffer */ diff --git a/fs/xfs/xfs_dir2_leaf.h b/fs/xfs/xfs_dir2_leaf.h index 7f20eee56a5..3303cd6f4c0 100644 --- a/fs/xfs/xfs_dir2_leaf.h +++ b/fs/xfs/xfs_dir2_leaf.h @@ -330,15 +330,8 @@ extern void int first, int last); extern void - xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp, - int first, int last); - -extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp, struct xfs_dabuf *bp); -extern void - xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp); - extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); diff --git a/fs/xfs/xfs_dir_leaf.c b/fs/xfs/xfs_dir_leaf.c index 617018d6bbd..c2ea6171fb0 100644 --- a/fs/xfs/xfs_dir_leaf.c +++ b/fs/xfs/xfs_dir_leaf.c @@ -91,6 +91,10 @@ STATIC int xfs_dir_leaf_figure_balance(xfs_da_state_t *state, int *number_entries_in_blk1, int *number_namebytes_in_blk1); +STATIC int xfs_dir_leaf_create(struct xfs_da_args *args, + xfs_dablk_t which_block, + struct xfs_dabuf **bpp); + /* * Utility routines. */ @@ -781,7 +785,7 @@ xfs_dir_leaf_to_node(xfs_da_args_t *args) * Create the initial contents of a leaf directory * or a leaf in a node directory. */ -int +STATIC int xfs_dir_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) { xfs_dir_leafblock_t *leaf; diff --git a/fs/xfs/xfs_dir_leaf.h b/fs/xfs/xfs_dir_leaf.h index 00d68d33cc7..dd423ce1bc8 100644 --- a/fs/xfs/xfs_dir_leaf.h +++ b/fs/xfs/xfs_dir_leaf.h @@ -202,8 +202,6 @@ int xfs_dir_leaf_to_shortform(struct xfs_da_args *args); /* * Routines used for growing the Btree. */ -int xfs_dir_leaf_create(struct xfs_da_args *args, xfs_dablk_t which_block, - struct xfs_dabuf **bpp); int xfs_dir_leaf_split(struct xfs_da_state *state, struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h index 55ae3e67d24..55c17adaaa3 100644 --- a/fs/xfs/xfs_dmapi.h +++ b/fs/xfs/xfs_dmapi.h @@ -166,27 +166,32 @@ typedef enum { #define DM_FLAGS_NDELAY 0x001 /* return EAGAIN after dm_pending() */ #define DM_FLAGS_UNWANTED 0x002 /* event not in fsys dm_eventset_t */ #define DM_FLAGS_ISEM 0x004 /* thread holds i_sem */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,21) -/* i_alloc_sem was added in 2.4.22-pre1 */ #define DM_FLAGS_IALLOCSEM_RD 0x010 /* thread holds i_alloc_sem rd */ #define DM_FLAGS_IALLOCSEM_WR 0x020 /* thread holds i_alloc_sem wr */ -#endif -#endif /* * Based on IO_ISDIRECT, decide which i_ flag is set. */ -#ifdef DM_FLAGS_IALLOCSEM_RD +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) +#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ + DM_FLAGS_ISEM : 0) +#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_ISEM) +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) && \ + (LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,22)) #define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ DM_FLAGS_IALLOCSEM_RD : DM_FLAGS_ISEM) #define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_ISEM) -#else +#endif + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,21) #define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ 0 : DM_FLAGS_ISEM) #define DM_SEM_FLAG_WR (DM_FLAGS_ISEM) #endif + /* * Macros to turn caller specified delay/block flags into * dm_send_xxxx_event flag DM_FLAGS_NDELAY. diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index bbe1dea11c0..dcd3fdd5c1f 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -280,7 +280,7 @@ xfs_error_report( } } -void +STATIC void xfs_hex_dump(void *p, int length) { __uint8_t *uip = (__uint8_t*)p; diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 6bc0535c0a6..52ee2b90b5e 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -73,9 +73,6 @@ xfs_corruption_error( int linenum, inst_t *ra); -extern void -xfs_hex_dump(void *p, int length); - #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) #define XFS_CORRUPTION_ERROR(e, lvl, mp, mem) \ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 5eafd5b6321..db7cbd1bc85 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -59,6 +59,18 @@ STATIC void xfs_efi_item_abort(xfs_efi_log_item_t *); STATIC void xfs_efd_item_abort(xfs_efd_log_item_t *); +void +xfs_efi_item_free(xfs_efi_log_item_t *efip) +{ + int nexts = efip->efi_format.efi_nextents; + + if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { + kmem_free(efip, sizeof(xfs_efi_log_item_t) + + (nexts - 1) * sizeof(xfs_extent_t)); + } else { + kmem_zone_free(xfs_efi_zone, efip); + } +} /* * This returns the number of iovecs needed to log the given efi item. @@ -120,8 +132,6 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip) STATIC void xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale) { - int nexts; - int size; xfs_mount_t *mp; SPLDECL(s); @@ -132,21 +142,11 @@ xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale) * xfs_trans_delete_ail() drops the AIL lock. */ xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s); - - nexts = efip->efi_format.efi_nextents; - if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { - size = sizeof(xfs_efi_log_item_t); - size += (nexts - 1) * sizeof(xfs_extent_t); - kmem_free(efip, size); - } else { - kmem_zone_free(xfs_efi_zone, efip); - } + xfs_efi_item_free(efip); } else { efip->efi_flags |= XFS_EFI_COMMITTED; AIL_UNLOCK(mp, s); } - - return; } /* @@ -159,8 +159,6 @@ xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale) STATIC void xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp) { - int nexts; - int size; xfs_mount_t *mp; xfs_log_item_desc_t *lidp; SPLDECL(s); @@ -178,23 +176,11 @@ xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp) * xfs_trans_delete_ail() drops the AIL lock. */ xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s); - /* - * now free the item itself - */ - nexts = efip->efi_format.efi_nextents; - if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { - size = sizeof(xfs_efi_log_item_t); - size += (nexts - 1) * sizeof(xfs_extent_t); - kmem_free(efip, size); - } else { - kmem_zone_free(xfs_efi_zone, efip); - } + xfs_efi_item_free(efip); } else { efip->efi_flags |= XFS_EFI_COMMITTED; AIL_UNLOCK(mp, s); } - - return; } /* @@ -245,18 +231,7 @@ xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) STATIC void xfs_efi_item_abort(xfs_efi_log_item_t *efip) { - int nexts; - int size; - - nexts = efip->efi_format.efi_nextents; - if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { - size = sizeof(xfs_efi_log_item_t); - size += (nexts - 1) * sizeof(xfs_extent_t); - kmem_free(efip, size); - } else { - kmem_zone_free(xfs_efi_zone, efip); - } - return; + xfs_efi_item_free(efip); } /* @@ -288,7 +263,7 @@ xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) /* * This is the ops vector shared by all efi log items. */ -struct xfs_item_ops xfs_efi_item_ops = { +STATIC struct xfs_item_ops xfs_efi_item_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_efi_item_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_efi_item_format, @@ -355,8 +330,6 @@ xfs_efi_release(xfs_efi_log_item_t *efip, { xfs_mount_t *mp; int extents_left; - uint size; - int nexts; SPLDECL(s); mp = efip->efi_item.li_mountp; @@ -372,20 +345,10 @@ xfs_efi_release(xfs_efi_log_item_t *efip, * xfs_trans_delete_ail() drops the AIL lock. */ xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s); + xfs_efi_item_free(efip); } else { AIL_UNLOCK(mp, s); } - - if (extents_left == 0) { - nexts = efip->efi_format.efi_nextents; - if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { - size = sizeof(xfs_efi_log_item_t); - size += (nexts - 1) * sizeof(xfs_extent_t); - kmem_free(efip, size); - } else { - kmem_zone_free(xfs_efi_zone, efip); - } - } } /* @@ -398,8 +361,6 @@ STATIC void xfs_efi_cancel( xfs_efi_log_item_t *efip) { - int nexts; - int size; xfs_mount_t *mp; SPLDECL(s); @@ -410,26 +371,25 @@ xfs_efi_cancel( * xfs_trans_delete_ail() drops the AIL lock. */ xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s); - - nexts = efip->efi_format.efi_nextents; - if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { - size = sizeof(xfs_efi_log_item_t); - size += (nexts - 1) * sizeof(xfs_extent_t); - kmem_free(efip, size); - } else { - kmem_zone_free(xfs_efi_zone, efip); - } + xfs_efi_item_free(efip); } else { efip->efi_flags |= XFS_EFI_CANCELED; AIL_UNLOCK(mp, s); } - - return; } +STATIC void +xfs_efd_item_free(xfs_efd_log_item_t *efdp) +{ + int nexts = efdp->efd_format.efd_nextents; - - + if (nexts > XFS_EFD_MAX_FAST_EXTENTS) { + kmem_free(efdp, sizeof(xfs_efd_log_item_t) + + (nexts - 1) * sizeof(xfs_extent_t)); + } else { + kmem_zone_free(xfs_efd_zone, efdp); + } +} /* * This returns the number of iovecs needed to log the given efd item. @@ -533,9 +493,6 @@ xfs_efd_item_unlock(xfs_efd_log_item_t *efdp) STATIC xfs_lsn_t xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn) { - uint size; - int nexts; - /* * If we got a log I/O error, it's always the case that the LR with the * EFI got unpinned and freed before the EFD got aborted. @@ -543,15 +500,7 @@ xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn) if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0) xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents); - nexts = efdp->efd_format.efd_nextents; - if (nexts > XFS_EFD_MAX_FAST_EXTENTS) { - size = sizeof(xfs_efd_log_item_t); - size += (nexts - 1) * sizeof(xfs_extent_t); - kmem_free(efdp, size); - } else { - kmem_zone_free(xfs_efd_zone, efdp); - } - + xfs_efd_item_free(efdp); return (xfs_lsn_t)-1; } @@ -565,9 +514,6 @@ xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn) STATIC void xfs_efd_item_abort(xfs_efd_log_item_t *efdp) { - int nexts; - int size; - /* * If we got a log I/O error, it's always the case that the LR with the * EFI got unpinned and freed before the EFD got aborted. So don't @@ -576,15 +522,7 @@ xfs_efd_item_abort(xfs_efd_log_item_t *efdp) if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0) xfs_efi_cancel(efdp->efd_efip); - nexts = efdp->efd_format.efd_nextents; - if (nexts > XFS_EFD_MAX_FAST_EXTENTS) { - size = sizeof(xfs_efd_log_item_t); - size += (nexts - 1) * sizeof(xfs_extent_t); - kmem_free(efdp, size); - } else { - kmem_zone_free(xfs_efd_zone, efdp); - } - return; + xfs_efd_item_free(efdp); } /* @@ -615,7 +553,7 @@ xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn) /* * This is the ops vector shared by all efd log items. */ -struct xfs_item_ops xfs_efd_item_ops = { +STATIC struct xfs_item_ops xfs_efd_item_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_efd_item_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_efd_item_format, diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 7122d6101d1..d433bac9f59 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -118,6 +118,8 @@ xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint); xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *, uint); +void xfs_efi_item_free(xfs_efi_log_item_t *); + #endif /* __KERNEL__ */ #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 6ee8443bf9d..095af0a5cff 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -60,7 +60,8 @@ struct fsxattr { __u32 fsx_xflags; /* xflags field value (get/set) */ __u32 fsx_extsize; /* extsize field value (get/set)*/ __u32 fsx_nextents; /* nextents field value (get) */ - unsigned char fsx_pad[16]; + __u32 fsx_projid; /* project identifier (get/set) */ + unsigned char fsx_pad[12]; }; #endif diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 21213057c27..ca535d61319 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -559,32 +559,6 @@ xfs_reserve_blocks( return(0); } -void -xfs_fs_log_dummy(xfs_mount_t *mp) -{ - xfs_trans_t *tp; - xfs_inode_t *ip; - - - tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); - atomic_inc(&mp->m_active_trans); - if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) { - xfs_trans_cancel(tp, 0); - return; - } - - ip = mp->m_rootip; - xfs_ilock(ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - xfs_trans_commit(tp, 0, NULL); - - xfs_iunlock(ip, XFS_ILOCK_EXCL); -} - int xfs_fs_goingdown( xfs_mount_t *mp, diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index 803c4d17a05..44be188674a 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -100,9 +100,13 @@ xfs_inofree_t xfs_inobt_mask(int i); #endif #if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_IS_FREE) int xfs_inobt_is_free(xfs_inobt_rec_t *rp, int i); -#define XFS_INOBT_IS_FREE(rp,i) xfs_inobt_is_free(rp,i) +#define XFS_INOBT_IS_FREE(rp,i) xfs_inobt_is_free(rp,i) +#define XFS_INOBT_IS_FREE_DISK(rp,i) xfs_inobt_is_free_disk(rp,i) #else -#define XFS_INOBT_IS_FREE(rp,i) (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0) +#define XFS_INOBT_IS_FREE(rp,i) \ + (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0) +#define XFS_INOBT_IS_FREE_DISK(rp,i) \ + ((INT_GET((rp)->ir_free, ARCH_CONVERT) & XFS_INOBT_MASK(i)) != 0) #endif #if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_INOBT_SET_FREE) void xfs_inobt_set_free(xfs_inobt_rec_t *rp, int i); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index bc8c8c7f903..34bdf590968 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -146,51 +146,6 @@ xfs_inobp_check( #endif /* - * called from bwrite on xfs inode buffers - */ -void -xfs_inobp_bwcheck(xfs_buf_t *bp) -{ - xfs_mount_t *mp; - int i; - int j; - xfs_dinode_t *dip; - - ASSERT(XFS_BUF_FSPRIVATE3(bp, void *) != NULL); - - mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *); - - - j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; - - for (i = 0; i < j; i++) { - dip = (xfs_dinode_t *) xfs_buf_offset(bp, - i * mp->m_sb.sb_inodesize); - if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) { - cmn_err(CE_WARN, -"Bad magic # 0x%x in XFS inode buffer 0x%Lx, starting blockno %Ld, offset 0x%x", - INT_GET(dip->di_core.di_magic, ARCH_CONVERT), - (__uint64_t)(__psunsigned_t) bp, - (__int64_t) XFS_BUF_ADDR(bp), - xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); - xfs_fs_cmn_err(CE_WARN, mp, - "corrupt, unmount and run xfs_repair"); - } - if (!dip->di_next_unlinked) { - cmn_err(CE_WARN, -"Bad next_unlinked field (0) in XFS inode buffer 0x%p, starting blockno %Ld, offset 0x%x", - (__uint64_t)(__psunsigned_t) bp, - (__int64_t) XFS_BUF_ADDR(bp), - xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); - xfs_fs_cmn_err(CE_WARN, mp, - "corrupt, unmount and run xfs_repair"); - } - } - - return; -} - -/* * This routine is called to map an inode number within a file * system to the buffer containing the on-disk version of the * inode. It returns a pointer to the buffer containing the @@ -203,7 +158,7 @@ xfs_inobp_bwcheck(xfs_buf_t *bp) * Use xfs_imap() to determine the size and location of the * buffer to read from disk. */ -int +STATIC int xfs_inotobp( xfs_mount_t *mp, xfs_trans_t *tp, @@ -1247,26 +1202,32 @@ xfs_ialloc( case S_IFREG: case S_IFDIR: if (unlikely(pip->i_d.di_flags & XFS_DIFLAG_ANY)) { - if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) { - if ((mode & S_IFMT) == S_IFDIR) { - ip->i_d.di_flags |= XFS_DIFLAG_RTINHERIT; - } else { - ip->i_d.di_flags |= XFS_DIFLAG_REALTIME; + uint di_flags = 0; + + if ((mode & S_IFMT) == S_IFDIR) { + if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + } else { + if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) { + di_flags |= XFS_DIFLAG_REALTIME; ip->i_iocore.io_flags |= XFS_IOCORE_RT; } } if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && xfs_inherit_noatime) - ip->i_d.di_flags |= XFS_DIFLAG_NOATIME; + di_flags |= XFS_DIFLAG_NOATIME; if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && xfs_inherit_nodump) - ip->i_d.di_flags |= XFS_DIFLAG_NODUMP; + di_flags |= XFS_DIFLAG_NODUMP; if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && xfs_inherit_sync) - ip->i_d.di_flags |= XFS_DIFLAG_SYNC; + di_flags |= XFS_DIFLAG_SYNC; if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && xfs_inherit_nosymlinks) - ip->i_d.di_flags |= XFS_DIFLAG_NOSYMLINKS; + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + ip->i_d.di_flags |= di_flags; } /* FALLTHROUGH */ case S_IFLNK: @@ -2156,7 +2117,7 @@ static __inline__ int xfs_inode_clean(xfs_inode_t *ip) (ip->i_update_core == 0)); } -void +STATIC void xfs_ifree_cluster( xfs_inode_t *free_ip, xfs_trans_t *tp, @@ -2875,7 +2836,7 @@ xfs_iunpin( * be subsequently pinned once someone is waiting for it to be * unpinned. */ -void +STATIC void xfs_iunpin_wait( xfs_inode_t *ip) { @@ -3601,107 +3562,43 @@ corrupt_out: /* - * Flush all inactive inodes in mp. Return true if no user references - * were found, false otherwise. + * Flush all inactive inodes in mp. */ -int +void xfs_iflush_all( - xfs_mount_t *mp, - int flag) + xfs_mount_t *mp) { - int busy; - int done; - int purged; xfs_inode_t *ip; - vmap_t vmap; vnode_t *vp; - busy = done = 0; - while (!done) { - purged = 0; - XFS_MOUNT_ILOCK(mp); - ip = mp->m_inodes; - if (ip == NULL) { - break; - } - do { - /* Make sure we skip markers inserted by sync */ - if (ip->i_mount == NULL) { - ip = ip->i_mnext; - continue; - } - - /* - * It's up to our caller to purge the root - * and quota vnodes later. - */ - vp = XFS_ITOV_NULL(ip); - - if (!vp) { - XFS_MOUNT_IUNLOCK(mp); - xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC); - purged = 1; - break; - } + again: + XFS_MOUNT_ILOCK(mp); + ip = mp->m_inodes; + if (ip == NULL) + goto out; - if (vn_count(vp) != 0) { - if (vn_count(vp) == 1 && - (ip == mp->m_rootip || - (mp->m_quotainfo && - (ip->i_ino == mp->m_sb.sb_uquotino || - ip->i_ino == mp->m_sb.sb_gquotino)))) { + do { + /* Make sure we skip markers inserted by sync */ + if (ip->i_mount == NULL) { + ip = ip->i_mnext; + continue; + } - ip = ip->i_mnext; - continue; - } - if (!(flag & XFS_FLUSH_ALL)) { - busy = 1; - done = 1; - break; - } - /* - * Ignore busy inodes but continue flushing - * others. - */ - ip = ip->i_mnext; - continue; - } - /* - * Sample vp mapping while holding mp locked on MP - * systems, so we don't purge a reclaimed or - * nonexistent vnode. We break from the loop - * since we know that we modify - * it by pulling ourselves from it in xfs_reclaim() - * called via vn_purge() below. Set ip to the next - * entry in the list anyway so we'll know below - * whether we reached the end or not. - */ - VMAP(vp, vmap); + vp = XFS_ITOV_NULL(ip); + if (!vp) { XFS_MOUNT_IUNLOCK(mp); + xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC); + goto again; + } - vn_purge(vp, &vmap); + ASSERT(vn_count(vp) == 0); - purged = 1; - break; - } while (ip != mp->m_inodes); - /* - * We need to distinguish between when we exit the loop - * after a purge and when we simply hit the end of the - * list. We can't use the (ip == mp->m_inodes) test, - * because when we purge an inode at the start of the list - * the next inode on the list becomes mp->m_inodes. That - * would cause such a test to bail out early. The purged - * variable tells us how we got out of the loop. - */ - if (!purged) { - done = 1; - } - } + ip = ip->i_mnext; + } while (ip != mp->m_inodes); + out: XFS_MOUNT_IUNLOCK(mp); - return !busy; } - /* * xfs_iaccess: check accessibility of inode for mode. */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 37e1c316f3b..54d9e54c7c9 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -412,11 +412,6 @@ void xfs_ifork_next_set(xfs_inode_t *ip, int w, int n); #define XFS_IFLUSH_DELWRI 5 /* - * Flags for xfs_iflush_all. - */ -#define XFS_FLUSH_ALL 0x1 - -/* * Flags for xfs_itruncate_start(). */ #define XFS_ITRUNC_DEFINITE 0x1 @@ -487,8 +482,6 @@ int xfs_finish_reclaim_all(struct xfs_mount *, int); /* * xfs_inode.c prototypes. */ -int xfs_inotobp(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, - xfs_dinode_t **, struct xfs_buf **, int *); int xfs_itobp(struct xfs_mount *, struct xfs_trans *, xfs_inode_t *, xfs_dinode_t **, struct xfs_buf **, xfs_daddr_t); @@ -522,7 +515,7 @@ void xfs_ipin(xfs_inode_t *); void xfs_iunpin(xfs_inode_t *); int xfs_iextents_copy(xfs_inode_t *, xfs_bmbt_rec_t *, int); int xfs_iflush(xfs_inode_t *, uint); -int xfs_iflush_all(struct xfs_mount *, int); +void xfs_iflush_all(struct xfs_mount *); int xfs_iaccess(xfs_inode_t *, mode_t, cred_t *); uint xfs_iroundup(uint); void xfs_ichgtime(xfs_inode_t *, int); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 768cb1816b8..0eed30f5cb1 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -910,7 +910,7 @@ xfs_inode_item_committing( /* * This is the ops vector shared by all buf log items. */ -struct xfs_item_ops xfs_inode_item_ops = { +STATIC struct xfs_item_ops xfs_inode_item_ops = { .iop_size = (uint(*)(xfs_log_item_t*))xfs_inode_item_size, .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) xfs_inode_item_format, diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 469e1a7939d..2edd6769e5d 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -385,15 +385,15 @@ xfs_iomap_write_direct( int nimaps, maps; int error; int bmapi_flag; + int quota_flag; int rt; xfs_trans_t *tp; xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS], *imapp; xfs_bmap_free_t free_list; int aeof; - xfs_filblks_t datablocks; + xfs_filblks_t datablocks, qblocks, resblks; int committed; int numrtextents; - uint resblks; /* * Make sure that the dquots are there. This doesn't hold @@ -419,7 +419,6 @@ xfs_iomap_write_direct( xfs_fileoff_t map_last_fsb; map_last_fsb = ret_imap->br_blockcount + ret_imap->br_startoff; - if (map_last_fsb < last_fsb) { last_fsb = map_last_fsb; count_fsb = last_fsb - offset_fsb; @@ -428,56 +427,47 @@ xfs_iomap_write_direct( } /* - * determine if reserving space on - * the data or realtime partition. + * Determine if reserving space on the data or realtime partition. */ if ((rt = XFS_IS_REALTIME_INODE(ip))) { - int sbrtextsize, iprtextsize; + xfs_extlen_t extsz; - sbrtextsize = mp->m_sb.sb_rextsize; - iprtextsize = - ip->i_d.di_extsize ? ip->i_d.di_extsize : sbrtextsize; - numrtextents = (count_fsb + iprtextsize - 1); - do_div(numrtextents, sbrtextsize); + if (!(extsz = ip->i_d.di_extsize)) + extsz = mp->m_sb.sb_rextsize; + numrtextents = qblocks = (count_fsb + extsz - 1); + do_div(numrtextents, mp->m_sb.sb_rextsize); + quota_flag = XFS_QMOPT_RES_RTBLKS; datablocks = 0; } else { - datablocks = count_fsb; + datablocks = qblocks = count_fsb; + quota_flag = XFS_QMOPT_RES_REGBLKS; numrtextents = 0; } /* - * allocate and setup the transaction + * Allocate and setup the transaction */ xfs_iunlock(ip, XFS_ILOCK_EXCL); tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks); - error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), numrtextents, XFS_TRANS_PERM_LOG_RES, XFS_WRITE_LOG_COUNT); /* - * check for running out of space + * Check for running out of space, note: need lock to return */ if (error) - /* - * Free the transaction structure. - */ xfs_trans_cancel(tp, 0); - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (error) - goto error_out; /* Don't return in above if .. trans .., - need lock to return */ + goto error_out; - if (XFS_TRANS_RESERVE_BLKQUOTA(mp, tp, ip, resblks)) { + if (XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag)) { error = (EDQUOT); goto error1; } - nimaps = 1; bmapi_flag = XFS_BMAPI_WRITE; xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); @@ -487,31 +477,29 @@ xfs_iomap_write_direct( bmapi_flag |= XFS_BMAPI_PREALLOC; /* - * issue the bmapi() call to allocate the blocks + * Issue the bmapi() call to allocate the blocks */ XFS_BMAP_INIT(&free_list, &firstfsb); + nimaps = 1; imapp = &imap[0]; error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag, &firstfsb, 0, imapp, &nimaps, &free_list); - if (error) { + if (error) goto error0; - } /* - * complete the transaction + * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); - if (error) { + if (error) goto error0; - } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - if (error) { + if (error) goto error_out; - } - /* copy any maps to caller's array and return any error. */ + /* + * Copy any maps to caller's array and return any error. + */ if (nimaps == 0) { error = (ENOSPC); goto error_out; @@ -530,10 +518,11 @@ xfs_iomap_write_direct( } return 0; - error0: /* Cancel bmap, unlock inode, and cancel trans */ +error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ xfs_bmap_cancel(&free_list); + XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag); - error1: /* Just cancel transaction */ +error1: /* Just cancel transaction */ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); *nmaps = 0; /* nothing set-up here */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 092d5fb096b..1cd2ac16387 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -134,7 +134,7 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, #define xlog_verify_tail_lsn(a,b,c) #endif -int xlog_iclogs_empty(xlog_t *log); +STATIC int xlog_iclogs_empty(xlog_t *log); #ifdef DEBUG int xlog_do_error = 0; @@ -1857,7 +1857,7 @@ xlog_write(xfs_mount_t * mp, * * State Change: DIRTY -> ACTIVE */ -void +STATIC void xlog_state_clean_log(xlog_t *log) { xlog_in_core_t *iclog; @@ -3542,7 +3542,7 @@ xfs_log_force_umount( return (retval); } -int +STATIC int xlog_iclogs_empty(xlog_t *log) { xlog_in_core_t *iclog; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index c31e3ce3be6..1a1d452f15f 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -535,7 +535,6 @@ typedef struct log { /* common routines */ extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); -extern int xlog_find_head(xlog_t *log, xfs_daddr_t *head_blk); extern int xlog_find_tail(xlog_t *log, xfs_daddr_t *head_blk, xfs_daddr_t *tail_blk, @@ -548,7 +547,6 @@ extern void xlog_recover_process_iunlinks(xlog_t *log); extern struct xfs_buf *xlog_get_bp(xlog_t *, int); extern void xlog_put_bp(struct xfs_buf *); extern int xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *); -extern xfs_caddr_t xlog_align(xlog_t *, xfs_daddr_t, int, struct xfs_buf *); /* iclog tracing */ #define XLOG_TRACE_GRAB_FLUSH 1 diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9824b5bf0ec..0aac28ddb81 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -148,7 +148,7 @@ xlog_bread( * The buffer is kept locked across the write and is returned locked. * This can only be used for synchronous log writes. */ -int +STATIC int xlog_bwrite( xlog_t *log, xfs_daddr_t blk_no, @@ -179,7 +179,7 @@ xlog_bwrite( return error; } -xfs_caddr_t +STATIC xfs_caddr_t xlog_align( xlog_t *log, xfs_daddr_t blk_no, @@ -528,7 +528,7 @@ out: * * Return: zero if normal, non-zero if error. */ -int +STATIC int xlog_find_head( xlog_t *log, xfs_daddr_t *return_head_blk) @@ -1964,7 +1964,8 @@ xlog_recover_do_reg_buffer( * probably a good thing to do for other buf types also. */ error = 0; - if (buf_f->blf_flags & (XFS_BLI_UDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { + if (buf_f->blf_flags & + (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { error = xfs_qm_dqcheck((xfs_disk_dquot_t *) item->ri_buf[i].i_addr, -1, 0, XFS_QMOPT_DOWARN, @@ -2030,6 +2031,7 @@ xfs_qm_dqcheck( } if (INT_GET(ddq->d_flags, ARCH_CONVERT) != XFS_DQ_USER && + INT_GET(ddq->d_flags, ARCH_CONVERT) != XFS_DQ_PROJ && INT_GET(ddq->d_flags, ARCH_CONVERT) != XFS_DQ_GROUP) { if (flags & XFS_QMOPT_DOWARN) cmn_err(CE_ALERT, @@ -2135,6 +2137,8 @@ xlog_recover_do_dquot_buffer( type = 0; if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) type |= XFS_DQ_USER; + if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF) + type |= XFS_DQ_PROJ; if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) type |= XFS_DQ_GROUP; /* @@ -2247,7 +2251,8 @@ xlog_recover_do_buffer_trans( error = 0; if (flags & XFS_BLI_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); - } else if (flags & (XFS_BLI_UDQUOT_BUF | XFS_BLI_GDQUOT_BUF)) { + } else if (flags & + (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); } else { xlog_recover_do_reg_buffer(mp, item, bp, buf_f); @@ -2619,7 +2624,7 @@ xlog_recover_do_dquot_trans( * This type of quotas was turned off, so ignore this record. */ type = INT_GET(recddq->d_flags, ARCH_CONVERT) & - (XFS_DQ_USER | XFS_DQ_GROUP); + (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); ASSERT(type); if (log->l_quotaoffs_flag & type) return (0); @@ -2742,7 +2747,6 @@ xlog_recover_do_efd_trans( xfs_efi_log_item_t *efip = NULL; xfs_log_item_t *lip; int gen; - int nexts; __uint64_t efi_id; SPLDECL(s); @@ -2777,22 +2781,15 @@ xlog_recover_do_efd_trans( } lip = xfs_trans_next_ail(mp, lip, &gen, NULL); } - if (lip == NULL) { - AIL_UNLOCK(mp, s); - } /* * If we found it, then free it up. If it wasn't there, it * must have been overwritten in the log. Oh well. */ if (lip != NULL) { - nexts = efip->efi_format.efi_nextents; - if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { - kmem_free(lip, sizeof(xfs_efi_log_item_t) + - ((nexts - 1) * sizeof(xfs_extent_t))); - } else { - kmem_zone_free(xfs_efi_zone, efip); - } + xfs_efi_item_free(efip); + } else { + AIL_UNLOCK(mp, s); } } diff --git a/fs/xfs/xfs_macros.c b/fs/xfs/xfs_macros.c index ce4f46c6b3a..698c2cd6285 100644 --- a/fs/xfs/xfs_macros.c +++ b/fs/xfs/xfs_macros.c @@ -1658,6 +1658,11 @@ xfs_inobt_is_free(xfs_inobt_rec_t *rp, int i) { return XFS_INOBT_IS_FREE(rp, i); } +int +xfs_inobt_is_free_disk(xfs_inobt_rec_t *rp, int i) +{ + return XFS_INOBT_IS_FREE_DISK(rp, i); +} #endif #if XFS_WANT_FUNCS_C || (XFS_WANT_SPACE_C && XFSSO_XFS_INOBT_IS_LAST_REC) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 2ec967d93e5..82e1646e624 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -64,6 +64,7 @@ STATIC void xfs_mount_log_sbunit(xfs_mount_t *, __int64_t); STATIC int xfs_uuid_mount(xfs_mount_t *); STATIC void xfs_uuid_unmount(xfs_mount_t *mp); +STATIC void xfs_unmountfs_wait(xfs_mount_t *); static struct { short offset; @@ -555,7 +556,7 @@ xfs_readsb(xfs_mount_t *mp) * fields from the superblock associated with the given * mount structure */ -void +STATIC void xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp) { int i; @@ -1081,7 +1082,7 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr) int64_t fsid; #endif - xfs_iflush_all(mp, XFS_FLUSH_ALL); + xfs_iflush_all(mp); XFS_QM_DQPURGEALL(mp, XFS_QMOPT_UQUOTA | XFS_QMOPT_GQUOTA | XFS_QMOPT_UMOUNTING); @@ -1111,15 +1112,6 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr) */ ASSERT(mp->m_inodes == NULL); - /* - * We may have bufs that are in the process of getting written still. - * We must wait for the I/O completion of those. The sync flag here - * does a two pass iteration thru the bufcache. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - xfs_incore_relse(mp->m_ddev_targp, 0, 1); /* synchronous */ - } - xfs_unmountfs_close(mp, cr); if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) xfs_uuid_unmount(mp); @@ -1146,7 +1138,7 @@ xfs_unmountfs_close(xfs_mount_t *mp, struct cred *cr) xfs_free_buftarg(mp->m_ddev_targp, 0); } -void +STATIC void xfs_unmountfs_wait(xfs_mount_t *mp) { if (mp->m_logdev_targp != mp->m_ddev_targp) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 30dd08fb9f5..5affba38a57 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -141,7 +141,7 @@ typedef int (*xfs_dqattach_t)(struct xfs_inode *, uint); typedef void (*xfs_dqdetach_t)(struct xfs_inode *); typedef int (*xfs_dqpurgeall_t)(struct xfs_mount *, uint); typedef int (*xfs_dqvopalloc_t)(struct xfs_mount *, - struct xfs_inode *, uid_t, gid_t, uint, + struct xfs_inode *, uid_t, gid_t, prid_t, uint, struct xfs_dquot **, struct xfs_dquot **); typedef void (*xfs_dqvopcreate_t)(struct xfs_trans *, struct xfs_inode *, struct xfs_dquot *, struct xfs_dquot *); @@ -185,8 +185,8 @@ typedef struct xfs_qmops { (*(mp)->m_qm_ops.xfs_dqdetach)(ip) #define XFS_QM_DQPURGEALL(mp, fl) \ (*(mp)->m_qm_ops.xfs_dqpurgeall)(mp, fl) -#define XFS_QM_DQVOPALLOC(mp, ip, uid, gid, fl, dq1, dq2) \ - (*(mp)->m_qm_ops.xfs_dqvopalloc)(mp, ip, uid, gid, fl, dq1, dq2) +#define XFS_QM_DQVOPALLOC(mp, ip, uid, gid, prid, fl, dq1, dq2) \ + (*(mp)->m_qm_ops.xfs_dqvopalloc)(mp, ip, uid, gid, prid, fl, dq1, dq2) #define XFS_QM_DQVOPCREATE(mp, tp, ip, dq1, dq2) \ (*(mp)->m_qm_ops.xfs_dqvopcreate)(tp, ip, dq1, dq2) #define XFS_QM_DQVOPRENAME(mp, ip) \ @@ -544,7 +544,6 @@ extern void xfs_mount_free(xfs_mount_t *mp, int remove_bhv); extern int xfs_mountfs(struct vfs *, xfs_mount_t *mp, int); extern int xfs_unmountfs(xfs_mount_t *, struct cred *); -extern void xfs_unmountfs_wait(xfs_mount_t *); extern void xfs_unmountfs_close(xfs_mount_t *, struct cred *); extern int xfs_unmountfs_writesb(xfs_mount_t *); extern int xfs_unmount_flush(xfs_mount_t *, int); diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 703ec4efcb4..7134576ae7f 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -96,7 +96,7 @@ typedef struct xfs_dqblk { * flags for q_flags field in the dquot. */ #define XFS_DQ_USER 0x0001 /* a user quota */ -/* #define XFS_DQ_PROJ 0x0002 -- project quota (IRIX) */ +#define XFS_DQ_PROJ 0x0002 /* project quota */ #define XFS_DQ_GROUP 0x0004 /* a group quota */ #define XFS_DQ_FLOCKED 0x0008 /* flush lock taken */ #define XFS_DQ_DIRTY 0x0010 /* dquot is dirty */ @@ -104,6 +104,8 @@ typedef struct xfs_dqblk { #define XFS_DQ_INACTIVE 0x0040 /* dq off mplist & hashlist */ #define XFS_DQ_MARKER 0x0080 /* sentinel */ +#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) + /* * In the worst case, when both user and group quotas are on, * we can have a max of three dquots changing in a single transaction. @@ -124,7 +126,7 @@ typedef struct xfs_dqblk { typedef struct xfs_dq_logformat { __uint16_t qlf_type; /* dquot log item type */ __uint16_t qlf_size; /* size of this item */ - xfs_dqid_t qlf_id; /* usr/grp id number : 32 bits */ + xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ __int64_t qlf_blkno; /* blkno of dquot buffer */ __int32_t qlf_len; /* len of dquot buffer */ __uint32_t qlf_boffset; /* off of dquot in buffer */ @@ -152,9 +154,9 @@ typedef struct xfs_qoff_logformat { #define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */ #define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */ #define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */ -#define XFS_PQUOTA_ACCT 0x0008 /* (IRIX) project quota accounting ON */ -#define XFS_GQUOTA_ENFD 0x0010 /* group quota limits enforced */ -#define XFS_GQUOTA_CHKD 0x0020 /* quotacheck run on grp quotas */ +#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */ +#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */ +#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */ #define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */ /* @@ -162,17 +164,22 @@ typedef struct xfs_qoff_logformat { * are in the process of getting turned off. These flags are in m_qflags but * never in sb_qflags. */ -#define XFS_UQUOTA_ACTIVE 0x0080 /* uquotas are being turned off */ -#define XFS_GQUOTA_ACTIVE 0x0100 /* gquotas are being turned off */ +#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */ +#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */ +#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */ /* * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees * quota will be not be switched off as long as that inode lock is held. */ #define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \ - XFS_GQUOTA_ACTIVE)) + XFS_GQUOTA_ACTIVE | \ + XFS_PQUOTA_ACTIVE)) +#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \ + XFS_PQUOTA_ACTIVE)) #define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE) #define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE) +#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE) /* * Flags to tell various functions what to do. Not all of these are meaningful @@ -182,7 +189,7 @@ typedef struct xfs_qoff_logformat { #define XFS_QMOPT_DQLOCK 0x0000001 /* dqlock */ #define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ #define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ -#define XFS_QMOPT_GQUOTA 0x0000008 /* group dquot requested */ +#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ #define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ #define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */ #define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ @@ -192,6 +199,7 @@ typedef struct xfs_qoff_logformat { #define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if necessary */ #define XFS_QMOPT_ILOCKED 0x0000800 /* inode is already locked (excl) */ #define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot, if damaged. */ +#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ /* * flags to xfs_trans_mod_dquot to indicate which field needs to be @@ -231,7 +239,8 @@ typedef struct xfs_qoff_logformat { #define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT -#define XFS_QMOPT_QUOTALL (XFS_QMOPT_UQUOTA|XFS_QMOPT_GQUOTA) +#define XFS_QMOPT_QUOTALL \ + (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) #ifdef __KERNEL__ @@ -246,21 +255,33 @@ typedef struct xfs_qoff_logformat { */ #define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\ (ip)->i_udquot == NULL) || \ - (XFS_IS_GQUOTA_ON(mp) && \ + (XFS_IS_OQUOTA_ON(mp) && \ (ip)->i_gdquot == NULL)) -#define XFS_QM_NEED_QUOTACHECK(mp) ((XFS_IS_UQUOTA_ON(mp) && \ - (mp->m_sb.sb_qflags & \ - XFS_UQUOTA_CHKD) == 0) || \ - (XFS_IS_GQUOTA_ON(mp) && \ - (mp->m_sb.sb_qflags & \ - XFS_GQUOTA_CHKD) == 0)) +#define XFS_QM_NEED_QUOTACHECK(mp) \ + ((XFS_IS_UQUOTA_ON(mp) && \ + (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \ + (XFS_IS_GQUOTA_ON(mp) && \ + ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ + (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \ + (XFS_IS_PQUOTA_ON(mp) && \ + ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ + (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT)))) + +#define XFS_MOUNT_QUOTA_SET1 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ + XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ + XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD) + +#define XFS_MOUNT_QUOTA_SET2 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ + XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ + XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD) #define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ - XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ - XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD) + XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ + XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\ + XFS_GQUOTA_ACCT) #define XFS_MOUNT_QUOTA_MASK (XFS_MOUNT_QUOTA_ALL | XFS_UQUOTA_ACTIVE | \ - XFS_GQUOTA_ACTIVE) + XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE) /* @@ -331,15 +352,8 @@ typedef struct xfs_dqtrxops { #define XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp) \ XFS_DQTRXOP_VOID(mp, tp, qo_unreserve_and_mod_dquots) -#define XFS_TRANS_RESERVE_BLKQUOTA(mp, tp, ip, nblks) \ - XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, 0, \ - XFS_QMOPT_RES_REGBLKS) -#define XFS_TRANS_RESERVE_BLKQUOTA_FORCE(mp, tp, ip, nblks) \ - XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, 0, \ - XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES) -#define XFS_TRANS_UNRESERVE_BLKQUOTA(mp, tp, ip, nblks) \ - XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, -(nblks), 0, \ - XFS_QMOPT_RES_REGBLKS) +#define XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, ninos, flags) \ + XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, -(nblks), -(ninos), flags) #define XFS_TRANS_RESERVE_QUOTA(mp, tp, ud, gd, nb, ni, f) \ XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, nb, ni, \ f | XFS_QMOPT_RES_REGBLKS) diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index cb13f9a1d45..23b48ac1cb7 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -234,9 +234,6 @@ xfs_lock_for_rename( return 0; } - -int rename_which_error_return = 0; - /* * xfs_rename */ @@ -316,7 +313,6 @@ xfs_rename( &num_inodes); if (error) { - rename_which_error_return = __LINE__; /* * We have nothing locked, no inode references, and * no transaction, so just get out. @@ -332,7 +328,6 @@ xfs_rename( */ if (target_ip == NULL && (src_dp != target_dp) && target_dp->i_d.di_nlink >= XFS_MAXLINK) { - rename_which_error_return = __LINE__; error = XFS_ERROR(EMLINK); xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED); goto rele_return; @@ -359,7 +354,6 @@ xfs_rename( XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT); } if (error) { - rename_which_error_return = __LINE__; xfs_trans_cancel(tp, 0); goto rele_return; } @@ -369,7 +363,6 @@ xfs_rename( */ if ((error = XFS_QM_DQVOPRENAME(mp, inodes))) { xfs_trans_cancel(tp, cancel_flags); - rename_which_error_return = __LINE__; goto rele_return; } @@ -413,7 +406,6 @@ xfs_rename( if (spaceres == 0 && (error = XFS_DIR_CANENTER(mp, tp, target_dp, target_name, target_namelen))) { - rename_which_error_return = __LINE__; goto error_return; } /* @@ -425,11 +417,9 @@ xfs_rename( target_namelen, src_ip->i_ino, &first_block, &free_list, spaceres); if (error == ENOSPC) { - rename_which_error_return = __LINE__; goto error_return; } if (error) { - rename_which_error_return = __LINE__; goto abort_return; } xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -437,7 +427,6 @@ xfs_rename( if (new_parent && src_is_directory) { error = xfs_bumplink(tp, target_dp); if (error) { - rename_which_error_return = __LINE__; goto abort_return; } } @@ -455,7 +444,6 @@ xfs_rename( if (!(XFS_DIR_ISEMPTY(target_ip->i_mount, target_ip)) || (target_ip->i_d.di_nlink > 2)) { error = XFS_ERROR(EEXIST); - rename_which_error_return = __LINE__; goto error_return; } } @@ -473,7 +461,6 @@ xfs_rename( target_namelen, src_ip->i_ino, &first_block, &free_list, spaceres); if (error) { - rename_which_error_return = __LINE__; goto abort_return; } xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -484,7 +471,6 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) { - rename_which_error_return = __LINE__; goto abort_return; } target_ip_dropped = 1; @@ -495,7 +481,6 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) { - rename_which_error_return = __LINE__; goto abort_return; } } @@ -519,7 +504,6 @@ xfs_rename( &free_list, spaceres); ASSERT(error != EEXIST); if (error) { - rename_which_error_return = __LINE__; goto abort_return; } xfs_ichgtime(src_ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -550,7 +534,6 @@ xfs_rename( */ error = xfs_droplink(tp, src_dp); if (error) { - rename_which_error_return = __LINE__; goto abort_return; } } @@ -558,7 +541,6 @@ xfs_rename( error = XFS_DIR_REMOVENAME(mp, tp, src_dp, src_name, src_namelen, src_ip->i_ino, &first_block, &free_list, spaceres); if (error) { - rename_which_error_return = __LINE__; goto abort_return; } xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 3db0e220077..06dfca531f7 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -332,25 +332,6 @@ undo_blocks: /* - * This is called to set the a callback to be called when the given - * transaction is committed to disk. The transaction pointer and the - * argument pointer will be passed to the callback routine. - * - * Only one callback can be associated with any single transaction. - */ -void -xfs_trans_callback( - xfs_trans_t *tp, - xfs_trans_callback_t callback, - void *arg) -{ - ASSERT(tp->t_callback == NULL); - tp->t_callback = callback; - tp->t_callarg = arg; -} - - -/* * Record the indicated change to the given field for application * to the file system's superblock when the transaction commits. * For now, just store the change in the transaction structure. @@ -551,7 +532,7 @@ xfs_trans_apply_sb_deltas( * * This is done efficiently with a single call to xfs_mod_incore_sb_batch(). */ -void +STATIC void xfs_trans_unreserve_and_mod_sb( xfs_trans_t *tp) { diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index bd37ccb85e7..ec541d66fa2 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -987,8 +987,6 @@ xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint); xfs_trans_t *xfs_trans_dup(xfs_trans_t *); int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, uint, uint); -void xfs_trans_callback(xfs_trans_t *, - void (*)(xfs_trans_t *, void *), void *); void xfs_trans_mod_sb(xfs_trans_t *, uint, long); struct xfs_buf *xfs_trans_get_buf(xfs_trans_t *, struct xfs_buftarg *, xfs_daddr_t, int, uint); @@ -1010,7 +1008,6 @@ int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *, xfs_ino_t , uint, uint, struct xfs_inode **); void xfs_trans_ijoin(xfs_trans_t *, struct xfs_inode *, uint); void xfs_trans_ihold(xfs_trans_t *, struct xfs_inode *); -void xfs_trans_ihold_release(xfs_trans_t *, struct xfs_inode *); void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index a9682b9510c..144da7a8546 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -976,6 +976,7 @@ xfs_trans_dquot_buf( ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); ASSERT(type == XFS_BLI_UDQUOT_BUF || + type == XFS_BLI_PDQUOT_BUF || type == XFS_BLI_GDQUOT_BUF); bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index e2c3706f453..7e7631ca497 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -253,24 +253,6 @@ xfs_trans_ihold( ip->i_itemp->ili_flags |= XFS_ILI_HOLD; } -/* - * Cancel the previous inode hold request made on this inode - * for this transaction. - */ -/*ARGSUSED*/ -void -xfs_trans_ihold_release( - xfs_trans_t *tp, - xfs_inode_t *ip) -{ - ASSERT(ip->i_transp == tp); - ASSERT(ip->i_itemp != NULL); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); - ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD); - - ip->i_itemp->ili_flags &= ~XFS_ILI_HOLD; -} - /* * This is called to mark the fields indicated in fieldmask as needing diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index e4bf711e48f..16f5371ce10 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -55,7 +55,7 @@ typedef signed long long int __int64_t; typedef unsigned long long int __uint64_t; typedef enum { B_FALSE,B_TRUE } boolean_t; -typedef __int64_t prid_t; /* project ID */ +typedef __uint32_t prid_t; /* project ID */ typedef __uint32_t inst_t; /* an instruction */ typedef __s64 xfs_off_t; /* <file offset> type */ diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index d1f8146a06e..11351f08d43 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -428,7 +428,7 @@ xfs_truncate_file( if (ip->i_ino != mp->m_sb.sb_uquotino) ASSERT(ip->i_udquot); } - if (XFS_IS_GQUOTA_ON(mp)) { + if (XFS_IS_OQUOTA_ON(mp)) { if (ip->i_ino != mp->m_sb.sb_gquotino) ASSERT(ip->i_gdquot); } diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index b5373665010..42bcc021520 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -368,16 +368,6 @@ xfs_finish_flags( } /* - * disallow mount attempts with (IRIX) project quota enabled - */ - if (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) && - (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT)) { - cmn_err(CE_WARN, - "XFS: cannot mount a filesystem with IRIX project quota enabled"); - return XFS_ERROR(ENOSYS); - } - - /* * check for shared mount. */ if (ap->flags & XFSMNT_SHARED) { @@ -622,7 +612,34 @@ out: return XFS_ERROR(error); } -#define REMOUNT_READONLY_FLAGS (SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT) +STATIC int +xfs_quiesce_fs( + xfs_mount_t *mp) +{ + int count = 0, pincount; + + xfs_refcache_purge_mp(mp); + xfs_flush_buftarg(mp->m_ddev_targp, 0); + xfs_finish_reclaim_all(mp, 0); + + /* This loop must run at least twice. + * The first instance of the loop will flush + * most meta data but that will generate more + * meta data (typically directory updates). + * Which then must be flushed and logged before + * we can write the unmount record. + */ + do { + xfs_syncsub(mp, SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT, 0, NULL); + pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); + if (!pincount) { + delay(50); + count++; + } + } while (count < 2); + + return 0; +} STATIC int xfs_mntupdate( @@ -632,8 +649,7 @@ xfs_mntupdate( { struct vfs *vfsp = bhvtovfs(bdp); xfs_mount_t *mp = XFS_BHVTOM(bdp); - int pincount, error; - int count = 0; + int error; if (args->flags & XFSMNT_NOATIME) mp->m_flags |= XFS_MOUNT_NOATIME; @@ -645,25 +661,7 @@ xfs_mntupdate( } if (*flags & MS_RDONLY) { - xfs_refcache_purge_mp(mp); - xfs_flush_buftarg(mp->m_ddev_targp, 0); - xfs_finish_reclaim_all(mp, 0); - - /* This loop must run at least twice. - * The first instance of the loop will flush - * most meta data but that will generate more - * meta data (typically directory updates). - * Which then must be flushed and logged before - * we can write the unmount record. - */ - do { - VFS_SYNC(vfsp, REMOUNT_READONLY_FLAGS, NULL, error); - pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); - if (!pincount) { - delay(50); - count++; - } - } while (count < 2); + xfs_quiesce_fs(mp); /* Ok now write out an unmount record */ xfs_log_unmount_write(mp); @@ -879,10 +877,12 @@ xfs_sync( int flags, cred_t *credp) { - xfs_mount_t *mp; + xfs_mount_t *mp = XFS_BHVTOM(bdp); - mp = XFS_BHVTOM(bdp); - return (xfs_syncsub(mp, flags, 0, NULL)); + if (unlikely(flags == SYNC_QUIESCE)) + return xfs_quiesce_fs(mp); + else + return xfs_syncsub(mp, flags, 0, NULL); } /* @@ -1681,7 +1681,7 @@ suffix_strtoul(const char *cp, char **endp, unsigned int base) return simple_strtoul(cp, endp, base) << shift_left_factor; } -int +STATIC int xfs_parseargs( struct bhv_desc *bhv, char *options, @@ -1867,7 +1867,7 @@ printk("XFS: irixsgid is now a sysctl(2) variable, option is deprecated.\n"); return 0; } -int +STATIC int xfs_showargs( struct bhv_desc *bhv, struct seq_file *m) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 25a526629b1..1377c868f3f 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2000-2005 Silicon Graphics, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU General Public License as @@ -351,21 +351,28 @@ xfs_setattr( * If the IDs do change before we take the ilock, we're covered * because the i_*dquot fields will get updated anyway. */ - if (XFS_IS_QUOTA_ON(mp) && (mask & (XFS_AT_UID|XFS_AT_GID))) { + if (XFS_IS_QUOTA_ON(mp) && + (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) { uint qflags = 0; - if (mask & XFS_AT_UID) { + if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) { uid = vap->va_uid; qflags |= XFS_QMOPT_UQUOTA; } else { uid = ip->i_d.di_uid; } - if (mask & XFS_AT_GID) { + if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) { gid = vap->va_gid; qflags |= XFS_QMOPT_GQUOTA; } else { gid = ip->i_d.di_gid; } + if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) { + projid = vap->va_projid; + qflags |= XFS_QMOPT_PQUOTA; + } else { + projid = ip->i_d.di_projid; + } /* * We take a reference when we initialize udqp and gdqp, * so it is important that we never blindly double trip on @@ -373,7 +380,8 @@ xfs_setattr( */ ASSERT(udqp == NULL); ASSERT(gdqp == NULL); - code = XFS_QM_DQVOPALLOC(mp, ip, uid,gid, qflags, &udqp, &gdqp); + code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags, + &udqp, &gdqp); if (code) return (code); } @@ -499,8 +507,6 @@ xfs_setattr( * that the group ID supplied to the chown() function * shall be equal to either the group ID or one of the * supplementary group IDs of the calling process. - * - * XXX: How does restricted_chown affect projid? */ if (restricted_chown && (iuid != uid || (igid != gid && @@ -510,10 +516,11 @@ xfs_setattr( goto error_return; } /* - * Do a quota reservation only if uid or gid is actually + * Do a quota reservation only if uid/projid/gid is actually * going to change. */ if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) || + (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) || (XFS_IS_GQUOTA_ON(mp) && igid != gid)) { ASSERT(tp); code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp, @@ -774,6 +781,7 @@ xfs_setattr( } if (igid != gid) { if (XFS_IS_GQUOTA_ON(mp)) { + ASSERT(!XFS_IS_PQUOTA_ON(mp)); ASSERT(mask & XFS_AT_GID); ASSERT(gdqp); olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip, @@ -782,6 +790,13 @@ xfs_setattr( ip->i_d.di_gid = gid; } if (iprojid != projid) { + if (XFS_IS_PQUOTA_ON(mp)) { + ASSERT(!XFS_IS_GQUOTA_ON(mp)); + ASSERT(mask & XFS_AT_PROJID); + ASSERT(gdqp); + olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip, + &ip->i_gdquot, gdqp); + } ip->i_d.di_projid = projid; /* * We may have to rev the inode as well as @@ -843,6 +858,8 @@ xfs_setattr( di_flags |= XFS_DIFLAG_NOATIME; if (vap->va_xflags & XFS_XFLAG_NODUMP) di_flags |= XFS_DIFLAG_NODUMP; + if (vap->va_xflags & XFS_XFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { if (vap->va_xflags & XFS_XFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; @@ -1898,7 +1915,9 @@ xfs_create( /* Return through std_return after this point. */ udqp = gdqp = NULL; - if (vap->va_mask & XFS_AT_PROJID) + if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + prid = dp->i_d.di_projid; + else if (vap->va_mask & XFS_AT_PROJID) prid = (xfs_prid_t)vap->va_projid; else prid = (xfs_prid_t)dfltprid; @@ -1907,7 +1926,7 @@ xfs_create( * Make sure that we have allocated dquot(s) on disk. */ error = XFS_QM_DQVOPALLOC(mp, dp, - current_fsuid(credp), current_fsgid(credp), + current_fsuid(credp), current_fsgid(credp), prid, XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp); if (error) goto std_return; @@ -2604,17 +2623,7 @@ xfs_link( if (src_vp->v_type == VDIR) return XFS_ERROR(EPERM); - /* - * For now, manually find the XFS behavior descriptor for - * the source vnode. If it doesn't exist then something - * is wrong and we should just return an error. - * Eventually we need to figure out how link is going to - * work in the face of stacked vnodes. - */ src_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(src_vp), &xfs_vnodeops); - if (src_bdp == NULL) { - return XFS_ERROR(EXDEV); - } sip = XFS_BHVTOI(src_bdp); tdp = XFS_BHVTOI(target_dir_bdp); mp = tdp->i_mount; @@ -2681,6 +2690,17 @@ xfs_link( goto error_return; } + /* + * If we are using project inheritance, we only allow hard link + * creation in our tree when the project IDs are the same; else + * the tree quota mechanism could be circumvented. + */ + if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (tdp->i_d.di_projid != sip->i_d.di_projid))) { + error = XFS_ERROR(EPERM); + goto error_return; + } + if (resblks == 0 && (error = XFS_DIR_CANENTER(mp, tp, tdp, target_name, target_namelen))) @@ -2803,7 +2823,9 @@ xfs_mkdir( mp = dp->i_mount; udqp = gdqp = NULL; - if (vap->va_mask & XFS_AT_PROJID) + if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + prid = dp->i_d.di_projid; + else if (vap->va_mask & XFS_AT_PROJID) prid = (xfs_prid_t)vap->va_projid; else prid = (xfs_prid_t)dfltprid; @@ -2812,7 +2834,7 @@ xfs_mkdir( * Make sure that we have allocated dquot(s) on disk. */ error = XFS_QM_DQVOPALLOC(mp, dp, - current_fsuid(credp), current_fsgid(credp), + current_fsuid(credp), current_fsgid(credp), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); if (error) goto std_return; @@ -3357,7 +3379,9 @@ xfs_symlink( /* Return through std_return after this point. */ udqp = gdqp = NULL; - if (vap->va_mask & XFS_AT_PROJID) + if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + prid = dp->i_d.di_projid; + else if (vap->va_mask & XFS_AT_PROJID) prid = (xfs_prid_t)vap->va_projid; else prid = (xfs_prid_t)dfltprid; @@ -3366,7 +3390,7 @@ xfs_symlink( * Make sure that we have allocated dquot(s) on disk. */ error = XFS_QM_DQVOPALLOC(mp, dp, - current_fsuid(credp), current_fsgid(credp), + current_fsuid(credp), current_fsgid(credp), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); if (error) goto std_return; @@ -4028,7 +4052,7 @@ xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock) * errno on error * */ -int +STATIC int xfs_alloc_file_space( xfs_inode_t *ip, xfs_off_t offset, @@ -4151,9 +4175,8 @@ retry: break; } xfs_ilock(ip, XFS_ILOCK_EXCL); - error = XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, - ip->i_udquot, ip->i_gdquot, resblks, 0, rt ? - XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + error = XFS_TRANS_RESERVE_QUOTA(mp, tp, + ip->i_udquot, ip->i_gdquot, resblks, 0, 0); if (error) goto error1; @@ -4305,6 +4328,7 @@ xfs_free_file_space( xfs_off_t len, int attr_flags) { + vnode_t *vp; int committed; int done; xfs_off_t end_dmi_offset; @@ -4325,9 +4349,11 @@ xfs_free_file_space( xfs_trans_t *tp; int need_iolock = 1; - vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); + vp = XFS_ITOV(ip); mp = ip->i_mount; + vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); + if ((error = XFS_QM_DQATTACH(mp, ip, 0))) return error; @@ -4344,7 +4370,7 @@ xfs_free_file_space( DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) { if (end_dmi_offset > ip->i_d.di_size) end_dmi_offset = ip->i_d.di_size; - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip), + error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp, offset, end_dmi_offset - offset, AT_DELAY_FLAG(attr_flags), NULL); if (error) @@ -4363,7 +4389,14 @@ xfs_free_file_space( ioffset = offset & ~(rounding - 1); if (ilen & (rounding - 1)) ilen = (ilen + rounding) & ~(rounding - 1); - xfs_inval_cached_pages(XFS_ITOV(ip), &(ip->i_iocore), ioffset, 0, 0); + + if (VN_CACHED(vp) != 0) { + xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1, + ctooff(offtoct(ioffset)), -1); + VOP_FLUSHINVAL_PAGES(vp, ctooff(offtoct(ioffset)), + -1, FI_REMAPF_LOCKED); + } + /* * Need to zero the stuff we're not freeing, on disk. * If its a realtime file & can't use unwritten extents then we |