diff options
Diffstat (limited to 'fs')
162 files changed, 10244 insertions, 2809 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index fa64867d6ed..599de54451a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -74,11 +74,11 @@ config EXT3_FS tristate "Ext3 journalling file system support" select JBD help - This is the journaling version of the Second extended file system + This is the journalling version of the Second extended file system (often called ext3), the de facto standard Linux file system (method to organize files on a storage device) for hard disks. - The journaling code included in this driver means you do not have + The journalling code included in this driver means you do not have to run e2fsck (file system checker) on your file systems after a crash. The journal keeps track of any changes that were being made at the time the system crashed, and can ensure that your file system @@ -143,7 +143,7 @@ config EXT3_FS_SECURITY config JBD tristate help - This is a generic journaling layer for block devices. It is + This is a generic journalling layer for block devices. It is currently used by the ext3 and OCFS2 file systems, but it could also be used to add journal support to other file systems or block devices such as RAID or LVM. @@ -183,7 +183,7 @@ config REISERFS_FS tristate "Reiserfs support" help Stores not just filenames but the files themselves in a balanced - tree. Uses journaling. + tree. Uses journalling. Balanced trees are more efficient than traditional file system architectural foundations. @@ -996,6 +996,18 @@ config AFFS_FS To compile this file system support as a module, choose M here: the module will be called affs. If unsure, say N. +config ECRYPT_FS + tristate "eCrypt filesystem layer support (EXPERIMENTAL)" + depends on EXPERIMENTAL && KEYS && CRYPTO + help + Encrypted filesystem that operates on the VFS layer. See + <file:Documentation/ecryptfs.txt> to learn more about + eCryptfs. Userspace components are required and can be + obtained from <http://ecryptfs.sf.net>. + + To compile this file system support as a module, choose M here: the + module will be called ecryptfs. + config HFS_FS tristate "Apple Macintosh file system support (EXPERIMENTAL)" depends on BLOCK && EXPERIMENTAL @@ -1033,7 +1045,7 @@ config BEFS_FS on files and directories, and database-like indeces on selected attributes. (Also note that this driver doesn't make those features available at this time). It is a 64 bit filesystem, so it supports - extremly large volumes and files. + extremely large volumes and files. If you use this filesystem, you should also say Y to at least one of the NLS (native language support) options below. @@ -1091,7 +1103,7 @@ config JFFS_FS tristate "Journalling Flash File System (JFFS) support" depends on MTD && BLOCK help - JFFS is the Journaling Flash File System developed by Axis + JFFS is the Journalling Flash File System developed by Axis Communications in Sweden, aimed at providing a crash/powerdown-safe file system for disk-less embedded devices. Further information is available at (<http://developer.axis.com/software/jffs/>). @@ -1261,7 +1273,7 @@ config JFFS2_CMODE_NONE config JFFS2_CMODE_PRIORITY bool "priority" help - Tries the compressors in a predefinied order and chooses the first + Tries the compressors in a predefined order and chooses the first successful one. config JFFS2_CMODE_SIZE @@ -1366,7 +1378,7 @@ config SYSV_FS If you have floppies or hard disk partitions like that, it is likely that they contain binaries from those other Unix systems; in order - to run these binaries, you will want to install linux-abi which is a + to run these binaries, you will want to install linux-abi which is a set of kernel modules that lets you run SCO, Xenix, Wyse, UnixWare, Dell Unix and System V programs under Linux. It is available via FTP (user: ftp) from @@ -1951,7 +1963,7 @@ config AFS_FS If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. - See <file:Documentation/filesystems/afs.txt> for more intormation. + See <file:Documentation/filesystems/afs.txt> for more information. If unsure, say N. diff --git a/fs/Makefile b/fs/Makefile index 215f7037817..df614eacee8 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -76,6 +76,7 @@ obj-$(CONFIG_BFS_FS) += bfs/ obj-$(CONFIG_ISO9660_FS) += isofs/ obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+ obj-$(CONFIG_HFS_FS) += hfs/ +obj-$(CONFIG_ECRYPT_FS) += ecryptfs/ obj-$(CONFIG_VXFS_FS) += freevxfs/ obj-$(CONFIG_NFS_FS) += nfs/ obj-$(CONFIG_EXPORTFS) += exportfs/ diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 2fc99877cb0..cf8a2cb2850 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -30,7 +30,7 @@ static int afs_dir_readdir(struct file *file, void *dirent, filldir_t filldir); static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd); static int afs_d_delete(struct dentry *dentry); static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen, - loff_t fpos, ino_t ino, unsigned dtype); + loff_t fpos, u64 ino, unsigned dtype); const struct file_operations afs_dir_file_operations = { .open = afs_dir_open, @@ -409,7 +409,7 @@ static int afs_dir_readdir(struct file *file, void *cookie, filldir_t filldir) * uniquifier through dtype */ static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen, - loff_t fpos, ino_t ino, unsigned dtype) + loff_t fpos, u64 ino, unsigned dtype) { struct afs_dir_lookup_cookie *cookie = _cookie; @@ -675,7 +675,7 @@ static ssize_t aio_run_iocb(struct kiocb *iocb) } if (!(iocb->ki_retried & 0xff)) { - pr_debug("%ld retry: %d of %d\n", iocb->ki_retried, + pr_debug("%ld retry: %zd of %zd\n", iocb->ki_retried, iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes); } @@ -1008,7 +1008,7 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) pr_debug("added to ring %p at [%lu]\n", iocb, tail); - pr_debug("%ld retries: %d of %d\n", iocb->ki_retried, + pr_debug("%ld retries: %zd of %zd\n", iocb->ki_retried, iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes); put_rq: /* everything turned out well, dispose of the aiocb. */ diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h index 9095518e918..63ef1e18fb8 100644 --- a/fs/befs/befs_fs_types.h +++ b/fs/befs/befs_fs_types.h @@ -1,5 +1,5 @@ /* - * include/linux/befs_fs_types.h + * fs/befs/befs_fs_types.h * * Copyright (C) 2001 Will Dyson (will@cs.earlham.edu) * diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index 32b5d625ce9..5bcdaaf4eae 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -29,6 +29,7 @@ #include <linux/personality.h> #include <linux/init.h> +#include <asm/a.out.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -194,6 +195,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) unsigned long som_entry; struct som_hdr *som_ex; struct som_exec_auxhdr *hpuxhdr; + struct files_struct *files; /* Get the exec-header */ som_ex = (struct som_hdr *) bprm->buf; @@ -208,15 +210,27 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) size = som_ex->aux_header_size; if (size > SOM_PAGESIZE) goto out; - hpuxhdr = (struct som_exec_auxhdr *) kmalloc(size, GFP_KERNEL); + hpuxhdr = kmalloc(size, GFP_KERNEL); if (!hpuxhdr) goto out; retval = kernel_read(bprm->file, som_ex->aux_header_location, (char *) hpuxhdr, size); + if (retval != size) { + if (retval >= 0) + retval = -EIO; + goto out_free; + } + + files = current->files; /* Refcounted so ok */ + retval = unshare_files(); if (retval < 0) goto out_free; -#error "Fix security hole before enabling me" + if (files == current->files) { + put_files_struct(files); + files = NULL; + } + retval = get_unused_fd(); if (retval < 0) goto out_free; diff --git a/fs/cifs/README b/fs/cifs/README index 5f0e1bd64fe..432e515431c 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -269,7 +269,7 @@ A partial list of the supported mount options follows: (gid) mount option is specified. For the uid (gid) of newly created files and directories, ie files created since the last mount of the server share, the expected uid - (gid) is cached as as long as the inode remains in + (gid) is cached as long as the inode remains in memory on the client. Also note that permission checks (authorization checks) on accesses to a file occur at the server, but there are cases in which an administrator @@ -375,7 +375,7 @@ A partial list of the supported mount options follows: the local process on newly created files, directories, and devices (create, mkdir, mknod). If the CIFS Unix Extensions are not negotiated, for newly created files and directories - instead of using the default uid and gid specified on the + instead of using the default uid and gid specified on the mount, cache the new file's uid and gid locally which means that the uid for the file can change when the inode is reloaded (or the user remounts the share). @@ -440,7 +440,7 @@ A partial list of the supported mount options follows: create device files and fifos in a format compatible with Services for Unix (SFU). In addition retrieve bits 10-12 of the mode via the SETFILEBITS extended attribute (as - SFU does). In the future the bottom 9 bits of the mode + SFU does). In the future the bottom 9 bits of the mode also will be emulated using queries of the security descriptor (ACL). sign Must use packet signing (helps avoid unwanted data modification diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0e9ba0b9d71..c78762051da 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -772,12 +772,12 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol) separator[1] = 0; memset(vol->source_rfc1001_name,0x20,15); - for(i=0;i < strnlen(system_utsname.nodename,15);i++) { + for(i=0;i < strnlen(utsname()->nodename,15);i++) { /* does not have to be a perfect mapping since the field is informational, only used for servers that do not support port 445 and it can be overridden at mount time */ vol->source_rfc1001_name[i] = - toupper(system_utsname.nodename[i]); + toupper(utsname()->nodename[i]); } vol->source_rfc1001_name[15] = 0; /* null target name indicates to use *SMBSERVR default called name @@ -2153,7 +2153,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bytes_returned = - cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, + cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bcc_ptr += 2; @@ -2180,8 +2180,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, } strcpy(bcc_ptr, "Linux version "); bcc_ptr += strlen("Linux version "); - strcpy(bcc_ptr, system_utsname.release); - bcc_ptr += strlen(system_utsname.release) + 1; + strcpy(bcc_ptr, utsname()->release); + bcc_ptr += strlen(utsname()->release) + 1; strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; } @@ -2445,7 +2445,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bytes_returned = - cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, 32, + cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bcc_ptr += 2; /* null terminate Linux version */ @@ -2462,8 +2462,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, } else { /* ASCII */ strcpy(bcc_ptr, "Linux version "); bcc_ptr += strlen("Linux version "); - strcpy(bcc_ptr, system_utsname.release); - bcc_ptr += strlen(system_utsname.release) + 1; + strcpy(bcc_ptr, utsname()->release); + bcc_ptr += strlen(utsname()->release) + 1; strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; bcc_ptr++; /* empty domain field */ @@ -2836,7 +2836,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bytes_returned = - cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, 32, + cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bcc_ptr += 2; /* null term version string */ @@ -2888,8 +2888,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, strcpy(bcc_ptr, "Linux version "); bcc_ptr += strlen("Linux version "); - strcpy(bcc_ptr, system_utsname.release); - bcc_ptr += strlen(system_utsname.release) + 1; + strcpy(bcc_ptr, utsname()->release); + bcc_ptr += strlen(utsname()->release) + 1; strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; bcc_ptr++; /* null domain */ diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index d1705ab8136..22b4c35dcfe 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -111,7 +111,7 @@ static void unicode_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses, bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32, nls_cp); bcc_ptr += 2 * bytes_ret; - bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, + bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, init_utsname()->release, 32, nls_cp); bcc_ptr += 2 * bytes_ret; bcc_ptr += 2; /* trailing null */ @@ -158,8 +158,8 @@ static void ascii_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses, strcpy(bcc_ptr, "Linux version "); bcc_ptr += strlen("Linux version "); - strcpy(bcc_ptr, system_utsname.release); - bcc_ptr += strlen(system_utsname.release) + 1; + strcpy(bcc_ptr, init_utsname()->release); + bcc_ptr += strlen(init_utsname()->release) + 1; strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; diff --git a/fs/compat.c b/fs/compat.c index 13fb08d096c..4d3fbcb2ddb 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -56,8 +56,6 @@ int compat_log = 1; -extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat); - int compat_printk(const char *fmt, ...) { va_list ap; @@ -916,20 +914,24 @@ struct compat_readdir_callback { }; static int compat_fillonedir(void *__buf, const char *name, int namlen, - loff_t offset, ino_t ino, unsigned int d_type) + loff_t offset, u64 ino, unsigned int d_type) { struct compat_readdir_callback *buf = __buf; struct compat_old_linux_dirent __user *dirent; + compat_ulong_t d_ino; if (buf->result) return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + return -EOVERFLOW; buf->result++; dirent = buf->dirent; if (!access_ok(VERIFY_WRITE, dirent, (unsigned long)(dirent->d_name + namlen + 1) - (unsigned long)dirent)) goto efault; - if ( __put_user(ino, &dirent->d_ino) || + if ( __put_user(d_ino, &dirent->d_ino) || __put_user(offset, &dirent->d_offset) || __put_user(namlen, &dirent->d_namlen) || __copy_to_user(dirent->d_name, name, namlen) || @@ -980,22 +982,26 @@ struct compat_getdents_callback { }; static int compat_filldir(void *__buf, const char *name, int namlen, - loff_t offset, ino_t ino, unsigned int d_type) + loff_t offset, u64 ino, unsigned int d_type) { struct compat_linux_dirent __user * dirent; struct compat_getdents_callback *buf = __buf; + compat_ulong_t d_ino; int reclen = COMPAT_ROUND_UP(NAME_OFFSET(dirent) + namlen + 2); buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + return -EOVERFLOW; dirent = buf->previous; if (dirent) { if (__put_user(offset, &dirent->d_off)) goto efault; } dirent = buf->current_dir; - if (__put_user(ino, &dirent->d_ino)) + if (__put_user(d_ino, &dirent->d_ino)) goto efault; if (__put_user(reclen, &dirent->d_reclen)) goto efault; @@ -1066,7 +1072,7 @@ struct compat_getdents_callback64 { }; static int compat_filldir64(void * __buf, const char * name, int namlen, loff_t offset, - ino_t ino, unsigned int d_type) + u64 ino, unsigned int d_type) { struct linux_dirent64 __user *dirent; struct compat_getdents_callback64 *buf = __buf; diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 85105e50f7d..e6d5754a715 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -137,8 +137,8 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp if ((retval = fill_read_buffer(file->f_dentry,buffer))) goto out; } - pr_debug("%s: count = %d, ppos = %lld, buf = %s\n", - __FUNCTION__,count,*ppos,buffer->page); + pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", + __FUNCTION__, count, *ppos, buffer->page); retval = flush_read_buffer(buffer,buf,count,ppos); out: up(&buffer->sem); diff --git a/fs/dcache.c b/fs/dcache.c index fc2faa44f8d..2355bddad8d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -291,9 +291,9 @@ struct dentry * dget_locked(struct dentry *dentry) * it can be unhashed only if it has no children, or if it is the root * of a filesystem. * - * If the inode has a DCACHE_DISCONNECTED alias, then prefer + * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer * any other hashed alias over that one unless @want_discon is set, - * in which case only return a DCACHE_DISCONNECTED alias. + * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias. */ static struct dentry * __d_find_alias(struct inode *inode, int want_discon) @@ -309,7 +309,8 @@ static struct dentry * __d_find_alias(struct inode *inode, int want_discon) prefetch(next); alias = list_entry(tmp, struct dentry, d_alias); if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { - if (alias->d_flags & DCACHE_DISCONNECTED) + if (IS_ROOT(alias) && + (alias->d_flags & DCACHE_DISCONNECTED)) discon_alias = alias; else if (!want_discon) { __dget_locked(alias); @@ -1004,7 +1005,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) { struct dentry *new = NULL; - if (inode) { + if (inode && S_ISDIR(inode->i_mode)) { spin_lock(&dcache_lock); new = __d_find_alias(inode, 1); if (new) { diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index ecf3da9edf2..e77676df671 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -252,7 +252,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir); * * This function removes a file or directory in debugfs that was previously * created with a call to another debugfs function (like - * debufs_create_file() or variants thereof.) + * debugfs_create_file() or variants thereof.) * * This function is required to be called in order for the file to be * removed, no automatic cleanup of files will happen when a module is diff --git a/fs/dnotify.c b/fs/dnotify.c index f932591df5a..2b0442db67e 100644 --- a/fs/dnotify.c +++ b/fs/dnotify.c @@ -92,7 +92,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) prev = &odn->dn_next; } - error = f_setown(filp, current->pid, 0); + error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); if (error) goto out_free; diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile new file mode 100644 index 00000000000..ca6562451ee --- /dev/null +++ b/fs/ecryptfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux 2.6 eCryptfs +# + +obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o + +ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o crypto.o keystore.o debug.o diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c new file mode 100644 index 00000000000..ed35a9712fa --- /dev/null +++ b/fs/ecryptfs/crypto.c @@ -0,0 +1,1659 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2004 Erez Zadok + * Copyright (C) 2001-2004 Stony Brook University + * Copyright (C) 2004-2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> + * Michael C. Thompson <mcthomps@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/pagemap.h> +#include <linux/random.h> +#include <linux/compiler.h> +#include <linux/key.h> +#include <linux/namei.h> +#include <linux/crypto.h> +#include <linux/file.h> +#include <linux/scatterlist.h> +#include "ecryptfs_kernel.h" + +static int +ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat, + struct page *dst_page, int dst_offset, + struct page *src_page, int src_offset, int size, + unsigned char *iv); +static int +ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat, + struct page *dst_page, int dst_offset, + struct page *src_page, int src_offset, int size, + unsigned char *iv); + +/** + * ecryptfs_to_hex + * @dst: Buffer to take hex character representation of contents of + * src; must be at least of size (src_size * 2) + * @src: Buffer to be converted to a hex string respresentation + * @src_size: number of bytes to convert + */ +void ecryptfs_to_hex(char *dst, char *src, size_t src_size) +{ + int x; + + for (x = 0; x < src_size; x++) + sprintf(&dst[x * 2], "%.2x", (unsigned char)src[x]); +} + +/** + * ecryptfs_from_hex + * @dst: Buffer to take the bytes from src hex; must be at least of + * size (src_size / 2) + * @src: Buffer to be converted from a hex string respresentation to raw value + * @dst_size: size of dst buffer, or number of hex characters pairs to convert + */ +void ecryptfs_from_hex(char *dst, char *src, int dst_size) +{ + int x; + char tmp[3] = { 0, }; + + for (x = 0; x < dst_size; x++) { + tmp[0] = src[x * 2]; + tmp[1] = src[x * 2 + 1]; + dst[x] = (unsigned char)simple_strtol(tmp, NULL, 16); + } +} + +/** + * ecryptfs_calculate_md5 - calculates the md5 of @src + * @dst: Pointer to 16 bytes of allocated memory + * @crypt_stat: Pointer to crypt_stat struct for the current inode + * @src: Data to be md5'd + * @len: Length of @src + * + * Uses the allocated crypto context that crypt_stat references to + * generate the MD5 sum of the contents of src. + */ +static int ecryptfs_calculate_md5(char *dst, + struct ecryptfs_crypt_stat *crypt_stat, + char *src, int len) +{ + int rc = 0; + struct scatterlist sg; + + mutex_lock(&crypt_stat->cs_md5_tfm_mutex); + sg_init_one(&sg, (u8 *)src, len); + if (!crypt_stat->md5_tfm) { + crypt_stat->md5_tfm = + crypto_alloc_tfm("md5", CRYPTO_TFM_REQ_MAY_SLEEP); + if (!crypt_stat->md5_tfm) { + rc = -ENOMEM; + ecryptfs_printk(KERN_ERR, "Error attempting to " + "allocate crypto context\n"); + goto out; + } + } + crypto_digest_init(crypt_stat->md5_tfm); + crypto_digest_update(crypt_stat->md5_tfm, &sg, 1); + crypto_digest_final(crypt_stat->md5_tfm, dst); + mutex_unlock(&crypt_stat->cs_md5_tfm_mutex); +out: + return rc; +} + +/** + * ecryptfs_derive_iv + * @iv: destination for the derived iv vale + * @crypt_stat: Pointer to crypt_stat struct for the current inode + * @offset: Offset of the page whose's iv we are to derive + * + * Generate the initialization vector from the given root IV and page + * offset. + * + * Returns zero on success; non-zero on error. + */ +static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + pgoff_t offset) +{ + int rc = 0; + char dst[MD5_DIGEST_SIZE]; + char src[ECRYPTFS_MAX_IV_BYTES + 16]; + + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "root iv:\n"); + ecryptfs_dump_hex(crypt_stat->root_iv, crypt_stat->iv_bytes); + } + /* TODO: It is probably secure to just cast the least + * significant bits of the root IV into an unsigned long and + * add the offset to that rather than go through all this + * hashing business. -Halcrow */ + memcpy(src, crypt_stat->root_iv, crypt_stat->iv_bytes); + memset((src + crypt_stat->iv_bytes), 0, 16); + snprintf((src + crypt_stat->iv_bytes), 16, "%ld", offset); + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "source:\n"); + ecryptfs_dump_hex(src, (crypt_stat->iv_bytes + 16)); + } + rc = ecryptfs_calculate_md5(dst, crypt_stat, src, + (crypt_stat->iv_bytes + 16)); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error attempting to compute " + "MD5 while generating IV for a page\n"); + goto out; + } + memcpy(iv, dst, crypt_stat->iv_bytes); + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "derived iv:\n"); + ecryptfs_dump_hex(iv, crypt_stat->iv_bytes); + } +out: + return rc; +} + +/** + * ecryptfs_init_crypt_stat + * @crypt_stat: Pointer to the crypt_stat struct to initialize. + * + * Initialize the crypt_stat structure. + */ +void +ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) +{ + memset((void *)crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat)); + mutex_init(&crypt_stat->cs_mutex); + mutex_init(&crypt_stat->cs_tfm_mutex); + mutex_init(&crypt_stat->cs_md5_tfm_mutex); + ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_STRUCT_INITIALIZED); +} + +/** + * ecryptfs_destruct_crypt_stat + * @crypt_stat: Pointer to the crypt_stat struct to initialize. + * + * Releases all memory associated with a crypt_stat struct. + */ +void ecryptfs_destruct_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat) +{ + if (crypt_stat->tfm) + crypto_free_tfm(crypt_stat->tfm); + if (crypt_stat->md5_tfm) + crypto_free_tfm(crypt_stat->md5_tfm); + memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat)); +} + +void ecryptfs_destruct_mount_crypt_stat( + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + if (mount_crypt_stat->global_auth_tok_key) + key_put(mount_crypt_stat->global_auth_tok_key); + if (mount_crypt_stat->global_key_tfm) + crypto_free_tfm(mount_crypt_stat->global_key_tfm); + memset(mount_crypt_stat, 0, sizeof(struct ecryptfs_mount_crypt_stat)); +} + +/** + * virt_to_scatterlist + * @addr: Virtual address + * @size: Size of data; should be an even multiple of the block size + * @sg: Pointer to scatterlist array; set to NULL to obtain only + * the number of scatterlist structs required in array + * @sg_size: Max array size + * + * Fills in a scatterlist array with page references for a passed + * virtual address. + * + * Returns the number of scatterlist structs in array used + */ +int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, + int sg_size) +{ + int i = 0; + struct page *pg; + int offset; + int remainder_of_page; + + while (size > 0 && i < sg_size) { + pg = virt_to_page(addr); + offset = offset_in_page(addr); + if (sg) { + sg[i].page = pg; + sg[i].offset = offset; + } + remainder_of_page = PAGE_CACHE_SIZE - offset; + if (size >= remainder_of_page) { + if (sg) + sg[i].length = remainder_of_page; + addr += remainder_of_page; + size -= remainder_of_page; + } else { + if (sg) + sg[i].length = size; + addr += size; + size = 0; + } + i++; + } + if (size > 0) + return -ENOMEM; + return i; +} + +/** + * encrypt_scatterlist + * @crypt_stat: Pointer to the crypt_stat struct to initialize. + * @dest_sg: Destination of encrypted data + * @src_sg: Data to be encrypted + * @size: Length of data to be encrypted + * @iv: iv to use during encryption + * + * Returns the number of bytes encrypted; negative value on error + */ +static int encrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, + struct scatterlist *dest_sg, + struct scatterlist *src_sg, int size, + unsigned char *iv) +{ + int rc = 0; + + BUG_ON(!crypt_stat || !crypt_stat->tfm + || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags, + ECRYPTFS_STRUCT_INITIALIZED)); + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "Key size [%d]; key:\n", + crypt_stat->key_size); + ecryptfs_dump_hex(crypt_stat->key, + crypt_stat->key_size); + } + /* Consider doing this once, when the file is opened */ + mutex_lock(&crypt_stat->cs_tfm_mutex); + rc = crypto_cipher_setkey(crypt_stat->tfm, crypt_stat->key, + crypt_stat->key_size); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n", + rc); + mutex_unlock(&crypt_stat->cs_tfm_mutex); + rc = -EINVAL; + goto out; + } + ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes.\n", size); + crypto_cipher_encrypt_iv(crypt_stat->tfm, dest_sg, src_sg, size, iv); + mutex_unlock(&crypt_stat->cs_tfm_mutex); +out: + return rc; +} + +static void +ecryptfs_extent_to_lwr_pg_idx_and_offset(unsigned long *lower_page_idx, + int *byte_offset, + struct ecryptfs_crypt_stat *crypt_stat, + unsigned long extent_num) +{ + unsigned long lower_extent_num; + int extents_occupied_by_headers_at_front; + int bytes_occupied_by_headers_at_front; + int extent_offset; + int extents_per_page; + + bytes_occupied_by_headers_at_front = + ( crypt_stat->header_extent_size + * crypt_stat->num_header_extents_at_front ); + extents_occupied_by_headers_at_front = + ( bytes_occupied_by_headers_at_front + / crypt_stat->extent_size ); + lower_extent_num = extents_occupied_by_headers_at_front + extent_num; + extents_per_page = PAGE_CACHE_SIZE / crypt_stat->extent_size; + (*lower_page_idx) = lower_extent_num / extents_per_page; + extent_offset = lower_extent_num % extents_per_page; + (*byte_offset) = extent_offset * crypt_stat->extent_size; + ecryptfs_printk(KERN_DEBUG, " * crypt_stat->header_extent_size = " + "[%d]\n", crypt_stat->header_extent_size); + ecryptfs_printk(KERN_DEBUG, " * crypt_stat->" + "num_header_extents_at_front = [%d]\n", + crypt_stat->num_header_extents_at_front); + ecryptfs_printk(KERN_DEBUG, " * extents_occupied_by_headers_at_" + "front = [%d]\n", extents_occupied_by_headers_at_front); + ecryptfs_printk(KERN_DEBUG, " * lower_extent_num = [0x%.16x]\n", + lower_extent_num); + ecryptfs_printk(KERN_DEBUG, " * extents_per_page = [%d]\n", + extents_per_page); + ecryptfs_printk(KERN_DEBUG, " * (*lower_page_idx) = [0x%.16x]\n", + (*lower_page_idx)); + ecryptfs_printk(KERN_DEBUG, " * extent_offset = [%d]\n", + extent_offset); + ecryptfs_printk(KERN_DEBUG, " * (*byte_offset) = [%d]\n", + (*byte_offset)); +} + +static int ecryptfs_write_out_page(struct ecryptfs_page_crypt_context *ctx, + struct page *lower_page, + struct inode *lower_inode, + int byte_offset_in_page, int bytes_to_write) +{ + int rc = 0; + + if (ctx->mode == ECRYPTFS_PREPARE_COMMIT_MODE) { + rc = ecryptfs_commit_lower_page(lower_page, lower_inode, + ctx->param.lower_file, + byte_offset_in_page, + bytes_to_write); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error calling lower " + "commit; rc = [%d]\n", rc); + goto out; + } + } else { + rc = ecryptfs_writepage_and_release_lower_page(lower_page, + lower_inode, + ctx->param.wbc); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error calling lower " + "writepage(); rc = [%d]\n", rc); + goto out; + } + } +out: + return rc; +} + +static int ecryptfs_read_in_page(struct ecryptfs_page_crypt_context *ctx, + struct page **lower_page, + struct inode *lower_inode, + unsigned long lower_page_idx, + int byte_offset_in_page) +{ + int rc = 0; + + if (ctx->mode == ECRYPTFS_PREPARE_COMMIT_MODE) { + /* TODO: Limit this to only the data extents that are + * needed */ + rc = ecryptfs_get_lower_page(lower_page, lower_inode, + ctx->param.lower_file, + lower_page_idx, + byte_offset_in_page, + (PAGE_CACHE_SIZE + - byte_offset_in_page)); + if (rc) { + ecryptfs_printk( + KERN_ERR, "Error attempting to grab, map, " + "and prepare_write lower page with index " + "[0x%.16x]; rc = [%d]\n", lower_page_idx, rc); + goto out; + } + } else { + rc = ecryptfs_grab_and_map_lower_page(lower_page, NULL, + lower_inode, + lower_page_idx); + if (rc) { + ecryptfs_printk( + KERN_ERR, "Error attempting to grab and map " + "lower page with index [0x%.16x]; rc = [%d]\n", + lower_page_idx, rc); + goto out; + } + } +out: + return rc; +} + +/** + * ecryptfs_encrypt_page + * @ctx: The context of the page + * + * Encrypt an eCryptfs page. This is done on a per-extent basis. Note + * that eCryptfs pages may straddle the lower pages -- for instance, + * if the file was created on a machine with an 8K page size + * (resulting in an 8K header), and then the file is copied onto a + * host with a 32K page size, then when reading page 0 of the eCryptfs + * file, 24K of page 0 of the lower file will be read and decrypted, + * and then 8K of page 1 of the lower file will be read and decrypted. + * + * The actual operations performed on each page depends on the + * contents of the ecryptfs_page_crypt_context struct. + * + * Returns zero on success; negative on error + */ +int ecryptfs_encrypt_page(struct ecryptfs_page_crypt_context *ctx) +{ + char extent_iv[ECRYPTFS_MAX_IV_BYTES]; + unsigned long base_extent; + unsigned long extent_offset = 0; + unsigned long lower_page_idx = 0; + unsigned long prior_lower_page_idx = 0; + struct page *lower_page; + struct inode *lower_inode; + struct ecryptfs_inode_info *inode_info; + struct ecryptfs_crypt_stat *crypt_stat; + int rc = 0; + int lower_byte_offset = 0; + int orig_byte_offset = 0; + int num_extents_per_page; +#define ECRYPTFS_PAGE_STATE_UNREAD 0 +#define ECRYPTFS_PAGE_STATE_READ 1 +#define ECRYPTFS_PAGE_STATE_MODIFIED 2 +#define ECRYPTFS_PAGE_STATE_WRITTEN 3 + int page_state; + + lower_inode = ecryptfs_inode_to_lower(ctx->page->mapping->host); + inode_info = ecryptfs_inode_to_private(ctx->page->mapping->host); + crypt_stat = &inode_info->crypt_stat; + if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED)) { + rc = ecryptfs_copy_page_to_lower(ctx->page, lower_inode, + ctx->param.lower_file); + if (rc) + ecryptfs_printk(KERN_ERR, "Error attempting to copy " + "page at index [0x%.16x]\n", + ctx->page->index); + goto out; + } + num_extents_per_page = PAGE_CACHE_SIZE / crypt_stat->extent_size; + base_extent = (ctx->page->index * num_extents_per_page); + page_state = ECRYPTFS_PAGE_STATE_UNREAD; + while (extent_offset < num_extents_per_page) { + ecryptfs_extent_to_lwr_pg_idx_and_offset( + &lower_page_idx, &lower_byte_offset, crypt_stat, + (base_extent + extent_offset)); + if (prior_lower_page_idx != lower_page_idx + && page_state == ECRYPTFS_PAGE_STATE_MODIFIED) { + rc = ecryptfs_write_out_page(ctx, lower_page, + lower_inode, + orig_byte_offset, + (PAGE_CACHE_SIZE + - orig_byte_offset)); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error attempting " + "to write out page; rc = [%d]" + "\n", rc); + goto out; + } + page_state = ECRYPTFS_PAGE_STATE_WRITTEN; + } + if (page_state == ECRYPTFS_PAGE_STATE_UNREAD + || page_state == ECRYPTFS_PAGE_STATE_WRITTEN) { + rc = ecryptfs_read_in_page(ctx, &lower_page, + lower_inode, lower_page_idx, + lower_byte_offset); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error attempting " + "to read in lower page with " + "index [0x%.16x]; rc = [%d]\n", + lower_page_idx, rc); + goto out; + } + orig_byte_offset = lower_byte_offset; + prior_lower_page_idx = lower_page_idx; + page_state = ECRYPTFS_PAGE_STATE_READ; + } + BUG_ON(!(page_state == ECRYPTFS_PAGE_STATE_MODIFIED + || page_state == ECRYPTFS_PAGE_STATE_READ)); + rc = ecryptfs_derive_iv(extent_iv, crypt_stat, + (base_extent + extent_offset)); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error attempting to " + "derive IV for extent [0x%.16x]; " + "rc = [%d]\n", + (base_extent + extent_offset), rc); + goto out; + } + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "Encrypting extent " + "with iv:\n"); + ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes); + ecryptfs_printk(KERN_DEBUG, "First 8 bytes before " + "encryption:\n"); + ecryptfs_dump_hex((char *) + (page_address(ctx->page) + + (extent_offset + * crypt_stat->extent_size)), 8); + } + rc = ecryptfs_encrypt_page_offset( + crypt_stat, lower_page, lower_byte_offset, ctx->page, + (extent_offset * crypt_stat->extent_size), + crypt_stat->extent_size, extent_iv); + ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16x]; " + "rc = [%d]\n", + (base_extent + extent_offset), rc); + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " + "encryption:\n"); + ecryptfs_dump_hex((char *)(page_address(lower_page) + + lower_byte_offset), 8); + } + page_state = ECRYPTFS_PAGE_STATE_MODIFIED; + extent_offset++; + } + BUG_ON(orig_byte_offset != 0); + rc = ecryptfs_write_out_page(ctx, lower_page, lower_inode, 0, + (lower_byte_offset + + crypt_stat->extent_size)); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error attempting to write out " + "page; rc = [%d]\n", rc); + goto out; + } +out: + return rc; +} + +/** + * ecryptfs_decrypt_page + * @file: The ecryptfs file + * @page: The page in ecryptfs to decrypt + * + * Decrypt an eCryptfs page. This is done on a per-extent basis. Note + * that eCryptfs pages may straddle the lower pages -- for instance, + * if the file was created on a machine with an 8K page size + * (resulting in an 8K header), and then the file is copied onto a + * host with a 32K page size, then when reading page 0 of the eCryptfs + * file, 24K of page 0 of the lower file will be read and decrypted, + * and then 8K of page 1 of the lower file will be read and decrypted. + * + * Returns zero on success; negative on error + */ +int ecryptfs_decrypt_page(struct file *file, struct page *page) +{ + char extent_iv[ECRYPTFS_MAX_IV_BYTES]; + unsigned long base_extent; + unsigned long extent_offset = 0; + unsigned long lower_page_idx = 0; + unsigned long prior_lower_page_idx = 0; + struct page *lower_page; + char *lower_page_virt = NULL; + struct inode *lower_inode; + struct ecryptfs_crypt_stat *crypt_stat; + int rc = 0; + int byte_offset; + int num_extents_per_page; + int page_state; + + crypt_stat = &(ecryptfs_inode_to_private( + page->mapping->host)->crypt_stat); + lower_inode = ecryptfs_inode_to_lower(page->mapping->host); + if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED)) { + rc = ecryptfs_do_readpage(file, page, page->index); + if (rc) + ecryptfs_printk(KERN_ERR, "Error attempting to copy " + "page at index [0x%.16x]\n", + page->index); + goto out; + } + num_extents_per_page = PAGE_CACHE_SIZE / crypt_stat->extent_size; + base_extent = (page->index * num_extents_per_page); + lower_page_virt = kmem_cache_alloc(ecryptfs_lower_page_cache, + SLAB_KERNEL); + if (!lower_page_virt) { + rc = -ENOMEM; + ecryptfs_printk(KERN_ERR, "Error getting page for encrypted " + "lower page(s)\n"); + goto out; + } + lower_page = virt_to_page(lower_page_virt); + page_state = ECRYPTFS_PAGE_STATE_UNREAD; + while (extent_offset < num_extents_per_page) { + ecryptfs_extent_to_lwr_pg_idx_and_offset( + &lower_page_idx, &byte_offset, crypt_stat, + (base_extent + extent_offset)); + if (prior_lower_page_idx != lower_page_idx + || page_state == ECRYPTFS_PAGE_STATE_UNREAD) { + rc = ecryptfs_do_readpage(file, lower_page, + lower_page_idx); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error reading " + "lower encrypted page; rc = " + "[%d]\n", rc); + goto out; + } + prior_lower_page_idx = lower_page_idx; + page_state = ECRYPTFS_PAGE_STATE_READ; + } + rc = ecryptfs_derive_iv(extent_iv, crypt_stat, + (base_extent + extent_offset)); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error attempting to " + "derive IV for extent [0x%.16x]; rc = " + "[%d]\n", + (base_extent + extent_offset), rc); + goto out; + } + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "Decrypting extent " + "with iv:\n"); + ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes); + ecryptfs_printk(KERN_DEBUG, "First 8 bytes before " + "decryption:\n"); + ecryptfs_dump_hex((lower_page_virt + byte_offset), 8); + } + rc = ecryptfs_decrypt_page_offset(crypt_stat, page, + (extent_offset + * crypt_stat->extent_size), + lower_page, byte_offset, + crypt_stat->extent_size, + extent_iv); + if (rc != crypt_stat->extent_size) { + ecryptfs_printk(KERN_ERR, "Error attempting to " + "decrypt extent [0x%.16x]\n", + (base_extent + extent_offset)); + goto out; + } + rc = 0; + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "First 8 bytes after " + "decryption:\n"); + ecryptfs_dump_hex((char *)(page_address(page) + + byte_offset), 8); + } + extent_offset++; + } +out: + if (lower_page_virt) + kmem_cache_free(ecryptfs_lower_page_cache, lower_page_virt); + return rc; +} + +/** + * decrypt_scatterlist + * + * Returns the number of bytes decrypted; negative value on error + */ +static int decrypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, + struct scatterlist *dest_sg, + struct scatterlist *src_sg, int size, + unsigned char *iv) +{ + int rc = 0; + + /* Consider doing this once, when the file is opened */ + mutex_lock(&crypt_stat->cs_tfm_mutex); + rc = crypto_cipher_setkey(crypt_stat->tfm, crypt_stat->key, + crypt_stat->key_size); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error setting key; rc = [%d]\n", + rc); + mutex_unlock(&crypt_stat->cs_tfm_mutex); + rc = -EINVAL; + goto out; + } + ecryptfs_printk(KERN_DEBUG, "Decrypting [%d] bytes.\n", size); + rc = crypto_cipher_decrypt_iv(crypt_stat->tfm, dest_sg, src_sg, size, + iv); + mutex_unlock(&crypt_stat->cs_tfm_mutex); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error decrypting; rc = [%d]\n", + rc); + goto out; + } + rc = size; +out: + return rc; +} + +/** + * ecryptfs_encrypt_page_offset + * + * Returns the number of bytes encrypted + */ +static int +ecryptfs_encrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat, + struct page *dst_page, int dst_offset, + struct page *src_page, int src_offset, int size, + unsigned char *iv) +{ + struct scatterlist src_sg, dst_sg; + + src_sg.page = src_page; + src_sg.offset = src_offset; + src_sg.length = size; + dst_sg.page = dst_page; + dst_sg.offset = dst_offset; + dst_sg.length = size; + return encrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv); +} + +/** + * ecryptfs_decrypt_page_offset + * + * Returns the number of bytes decrypted + */ +static int +ecryptfs_decrypt_page_offset(struct ecryptfs_crypt_stat *crypt_stat, + struct page *dst_page, int dst_offset, + struct page *src_page, int src_offset, int size, + unsigned char *iv) +{ + struct scatterlist src_sg, dst_sg; + + src_sg.page = src_page; + src_sg.offset = src_offset; + src_sg.length = size; + dst_sg.page = dst_page; + dst_sg.offset = dst_offset; + dst_sg.length = size; + return decrypt_scatterlist(crypt_stat, &dst_sg, &src_sg, size, iv); +} + +#define ECRYPTFS_MAX_SCATTERLIST_LEN 4 + +/** + * ecryptfs_init_crypt_ctx + * @crypt_stat: Uninitilized crypt stats structure + * + * Initialize the crypto context. + * + * TODO: Performance: Keep a cache of initialized cipher contexts; + * only init if needed + */ +int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat) +{ + int rc = -EINVAL; + + if (!crypt_stat->cipher) { + ecryptfs_printk(KERN_ERR, "No cipher specified\n"); + goto out; + } + ecryptfs_printk(KERN_DEBUG, + "Initializing cipher [%s]; strlen = [%d]; " + "key_size_bits = [%d]\n", + crypt_stat->cipher, (int)strlen(crypt_stat->cipher), + crypt_stat->key_size << 3); + if (crypt_stat->tfm) { + rc = 0; + goto out; + } + mutex_lock(&crypt_stat->cs_tfm_mutex); + crypt_stat->tfm = crypto_alloc_tfm(crypt_stat->cipher, + ECRYPTFS_DEFAULT_CHAINING_MODE + | CRYPTO_TFM_REQ_WEAK_KEY); + mutex_unlock(&crypt_stat->cs_tfm_mutex); + if (!crypt_stat->tfm) { + ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): " + "Error initializing cipher [%s]\n", + crypt_stat->cipher); + goto out; + } + rc = 0; +out: + return rc; +} + +static void set_extent_mask_and_shift(struct ecryptfs_crypt_stat *crypt_stat) +{ + int extent_size_tmp; + + crypt_stat->extent_mask = 0xFFFFFFFF; + crypt_stat->extent_shift = 0; + if (crypt_stat->extent_size == 0) + return; + extent_size_tmp = crypt_stat->extent_size; + while ((extent_size_tmp & 0x01) == 0) { + extent_size_tmp >>= 1; + crypt_stat->extent_mask <<= 1; + crypt_stat->extent_shift++; + } +} + +void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat) +{ + /* Default values; may be overwritten as we are parsing the + * packets. */ + crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE; + set_extent_mask_and_shift(crypt_stat); + crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES; + if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) { + crypt_stat->header_extent_size = + ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; + } else + crypt_stat->header_extent_size = PAGE_CACHE_SIZE; + crypt_stat->num_header_extents_at_front = 1; +} + +/** + * ecryptfs_compute_root_iv + * @crypt_stats + * + * On error, sets the root IV to all 0's. + */ +int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat) +{ + int rc = 0; + char dst[MD5_DIGEST_SIZE]; + + BUG_ON(crypt_stat->iv_bytes > MD5_DIGEST_SIZE); + BUG_ON(crypt_stat->iv_bytes <= 0); + if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID)) { + rc = -EINVAL; + ecryptfs_printk(KERN_WARNING, "Session key not valid; " + "cannot generate root IV\n"); + goto out; + } + rc = ecryptfs_calculate_md5(dst, crypt_stat, crypt_stat->key, + crypt_stat->key_size); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error attempting to compute " + "MD5 while generating root IV\n"); + goto out; + } + memcpy(crypt_stat->root_iv, dst, crypt_stat->iv_bytes); +out: + if (rc) { + memset(crypt_stat->root_iv, 0, crypt_stat->iv_bytes); + ECRYPTFS_SET_FLAG(crypt_stat->flags, + ECRYPTFS_SECURITY_WARNING); + } + return rc; +} + +static void ecryptfs_generate_new_key(struct ecryptfs_crypt_stat *crypt_stat) +{ + get_random_bytes(crypt_stat->key, crypt_stat->key_size); + ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID); + ecryptfs_compute_root_iv(crypt_stat); + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "Generated new session key:\n"); + ecryptfs_dump_hex(crypt_stat->key, + crypt_stat->key_size); + } +} + +/** + * ecryptfs_set_default_crypt_stat_vals + * @crypt_stat + * + * Default values in the event that policy does not override them. + */ +static void ecryptfs_set_default_crypt_stat_vals( + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) +{ + ecryptfs_set_default_sizes(crypt_stat); + strcpy(crypt_stat->cipher, ECRYPTFS_DEFAULT_CIPHER); + crypt_stat->key_size = ECRYPTFS_DEFAULT_KEY_BYTES; + ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID); + crypt_stat->file_version = ECRYPTFS_FILE_VERSION; + crypt_stat->mount_crypt_stat = mount_crypt_stat; +} + +/** + * ecryptfs_new_file_context + * @ecryptfs_dentry + * + * If the crypto context for the file has not yet been established, + * this is where we do that. Establishing a new crypto context + * involves the following decisions: + * - What cipher to use? + * - What set of authentication tokens to use? + * Here we just worry about getting enough information into the + * authentication tokens so that we know that they are available. + * We associate the available authentication tokens with the new file + * via the set of signatures in the crypt_stat struct. Later, when + * the headers are actually written out, we may again defer to + * userspace to perform the encryption of the session key; for the + * foreseeable future, this will be the case with public key packets. + * + * Returns zero on success; non-zero otherwise + */ +/* Associate an authentication token(s) with the file */ +int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry) +{ + int rc = 0; + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + int cipher_name_len; + + ecryptfs_set_default_crypt_stat_vals(crypt_stat, mount_crypt_stat); + /* See if there are mount crypt options */ + if (mount_crypt_stat->global_auth_tok) { + ecryptfs_printk(KERN_DEBUG, "Initializing context for new " + "file using mount_crypt_stat\n"); + ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED); + ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID); + memcpy(crypt_stat->keysigs[crypt_stat->num_keysigs++], + mount_crypt_stat->global_auth_tok_sig, + ECRYPTFS_SIG_SIZE_HEX); + cipher_name_len = + strlen(mount_crypt_stat->global_default_cipher_name); + memcpy(crypt_stat->cipher, + mount_crypt_stat->global_default_cipher_name, + cipher_name_len); + crypt_stat->cipher[cipher_name_len] = '\0'; + crypt_stat->key_size = + mount_crypt_stat->global_default_cipher_key_size; + ecryptfs_generate_new_key(crypt_stat); + } else + /* We should not encounter this scenario since we + * should detect lack of global_auth_tok at mount time + * TODO: Applies to 0.1 release only; remove in future + * release */ + BUG(); + rc = ecryptfs_init_crypt_ctx(crypt_stat); + if (rc) + ecryptfs_printk(KERN_ERR, "Error initializing cryptographic " + "context for cipher [%s]: rc = [%d]\n", + crypt_stat->cipher, rc); + return rc; +} + +/** + * contains_ecryptfs_marker - check for the ecryptfs marker + * @data: The data block in which to check + * + * Returns one if marker found; zero if not found + */ +int contains_ecryptfs_marker(char *data) +{ + u32 m_1, m_2; + + memcpy(&m_1, data, 4); + m_1 = be32_to_cpu(m_1); + memcpy(&m_2, (data + 4), 4); + m_2 = be32_to_cpu(m_2); + if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2) + return 1; + ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; " + "MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2, + MAGIC_ECRYPTFS_MARKER); + ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = " + "[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER)); + return 0; +} + +struct ecryptfs_flag_map_elem { + u32 file_flag; + u32 local_flag; +}; + +/* Add support for additional flags by adding elements here. */ +static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = { + {0x00000001, ECRYPTFS_ENABLE_HMAC}, + {0x00000002, ECRYPTFS_ENCRYPTED} +}; + +/** + * ecryptfs_process_flags + * @crypt_stat + * @page_virt: Source data to be parsed + * @bytes_read: Updated with the number of bytes read + * + * Returns zero on success; non-zero if the flag set is invalid + */ +static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat, + char *page_virt, int *bytes_read) +{ + int rc = 0; + int i; + u32 flags; + + memcpy(&flags, page_virt, 4); + flags = be32_to_cpu(flags); + for (i = 0; i < ((sizeof(ecryptfs_flag_map) + / sizeof(struct ecryptfs_flag_map_elem))); i++) + if (flags & ecryptfs_flag_map[i].file_flag) { + ECRYPTFS_SET_FLAG(crypt_stat->flags, + ecryptfs_flag_map[i].local_flag); + } else + ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, + ecryptfs_flag_map[i].local_flag); + /* Version is in top 8 bits of the 32-bit flag vector */ + crypt_stat->file_version = ((flags >> 24) & 0xFF); + (*bytes_read) = 4; + return rc; +} + +/** + * write_ecryptfs_marker + * @page_virt: The pointer to in a page to begin writing the marker + * @written: Number of bytes written + * + * Marker = 0x3c81b7f5 + */ +static void write_ecryptfs_marker(char *page_virt, size_t *written) +{ + u32 m_1, m_2; + + get_random_bytes(&m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2)); + m_2 = (m_1 ^ MAGIC_ECRYPTFS_MARKER); + m_1 = cpu_to_be32(m_1); + memcpy(page_virt, &m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2)); + m_2 = cpu_to_be32(m_2); + memcpy(page_virt + (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2), &m_2, + (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2)); + (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; +} + +static void +write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, + size_t *written) +{ + u32 flags = 0; + int i; + + for (i = 0; i < ((sizeof(ecryptfs_flag_map) + / sizeof(struct ecryptfs_flag_map_elem))); i++) + if (ECRYPTFS_CHECK_FLAG(crypt_stat->flags, + ecryptfs_flag_map[i].local_flag)) + flags |= ecryptfs_flag_map[i].file_flag; + /* Version is in top 8 bits of the 32-bit flag vector */ + flags |= ((((u8)crypt_stat->file_version) << 24) & 0xFF000000); + flags = cpu_to_be32(flags); + memcpy(page_virt, &flags, 4); + (*written) = 4; +} + +struct ecryptfs_cipher_code_str_map_elem { + char cipher_str[16]; + u16 cipher_code; +}; + +/* Add support for additional ciphers by adding elements here. The + * cipher_code is whatever OpenPGP applicatoins use to identify the + * ciphers. List in order of probability. */ +static struct ecryptfs_cipher_code_str_map_elem +ecryptfs_cipher_code_str_map[] = { + {"aes",RFC2440_CIPHER_AES_128 }, + {"blowfish", RFC2440_CIPHER_BLOWFISH}, + {"des3_ede", RFC2440_CIPHER_DES3_EDE}, + {"cast5", RFC2440_CIPHER_CAST_5}, + {"twofish", RFC2440_CIPHER_TWOFISH}, + {"cast6", RFC2440_CIPHER_CAST_6}, + {"aes", RFC2440_CIPHER_AES_192}, + {"aes", RFC2440_CIPHER_AES_256} +}; + +/** + * ecryptfs_code_for_cipher_string + * @str: The string representing the cipher name + * + * Returns zero on no match, or the cipher code on match + */ +u16 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat) +{ + int i; + u16 code = 0; + struct ecryptfs_cipher_code_str_map_elem *map = + ecryptfs_cipher_code_str_map; + + if (strcmp(crypt_stat->cipher, "aes") == 0) { + switch (crypt_stat->key_size) { + case 16: + code = RFC2440_CIPHER_AES_128; + break; + case 24: + code = RFC2440_CIPHER_AES_192; + break; + case 32: + code = RFC2440_CIPHER_AES_256; + } + } else { + for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++) + if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){ + code = map[i].cipher_code; + break; + } + } + return code; +} + +/** + * ecryptfs_cipher_code_to_string + * @str: Destination to write out the cipher name + * @cipher_code: The code to convert to cipher name string + * + * Returns zero on success + */ +int ecryptfs_cipher_code_to_string(char *str, u16 cipher_code) +{ + int rc = 0; + int i; + + str[0] = '\0'; + for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++) + if (cipher_code == ecryptfs_cipher_code_str_map[i].cipher_code) + strcpy(str, ecryptfs_cipher_code_str_map[i].cipher_str); + if (str[0] == '\0') { + ecryptfs_printk(KERN_WARNING, "Cipher code not recognized: " + "[%d]\n", cipher_code); + rc = -EINVAL; + } + return rc; +} + +/** + * ecryptfs_read_header_region + * @data + * @dentry + * @nd + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_read_header_region(char *data, struct dentry *dentry, + struct vfsmount *mnt) +{ + struct file *file; + mm_segment_t oldfs; + int rc; + + mnt = mntget(mnt); + file = dentry_open(dentry, mnt, O_RDONLY); + if (IS_ERR(file)) { + ecryptfs_printk(KERN_DEBUG, "Error opening file to " + "read header region\n"); + mntput(mnt); + rc = PTR_ERR(file); + goto out; + } + file->f_pos = 0; + oldfs = get_fs(); + set_fs(get_ds()); + /* For releases 0.1 and 0.2, all of the header information + * fits in the first data extent-sized region. */ + rc = file->f_op->read(file, (char __user *)data, + ECRYPTFS_DEFAULT_EXTENT_SIZE, &file->f_pos); + set_fs(oldfs); + fput(file); + rc = 0; +out: + return rc; +} + +static void +write_header_metadata(char *virt, struct ecryptfs_crypt_stat *crypt_stat, + size_t *written) +{ + u32 header_extent_size; + u16 num_header_extents_at_front; + + header_extent_size = (u32)crypt_stat->header_extent_size; + num_header_extents_at_front = + (u16)crypt_stat->num_header_extents_at_front; + header_extent_size = cpu_to_be32(header_extent_size); + memcpy(virt, &header_extent_size, 4); + virt += 4; + num_header_extents_at_front = cpu_to_be16(num_header_extents_at_front); + memcpy(virt, &num_header_extents_at_front, 2); + (*written) = 6; +} + +struct kmem_cache *ecryptfs_header_cache_0; +struct kmem_cache *ecryptfs_header_cache_1; +struct kmem_cache *ecryptfs_header_cache_2; + +/** + * ecryptfs_write_headers_virt + * @page_virt + * @crypt_stat + * @ecryptfs_dentry + * + * Format version: 1 + * + * Header Extent: + * Octets 0-7: Unencrypted file size (big-endian) + * Octets 8-15: eCryptfs special marker + * Octets 16-19: Flags + * Octet 16: File format version number (between 0 and 255) + * Octets 17-18: Reserved + * Octet 19: Bit 1 (lsb): Reserved + * Bit 2: Encrypted? + * Bits 3-8: Reserved + * Octets 20-23: Header extent size (big-endian) + * Octets 24-25: Number of header extents at front of file + * (big-endian) + * Octet 26: Begin RFC 2440 authentication token packet set + * Data Extent 0: + * Lower data (CBC encrypted) + * Data Extent 1: + * Lower data (CBC encrypted) + * ... + * + * Returns zero on success + */ +int ecryptfs_write_headers_virt(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat, + struct dentry *ecryptfs_dentry) +{ + int rc; + size_t written; + size_t offset; + + offset = ECRYPTFS_FILE_SIZE_BYTES; + write_ecryptfs_marker((page_virt + offset), &written); + offset += written; + write_ecryptfs_flags((page_virt + offset), crypt_stat, &written); + offset += written; + write_header_metadata((page_virt + offset), crypt_stat, &written); + offset += written; + rc = ecryptfs_generate_key_packet_set((page_virt + offset), crypt_stat, + ecryptfs_dentry, &written, + PAGE_CACHE_SIZE - offset); + if (rc) + ecryptfs_printk(KERN_WARNING, "Error generating key packet " + "set; rc = [%d]\n", rc); + return rc; +} + +/** + * ecryptfs_write_headers + * @lower_file: The lower file struct, which was returned from dentry_open + * + * Write the file headers out. This will likely involve a userspace + * callout, in which the session key is encrypted with one or more + * public keys and/or the passphrase necessary to do the encryption is + * retrieved via a prompt. Exactly what happens at this point should + * be policy-dependent. + * + * Returns zero on success; non-zero on error + */ +int ecryptfs_write_headers(struct dentry *ecryptfs_dentry, + struct file *lower_file) +{ + mm_segment_t oldfs; + struct ecryptfs_crypt_stat *crypt_stat; + char *page_virt; + int current_header_page; + int header_pages; + int rc = 0; + + crypt_stat = &ecryptfs_inode_to_private( + ecryptfs_dentry->d_inode)->crypt_stat; + if (likely(ECRYPTFS_CHECK_FLAG(crypt_stat->flags, + ECRYPTFS_ENCRYPTED))) { + if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, + ECRYPTFS_KEY_VALID)) { + ecryptfs_printk(KERN_DEBUG, "Key is " + "invalid; bailing out\n"); + rc = -EINVAL; + goto out; + } + } else { + rc = -EINVAL; + ecryptfs_printk(KERN_WARNING, + "Called with crypt_stat->encrypted == 0\n"); + goto out; + } + /* Released in this function */ + page_virt = kmem_cache_alloc(ecryptfs_header_cache_0, SLAB_USER); + if (!page_virt) { + ecryptfs_printk(KERN_ERR, "Out of memory\n"); + rc = -ENOMEM; + goto out; + } + memset(page_virt, 0, PAGE_CACHE_SIZE); + rc = ecryptfs_write_headers_virt(page_virt, crypt_stat, + ecryptfs_dentry); + if (unlikely(rc)) { + ecryptfs_printk(KERN_ERR, "Error whilst writing headers\n"); + memset(page_virt, 0, PAGE_CACHE_SIZE); + goto out_free; + } + ecryptfs_printk(KERN_DEBUG, + "Writing key packet set to underlying file\n"); + lower_file->f_pos = 0; + oldfs = get_fs(); + set_fs(get_ds()); + ecryptfs_printk(KERN_DEBUG, "Calling lower_file->f_op->" + "write() w/ header page; lower_file->f_pos = " + "[0x%.16x]\n", lower_file->f_pos); + lower_file->f_op->write(lower_file, (char __user *)page_virt, + PAGE_CACHE_SIZE, &lower_file->f_pos); + header_pages = ((crypt_stat->header_extent_size + * crypt_stat->num_header_extents_at_front) + / PAGE_CACHE_SIZE); + memset(page_virt, 0, PAGE_CACHE_SIZE); + current_header_page = 1; + while (current_header_page < header_pages) { + ecryptfs_printk(KERN_DEBUG, "Calling lower_file->f_op->" + "write() w/ zero'd page; lower_file->f_pos = " + "[0x%.16x]\n", lower_file->f_pos); + lower_file->f_op->write(lower_file, (char __user *)page_virt, + PAGE_CACHE_SIZE, &lower_file->f_pos); + current_header_page++; + } + set_fs(oldfs); + ecryptfs_printk(KERN_DEBUG, + "Done writing key packet set to underlying file.\n"); +out_free: + kmem_cache_free(ecryptfs_header_cache_0, page_virt); +out: + return rc; +} + +static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat, + char *virt, int *bytes_read) +{ + int rc = 0; + u32 header_extent_size; + u16 num_header_extents_at_front; + + memcpy(&header_extent_size, virt, 4); + header_extent_size = be32_to_cpu(header_extent_size); + virt += 4; + memcpy(&num_header_extents_at_front, virt, 2); + num_header_extents_at_front = be16_to_cpu(num_header_extents_at_front); + crypt_stat->header_extent_size = (int)header_extent_size; + crypt_stat->num_header_extents_at_front = + (int)num_header_extents_at_front; + (*bytes_read) = 6; + if ((crypt_stat->header_extent_size + * crypt_stat->num_header_extents_at_front) + < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) { + rc = -EINVAL; + ecryptfs_printk(KERN_WARNING, "Invalid header extent size: " + "[%d]\n", crypt_stat->header_extent_size); + } + return rc; +} + +/** + * set_default_header_data + * + * For version 0 file format; this function is only for backwards + * compatibility for files created with the prior versions of + * eCryptfs. + */ +static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat) +{ + crypt_stat->header_extent_size = 4096; + crypt_stat->num_header_extents_at_front = 1; +} + +/** + * ecryptfs_read_headers_virt + * + * Read/parse the header data. The header format is detailed in the + * comment block for the ecryptfs_write_headers_virt() function. + * + * Returns zero on success + */ +static int ecryptfs_read_headers_virt(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat, + struct dentry *ecryptfs_dentry) +{ + int rc = 0; + int offset; + int bytes_read; + + ecryptfs_set_default_sizes(crypt_stat); + crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + offset = ECRYPTFS_FILE_SIZE_BYTES; + rc = contains_ecryptfs_marker(page_virt + offset); + if (rc == 0) { + rc = -EINVAL; + goto out; + } + offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; + rc = ecryptfs_process_flags(crypt_stat, (page_virt + offset), + &bytes_read); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error processing flags\n"); + goto out; + } + if (crypt_stat->file_version > ECRYPTFS_SUPPORTED_FILE_VERSION) { + ecryptfs_printk(KERN_WARNING, "File version is [%d]; only " + "file version [%d] is supported by this " + "version of eCryptfs\n", + crypt_stat->file_version, + ECRYPTFS_SUPPORTED_FILE_VERSION); + rc = -EINVAL; + goto out; + } + offset += bytes_read; + if (crypt_stat->file_version >= 1) { + rc = parse_header_metadata(crypt_stat, (page_virt + offset), + &bytes_read); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error reading header " + "metadata; rc = [%d]\n", rc); + } + offset += bytes_read; + } else + set_default_header_data(crypt_stat); + rc = ecryptfs_parse_packet_set(crypt_stat, (page_virt + offset), + ecryptfs_dentry); +out: + return rc; +} + +/** + * ecryptfs_read_headers + * + * Returns zero if valid headers found and parsed; non-zero otherwise + */ +int ecryptfs_read_headers(struct dentry *ecryptfs_dentry, + struct file *lower_file) +{ + int rc = 0; + char *page_virt = NULL; + mm_segment_t oldfs; + ssize_t bytes_read; + struct ecryptfs_crypt_stat *crypt_stat = + &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + + /* Read the first page from the underlying file */ + page_virt = kmem_cache_alloc(ecryptfs_header_cache_1, SLAB_USER); + if (!page_virt) { + rc = -ENOMEM; + ecryptfs_printk(KERN_ERR, "Unable to allocate page_virt\n"); + goto out; + } + lower_file->f_pos = 0; + oldfs = get_fs(); + set_fs(get_ds()); + bytes_read = lower_file->f_op->read(lower_file, + (char __user *)page_virt, + ECRYPTFS_DEFAULT_EXTENT_SIZE, + &lower_file->f_pos); + set_fs(oldfs); + if (bytes_read != ECRYPTFS_DEFAULT_EXTENT_SIZE) { + rc = -EINVAL; + goto out; + } + rc = ecryptfs_read_headers_virt(page_virt, crypt_stat, + ecryptfs_dentry); + if (rc) { + ecryptfs_printk(KERN_DEBUG, "Valid eCryptfs headers not " + "found\n"); + rc = -EINVAL; + } +out: + if (page_virt) { + memset(page_virt, 0, PAGE_CACHE_SIZE); + kmem_cache_free(ecryptfs_header_cache_1, page_virt); + } + return rc; +} + +/** + * ecryptfs_encode_filename - converts a plaintext file name to cipher text + * @crypt_stat: The crypt_stat struct associated with the file anem to encode + * @name: The plaintext name + * @length: The length of the plaintext + * @encoded_name: The encypted name + * + * Encrypts and encodes a filename into something that constitutes a + * valid filename for a filesystem, with printable characters. + * + * We assume that we have a properly initialized crypto context, + * pointed to by crypt_stat->tfm. + * + * TODO: Implement filename decoding and decryption here, in place of + * memcpy. We are keeping the framework around for now to (1) + * facilitate testing of the components needed to implement filename + * encryption and (2) to provide a code base from which other + * developers in the community can easily implement this feature. + * + * Returns the length of encoded filename; negative if error + */ +int +ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat, + const char *name, int length, char **encoded_name) +{ + int error = 0; + + (*encoded_name) = kmalloc(length + 2, GFP_KERNEL); + if (!(*encoded_name)) { + error = -ENOMEM; + goto out; + } + /* TODO: Filename encryption is a scheduled feature for a + * future version of eCryptfs. This function is here only for + * the purpose of providing a framework for other developers + * to easily implement filename encryption. Hint: Replace this + * memcpy() with a call to encrypt and encode the + * filename, the set the length accordingly. */ + memcpy((void *)(*encoded_name), (void *)name, length); + (*encoded_name)[length] = '\0'; + error = length + 1; +out: + return error; +} + +/** + * ecryptfs_decode_filename - converts the cipher text name to plaintext + * @crypt_stat: The crypt_stat struct associated with the file + * @name: The filename in cipher text + * @length: The length of the cipher text name + * @decrypted_name: The plaintext name + * + * Decodes and decrypts the filename. + * + * We assume that we have a properly initialized crypto context, + * pointed to by crypt_stat->tfm. + * + * TODO: Implement filename decoding and decryption here, in place of + * memcpy. We are keeping the framework around for now to (1) + * facilitate testing of the components needed to implement filename + * encryption and (2) to provide a code base from which other + * developers in the community can easily implement this feature. + * + * Returns the length of decoded filename; negative if error + */ +int +ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat, + const char *name, int length, char **decrypted_name) +{ + int error = 0; + + (*decrypted_name) = kmalloc(length + 2, GFP_KERNEL); + if (!(*decrypted_name)) { + error = -ENOMEM; + goto out; + } + /* TODO: Filename encryption is a scheduled feature for a + * future version of eCryptfs. This function is here only for + * the purpose of providing a framework for other developers + * to easily implement filename encryption. Hint: Replace this + * memcpy() with a call to decode and decrypt the + * filename, the set the length accordingly. */ + memcpy((void *)(*decrypted_name), (void *)name, length); + (*decrypted_name)[length + 1] = '\0'; /* Only for convenience + * in printing out the + * string in debug + * messages */ + error = length; +out: + return error; +} + +/** + * ecryptfs_process_cipher - Perform cipher initialization. + * @tfm: Crypto context set by this function + * @key_tfm: Crypto context for key material, set by this function + * @cipher_name: Name of the cipher. + * @key_size: Size of the key in bytes. + * + * Returns zero on success. Any crypto_tfm structs allocated here + * should be released by other functions, such as on a superblock put + * event, regardless of whether this function succeeds for fails. + */ +int +ecryptfs_process_cipher(struct crypto_tfm **tfm, struct crypto_tfm **key_tfm, + char *cipher_name, size_t key_size) +{ + char dummy_key[ECRYPTFS_MAX_KEY_BYTES]; + int rc; + + *tfm = *key_tfm = NULL; + if (key_size > ECRYPTFS_MAX_KEY_BYTES) { + rc = -EINVAL; + printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum " + "allowable is [%d]\n", key_size, ECRYPTFS_MAX_KEY_BYTES); + goto out; + } + *tfm = crypto_alloc_tfm(cipher_name, (ECRYPTFS_DEFAULT_CHAINING_MODE + | CRYPTO_TFM_REQ_WEAK_KEY)); + if (!(*tfm)) { + rc = -EINVAL; + printk(KERN_ERR "Unable to allocate crypto cipher with name " + "[%s]\n", cipher_name); + goto out; + } + *key_tfm = crypto_alloc_tfm(cipher_name, CRYPTO_TFM_REQ_WEAK_KEY); + if (!(*key_tfm)) { + rc = -EINVAL; + printk(KERN_ERR "Unable to allocate crypto cipher with name " + "[%s]\n", cipher_name); + goto out; + } + if (key_size < crypto_tfm_alg_min_keysize(*tfm)) { + rc = -EINVAL; + printk(KERN_ERR "Request key size is [%Zd]; minimum key size " + "supported by cipher [%s] is [%d]\n", key_size, + cipher_name, crypto_tfm_alg_min_keysize(*tfm)); + goto out; + } + if (key_size < crypto_tfm_alg_min_keysize(*key_tfm)) { + rc = -EINVAL; + printk(KERN_ERR "Request key size is [%Zd]; minimum key size " + "supported by cipher [%s] is [%d]\n", key_size, + cipher_name, crypto_tfm_alg_min_keysize(*key_tfm)); + goto out; + } + if (key_size > crypto_tfm_alg_max_keysize(*tfm)) { + rc = -EINVAL; + printk(KERN_ERR "Request key size is [%Zd]; maximum key size " + "supported by cipher [%s] is [%d]\n", key_size, + cipher_name, crypto_tfm_alg_min_keysize(*tfm)); + goto out; + } + if (key_size > crypto_tfm_alg_max_keysize(*key_tfm)) { + rc = -EINVAL; + printk(KERN_ERR "Request key size is [%Zd]; maximum key size " + "supported by cipher [%s] is [%d]\n", key_size, + cipher_name, crypto_tfm_alg_min_keysize(*key_tfm)); + goto out; + } + get_random_bytes(dummy_key, key_size); + rc = crypto_cipher_setkey(*tfm, dummy_key, key_size); + if (rc) { + printk(KERN_ERR "Error attempting to set key of size [%Zd] for " + "cipher [%s]; rc = [%d]\n", key_size, cipher_name, rc); + rc = -EINVAL; + goto out; + } + rc = crypto_cipher_setkey(*key_tfm, dummy_key, key_size); + if (rc) { + printk(KERN_ERR "Error attempting to set key of size [%Zd] for " + "cipher [%s]; rc = [%d]\n", key_size, cipher_name, rc); + rc = -EINVAL; + goto out; + } +out: + return rc; +} diff --git a/fs/ecryptfs/debug.c b/fs/ecryptfs/debug.c new file mode 100644 index 00000000000..61f8e894284 --- /dev/null +++ b/fs/ecryptfs/debug.c @@ -0,0 +1,123 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * Functions only useful for debugging. + * + * Copyright (C) 2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include "ecryptfs_kernel.h" + +/** + * ecryptfs_dump_auth_tok - debug function to print auth toks + * + * This function will print the contents of an ecryptfs authentication + * token. + */ +void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok) +{ + char salt[ECRYPTFS_SALT_SIZE * 2 + 1]; + char sig[ECRYPTFS_SIG_SIZE_HEX + 1]; + + ecryptfs_printk(KERN_DEBUG, "Auth tok at mem loc [%p]:\n", + auth_tok); + if (ECRYPTFS_CHECK_FLAG(auth_tok->flags, ECRYPTFS_PRIVATE_KEY)) { + ecryptfs_printk(KERN_DEBUG, " * private key type\n"); + ecryptfs_printk(KERN_DEBUG, " * (NO PRIVATE KEY SUPPORT " + "IN ECRYPTFS VERSION 0.1)\n"); + } else { + ecryptfs_printk(KERN_DEBUG, " * passphrase type\n"); + ecryptfs_to_hex(salt, auth_tok->token.password.salt, + ECRYPTFS_SALT_SIZE); + salt[ECRYPTFS_SALT_SIZE * 2] = '\0'; + ecryptfs_printk(KERN_DEBUG, " * salt = [%s]\n", salt); + if (ECRYPTFS_CHECK_FLAG(auth_tok->token.password.flags, + ECRYPTFS_PERSISTENT_PASSWORD)) { + ecryptfs_printk(KERN_DEBUG, " * persistent\n"); + } + memcpy(sig, auth_tok->token.password.signature, + ECRYPTFS_SIG_SIZE_HEX); + sig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; + ecryptfs_printk(KERN_DEBUG, " * signature = [%s]\n", sig); + } + ecryptfs_printk(KERN_DEBUG, " * session_key.flags = [0x%x]\n", + auth_tok->session_key.flags); + if (auth_tok->session_key.flags + & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT) + ecryptfs_printk(KERN_DEBUG, + " * Userspace decrypt request set\n"); + if (auth_tok->session_key.flags + & ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT) + ecryptfs_printk(KERN_DEBUG, + " * Userspace encrypt request set\n"); + if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_DECRYPTED_KEY) { + ecryptfs_printk(KERN_DEBUG, " * Contains decrypted key\n"); + ecryptfs_printk(KERN_DEBUG, + " * session_key.decrypted_key_size = [0x%x]\n", + auth_tok->session_key.decrypted_key_size); + ecryptfs_printk(KERN_DEBUG, " * Decrypted session key " + "dump:\n"); + if (ecryptfs_verbosity > 0) + ecryptfs_dump_hex(auth_tok->session_key.decrypted_key, + ECRYPTFS_DEFAULT_KEY_BYTES); + } + if (auth_tok->session_key.flags & ECRYPTFS_CONTAINS_ENCRYPTED_KEY) { + ecryptfs_printk(KERN_DEBUG, " * Contains encrypted key\n"); + ecryptfs_printk(KERN_DEBUG, + " * session_key.encrypted_key_size = [0x%x]\n", + auth_tok->session_key.encrypted_key_size); + ecryptfs_printk(KERN_DEBUG, " * Encrypted session key " + "dump:\n"); + if (ecryptfs_verbosity > 0) + ecryptfs_dump_hex(auth_tok->session_key.encrypted_key, + auth_tok->session_key. + encrypted_key_size); + } +} + +/** + * ecryptfs_dump_hex - debug hex printer + * @data: string of bytes to be printed + * @bytes: number of bytes to print + * + * Dump hexadecimal representation of char array + */ +void ecryptfs_dump_hex(char *data, int bytes) +{ + int i = 0; + int add_newline = 1; + + if (ecryptfs_verbosity < 1) + return; + if (bytes != 0) { + printk(KERN_DEBUG "0x%.2x.", (unsigned char)data[i]); + i++; + } + while (i < bytes) { + printk("0x%.2x.", (unsigned char)data[i]); + i++; + if (i % 16 == 0) { + printk("\n"); + add_newline = 0; + } else + add_newline = 1; + } + if (add_newline) + printk("\n"); +} + diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c new file mode 100644 index 00000000000..f0d2a433242 --- /dev/null +++ b/fs/ecryptfs/dentry.c @@ -0,0 +1,87 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2003 Erez Zadok + * Copyright (C) 2001-2003 Stony Brook University + * Copyright (C) 2004-2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include <linux/dcache.h> +#include <linux/namei.h> +#include "ecryptfs_kernel.h" + +/** + * ecryptfs_d_revalidate - revalidate an ecryptfs dentry + * @dentry: The ecryptfs dentry + * @nd: The associated nameidata + * + * Called when the VFS needs to revalidate a dentry. This + * is called whenever a name lookup finds a dentry in the + * dcache. Most filesystems leave this as NULL, because all their + * dentries in the dcache are valid. + * + * Returns 1 if valid, 0 otherwise. + * + */ +static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); + struct dentry *dentry_save; + struct vfsmount *vfsmount_save; + int rc = 1; + + if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate) + goto out; + dentry_save = nd->dentry; + vfsmount_save = nd->mnt; + nd->dentry = lower_dentry; + nd->mnt = lower_mnt; + rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd); + nd->dentry = dentry_save; + nd->mnt = vfsmount_save; +out: + return rc; +} + +struct kmem_cache *ecryptfs_dentry_info_cache; + +/** + * ecryptfs_d_release + * @dentry: The ecryptfs dentry + * + * Called when a dentry is really deallocated. + */ +static void ecryptfs_d_release(struct dentry *dentry) +{ + struct dentry *lower_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + if (ecryptfs_dentry_to_private(dentry)) + kmem_cache_free(ecryptfs_dentry_info_cache, + ecryptfs_dentry_to_private(dentry)); + if (lower_dentry) + dput(lower_dentry); + return; +} + +struct dentry_operations ecryptfs_dops = { + .d_revalidate = ecryptfs_d_revalidate, + .d_release = ecryptfs_d_release, +}; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h new file mode 100644 index 00000000000..872c9958531 --- /dev/null +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -0,0 +1,482 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * Kernel declarations. + * + * Copyright (C) 1997-2003 Erez Zadok + * Copyright (C) 2001-2003 Stony Brook University + * Copyright (C) 2004-2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#ifndef ECRYPTFS_KERNEL_H +#define ECRYPTFS_KERNEL_H + +#include <keys/user-type.h> +#include <linux/fs.h> +#include <linux/scatterlist.h> + +/* Version verification for shared data structures w/ userspace */ +#define ECRYPTFS_VERSION_MAJOR 0x00 +#define ECRYPTFS_VERSION_MINOR 0x04 +#define ECRYPTFS_SUPPORTED_FILE_VERSION 0x01 +/* These flags indicate which features are supported by the kernel + * module; userspace tools such as the mount helper read + * ECRYPTFS_VERSIONING_MASK from a sysfs handle in order to determine + * how to behave. */ +#define ECRYPTFS_VERSIONING_PASSPHRASE 0x00000001 +#define ECRYPTFS_VERSIONING_PUBKEY 0x00000002 +#define ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH 0x00000004 +#define ECRYPTFS_VERSIONING_POLICY 0x00000008 +#define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ + | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH) + +#define ECRYPTFS_MAX_PASSWORD_LENGTH 64 +#define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH +#define ECRYPTFS_SALT_SIZE 8 +#define ECRYPTFS_SALT_SIZE_HEX (ECRYPTFS_SALT_SIZE*2) +/* The original signature size is only for what is stored on disk; all + * in-memory representations are expanded hex, so it better adapted to + * be passed around or referenced on the command line */ +#define ECRYPTFS_SIG_SIZE 8 +#define ECRYPTFS_SIG_SIZE_HEX (ECRYPTFS_SIG_SIZE*2) +#define ECRYPTFS_PASSWORD_SIG_SIZE ECRYPTFS_SIG_SIZE_HEX +#define ECRYPTFS_MAX_KEY_BYTES 64 +#define ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES 512 +#define ECRYPTFS_DEFAULT_IV_BYTES 16 +#define ECRYPTFS_FILE_VERSION 0x01 +#define ECRYPTFS_DEFAULT_HEADER_EXTENT_SIZE 8192 +#define ECRYPTFS_DEFAULT_EXTENT_SIZE 4096 +#define ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE 8192 + +#define RFC2440_CIPHER_DES3_EDE 0x02 +#define RFC2440_CIPHER_CAST_5 0x03 +#define RFC2440_CIPHER_BLOWFISH 0x04 +#define RFC2440_CIPHER_AES_128 0x07 +#define RFC2440_CIPHER_AES_192 0x08 +#define RFC2440_CIPHER_AES_256 0x09 +#define RFC2440_CIPHER_TWOFISH 0x0a +#define RFC2440_CIPHER_CAST_6 0x0b + +#define ECRYPTFS_SET_FLAG(flag_bit_vector, flag) (flag_bit_vector |= (flag)) +#define ECRYPTFS_CLEAR_FLAG(flag_bit_vector, flag) (flag_bit_vector &= ~(flag)) +#define ECRYPTFS_CHECK_FLAG(flag_bit_vector, flag) (flag_bit_vector & (flag)) + +/** + * For convenience, we may need to pass around the encrypted session + * key between kernel and userspace because the authentication token + * may not be extractable. For example, the TPM may not release the + * private key, instead requiring the encrypted data and returning the + * decrypted data. + */ +struct ecryptfs_session_key { +#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT 0x00000001 +#define ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT 0x00000002 +#define ECRYPTFS_CONTAINS_DECRYPTED_KEY 0x00000004 +#define ECRYPTFS_CONTAINS_ENCRYPTED_KEY 0x00000008 + u32 flags; + u32 encrypted_key_size; + u32 decrypted_key_size; + u8 encrypted_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES]; + u8 decrypted_key[ECRYPTFS_MAX_KEY_BYTES]; +}; + +struct ecryptfs_password { + u32 password_bytes; + s32 hash_algo; + u32 hash_iterations; + u32 session_key_encryption_key_bytes; +#define ECRYPTFS_PERSISTENT_PASSWORD 0x01 +#define ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET 0x02 + u32 flags; + /* Iterated-hash concatenation of salt and passphrase */ + u8 session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES]; + u8 signature[ECRYPTFS_PASSWORD_SIG_SIZE + 1]; + /* Always in expanded hex */ + u8 salt[ECRYPTFS_SALT_SIZE]; +}; + +enum ecryptfs_token_types {ECRYPTFS_PASSWORD, ECRYPTFS_PRIVATE_KEY}; + +/* May be a password or a private key */ +struct ecryptfs_auth_tok { + u16 version; /* 8-bit major and 8-bit minor */ + u16 token_type; + u32 flags; + struct ecryptfs_session_key session_key; + u8 reserved[32]; + union { + struct ecryptfs_password password; + /* Private key is in future eCryptfs releases */ + } token; +} __attribute__ ((packed)); + +void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok); +extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size); +extern void ecryptfs_from_hex(char *dst, char *src, int dst_size); + +struct ecryptfs_key_record { + unsigned char type; + size_t enc_key_size; + unsigned char sig[ECRYPTFS_SIG_SIZE]; + unsigned char enc_key[ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES]; +}; + +struct ecryptfs_auth_tok_list { + struct ecryptfs_auth_tok *auth_tok; + struct list_head list; +}; + +struct ecryptfs_crypt_stat; +struct ecryptfs_mount_crypt_stat; + +struct ecryptfs_page_crypt_context { + struct page *page; +#define ECRYPTFS_PREPARE_COMMIT_MODE 0 +#define ECRYPTFS_WRITEPAGE_MODE 1 + unsigned int mode; + union { + struct file *lower_file; + struct writeback_control *wbc; + } param; +}; + +static inline struct ecryptfs_auth_tok * +ecryptfs_get_key_payload_data(struct key *key) +{ + return (struct ecryptfs_auth_tok *) + (((struct user_key_payload*)key->payload.data)->data); +} + +#define ECRYPTFS_SUPER_MAGIC 0xf15f +#define ECRYPTFS_MAX_KEYSET_SIZE 1024 +#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32 +#define ECRYPTFS_MAX_NUM_ENC_KEYS 64 +#define ECRYPTFS_MAX_NUM_KEYSIGS 2 /* TODO: Make this a linked list */ +#define ECRYPTFS_MAX_IV_BYTES 16 /* 128 bits */ +#define ECRYPTFS_SALT_BYTES 2 +#define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5 +#define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8 /* 4*2 */ +#define ECRYPTFS_FILE_SIZE_BYTES 8 +#define ECRYPTFS_DEFAULT_CIPHER "aes" +#define ECRYPTFS_DEFAULT_KEY_BYTES 16 +#define ECRYPTFS_DEFAULT_CHAINING_MODE CRYPTO_TFM_MODE_CBC +#define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C +#define ECRYPTFS_TAG_11_PACKET_TYPE 0xED +#define MD5_DIGEST_SIZE 16 + +/** + * This is the primary struct associated with each encrypted file. + * + * TODO: cache align/pack? + */ +struct ecryptfs_crypt_stat { +#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001 +#define ECRYPTFS_POLICY_APPLIED 0x00000002 +#define ECRYPTFS_NEW_FILE 0x00000004 +#define ECRYPTFS_ENCRYPTED 0x00000008 +#define ECRYPTFS_SECURITY_WARNING 0x00000010 +#define ECRYPTFS_ENABLE_HMAC 0x00000020 +#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040 +#define ECRYPTFS_KEY_VALID 0x00000080 + u32 flags; + unsigned int file_version; + size_t iv_bytes; + size_t num_keysigs; + size_t header_extent_size; + size_t num_header_extents_at_front; + size_t extent_size; /* Data extent size; default is 4096 */ + size_t key_size; + size_t extent_shift; + unsigned int extent_mask; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + struct crypto_tfm *tfm; + struct crypto_tfm *md5_tfm; /* Crypto context for generating + * the initialization vectors */ + unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE]; + unsigned char key[ECRYPTFS_MAX_KEY_BYTES]; + unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES]; + unsigned char keysigs[ECRYPTFS_MAX_NUM_KEYSIGS][ECRYPTFS_SIG_SIZE_HEX]; + struct mutex cs_tfm_mutex; + struct mutex cs_md5_tfm_mutex; + struct mutex cs_mutex; +}; + +/* inode private data. */ +struct ecryptfs_inode_info { + struct inode vfs_inode; + struct inode *wii_inode; + struct ecryptfs_crypt_stat crypt_stat; +}; + +/* dentry private data. Each dentry must keep track of a lower + * vfsmount too. */ +struct ecryptfs_dentry_info { + struct dentry *wdi_dentry; + struct vfsmount *lower_mnt; + struct ecryptfs_crypt_stat *crypt_stat; +}; + +/** + * This struct is to enable a mount-wide passphrase/salt combo. This + * is more or less a stopgap to provide similar functionality to other + * crypto filesystems like EncFS or CFS until full policy support is + * implemented in eCryptfs. + */ +struct ecryptfs_mount_crypt_stat { + /* Pointers to memory we do not own, do not free these */ +#define ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED 0x00000001 + u32 flags; + struct ecryptfs_auth_tok *global_auth_tok; + struct key *global_auth_tok_key; + size_t global_default_cipher_key_size; + struct crypto_tfm *global_key_tfm; + struct mutex global_key_tfm_mutex; + unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE + + 1]; + unsigned char global_auth_tok_sig[ECRYPTFS_SIG_SIZE_HEX + 1]; +}; + +/* superblock private data. */ +struct ecryptfs_sb_info { + struct super_block *wsi_sb; + struct ecryptfs_mount_crypt_stat mount_crypt_stat; +}; + +/* file private data. */ +struct ecryptfs_file_info { + struct file *wfi_file; + struct ecryptfs_crypt_stat *crypt_stat; +}; + +/* auth_tok <=> encrypted_session_key mappings */ +struct ecryptfs_auth_tok_list_item { + unsigned char encrypted_session_key[ECRYPTFS_MAX_KEY_BYTES]; + struct list_head list; + struct ecryptfs_auth_tok auth_tok; +}; + +static inline struct ecryptfs_file_info * +ecryptfs_file_to_private(struct file *file) +{ + return (struct ecryptfs_file_info *)file->private_data; +} + +static inline void +ecryptfs_set_file_private(struct file *file, + struct ecryptfs_file_info *file_info) +{ + file->private_data = file_info; +} + +static inline struct file *ecryptfs_file_to_lower(struct file *file) +{ + return ((struct ecryptfs_file_info *)file->private_data)->wfi_file; +} + +static inline void +ecryptfs_set_file_lower(struct file *file, struct file *lower_file) +{ + ((struct ecryptfs_file_info *)file->private_data)->wfi_file = + lower_file; +} + +static inline struct ecryptfs_inode_info * +ecryptfs_inode_to_private(struct inode *inode) +{ + return container_of(inode, struct ecryptfs_inode_info, vfs_inode); +} + +static inline struct inode *ecryptfs_inode_to_lower(struct inode *inode) +{ + return ecryptfs_inode_to_private(inode)->wii_inode; +} + +static inline void +ecryptfs_set_inode_lower(struct inode *inode, struct inode *lower_inode) +{ + ecryptfs_inode_to_private(inode)->wii_inode = lower_inode; +} + +static inline struct ecryptfs_sb_info * +ecryptfs_superblock_to_private(struct super_block *sb) +{ + return (struct ecryptfs_sb_info *)sb->s_fs_info; +} + +static inline void +ecryptfs_set_superblock_private(struct super_block *sb, + struct ecryptfs_sb_info *sb_info) +{ + sb->s_fs_info = sb_info; +} + +static inline struct super_block * +ecryptfs_superblock_to_lower(struct super_block *sb) +{ + return ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb; +} + +static inline void +ecryptfs_set_superblock_lower(struct super_block *sb, + struct super_block *lower_sb) +{ + ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb = lower_sb; +} + +static inline struct ecryptfs_dentry_info * +ecryptfs_dentry_to_private(struct dentry *dentry) +{ + return (struct ecryptfs_dentry_info *)dentry->d_fsdata; +} + +static inline void +ecryptfs_set_dentry_private(struct dentry *dentry, + struct ecryptfs_dentry_info *dentry_info) +{ + dentry->d_fsdata = dentry_info; +} + +static inline struct dentry * +ecryptfs_dentry_to_lower(struct dentry *dentry) +{ + return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->wdi_dentry; +} + +static inline void +ecryptfs_set_dentry_lower(struct dentry *dentry, struct dentry *lower_dentry) +{ + ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->wdi_dentry = + lower_dentry; +} + +static inline struct vfsmount * +ecryptfs_dentry_to_lower_mnt(struct dentry *dentry) +{ + return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_mnt; +} + +static inline void +ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt) +{ + ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_mnt = + lower_mnt; +} + +#define ecryptfs_printk(type, fmt, arg...) \ + __ecryptfs_printk(type "%s: " fmt, __FUNCTION__, ## arg); +void __ecryptfs_printk(const char *fmt, ...); + +extern const struct file_operations ecryptfs_main_fops; +extern const struct file_operations ecryptfs_dir_fops; +extern struct inode_operations ecryptfs_main_iops; +extern struct inode_operations ecryptfs_dir_iops; +extern struct inode_operations ecryptfs_symlink_iops; +extern struct super_operations ecryptfs_sops; +extern struct dentry_operations ecryptfs_dops; +extern struct address_space_operations ecryptfs_aops; +extern int ecryptfs_verbosity; + +extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache; +extern struct kmem_cache *ecryptfs_file_info_cache; +extern struct kmem_cache *ecryptfs_dentry_info_cache; +extern struct kmem_cache *ecryptfs_inode_info_cache; +extern struct kmem_cache *ecryptfs_sb_info_cache; +extern struct kmem_cache *ecryptfs_header_cache_0; +extern struct kmem_cache *ecryptfs_header_cache_1; +extern struct kmem_cache *ecryptfs_header_cache_2; +extern struct kmem_cache *ecryptfs_lower_page_cache; + +int ecryptfs_interpose(struct dentry *hidden_dentry, + struct dentry *this_dentry, struct super_block *sb, + int flag); +int ecryptfs_fill_zeros(struct file *file, loff_t new_length); +int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat, + const char *name, int length, + char **decrypted_name); +int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat, + const char *name, int length, + char **encoded_name); +struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); +void ecryptfs_copy_attr_atime(struct inode *dest, const struct inode *src); +void ecryptfs_copy_attr_all(struct inode *dest, const struct inode *src); +void ecryptfs_copy_inode_size(struct inode *dst, const struct inode *src); +void ecryptfs_dump_hex(char *data, int bytes); +int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, + int sg_size); +int ecryptfs_compute_root_iv(struct ecryptfs_crypt_stat *crypt_stat); +void ecryptfs_rotate_iv(unsigned char *iv); +void ecryptfs_init_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); +void ecryptfs_destruct_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat); +void ecryptfs_destruct_mount_crypt_stat( + struct ecryptfs_mount_crypt_stat *mount_crypt_stat); +int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat); +int ecryptfs_write_inode_size_to_header(struct file *lower_file, + struct inode *lower_inode, + struct inode *inode); +int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode, + struct file *lower_file, + unsigned long lower_page_index, int byte_offset, + int region_bytes); +int +ecryptfs_commit_lower_page(struct page *lower_page, struct inode *lower_inode, + struct file *lower_file, int byte_offset, + int region_size); +int ecryptfs_copy_page_to_lower(struct page *page, struct inode *lower_inode, + struct file *lower_file); +int ecryptfs_do_readpage(struct file *file, struct page *page, + pgoff_t lower_page_index); +int ecryptfs_grab_and_map_lower_page(struct page **lower_page, + char **lower_virt, + struct inode *lower_inode, + unsigned long lower_page_index); +int ecryptfs_writepage_and_release_lower_page(struct page *lower_page, + struct inode *lower_inode, + struct writeback_control *wbc); +int ecryptfs_encrypt_page(struct ecryptfs_page_crypt_context *ctx); +int ecryptfs_decrypt_page(struct file *file, struct page *page); +int ecryptfs_write_headers(struct dentry *ecryptfs_dentry, + struct file *lower_file); +int ecryptfs_write_headers_virt(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat, + struct dentry *ecryptfs_dentry); +int ecryptfs_read_headers(struct dentry *ecryptfs_dentry, + struct file *lower_file); +int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); +int contains_ecryptfs_marker(char *data); +int ecryptfs_read_header_region(char *data, struct dentry *dentry, + struct vfsmount *mnt); +u16 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat); +int ecryptfs_cipher_code_to_string(char *str, u16 cipher_code); +void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat); +int ecryptfs_generate_key_packet_set(char *dest_base, + struct ecryptfs_crypt_stat *crypt_stat, + struct dentry *ecryptfs_dentry, + size_t *len, size_t max); +int process_request_key_err(long err_code); +int +ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, + unsigned char *src, struct dentry *ecryptfs_dentry); +int ecryptfs_truncate(struct dentry *dentry, loff_t new_length); +int +ecryptfs_process_cipher(struct crypto_tfm **tfm, struct crypto_tfm **key_tfm, + char *cipher_name, size_t key_size); +int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode); +int ecryptfs_inode_set(struct inode *inode, void *lower_inode); +void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode); + +#endif /* #ifndef ECRYPTFS_KERNEL_H */ diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c new file mode 100644 index 00000000000..c8550c9f9cd --- /dev/null +++ b/fs/ecryptfs/file.c @@ -0,0 +1,440 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2004 Erez Zadok + * Copyright (C) 2001-2004 Stony Brook University + * Copyright (C) 2004-2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com> + * Michael C. Thompson <mcthomps@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include <linux/file.h> +#include <linux/poll.h> +#include <linux/mount.h> +#include <linux/pagemap.h> +#include <linux/security.h> +#include <linux/smp_lock.h> +#include <linux/compat.h> +#include "ecryptfs_kernel.h" + +/** + * ecryptfs_llseek + * @file: File we are seeking in + * @offset: The offset to seek to + * @origin: 2 - offset from i_size; 1 - offset from f_pos + * + * Returns the position we have seeked to, or negative on error + */ +static loff_t ecryptfs_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t rv; + loff_t new_end_pos; + int rc; + int expanding_file = 0; + struct inode *inode = file->f_mapping->host; + + /* If our offset is past the end of our file, we're going to + * need to grow it so we have a valid length of 0's */ + new_end_pos = offset; + switch (origin) { + case 2: + new_end_pos += i_size_read(inode); + expanding_file = 1; + break; + case 1: + new_end_pos += file->f_pos; + if (new_end_pos > i_size_read(inode)) { + ecryptfs_printk(KERN_DEBUG, "new_end_pos(=[0x%.16x]) " + "> i_size_read(inode)(=[0x%.16x])\n", + new_end_pos, i_size_read(inode)); + expanding_file = 1; + } + break; + default: + if (new_end_pos > i_size_read(inode)) { + ecryptfs_printk(KERN_DEBUG, "new_end_pos(=[0x%.16x]) " + "> i_size_read(inode)(=[0x%.16x])\n", + new_end_pos, i_size_read(inode)); + expanding_file = 1; + } + } + ecryptfs_printk(KERN_DEBUG, "new_end_pos = [0x%.16x]\n", new_end_pos); + if (expanding_file) { + rc = ecryptfs_truncate(file->f_dentry, new_end_pos); + if (rc) { + rv = rc; + ecryptfs_printk(KERN_ERR, "Error on attempt to " + "truncate to (higher) offset [0x%.16x];" + " rc = [%d]\n", new_end_pos, rc); + goto out; + } + } + rv = generic_file_llseek(file, offset, origin); +out: + return rv; +} + +/** + * ecryptfs_read_update_atime + * + * generic_file_read updates the atime of upper layer inode. But, it + * doesn't give us a chance to update the atime of the lower layer + * inode. This function is a wrapper to generic_file_read. It + * updates the atime of the lower level inode if generic_file_read + * returns without any errors. This is to be used only for file reads. + * The function to be used for directory reads is ecryptfs_read. + */ +static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + int rc; + struct dentry *lower_dentry; + struct vfsmount *lower_vfsmount; + struct file *file = iocb->ki_filp; + + rc = generic_file_aio_read(iocb, iov, nr_segs, pos); + /* + * Even though this is a async interface, we need to wait + * for IO to finish to update atime + */ + if (-EIOCBQUEUED == rc) + rc = wait_on_sync_kiocb(iocb); + if (rc >= 0) { + lower_dentry = ecryptfs_dentry_to_lower(file->f_dentry); + lower_vfsmount = ecryptfs_dentry_to_lower_mnt(file->f_dentry); + touch_atime(lower_vfsmount, lower_dentry); + } + return rc; +} + +struct ecryptfs_getdents_callback { + void *dirent; + struct dentry *dentry; + filldir_t filldir; + int err; + int filldir_called; + int entries_written; +}; + +/* Inspired by generic filldir in fs/readir.c */ +static int +ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset, + u64 ino, unsigned int d_type) +{ + struct ecryptfs_crypt_stat *crypt_stat; + struct ecryptfs_getdents_callback *buf = + (struct ecryptfs_getdents_callback *)dirent; + int rc; + int decoded_length; + char *decoded_name; + + crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat; + buf->filldir_called++; + decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen, + &decoded_name); + if (decoded_length < 0) { + rc = decoded_length; + goto out; + } + rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset, + ino, d_type); + kfree(decoded_name); + if (rc >= 0) + buf->entries_written++; +out: + return rc; +} + +/** + * ecryptfs_readdir + * @file: The ecryptfs file struct + * @dirent: Directory entry + * @filldir: The filldir callback function + */ +static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + int rc; + struct file *lower_file; + struct inode *inode; + struct ecryptfs_getdents_callback buf; + + lower_file = ecryptfs_file_to_lower(file); + lower_file->f_pos = file->f_pos; + inode = file->f_dentry->d_inode; + memset(&buf, 0, sizeof(buf)); + buf.dirent = dirent; + buf.dentry = file->f_dentry; + buf.filldir = filldir; +retry: + buf.filldir_called = 0; + buf.entries_written = 0; + buf.err = 0; + rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf); + if (buf.err) + rc = buf.err; + if (buf.filldir_called && !buf.entries_written) + goto retry; + file->f_pos = lower_file->f_pos; + if (rc >= 0) + ecryptfs_copy_attr_atime(inode, lower_file->f_dentry->d_inode); + return rc; +} + +struct kmem_cache *ecryptfs_file_info_cache; + +/** + * ecryptfs_open + * @inode: inode speciying file to open + * @file: Structure to return filled in + * + * Opens the file specified by inode. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_open(struct inode *inode, struct file *file) +{ + int rc = 0; + struct ecryptfs_crypt_stat *crypt_stat = NULL; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + struct dentry *ecryptfs_dentry = file->f_dentry; + /* Private value of ecryptfs_dentry allocated in + * ecryptfs_lookup() */ + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); + struct inode *lower_inode = NULL; + struct file *lower_file = NULL; + struct vfsmount *lower_mnt; + struct ecryptfs_file_info *file_info; + int lower_flags; + + /* Released in ecryptfs_release or end of function if failure */ + file_info = kmem_cache_alloc(ecryptfs_file_info_cache, SLAB_KERNEL); + ecryptfs_set_file_private(file, file_info); + if (!file_info) { + ecryptfs_printk(KERN_ERR, + "Error attempting to allocate memory\n"); + rc = -ENOMEM; + goto out; + } + memset(file_info, 0, sizeof(*file_info)); + lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); + crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; + mount_crypt_stat = &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + mutex_lock(&crypt_stat->cs_mutex); + if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED)) { + ecryptfs_printk(KERN_DEBUG, "Setting flags for stat...\n"); + /* Policy code enabled in future release */ + ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED); + ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED); + } + mutex_unlock(&crypt_stat->cs_mutex); + /* This mntget & dget is undone via fput when the file is released */ + dget(lower_dentry); + lower_flags = file->f_flags; + if ((lower_flags & O_ACCMODE) == O_WRONLY) + lower_flags = (lower_flags & O_ACCMODE) | O_RDWR; + if (file->f_flags & O_APPEND) + lower_flags &= ~O_APPEND; + lower_mnt = ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry); + mntget(lower_mnt); + /* Corresponding fput() in ecryptfs_release() */ + lower_file = dentry_open(lower_dentry, lower_mnt, lower_flags); + if (IS_ERR(lower_file)) { + rc = PTR_ERR(lower_file); + ecryptfs_printk(KERN_ERR, "Error opening lower file\n"); + goto out_puts; + } + ecryptfs_set_file_lower(file, lower_file); + /* Isn't this check the same as the one in lookup? */ + lower_inode = lower_dentry->d_inode; + if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { + ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); + ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED); + rc = 0; + goto out; + } + mutex_lock(&crypt_stat->cs_mutex); + if (i_size_read(lower_inode) < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) { + if (!(mount_crypt_stat->flags + & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) { + rc = -EIO; + printk(KERN_WARNING "Attempt to read file that is " + "not in a valid eCryptfs format, and plaintext " + "passthrough mode is not enabled; returning " + "-EIO\n"); + mutex_unlock(&crypt_stat->cs_mutex); + goto out_puts; + } + crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); + rc = 0; + mutex_unlock(&crypt_stat->cs_mutex); + goto out; + } else if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, + ECRYPTFS_POLICY_APPLIED) + || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags, + ECRYPTFS_KEY_VALID)) { + rc = ecryptfs_read_headers(ecryptfs_dentry, lower_file); + if (rc) { + ecryptfs_printk(KERN_DEBUG, + "Valid headers not found\n"); + if (!(mount_crypt_stat->flags + & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) { + rc = -EIO; + printk(KERN_WARNING "Attempt to read file that " + "is not in a valid eCryptfs format, " + "and plaintext passthrough mode is not " + "enabled; returning -EIO\n"); + mutex_unlock(&crypt_stat->cs_mutex); + goto out_puts; + } + ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, + ECRYPTFS_ENCRYPTED); + rc = 0; + mutex_unlock(&crypt_stat->cs_mutex); + goto out; + } + } + mutex_unlock(&crypt_stat->cs_mutex); + ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = [0x%.16x] " + "size: [0x%.16x]\n", inode, inode->i_ino, + i_size_read(inode)); + ecryptfs_set_file_lower(file, lower_file); + goto out; +out_puts: + mntput(lower_mnt); + dput(lower_dentry); + kmem_cache_free(ecryptfs_file_info_cache, + ecryptfs_file_to_private(file)); +out: + return rc; +} + +static int ecryptfs_flush(struct file *file, fl_owner_t td) +{ + int rc = 0; + struct file *lower_file = NULL; + + lower_file = ecryptfs_file_to_lower(file); + if (lower_file->f_op && lower_file->f_op->flush) + rc = lower_file->f_op->flush(lower_file, td); + return rc; +} + +static int ecryptfs_release(struct inode *inode, struct file *file) +{ + struct file *lower_file = ecryptfs_file_to_lower(file); + struct ecryptfs_file_info *file_info = ecryptfs_file_to_private(file); + struct inode *lower_inode = ecryptfs_inode_to_lower(inode); + + fput(lower_file); + inode->i_blocks = lower_inode->i_blocks; + kmem_cache_free(ecryptfs_file_info_cache, file_info); + return 0; +} + +static int +ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + struct file *lower_file = ecryptfs_file_to_lower(file); + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + struct inode *lower_inode = lower_dentry->d_inode; + int rc = -EINVAL; + + if (lower_inode->i_fop->fsync) { + mutex_lock(&lower_inode->i_mutex); + rc = lower_inode->i_fop->fsync(lower_file, lower_dentry, + datasync); + mutex_unlock(&lower_inode->i_mutex); + } + return rc; +} + +static int ecryptfs_fasync(int fd, struct file *file, int flag) +{ + int rc = 0; + struct file *lower_file = NULL; + + lower_file = ecryptfs_file_to_lower(file); + if (lower_file->f_op && lower_file->f_op->fasync) + rc = lower_file->f_op->fasync(fd, lower_file, flag); + return rc; +} + +static ssize_t ecryptfs_sendfile(struct file *file, loff_t * ppos, + size_t count, read_actor_t actor, void *target) +{ + struct file *lower_file = NULL; + int rc = -EINVAL; + + lower_file = ecryptfs_file_to_lower(file); + if (lower_file->f_op && lower_file->f_op->sendfile) + rc = lower_file->f_op->sendfile(lower_file, ppos, count, + actor, target); + + return rc; +} + +static int ecryptfs_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); + +const struct file_operations ecryptfs_dir_fops = { + .readdir = ecryptfs_readdir, + .ioctl = ecryptfs_ioctl, + .mmap = generic_file_mmap, + .open = ecryptfs_open, + .flush = ecryptfs_flush, + .release = ecryptfs_release, + .fsync = ecryptfs_fsync, + .fasync = ecryptfs_fasync, + .sendfile = ecryptfs_sendfile, +}; + +const struct file_operations ecryptfs_main_fops = { + .llseek = ecryptfs_llseek, + .read = do_sync_read, + .aio_read = ecryptfs_read_update_atime, + .write = do_sync_write, + .aio_write = generic_file_aio_write, + .readdir = ecryptfs_readdir, + .ioctl = ecryptfs_ioctl, + .mmap = generic_file_mmap, + .open = ecryptfs_open, + .flush = ecryptfs_flush, + .release = ecryptfs_release, + .fsync = ecryptfs_fsync, + .fasync = ecryptfs_fasync, + .sendfile = ecryptfs_sendfile, +}; + +static int +ecryptfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, + unsigned long arg) +{ + int rc = 0; + struct file *lower_file = NULL; + + if (ecryptfs_file_to_private(file)) + lower_file = ecryptfs_file_to_lower(file); + if (lower_file && lower_file->f_op && lower_file->f_op->ioctl) + rc = lower_file->f_op->ioctl(ecryptfs_inode_to_lower(inode), + lower_file, cmd, arg); + else + rc = -ENOTTY; + return rc; +} diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c new file mode 100644 index 00000000000..efdd2b7b62d --- /dev/null +++ b/fs/ecryptfs/inode.c @@ -0,0 +1,1079 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2004 Erez Zadok + * Copyright (C) 2001-2004 Stony Brook University + * Copyright (C) 2004-2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> + * Michael C. Thompsion <mcthomps@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include <linux/file.h> +#include <linux/vmalloc.h> +#include <linux/pagemap.h> +#include <linux/dcache.h> +#include <linux/namei.h> +#include <linux/mount.h> +#include <linux/crypto.h> +#include "ecryptfs_kernel.h" + +static struct dentry *lock_parent(struct dentry *dentry) +{ + struct dentry *dir; + + dir = dget(dentry->d_parent); + mutex_lock(&(dir->d_inode->i_mutex)); + return dir; +} + +static void unlock_parent(struct dentry *dentry) +{ + mutex_unlock(&(dentry->d_parent->d_inode->i_mutex)); + dput(dentry->d_parent); +} + +static void unlock_dir(struct dentry *dir) +{ + mutex_unlock(&dir->d_inode->i_mutex); + dput(dir); +} + +void ecryptfs_copy_inode_size(struct inode *dst, const struct inode *src) +{ + i_size_write(dst, i_size_read((struct inode *)src)); + dst->i_blocks = src->i_blocks; +} + +void ecryptfs_copy_attr_atime(struct inode *dest, const struct inode *src) +{ + dest->i_atime = src->i_atime; +} + +static void ecryptfs_copy_attr_times(struct inode *dest, + const struct inode *src) +{ + dest->i_atime = src->i_atime; + dest->i_mtime = src->i_mtime; + dest->i_ctime = src->i_ctime; +} + +static void ecryptfs_copy_attr_timesizes(struct inode *dest, + const struct inode *src) +{ + dest->i_atime = src->i_atime; + dest->i_mtime = src->i_mtime; + dest->i_ctime = src->i_ctime; + ecryptfs_copy_inode_size(dest, src); +} + +void ecryptfs_copy_attr_all(struct inode *dest, const struct inode *src) +{ + dest->i_mode = src->i_mode; + dest->i_nlink = src->i_nlink; + dest->i_uid = src->i_uid; + dest->i_gid = src->i_gid; + dest->i_rdev = src->i_rdev; + dest->i_atime = src->i_atime; + dest->i_mtime = src->i_mtime; + dest->i_ctime = src->i_ctime; + dest->i_blkbits = src->i_blkbits; + dest->i_flags = src->i_flags; +} + +/** + * ecryptfs_create_underlying_file + * @lower_dir_inode: inode of the parent in the lower fs of the new file + * @lower_dentry: New file's dentry in the lower fs + * @ecryptfs_dentry: New file's dentry in ecryptfs + * @mode: The mode of the new file + * @nd: nameidata of ecryptfs' parent's dentry & vfsmount + * + * Creates the file in the lower file system. + * + * Returns zero on success; non-zero on error condition + */ +static int +ecryptfs_create_underlying_file(struct inode *lower_dir_inode, + struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); + struct dentry *dentry_save; + struct vfsmount *vfsmount_save; + int rc; + + dentry_save = nd->dentry; + vfsmount_save = nd->mnt; + nd->dentry = lower_dentry; + nd->mnt = lower_mnt; + rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd); + nd->dentry = dentry_save; + nd->mnt = vfsmount_save; + return rc; +} + +/** + * ecryptfs_do_create + * @directory_inode: inode of the new file's dentry's parent in ecryptfs + * @ecryptfs_dentry: New file's dentry in ecryptfs + * @mode: The mode of the new file + * @nd: nameidata of ecryptfs' parent's dentry & vfsmount + * + * Creates the underlying file and the eCryptfs inode which will link to + * it. It will also update the eCryptfs directory inode to mimic the + * stat of the lower directory inode. + * + * Returns zero on success; non-zero on error condition + */ +static int +ecryptfs_do_create(struct inode *directory_inode, + struct dentry *ecryptfs_dentry, int mode, + struct nameidata *nd) +{ + int rc; + struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); + lower_dir_dentry = lock_parent(lower_dentry); + if (unlikely(IS_ERR(lower_dir_dentry))) { + ecryptfs_printk(KERN_ERR, "Error locking directory of " + "dentry\n"); + rc = PTR_ERR(lower_dir_dentry); + goto out; + } + rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode, + ecryptfs_dentry, mode, nd); + if (unlikely(rc)) { + ecryptfs_printk(KERN_ERR, + "Failure to create underlying file\n"); + goto out_lock; + } + rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, + directory_inode->i_sb, 0); + if (rc) { + ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n"); + goto out_lock; + } + ecryptfs_copy_attr_timesizes(directory_inode, + lower_dir_dentry->d_inode); +out_lock: + unlock_dir(lower_dir_dentry); +out: + return rc; +} + +/** + * grow_file + * @ecryptfs_dentry: the ecryptfs dentry + * @lower_file: The lower file + * @inode: The ecryptfs inode + * @lower_inode: The lower inode + * + * This is the code which will grow the file to its correct size. + */ +static int grow_file(struct dentry *ecryptfs_dentry, struct file *lower_file, + struct inode *inode, struct inode *lower_inode) +{ + int rc = 0; + struct file fake_file; + struct ecryptfs_file_info tmp_file_info; + + memset(&fake_file, 0, sizeof(fake_file)); + fake_file.f_dentry = ecryptfs_dentry; + memset(&tmp_file_info, 0, sizeof(tmp_file_info)); + ecryptfs_set_file_private(&fake_file, &tmp_file_info); + ecryptfs_set_file_lower(&fake_file, lower_file); + rc = ecryptfs_fill_zeros(&fake_file, 1); + if (rc) { + ECRYPTFS_SET_FLAG( + ecryptfs_inode_to_private(inode)->crypt_stat.flags, + ECRYPTFS_SECURITY_WARNING); + ecryptfs_printk(KERN_WARNING, "Error attempting to fill zeros " + "in file; rc = [%d]\n", rc); + goto out; + } + i_size_write(inode, 0); + ecryptfs_write_inode_size_to_header(lower_file, lower_inode, inode); + ECRYPTFS_SET_FLAG(ecryptfs_inode_to_private(inode)->crypt_stat.flags, + ECRYPTFS_NEW_FILE); +out: + return rc; +} + +/** + * ecryptfs_initialize_file + * + * Cause the file to be changed from a basic empty file to an ecryptfs + * file with a header and first data page. + * + * Returns zero on success + */ +static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) +{ + int rc = 0; + int lower_flags; + struct ecryptfs_crypt_stat *crypt_stat; + struct dentry *lower_dentry; + struct dentry *tlower_dentry = NULL; + struct file *lower_file; + struct inode *inode, *lower_inode; + struct vfsmount *lower_mnt; + + lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); + ecryptfs_printk(KERN_DEBUG, "lower_dentry->d_name.name = [%s]\n", + lower_dentry->d_name.name); + inode = ecryptfs_dentry->d_inode; + crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat; + tlower_dentry = dget(lower_dentry); + if (!tlower_dentry) { + rc = -ENOMEM; + ecryptfs_printk(KERN_ERR, "Error dget'ing lower_dentry\n"); + goto out; + } + lower_flags = ((O_CREAT | O_WRONLY | O_TRUNC) & O_ACCMODE) | O_RDWR; +#if BITS_PER_LONG != 32 + lower_flags |= O_LARGEFILE; +#endif + lower_mnt = ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry); + mntget(lower_mnt); + /* Corresponding fput() at end of this function */ + lower_file = dentry_open(tlower_dentry, lower_mnt, lower_flags); + if (IS_ERR(lower_file)) { + rc = PTR_ERR(lower_file); + ecryptfs_printk(KERN_ERR, + "Error opening dentry; rc = [%i]\n", rc); + goto out; + } + /* fput(lower_file) should handle the puts if we do this */ + lower_file->f_dentry = tlower_dentry; + lower_file->f_vfsmnt = lower_mnt; + lower_inode = tlower_dentry->d_inode; + if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { + ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); + ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED); + goto out_fput; + } + ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE); + ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n"); + rc = ecryptfs_new_file_context(ecryptfs_dentry); + if (rc) { + ecryptfs_printk(KERN_DEBUG, "Error creating new file " + "context\n"); + goto out_fput; + } + rc = ecryptfs_write_headers(ecryptfs_dentry, lower_file); + if (rc) { + ecryptfs_printk(KERN_DEBUG, "Error writing headers\n"); + goto out_fput; + } + rc = grow_file(ecryptfs_dentry, lower_file, inode, lower_inode); +out_fput: + fput(lower_file); +out: + return rc; +} + +/** + * ecryptfs_create + * @dir: The inode of the directory in which to create the file. + * @dentry: The eCryptfs dentry + * @mode: The mode of the new file. + * @nd: nameidata + * + * Creates a new file. + * + * Returns zero on success; non-zero on error condition + */ +static int +ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, + int mode, struct nameidata *nd) +{ + int rc; + + rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd); + if (unlikely(rc)) { + ecryptfs_printk(KERN_WARNING, "Failed to create file in" + "lower filesystem\n"); + goto out; + } + /* At this point, a file exists on "disk"; we need to make sure + * that this on disk file is prepared to be an ecryptfs file */ + rc = ecryptfs_initialize_file(ecryptfs_dentry); +out: + return rc; +} + +/** + * ecryptfs_lookup + * @dir: inode + * @dentry: The dentry + * @nd: nameidata, may be NULL + * + * Find a file on disk. If the file does not exist, then we'll add it to the + * dentry cache and continue on to read it from the disk. + */ +static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + int rc = 0; + struct dentry *lower_dir_dentry; + struct dentry *lower_dentry; + struct vfsmount *lower_mnt; + struct dentry *tlower_dentry = NULL; + char *encoded_name; + unsigned int encoded_namelen; + struct ecryptfs_crypt_stat *crypt_stat = NULL; + char *page_virt = NULL; + struct inode *lower_inode; + u64 file_size; + + lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); + dentry->d_op = &ecryptfs_dops; + if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, ".")) + || (dentry->d_name.len == 2 && !strcmp(dentry->d_name.name, ".."))) + goto out_drop; + encoded_namelen = ecryptfs_encode_filename(crypt_stat, + dentry->d_name.name, + dentry->d_name.len, + &encoded_name); + if (encoded_namelen < 0) { + rc = encoded_namelen; + goto out_drop; + } + ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen " + "= [%d]\n", encoded_name, encoded_namelen); + lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry, + encoded_namelen - 1); + kfree(encoded_name); + lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); + if (IS_ERR(lower_dentry)) { + ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n"); + rc = PTR_ERR(lower_dentry); + goto out_drop; + } + ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->" + "d_name.name = [%s]\n", lower_dentry, + lower_dentry->d_name.name); + lower_inode = lower_dentry->d_inode; + ecryptfs_copy_attr_atime(dir, lower_dir_dentry->d_inode); + BUG_ON(!atomic_read(&lower_dentry->d_count)); + ecryptfs_set_dentry_private(dentry, + kmem_cache_alloc(ecryptfs_dentry_info_cache, + SLAB_KERNEL)); + if (!ecryptfs_dentry_to_private(dentry)) { + rc = -ENOMEM; + ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting " + "to allocate ecryptfs_dentry_info struct\n"); + goto out_dput; + } + ecryptfs_set_dentry_lower(dentry, lower_dentry); + ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt); + if (!lower_dentry->d_inode) { + /* We want to add because we couldn't find in lower */ + d_add(dentry, NULL); + goto out; + } + rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 1); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error interposing\n"); + goto out_dput; + } + if (S_ISDIR(lower_inode->i_mode)) { + ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n"); + goto out; + } + if (S_ISLNK(lower_inode->i_mode)) { + ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n"); + goto out; + } + if (!nd) { + ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave" + "as we *think* we are about to unlink\n"); + goto out; + } + tlower_dentry = dget(lower_dentry); + if (!tlower_dentry || IS_ERR(tlower_dentry)) { + rc = -ENOMEM; + ecryptfs_printk(KERN_ERR, "Cannot dget lower_dentry\n"); + goto out_dput; + } + /* Released in this function */ + page_virt = + (char *)kmem_cache_alloc(ecryptfs_header_cache_2, + SLAB_USER); + if (!page_virt) { + rc = -ENOMEM; + ecryptfs_printk(KERN_ERR, + "Cannot ecryptfs_kmalloc a page\n"); + goto out_dput; + } + memset(page_virt, 0, PAGE_CACHE_SIZE); + rc = ecryptfs_read_header_region(page_virt, tlower_dentry, nd->mnt); + crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; + if (!ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_POLICY_APPLIED)) + ecryptfs_set_default_sizes(crypt_stat); + if (rc) { + rc = 0; + ecryptfs_printk(KERN_WARNING, "Error reading header region;" + " assuming unencrypted\n"); + } else { + if (!contains_ecryptfs_marker(page_virt + + ECRYPTFS_FILE_SIZE_BYTES)) { + kmem_cache_free(ecryptfs_header_cache_2, page_virt); + goto out; + } + memcpy(&file_size, page_virt, sizeof(file_size)); + file_size = be64_to_cpu(file_size); + i_size_write(dentry->d_inode, (loff_t)file_size); + } + kmem_cache_free(ecryptfs_header_cache_2, page_virt); + goto out; + +out_dput: + dput(lower_dentry); + if (tlower_dentry) + dput(tlower_dentry); +out_drop: + d_drop(dentry); +out: + return ERR_PTR(rc); +} + +static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + struct dentry *lower_old_dentry; + struct dentry *lower_new_dentry; + struct dentry *lower_dir_dentry; + u64 file_size_save; + int rc; + + file_size_save = i_size_read(old_dentry->d_inode); + lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); + lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); + dget(lower_old_dentry); + dget(lower_new_dentry); + lower_dir_dentry = lock_parent(lower_new_dentry); + rc = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode, + lower_new_dentry); + if (rc || !lower_new_dentry->d_inode) + goto out_lock; + rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0); + if (rc) + goto out_lock; + ecryptfs_copy_attr_timesizes(dir, lower_new_dentry->d_inode); + old_dentry->d_inode->i_nlink = + ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink; + i_size_write(new_dentry->d_inode, file_size_save); +out_lock: + unlock_dir(lower_dir_dentry); + dput(lower_new_dentry); + dput(lower_old_dentry); + if (!new_dentry->d_inode) + d_drop(new_dentry); + return rc; +} + +static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) +{ + int rc = 0; + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir); + + lock_parent(lower_dentry); + rc = vfs_unlink(lower_dir_inode, lower_dentry); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error in vfs_unlink\n"); + goto out_unlock; + } + ecryptfs_copy_attr_times(dir, lower_dir_inode); + dentry->d_inode->i_nlink = + ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink; + dentry->d_inode->i_ctime = dir->i_ctime; +out_unlock: + unlock_parent(lower_dentry); + return rc; +} + +static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + int rc; + struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; + umode_t mode; + char *encoded_symname; + unsigned int encoded_symlen; + struct ecryptfs_crypt_stat *crypt_stat = NULL; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + dget(lower_dentry); + lower_dir_dentry = lock_parent(lower_dentry); + mode = S_IALLUGO; + encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname, + strlen(symname), + &encoded_symname); + if (encoded_symlen < 0) { + rc = encoded_symlen; + goto out_lock; + } + rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry, + encoded_symname, mode); + kfree(encoded_symname); + if (rc || !lower_dentry->d_inode) + goto out_lock; + rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0); + if (rc) + goto out_lock; + ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode); +out_lock: + unlock_dir(lower_dir_dentry); + dput(lower_dentry); + if (!dentry->d_inode) + d_drop(dentry); + return rc; +} + +static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + int rc; + struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + lower_dir_dentry = lock_parent(lower_dentry); + rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode); + if (rc || !lower_dentry->d_inode) + goto out; + rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0); + if (rc) + goto out; + ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode); + dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; +out: + unlock_dir(lower_dir_dentry); + if (!dentry->d_inode) + d_drop(dentry); + return rc; +} + +static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + int rc = 0; + struct dentry *tdentry = NULL; + struct dentry *lower_dentry; + struct dentry *tlower_dentry = NULL; + struct dentry *lower_dir_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + if (!(tdentry = dget(dentry))) { + rc = -EINVAL; + ecryptfs_printk(KERN_ERR, "Error dget'ing dentry [%p]\n", + dentry); + goto out; + } + lower_dir_dentry = lock_parent(lower_dentry); + if (!(tlower_dentry = dget(lower_dentry))) { + rc = -EINVAL; + ecryptfs_printk(KERN_ERR, "Error dget'ing lower_dentry " + "[%p]\n", lower_dentry); + goto out; + } + rc = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry); + if (!rc) { + d_delete(tlower_dentry); + tlower_dentry = NULL; + } + ecryptfs_copy_attr_times(dir, lower_dir_dentry->d_inode); + dir->i_nlink = lower_dir_dentry->d_inode->i_nlink; + unlock_dir(lower_dir_dentry); + if (!rc) + d_drop(dentry); +out: + if (tdentry) + dput(tdentry); + if (tlower_dentry) + dput(tlower_dentry); + return rc; +} + +static int +ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +{ + int rc; + struct dentry *lower_dentry; + struct dentry *lower_dir_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + lower_dir_dentry = lock_parent(lower_dentry); + rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev); + if (rc || !lower_dentry->d_inode) + goto out; + rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0); + if (rc) + goto out; + ecryptfs_copy_attr_timesizes(dir, lower_dir_dentry->d_inode); +out: + unlock_dir(lower_dir_dentry); + if (!dentry->d_inode) + d_drop(dentry); + return rc; +} + +static int +ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int rc; + struct dentry *lower_old_dentry; + struct dentry *lower_new_dentry; + struct dentry *lower_old_dir_dentry; + struct dentry *lower_new_dir_dentry; + + lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); + lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); + dget(lower_old_dentry); + dget(lower_new_dentry); + lower_old_dir_dentry = dget_parent(lower_old_dentry); + lower_new_dir_dentry = dget_parent(lower_new_dentry); + lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, + lower_new_dir_dentry->d_inode, lower_new_dentry); + if (rc) + goto out_lock; + ecryptfs_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode); + if (new_dir != old_dir) + ecryptfs_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode); +out_lock: + unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + dput(lower_new_dentry); + dput(lower_old_dentry); + return rc; +} + +static int +ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz) +{ + int rc; + struct dentry *lower_dentry; + char *decoded_name; + char *lower_buf; + mm_segment_t old_fs; + struct ecryptfs_crypt_stat *crypt_stat; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + if (!lower_dentry->d_inode->i_op || + !lower_dentry->d_inode->i_op->readlink) { + rc = -EINVAL; + goto out; + } + /* Released in this function */ + lower_buf = kmalloc(bufsiz, GFP_KERNEL); + if (lower_buf == NULL) { + ecryptfs_printk(KERN_ERR, "Out of memory\n"); + rc = -ENOMEM; + goto out; + } + old_fs = get_fs(); + set_fs(get_ds()); + ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ " + "lower_dentry->d_name.name = [%s]\n", + lower_dentry->d_name.name); + rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, + (char __user *)lower_buf, + bufsiz); + set_fs(old_fs); + if (rc >= 0) { + crypt_stat = NULL; + rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc, + &decoded_name); + if (rc == -ENOMEM) + goto out_free_lower_buf; + if (rc > 0) { + ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes " + "to userspace: [%*s]\n", rc, + decoded_name); + if (copy_to_user(buf, decoded_name, rc)) + rc = -EFAULT; + } + kfree(decoded_name); + ecryptfs_copy_attr_atime(dentry->d_inode, + lower_dentry->d_inode); + } +out_free_lower_buf: + kfree(lower_buf); +out: + return rc; +} + +static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + char *buf; + int len = PAGE_SIZE, rc; + mm_segment_t old_fs; + + /* Released in ecryptfs_put_link(); only release here on error */ + buf = kmalloc(len, GFP_KERNEL); + if (!buf) { + rc = -ENOMEM; + goto out; + } + old_fs = get_fs(); + set_fs(get_ds()); + ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ " + "dentry->d_name.name = [%s]\n", dentry->d_name.name); + rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); + buf[rc] = '\0'; + set_fs(old_fs); + if (rc < 0) + goto out_free; + rc = 0; + nd_set_link(nd, buf); + goto out; +out_free: + kfree(buf); +out: + return ERR_PTR(rc); +} + +static void +ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr) +{ + /* Free the char* */ + kfree(nd_get_link(nd)); +} + +/** + * upper_size_to_lower_size + * @crypt_stat: Crypt_stat associated with file + * @upper_size: Size of the upper file + * + * Calculate the requried size of the lower file based on the + * specified size of the upper file. This calculation is based on the + * number of headers in the underlying file and the extent size. + * + * Returns Calculated size of the lower file. + */ +static loff_t +upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat, + loff_t upper_size) +{ + loff_t lower_size; + + lower_size = ( crypt_stat->header_extent_size + * crypt_stat->num_header_extents_at_front ); + if (upper_size != 0) { + loff_t num_extents; + + num_extents = upper_size >> crypt_stat->extent_shift; + if (upper_size & ~crypt_stat->extent_mask) + num_extents++; + lower_size += (num_extents * crypt_stat->extent_size); + } + return lower_size; +} + +/** + * ecryptfs_truncate + * @dentry: The ecryptfs layer dentry + * @new_length: The length to expand the file to + * + * Function to handle truncations modifying the size of the file. Note + * that the file sizes are interpolated. When expanding, we are simply + * writing strings of 0's out. When truncating, we need to modify the + * underlying file size according to the page index interpolations. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) +{ + int rc = 0; + struct inode *inode = dentry->d_inode; + struct dentry *lower_dentry; + struct vfsmount *lower_mnt; + struct file fake_ecryptfs_file, *lower_file = NULL; + struct ecryptfs_crypt_stat *crypt_stat; + loff_t i_size = i_size_read(inode); + loff_t lower_size_before_truncate; + loff_t lower_size_after_truncate; + + if (unlikely((new_length == i_size))) + goto out; + crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; + /* Set up a fake ecryptfs file, this is used to interface with + * the file in the underlying filesystem so that the + * truncation has an effect there as well. */ + memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file)); + fake_ecryptfs_file.f_dentry = dentry; + /* Released at out_free: label */ + ecryptfs_set_file_private(&fake_ecryptfs_file, + kmem_cache_alloc(ecryptfs_file_info_cache, + SLAB_KERNEL)); + if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) { + rc = -ENOMEM; + goto out; + } + lower_dentry = ecryptfs_dentry_to_lower(dentry); + /* This dget & mntget is released through fput at out_fput: */ + dget(lower_dentry); + lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); + mntget(lower_mnt); + lower_file = dentry_open(lower_dentry, lower_mnt, O_RDWR); + if (unlikely(IS_ERR(lower_file))) { + rc = PTR_ERR(lower_file); + goto out_free; + } + ecryptfs_set_file_lower(&fake_ecryptfs_file, lower_file); + /* Switch on growing or shrinking file */ + if (new_length > i_size) { + rc = ecryptfs_fill_zeros(&fake_ecryptfs_file, new_length); + if (rc) { + ecryptfs_printk(KERN_ERR, + "Problem with fill_zeros\n"); + goto out_fput; + } + i_size_write(inode, new_length); + rc = ecryptfs_write_inode_size_to_header(lower_file, + lower_dentry->d_inode, + inode); + if (rc) { + ecryptfs_printk(KERN_ERR, + "Problem with ecryptfs_write" + "_inode_size\n"); + goto out_fput; + } + } else { /* new_length < i_size_read(inode) */ + vmtruncate(inode, new_length); + ecryptfs_write_inode_size_to_header(lower_file, + lower_dentry->d_inode, + inode); + /* We are reducing the size of the ecryptfs file, and need to + * know if we need to reduce the size of the lower file. */ + lower_size_before_truncate = + upper_size_to_lower_size(crypt_stat, i_size); + lower_size_after_truncate = + upper_size_to_lower_size(crypt_stat, new_length); + if (lower_size_after_truncate < lower_size_before_truncate) + vmtruncate(lower_dentry->d_inode, + lower_size_after_truncate); + } + /* Update the access times */ + lower_dentry->d_inode->i_mtime = lower_dentry->d_inode->i_ctime + = CURRENT_TIME; + mark_inode_dirty_sync(inode); +out_fput: + fput(lower_file); +out_free: + if (ecryptfs_file_to_private(&fake_ecryptfs_file)) + kmem_cache_free(ecryptfs_file_info_cache, + ecryptfs_file_to_private(&fake_ecryptfs_file)); +out: + return rc; +} + +static int +ecryptfs_permission(struct inode *inode, int mask, struct nameidata *nd) +{ + int rc; + + if (nd) { + struct vfsmount *vfsmnt_save = nd->mnt; + struct dentry *dentry_save = nd->dentry; + + nd->mnt = ecryptfs_dentry_to_lower_mnt(nd->dentry); + nd->dentry = ecryptfs_dentry_to_lower(nd->dentry); + rc = permission(ecryptfs_inode_to_lower(inode), mask, nd); + nd->mnt = vfsmnt_save; + nd->dentry = dentry_save; + } else + rc = permission(ecryptfs_inode_to_lower(inode), mask, NULL); + return rc; +} + +/** + * ecryptfs_setattr + * @dentry: dentry handle to the inode to modify + * @ia: Structure with flags of what to change and values + * + * Updates the metadata of an inode. If the update is to the size + * i.e. truncation, then ecryptfs_truncate will handle the size modification + * of both the ecryptfs inode and the lower inode. + * + * All other metadata changes will be passed right to the lower filesystem, + * and we will just update our inode to look like the lower. + */ +static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) +{ + int rc = 0; + struct dentry *lower_dentry; + struct inode *inode; + struct inode *lower_inode; + struct ecryptfs_crypt_stat *crypt_stat; + + crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; + lower_dentry = ecryptfs_dentry_to_lower(dentry); + inode = dentry->d_inode; + lower_inode = ecryptfs_inode_to_lower(inode); + if (ia->ia_valid & ATTR_SIZE) { + ecryptfs_printk(KERN_DEBUG, + "ia->ia_valid = [0x%x] ATTR_SIZE" " = [0x%x]\n", + ia->ia_valid, ATTR_SIZE); + rc = ecryptfs_truncate(dentry, ia->ia_size); + /* ecryptfs_truncate handles resizing of the lower file */ + ia->ia_valid &= ~ATTR_SIZE; + ecryptfs_printk(KERN_DEBUG, "ia->ia_valid = [%x]\n", + ia->ia_valid); + if (rc < 0) + goto out; + } + rc = notify_change(lower_dentry, ia); +out: + ecryptfs_copy_attr_all(inode, lower_inode); + return rc; +} + +static int +ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + int rc = 0; + struct dentry *lower_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + if (!lower_dentry->d_inode->i_op->setxattr) { + rc = -ENOSYS; + goto out; + } + mutex_lock(&lower_dentry->d_inode->i_mutex); + rc = lower_dentry->d_inode->i_op->setxattr(lower_dentry, name, value, + size, flags); + mutex_unlock(&lower_dentry->d_inode->i_mutex); +out: + return rc; +} + +static ssize_t +ecryptfs_getxattr(struct dentry *dentry, const char *name, void *value, + size_t size) +{ + int rc = 0; + struct dentry *lower_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + if (!lower_dentry->d_inode->i_op->getxattr) { + rc = -ENOSYS; + goto out; + } + mutex_lock(&lower_dentry->d_inode->i_mutex); + rc = lower_dentry->d_inode->i_op->getxattr(lower_dentry, name, value, + size); + mutex_unlock(&lower_dentry->d_inode->i_mutex); +out: + return rc; +} + +static ssize_t +ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size) +{ + int rc = 0; + struct dentry *lower_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + if (!lower_dentry->d_inode->i_op->listxattr) { + rc = -ENOSYS; + goto out; + } + mutex_lock(&lower_dentry->d_inode->i_mutex); + rc = lower_dentry->d_inode->i_op->listxattr(lower_dentry, list, size); + mutex_unlock(&lower_dentry->d_inode->i_mutex); +out: + return rc; +} + +static int ecryptfs_removexattr(struct dentry *dentry, const char *name) +{ + int rc = 0; + struct dentry *lower_dentry; + + lower_dentry = ecryptfs_dentry_to_lower(dentry); + if (!lower_dentry->d_inode->i_op->removexattr) { + rc = -ENOSYS; + goto out; + } + mutex_lock(&lower_dentry->d_inode->i_mutex); + rc = lower_dentry->d_inode->i_op->removexattr(lower_dentry, name); + mutex_unlock(&lower_dentry->d_inode->i_mutex); +out: + return rc; +} + +int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode) +{ + if ((ecryptfs_inode_to_lower(inode) + == (struct inode *)candidate_lower_inode)) + return 1; + else + return 0; +} + +int ecryptfs_inode_set(struct inode *inode, void *lower_inode) +{ + ecryptfs_init_inode(inode, (struct inode *)lower_inode); + return 0; +} + +struct inode_operations ecryptfs_symlink_iops = { + .readlink = ecryptfs_readlink, + .follow_link = ecryptfs_follow_link, + .put_link = ecryptfs_put_link, + .permission = ecryptfs_permission, + .setattr = ecryptfs_setattr, + .setxattr = ecryptfs_setxattr, + .getxattr = ecryptfs_getxattr, + .listxattr = ecryptfs_listxattr, + .removexattr = ecryptfs_removexattr +}; + +struct inode_operations ecryptfs_dir_iops = { + .create = ecryptfs_create, + .lookup = ecryptfs_lookup, + .link = ecryptfs_link, + .unlink = ecryptfs_unlink, + .symlink = ecryptfs_symlink, + .mkdir = ecryptfs_mkdir, + .rmdir = ecryptfs_rmdir, + .mknod = ecryptfs_mknod, + .rename = ecryptfs_rename, + .permission = ecryptfs_permission, + .setattr = ecryptfs_setattr, + .setxattr = ecryptfs_setxattr, + .getxattr = ecryptfs_getxattr, + .listxattr = ecryptfs_listxattr, + .removexattr = ecryptfs_removexattr +}; + +struct inode_operations ecryptfs_main_iops = { + .permission = ecryptfs_permission, + .setattr = ecryptfs_setattr, + .setxattr = ecryptfs_setxattr, + .getxattr = ecryptfs_getxattr, + .listxattr = ecryptfs_listxattr, + .removexattr = ecryptfs_removexattr +}; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c new file mode 100644 index 00000000000..ba454785a0c --- /dev/null +++ b/fs/ecryptfs/keystore.c @@ -0,0 +1,1061 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * In-kernel key management code. Includes functions to parse and + * write authentication token-related packets with the underlying + * file. + * + * Copyright (C) 2004-2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com> + * Michael C. Thompson <mcthomps@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include <linux/string.h> +#include <linux/sched.h> +#include <linux/syscalls.h> +#include <linux/pagemap.h> +#include <linux/key.h> +#include <linux/random.h> +#include <linux/crypto.h> +#include <linux/scatterlist.h> +#include "ecryptfs_kernel.h" + +/** + * request_key returned an error instead of a valid key address; + * determine the type of error, make appropriate log entries, and + * return an error code. + */ +int process_request_key_err(long err_code) +{ + int rc = 0; + + switch (err_code) { + case ENOKEY: + ecryptfs_printk(KERN_WARNING, "No key\n"); + rc = -ENOENT; + break; + case EKEYEXPIRED: + ecryptfs_printk(KERN_WARNING, "Key expired\n"); + rc = -ETIME; + break; + case EKEYREVOKED: + ecryptfs_printk(KERN_WARNING, "Key revoked\n"); + rc = -EINVAL; + break; + default: + ecryptfs_printk(KERN_WARNING, "Unknown error code: " + "[0x%.16x]\n", err_code); + rc = -EINVAL; + } + return rc; +} + +static void wipe_auth_tok_list(struct list_head *auth_tok_list_head) +{ + struct list_head *walker; + struct ecryptfs_auth_tok_list_item *auth_tok_list_item; + + walker = auth_tok_list_head->next; + while (walker != auth_tok_list_head) { + auth_tok_list_item = + list_entry(walker, struct ecryptfs_auth_tok_list_item, + list); + walker = auth_tok_list_item->list.next; + memset(auth_tok_list_item, 0, + sizeof(struct ecryptfs_auth_tok_list_item)); + kmem_cache_free(ecryptfs_auth_tok_list_item_cache, + auth_tok_list_item); + } +} + +struct kmem_cache *ecryptfs_auth_tok_list_item_cache; + +/** + * parse_packet_length + * @data: Pointer to memory containing length at offset + * @size: This function writes the decoded size to this memory + * address; zero on error + * @length_size: The number of bytes occupied by the encoded length + * + * Returns Zero on success + */ +static int parse_packet_length(unsigned char *data, size_t *size, + size_t *length_size) +{ + int rc = 0; + + (*length_size) = 0; + (*size) = 0; + if (data[0] < 192) { + /* One-byte length */ + (*size) = data[0]; + (*length_size) = 1; + } else if (data[0] < 224) { + /* Two-byte length */ + (*size) = ((data[0] - 192) * 256); + (*size) += (data[1] + 192); + (*length_size) = 2; + } else if (data[0] == 255) { + /* Five-byte length; we're not supposed to see this */ + ecryptfs_printk(KERN_ERR, "Five-byte packet length not " + "supported\n"); + rc = -EINVAL; + goto out; + } else { + ecryptfs_printk(KERN_ERR, "Error parsing packet length\n"); + rc = -EINVAL; + goto out; + } +out: + return rc; +} + +/** + * write_packet_length + * @dest: The byte array target into which to write the + * length. Must have at least 5 bytes allocated. + * @size: The length to write. + * @packet_size_length: The number of bytes used to encode the + * packet length is written to this address. + * + * Returns zero on success; non-zero on error. + */ +static int write_packet_length(char *dest, size_t size, + size_t *packet_size_length) +{ + int rc = 0; + + if (size < 192) { + dest[0] = size; + (*packet_size_length) = 1; + } else if (size < 65536) { + dest[0] = (((size - 192) / 256) + 192); + dest[1] = ((size - 192) % 256); + (*packet_size_length) = 2; + } else { + rc = -EINVAL; + ecryptfs_printk(KERN_WARNING, + "Unsupported packet size: [%d]\n", size); + } + return rc; +} + +/** + * parse_tag_3_packet + * @crypt_stat: The cryptographic context to modify based on packet + * contents. + * @data: The raw bytes of the packet. + * @auth_tok_list: eCryptfs parses packets into authentication tokens; + * a new authentication token will be placed at the end + * of this list for this packet. + * @new_auth_tok: Pointer to a pointer to memory that this function + * allocates; sets the memory address of the pointer to + * NULL on error. This object is added to the + * auth_tok_list. + * @packet_size: This function writes the size of the parsed packet + * into this memory location; zero on error. + * @max_packet_size: maximum number of bytes to parse + * + * Returns zero on success; non-zero on error. + */ +static int +parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat, + unsigned char *data, struct list_head *auth_tok_list, + struct ecryptfs_auth_tok **new_auth_tok, + size_t *packet_size, size_t max_packet_size) +{ + int rc = 0; + size_t body_size; + struct ecryptfs_auth_tok_list_item *auth_tok_list_item; + size_t length_size; + + (*packet_size) = 0; + (*new_auth_tok) = NULL; + + /* we check that: + * one byte for the Tag 3 ID flag + * two bytes for the body size + * do not exceed the maximum_packet_size + */ + if (unlikely((*packet_size) + 3 > max_packet_size)) { + ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n"); + rc = -EINVAL; + goto out; + } + + /* check for Tag 3 identifyer - one byte */ + if (data[(*packet_size)++] != ECRYPTFS_TAG_3_PACKET_TYPE) { + ecryptfs_printk(KERN_ERR, "Enter w/ first byte != 0x%.2x\n", + ECRYPTFS_TAG_3_PACKET_TYPE); + rc = -EINVAL; + goto out; + } + /* Released: wipe_auth_tok_list called in ecryptfs_parse_packet_set or + * at end of function upon failure */ + auth_tok_list_item = + kmem_cache_alloc(ecryptfs_auth_tok_list_item_cache, SLAB_KERNEL); + if (!auth_tok_list_item) { + ecryptfs_printk(KERN_ERR, "Unable to allocate memory\n"); + rc = -ENOMEM; + goto out; + } + memset(auth_tok_list_item, 0, + sizeof(struct ecryptfs_auth_tok_list_item)); + (*new_auth_tok) = &auth_tok_list_item->auth_tok; + + /* check for body size - one to two bytes */ + rc = parse_packet_length(&data[(*packet_size)], &body_size, + &length_size); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error parsing packet length; " + "rc = [%d]\n", rc); + goto out_free; + } + if (unlikely(body_size < (0x05 + ECRYPTFS_SALT_SIZE))) { + ecryptfs_printk(KERN_WARNING, "Invalid body size ([%d])\n", + body_size); + rc = -EINVAL; + goto out_free; + } + (*packet_size) += length_size; + + /* now we know the length of the remainting Tag 3 packet size: + * 5 fix bytes for: version string, cipher, S2K ID, hash algo, + * number of hash iterations + * ECRYPTFS_SALT_SIZE bytes for salt + * body_size bytes minus the stuff above is the encrypted key size + */ + if (unlikely((*packet_size) + body_size > max_packet_size)) { + ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n"); + rc = -EINVAL; + goto out_free; + } + + /* There are 5 characters of additional information in the + * packet */ + (*new_auth_tok)->session_key.encrypted_key_size = + body_size - (0x05 + ECRYPTFS_SALT_SIZE); + ecryptfs_printk(KERN_DEBUG, "Encrypted key size = [%d]\n", + (*new_auth_tok)->session_key.encrypted_key_size); + + /* Version 4 (from RFC2440) - one byte */ + if (unlikely(data[(*packet_size)++] != 0x04)) { + ecryptfs_printk(KERN_DEBUG, "Unknown version number " + "[%d]\n", data[(*packet_size) - 1]); + rc = -EINVAL; + goto out_free; + } + + /* cipher - one byte */ + ecryptfs_cipher_code_to_string(crypt_stat->cipher, + (u16)data[(*packet_size)]); + /* A little extra work to differentiate among the AES key + * sizes; see RFC2440 */ + switch(data[(*packet_size)++]) { + case RFC2440_CIPHER_AES_192: + crypt_stat->key_size = 24; + break; + default: + crypt_stat->key_size = + (*new_auth_tok)->session_key.encrypted_key_size; + } + ecryptfs_init_crypt_ctx(crypt_stat); + /* S2K identifier 3 (from RFC2440) */ + if (unlikely(data[(*packet_size)++] != 0x03)) { + ecryptfs_printk(KERN_ERR, "Only S2K ID 3 is currently " + "supported\n"); + rc = -ENOSYS; + goto out_free; + } + + /* TODO: finish the hash mapping */ + /* hash algorithm - one byte */ + switch (data[(*packet_size)++]) { + case 0x01: /* See RFC2440 for these numbers and their mappings */ + /* Choose MD5 */ + /* salt - ECRYPTFS_SALT_SIZE bytes */ + memcpy((*new_auth_tok)->token.password.salt, + &data[(*packet_size)], ECRYPTFS_SALT_SIZE); + (*packet_size) += ECRYPTFS_SALT_SIZE; + + /* This conversion was taken straight from RFC2440 */ + /* number of hash iterations - one byte */ + (*new_auth_tok)->token.password.hash_iterations = + ((u32) 16 + (data[(*packet_size)] & 15)) + << ((data[(*packet_size)] >> 4) + 6); + (*packet_size)++; + + /* encrypted session key - + * (body_size-5-ECRYPTFS_SALT_SIZE) bytes */ + memcpy((*new_auth_tok)->session_key.encrypted_key, + &data[(*packet_size)], + (*new_auth_tok)->session_key.encrypted_key_size); + (*packet_size) += + (*new_auth_tok)->session_key.encrypted_key_size; + (*new_auth_tok)->session_key.flags &= + ~ECRYPTFS_CONTAINS_DECRYPTED_KEY; + (*new_auth_tok)->session_key.flags |= + ECRYPTFS_CONTAINS_ENCRYPTED_KEY; + (*new_auth_tok)->token.password.hash_algo = 0x01; + break; + default: + ecryptfs_printk(KERN_ERR, "Unsupported hash algorithm: " + "[%d]\n", data[(*packet_size) - 1]); + rc = -ENOSYS; + goto out_free; + } + (*new_auth_tok)->token_type = ECRYPTFS_PASSWORD; + /* TODO: Parametarize; we might actually want userspace to + * decrypt the session key. */ + ECRYPTFS_CLEAR_FLAG((*new_auth_tok)->session_key.flags, + ECRYPTFS_USERSPACE_SHOULD_TRY_TO_DECRYPT); + ECRYPTFS_CLEAR_FLAG((*new_auth_tok)->session_key.flags, + ECRYPTFS_USERSPACE_SHOULD_TRY_TO_ENCRYPT); + list_add(&auth_tok_list_item->list, auth_tok_list); + goto out; +out_free: + (*new_auth_tok) = NULL; + memset(auth_tok_list_item, 0, + sizeof(struct ecryptfs_auth_tok_list_item)); + kmem_cache_free(ecryptfs_auth_tok_list_item_cache, + auth_tok_list_item); +out: + if (rc) + (*packet_size) = 0; + return rc; +} + +/** + * parse_tag_11_packet + * @data: The raw bytes of the packet + * @contents: This function writes the data contents of the literal + * packet into this memory location + * @max_contents_bytes: The maximum number of bytes that this function + * is allowed to write into contents + * @tag_11_contents_size: This function writes the size of the parsed + * contents into this memory location; zero on + * error + * @packet_size: This function writes the size of the parsed packet + * into this memory location; zero on error + * @max_packet_size: maximum number of bytes to parse + * + * Returns zero on success; non-zero on error. + */ +static int +parse_tag_11_packet(unsigned char *data, unsigned char *contents, + size_t max_contents_bytes, size_t *tag_11_contents_size, + size_t *packet_size, size_t max_packet_size) +{ + int rc = 0; + size_t body_size; + size_t length_size; + + (*packet_size) = 0; + (*tag_11_contents_size) = 0; + + /* check that: + * one byte for the Tag 11 ID flag + * two bytes for the Tag 11 length + * do not exceed the maximum_packet_size + */ + if (unlikely((*packet_size) + 3 > max_packet_size)) { + ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n"); + rc = -EINVAL; + goto out; + } + + /* check for Tag 11 identifyer - one byte */ + if (data[(*packet_size)++] != ECRYPTFS_TAG_11_PACKET_TYPE) { + ecryptfs_printk(KERN_WARNING, + "Invalid tag 11 packet format\n"); + rc = -EINVAL; + goto out; + } + + /* get Tag 11 content length - one or two bytes */ + rc = parse_packet_length(&data[(*packet_size)], &body_size, + &length_size); + if (rc) { + ecryptfs_printk(KERN_WARNING, + "Invalid tag 11 packet format\n"); + goto out; + } + (*packet_size) += length_size; + + if (body_size < 13) { + ecryptfs_printk(KERN_WARNING, "Invalid body size ([%d])\n", + body_size); + rc = -EINVAL; + goto out; + } + /* We have 13 bytes of surrounding packet values */ + (*tag_11_contents_size) = (body_size - 13); + + /* now we know the length of the remainting Tag 11 packet size: + * 14 fix bytes for: special flag one, special flag two, + * 12 skipped bytes + * body_size bytes minus the stuff above is the Tag 11 content + */ + /* FIXME why is the body size one byte smaller than the actual + * size of the body? + * this seems to be an error here as well as in + * write_tag_11_packet() */ + if (unlikely((*packet_size) + body_size + 1 > max_packet_size)) { + ecryptfs_printk(KERN_ERR, "Packet size exceeds max\n"); + rc = -EINVAL; + goto out; + } + + /* special flag one - one byte */ + if (data[(*packet_size)++] != 0x62) { + ecryptfs_printk(KERN_WARNING, "Unrecognizable packet\n"); + rc = -EINVAL; + goto out; + } + + /* special flag two - one byte */ + if (data[(*packet_size)++] != 0x08) { + ecryptfs_printk(KERN_WARNING, "Unrecognizable packet\n"); + rc = -EINVAL; + goto out; + } + + /* skip the next 12 bytes */ + (*packet_size) += 12; /* We don't care about the filename or + * the timestamp */ + + /* get the Tag 11 contents - tag_11_contents_size bytes */ + memcpy(contents, &data[(*packet_size)], (*tag_11_contents_size)); + (*packet_size) += (*tag_11_contents_size); + +out: + if (rc) { + (*packet_size) = 0; + (*tag_11_contents_size) = 0; + } + return rc; +} + +/** + * decrypt_session_key - Decrypt the session key with the given auth_tok. + * + * Returns Zero on success; non-zero error otherwise. + */ +static int decrypt_session_key(struct ecryptfs_auth_tok *auth_tok, + struct ecryptfs_crypt_stat *crypt_stat) +{ + int rc = 0; + struct ecryptfs_password *password_s_ptr; + struct crypto_tfm *tfm = NULL; + struct scatterlist src_sg[2], dst_sg[2]; + struct mutex *tfm_mutex = NULL; + /* TODO: Use virt_to_scatterlist for these */ + char *encrypted_session_key; + char *session_key; + + password_s_ptr = &auth_tok->token.password; + if (ECRYPTFS_CHECK_FLAG(password_s_ptr->flags, + ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET)) + ecryptfs_printk(KERN_DEBUG, "Session key encryption key " + "set; skipping key generation\n"); + ecryptfs_printk(KERN_DEBUG, "Session key encryption key (size [%d])" + ":\n", + password_s_ptr->session_key_encryption_key_bytes); + if (ecryptfs_verbosity > 0) + ecryptfs_dump_hex(password_s_ptr->session_key_encryption_key, + password_s_ptr-> + session_key_encryption_key_bytes); + if (!strcmp(crypt_stat->cipher, + crypt_stat->mount_crypt_stat->global_default_cipher_name) + && crypt_stat->mount_crypt_stat->global_key_tfm) { + tfm = crypt_stat->mount_crypt_stat->global_key_tfm; + tfm_mutex = &crypt_stat->mount_crypt_stat->global_key_tfm_mutex; + } else { + tfm = crypto_alloc_tfm(crypt_stat->cipher, + CRYPTO_TFM_REQ_WEAK_KEY); + if (!tfm) { + printk(KERN_ERR "Error allocating crypto context\n"); + rc = -ENOMEM; + goto out; + } + } + if (password_s_ptr->session_key_encryption_key_bytes + < crypto_tfm_alg_min_keysize(tfm)) { + printk(KERN_WARNING "Session key encryption key is [%d] bytes; " + "minimum keysize for selected cipher is [%d] bytes.\n", + password_s_ptr->session_key_encryption_key_bytes, + crypto_tfm_alg_min_keysize(tfm)); + rc = -EINVAL; + goto out; + } + if (tfm_mutex) + mutex_lock(tfm_mutex); + crypto_cipher_setkey(tfm, password_s_ptr->session_key_encryption_key, + crypt_stat->key_size); + /* TODO: virt_to_scatterlist */ + encrypted_session_key = (char *)__get_free_page(GFP_KERNEL); + if (!encrypted_session_key) { + ecryptfs_printk(KERN_ERR, "Out of memory\n"); + rc = -ENOMEM; + goto out_free_tfm; + } + session_key = (char *)__get_free_page(GFP_KERNEL); + if (!session_key) { + kfree(encrypted_session_key); + ecryptfs_printk(KERN_ERR, "Out of memory\n"); + rc = -ENOMEM; + goto out_free_tfm; + } + memcpy(encrypted_session_key, auth_tok->session_key.encrypted_key, + auth_tok->session_key.encrypted_key_size); + src_sg[0].page = virt_to_page(encrypted_session_key); + src_sg[0].offset = 0; + BUG_ON(auth_tok->session_key.encrypted_key_size > PAGE_CACHE_SIZE); + src_sg[0].length = auth_tok->session_key.encrypted_key_size; + dst_sg[0].page = virt_to_page(session_key); + dst_sg[0].offset = 0; + auth_tok->session_key.decrypted_key_size = + auth_tok->session_key.encrypted_key_size; + dst_sg[0].length = auth_tok->session_key.encrypted_key_size; + /* TODO: Handle error condition */ + crypto_cipher_decrypt(tfm, dst_sg, src_sg, + auth_tok->session_key.encrypted_key_size); + auth_tok->session_key.decrypted_key_size = + auth_tok->session_key.encrypted_key_size; + memcpy(auth_tok->session_key.decrypted_key, session_key, + auth_tok->session_key.decrypted_key_size); + auth_tok->session_key.flags |= ECRYPTFS_CONTAINS_DECRYPTED_KEY; + memcpy(crypt_stat->key, auth_tok->session_key.decrypted_key, + auth_tok->session_key.decrypted_key_size); + ECRYPTFS_SET_FLAG(crypt_stat->flags, ECRYPTFS_KEY_VALID); + ecryptfs_printk(KERN_DEBUG, "Decrypted session key:\n"); + if (ecryptfs_verbosity > 0) + ecryptfs_dump_hex(crypt_stat->key, + crypt_stat->key_size); + memset(encrypted_session_key, 0, PAGE_CACHE_SIZE); + free_page((unsigned long)encrypted_session_key); + memset(session_key, 0, PAGE_CACHE_SIZE); + free_page((unsigned long)session_key); +out_free_tfm: + if (tfm_mutex) + mutex_unlock(tfm_mutex); + else + crypto_free_tfm(tfm); +out: + return rc; +} + +/** + * ecryptfs_parse_packet_set + * @dest: The header page in memory + * @version: Version of file format, to guide parsing behavior + * + * Get crypt_stat to have the file's session key if the requisite key + * is available to decrypt the session key. + * + * Returns Zero if a valid authentication token was retrieved and + * processed; negative value for file not encrypted or for error + * conditions. + */ +int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, + unsigned char *src, + struct dentry *ecryptfs_dentry) +{ + size_t i = 0; + int rc = 0; + size_t found_auth_tok = 0; + size_t next_packet_is_auth_tok_packet; + char sig[ECRYPTFS_SIG_SIZE_HEX]; + struct list_head auth_tok_list; + struct list_head *walker; + struct ecryptfs_auth_tok *chosen_auth_tok = NULL; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + struct ecryptfs_auth_tok *candidate_auth_tok = NULL; + size_t packet_size; + struct ecryptfs_auth_tok *new_auth_tok; + unsigned char sig_tmp_space[ECRYPTFS_SIG_SIZE]; + size_t tag_11_contents_size; + size_t tag_11_packet_size; + + INIT_LIST_HEAD(&auth_tok_list); + /* Parse the header to find as many packets as we can, these will be + * added the our &auth_tok_list */ + next_packet_is_auth_tok_packet = 1; + while (next_packet_is_auth_tok_packet) { + size_t max_packet_size = ((PAGE_CACHE_SIZE - 8) - i); + + switch (src[i]) { + case ECRYPTFS_TAG_3_PACKET_TYPE: + rc = parse_tag_3_packet(crypt_stat, + (unsigned char *)&src[i], + &auth_tok_list, &new_auth_tok, + &packet_size, max_packet_size); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error parsing " + "tag 3 packet\n"); + rc = -EIO; + goto out_wipe_list; + } + i += packet_size; + rc = parse_tag_11_packet((unsigned char *)&src[i], + sig_tmp_space, + ECRYPTFS_SIG_SIZE, + &tag_11_contents_size, + &tag_11_packet_size, + max_packet_size); + if (rc) { + ecryptfs_printk(KERN_ERR, "No valid " + "(ecryptfs-specific) literal " + "packet containing " + "authentication token " + "signature found after " + "tag 3 packet\n"); + rc = -EIO; + goto out_wipe_list; + } + i += tag_11_packet_size; + if (ECRYPTFS_SIG_SIZE != tag_11_contents_size) { + ecryptfs_printk(KERN_ERR, "Expected " + "signature of size [%d]; " + "read size [%d]\n", + ECRYPTFS_SIG_SIZE, + tag_11_contents_size); + rc = -EIO; + goto out_wipe_list; + } + ecryptfs_to_hex(new_auth_tok->token.password.signature, + sig_tmp_space, tag_11_contents_size); + new_auth_tok->token.password.signature[ + ECRYPTFS_PASSWORD_SIG_SIZE] = '\0'; + ECRYPTFS_SET_FLAG(crypt_stat->flags, + ECRYPTFS_ENCRYPTED); + break; + case ECRYPTFS_TAG_11_PACKET_TYPE: + ecryptfs_printk(KERN_WARNING, "Invalid packet set " + "(Tag 11 not allowed by itself)\n"); + rc = -EIO; + goto out_wipe_list; + break; + default: + ecryptfs_printk(KERN_DEBUG, "No packet at offset " + "[%d] of the file header; hex value of " + "character is [0x%.2x]\n", i, src[i]); + next_packet_is_auth_tok_packet = 0; + } + } + if (list_empty(&auth_tok_list)) { + rc = -EINVAL; /* Do not support non-encrypted files in + * the 0.1 release */ + goto out; + } + /* If we have a global auth tok, then we should try to use + * it */ + if (mount_crypt_stat->global_auth_tok) { + memcpy(sig, mount_crypt_stat->global_auth_tok_sig, + ECRYPTFS_SIG_SIZE_HEX); + chosen_auth_tok = mount_crypt_stat->global_auth_tok; + } else + BUG(); /* We should always have a global auth tok in + * the 0.1 release */ + /* Scan list to see if our chosen_auth_tok works */ + list_for_each(walker, &auth_tok_list) { + struct ecryptfs_auth_tok_list_item *auth_tok_list_item; + auth_tok_list_item = + list_entry(walker, struct ecryptfs_auth_tok_list_item, + list); + candidate_auth_tok = &auth_tok_list_item->auth_tok; + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, + "Considering cadidate auth tok:\n"); + ecryptfs_dump_auth_tok(candidate_auth_tok); + } + /* TODO: Replace ECRYPTFS_SIG_SIZE_HEX w/ dynamic value */ + if (candidate_auth_tok->token_type == ECRYPTFS_PASSWORD + && !strncmp(candidate_auth_tok->token.password.signature, + sig, ECRYPTFS_SIG_SIZE_HEX)) { + found_auth_tok = 1; + goto leave_list; + /* TODO: Transfer the common salt into the + * crypt_stat salt */ + } + } +leave_list: + if (!found_auth_tok) { + ecryptfs_printk(KERN_ERR, "Could not find authentication " + "token on temporary list for sig [%.*s]\n", + ECRYPTFS_SIG_SIZE_HEX, sig); + rc = -EIO; + goto out_wipe_list; + } else { + memcpy(&(candidate_auth_tok->token.password), + &(chosen_auth_tok->token.password), + sizeof(struct ecryptfs_password)); + rc = decrypt_session_key(candidate_auth_tok, crypt_stat); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error decrypting the " + "session key\n"); + goto out_wipe_list; + } + rc = ecryptfs_compute_root_iv(crypt_stat); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error computing " + "the root IV\n"); + goto out_wipe_list; + } + } + rc = ecryptfs_init_crypt_ctx(crypt_stat); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error initializing crypto " + "context for cipher [%s]; rc = [%d]\n", + crypt_stat->cipher, rc); + } +out_wipe_list: + wipe_auth_tok_list(&auth_tok_list); +out: + return rc; +} + +/** + * write_tag_11_packet + * @dest: Target into which Tag 11 packet is to be written + * @max: Maximum packet length + * @contents: Byte array of contents to copy in + * @contents_length: Number of bytes in contents + * @packet_length: Length of the Tag 11 packet written; zero on error + * + * Returns zero on success; non-zero on error. + */ +static int +write_tag_11_packet(char *dest, int max, char *contents, size_t contents_length, + size_t *packet_length) +{ + int rc = 0; + size_t packet_size_length; + + (*packet_length) = 0; + if ((13 + contents_length) > max) { + rc = -EINVAL; + ecryptfs_printk(KERN_ERR, "Packet length larger than " + "maximum allowable\n"); + goto out; + } + /* General packet header */ + /* Packet tag */ + dest[(*packet_length)++] = ECRYPTFS_TAG_11_PACKET_TYPE; + /* Packet length */ + rc = write_packet_length(&dest[(*packet_length)], + (13 + contents_length), &packet_size_length); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 11 packet " + "header; cannot generate packet length\n"); + goto out; + } + (*packet_length) += packet_size_length; + /* Tag 11 specific */ + /* One-octet field that describes how the data is formatted */ + dest[(*packet_length)++] = 0x62; /* binary data */ + /* One-octet filename length followed by filename */ + dest[(*packet_length)++] = 8; + memcpy(&dest[(*packet_length)], "_CONSOLE", 8); + (*packet_length) += 8; + /* Four-octet number indicating modification date */ + memset(&dest[(*packet_length)], 0x00, 4); + (*packet_length) += 4; + /* Remainder is literal data */ + memcpy(&dest[(*packet_length)], contents, contents_length); + (*packet_length) += contents_length; + out: + if (rc) + (*packet_length) = 0; + return rc; +} + +/** + * write_tag_3_packet + * @dest: Buffer into which to write the packet + * @max: Maximum number of bytes that can be written + * @auth_tok: Authentication token + * @crypt_stat: The cryptographic context + * @key_rec: encrypted key + * @packet_size: This function will write the number of bytes that end + * up constituting the packet; set to zero on error + * + * Returns zero on success; non-zero on error. + */ +static int +write_tag_3_packet(char *dest, size_t max, struct ecryptfs_auth_tok *auth_tok, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_key_record *key_rec, size_t *packet_size) +{ + int rc = 0; + + size_t i; + size_t signature_is_valid = 0; + size_t encrypted_session_key_valid = 0; + char session_key_encryption_key[ECRYPTFS_MAX_KEY_BYTES]; + struct scatterlist dest_sg[2]; + struct scatterlist src_sg[2]; + struct crypto_tfm *tfm = NULL; + struct mutex *tfm_mutex = NULL; + size_t key_rec_size; + size_t packet_size_length; + size_t cipher_code; + + (*packet_size) = 0; + /* Check for a valid signature on the auth_tok */ + for (i = 0; i < ECRYPTFS_SIG_SIZE_HEX; i++) + signature_is_valid |= auth_tok->token.password.signature[i]; + if (!signature_is_valid) + BUG(); + ecryptfs_from_hex((*key_rec).sig, auth_tok->token.password.signature, + ECRYPTFS_SIG_SIZE); + encrypted_session_key_valid = 0; + for (i = 0; i < crypt_stat->key_size; i++) + encrypted_session_key_valid |= + auth_tok->session_key.encrypted_key[i]; + if (encrypted_session_key_valid) { + memcpy((*key_rec).enc_key, + auth_tok->session_key.encrypted_key, + auth_tok->session_key.encrypted_key_size); + goto encrypted_session_key_set; + } + if (auth_tok->session_key.encrypted_key_size == 0) + auth_tok->session_key.encrypted_key_size = + crypt_stat->key_size; + if (crypt_stat->key_size == 24 + && strcmp("aes", crypt_stat->cipher) == 0) { + memset((crypt_stat->key + 24), 0, 8); + auth_tok->session_key.encrypted_key_size = 32; + } + (*key_rec).enc_key_size = + auth_tok->session_key.encrypted_key_size; + if (ECRYPTFS_CHECK_FLAG(auth_tok->token.password.flags, + ECRYPTFS_SESSION_KEY_ENCRYPTION_KEY_SET)) { + ecryptfs_printk(KERN_DEBUG, "Using previously generated " + "session key encryption key of size [%d]\n", + auth_tok->token.password. + session_key_encryption_key_bytes); + memcpy(session_key_encryption_key, + auth_tok->token.password.session_key_encryption_key, + crypt_stat->key_size); + ecryptfs_printk(KERN_DEBUG, + "Cached session key " "encryption key: \n"); + if (ecryptfs_verbosity > 0) + ecryptfs_dump_hex(session_key_encryption_key, 16); + } + if (unlikely(ecryptfs_verbosity > 0)) { + ecryptfs_printk(KERN_DEBUG, "Session key encryption key:\n"); + ecryptfs_dump_hex(session_key_encryption_key, 16); + } + rc = virt_to_scatterlist(crypt_stat->key, + (*key_rec).enc_key_size, src_sg, 2); + if (!rc) { + ecryptfs_printk(KERN_ERR, "Error generating scatterlist " + "for crypt_stat session key\n"); + rc = -ENOMEM; + goto out; + } + rc = virt_to_scatterlist((*key_rec).enc_key, + (*key_rec).enc_key_size, dest_sg, 2); + if (!rc) { + ecryptfs_printk(KERN_ERR, "Error generating scatterlist " + "for crypt_stat encrypted session key\n"); + rc = -ENOMEM; + goto out; + } + if (!strcmp(crypt_stat->cipher, + crypt_stat->mount_crypt_stat->global_default_cipher_name) + && crypt_stat->mount_crypt_stat->global_key_tfm) { + tfm = crypt_stat->mount_crypt_stat->global_key_tfm; + tfm_mutex = &crypt_stat->mount_crypt_stat->global_key_tfm_mutex; + } else + tfm = crypto_alloc_tfm(crypt_stat->cipher, 0); + if (!tfm) { + ecryptfs_printk(KERN_ERR, "Could not initialize crypto " + "context for cipher [%s]\n", + crypt_stat->cipher); + rc = -EINVAL; + goto out; + } + if (tfm_mutex) + mutex_lock(tfm_mutex); + rc = crypto_cipher_setkey(tfm, session_key_encryption_key, + crypt_stat->key_size); + if (rc < 0) { + if (tfm_mutex) + mutex_unlock(tfm_mutex); + ecryptfs_printk(KERN_ERR, "Error setting key for crypto " + "context\n"); + goto out; + } + rc = 0; + ecryptfs_printk(KERN_DEBUG, "Encrypting [%d] bytes of the key\n", + crypt_stat->key_size); + crypto_cipher_encrypt(tfm, dest_sg, src_sg, + (*key_rec).enc_key_size); + if (tfm_mutex) + mutex_unlock(tfm_mutex); + ecryptfs_printk(KERN_DEBUG, "This should be the encrypted key:\n"); + if (ecryptfs_verbosity > 0) + ecryptfs_dump_hex((*key_rec).enc_key, + (*key_rec).enc_key_size); +encrypted_session_key_set: + /* Now we have a valid key_rec. Append it to the + * key_rec set. */ + key_rec_size = (sizeof(struct ecryptfs_key_record) + - ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES + + ((*key_rec).enc_key_size)); + /* TODO: Include a packet size limit as a parameter to this + * function once we have multi-packet headers (for versions + * later than 0.1 */ + if (key_rec_size >= ECRYPTFS_MAX_KEYSET_SIZE) { + ecryptfs_printk(KERN_ERR, "Keyset too large\n"); + rc = -EINVAL; + goto out; + } + /* TODO: Packet size limit */ + /* We have 5 bytes of surrounding packet data */ + if ((0x05 + ECRYPTFS_SALT_SIZE + + (*key_rec).enc_key_size) >= max) { + ecryptfs_printk(KERN_ERR, "Authentication token is too " + "large\n"); + rc = -EINVAL; + goto out; + } + /* This format is inspired by OpenPGP; see RFC 2440 + * packet tag 3 */ + dest[(*packet_size)++] = ECRYPTFS_TAG_3_PACKET_TYPE; + /* ver+cipher+s2k+hash+salt+iter+enc_key */ + rc = write_packet_length(&dest[(*packet_size)], + (0x05 + ECRYPTFS_SALT_SIZE + + (*key_rec).enc_key_size), + &packet_size_length); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error generating tag 3 packet " + "header; cannot generate packet length\n"); + goto out; + } + (*packet_size) += packet_size_length; + dest[(*packet_size)++] = 0x04; /* version 4 */ + cipher_code = ecryptfs_code_for_cipher_string(crypt_stat); + if (cipher_code == 0) { + ecryptfs_printk(KERN_WARNING, "Unable to generate code for " + "cipher [%s]\n", crypt_stat->cipher); + rc = -EINVAL; + goto out; + } + dest[(*packet_size)++] = cipher_code; + dest[(*packet_size)++] = 0x03; /* S2K */ + dest[(*packet_size)++] = 0x01; /* MD5 (TODO: parameterize) */ + memcpy(&dest[(*packet_size)], auth_tok->token.password.salt, + ECRYPTFS_SALT_SIZE); + (*packet_size) += ECRYPTFS_SALT_SIZE; /* salt */ + dest[(*packet_size)++] = 0x60; /* hash iterations (65536) */ + memcpy(&dest[(*packet_size)], (*key_rec).enc_key, + (*key_rec).enc_key_size); + (*packet_size) += (*key_rec).enc_key_size; +out: + if (tfm && !tfm_mutex) + crypto_free_tfm(tfm); + if (rc) + (*packet_size) = 0; + return rc; +} + +/** + * ecryptfs_generate_key_packet_set + * @dest: Virtual address from which to write the key record set + * @crypt_stat: The cryptographic context from which the + * authentication tokens will be retrieved + * @ecryptfs_dentry: The dentry, used to retrieve the mount crypt stat + * for the global parameters + * @len: The amount written + * @max: The maximum amount of data allowed to be written + * + * Generates a key packet set and writes it to the virtual address + * passed in. + * + * Returns zero on success; non-zero on error. + */ +int +ecryptfs_generate_key_packet_set(char *dest_base, + struct ecryptfs_crypt_stat *crypt_stat, + struct dentry *ecryptfs_dentry, size_t *len, + size_t max) +{ + int rc = 0; + struct ecryptfs_auth_tok *auth_tok; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + size_t written; + struct ecryptfs_key_record key_rec; + + (*len) = 0; + if (mount_crypt_stat->global_auth_tok) { + auth_tok = mount_crypt_stat->global_auth_tok; + if (auth_tok->token_type == ECRYPTFS_PASSWORD) { + rc = write_tag_3_packet((dest_base + (*len)), + max, auth_tok, + crypt_stat, &key_rec, + &written); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error " + "writing tag 3 packet\n"); + goto out; + } + (*len) += written; + /* Write auth tok signature packet */ + rc = write_tag_11_packet( + (dest_base + (*len)), + (max - (*len)), + key_rec.sig, ECRYPTFS_SIG_SIZE, &written); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error writing " + "auth tok signature packet\n"); + goto out; + } + (*len) += written; + } else { + ecryptfs_printk(KERN_WARNING, "Unsupported " + "authentication token type\n"); + rc = -EINVAL; + goto out; + } + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error writing " + "authentication token packet with sig " + "= [%s]\n", + mount_crypt_stat->global_auth_tok_sig); + rc = -EIO; + goto out; + } + } else + BUG(); + if (likely((max - (*len)) > 0)) { + dest_base[(*len)] = 0x00; + } else { + ecryptfs_printk(KERN_ERR, "Error writing boundary byte\n"); + rc = -EIO; + } +out: + if (rc) + (*len) = 0; + return rc; +} diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c new file mode 100644 index 00000000000..7a11b8ae664 --- /dev/null +++ b/fs/ecryptfs/main.c @@ -0,0 +1,831 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2003 Erez Zadok + * Copyright (C) 2001-2003 Stony Brook University + * Copyright (C) 2004-2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> + * Michael C. Thompson <mcthomps@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include <linux/dcache.h> +#include <linux/file.h> +#include <linux/module.h> +#include <linux/namei.h> +#include <linux/skbuff.h> +#include <linux/crypto.h> +#include <linux/netlink.h> +#include <linux/mount.h> +#include <linux/dcache.h> +#include <linux/pagemap.h> +#include <linux/key.h> +#include <linux/parser.h> +#include "ecryptfs_kernel.h" + +/** + * Module parameter that defines the ecryptfs_verbosity level. + */ +int ecryptfs_verbosity = 0; + +module_param(ecryptfs_verbosity, int, 0); +MODULE_PARM_DESC(ecryptfs_verbosity, + "Initial verbosity level (0 or 1; defaults to " + "0, which is Quiet)"); + +void __ecryptfs_printk(const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + if (fmt[1] == '7') { /* KERN_DEBUG */ + if (ecryptfs_verbosity >= 1) + vprintk(fmt, args); + } else + vprintk(fmt, args); + va_end(args); +} + +/** + * ecryptfs_interpose + * @lower_dentry: Existing dentry in the lower filesystem + * @dentry: ecryptfs' dentry + * @sb: ecryptfs's super_block + * @flag: If set to true, then d_add is called, else d_instantiate is called + * + * Interposes upper and lower dentries. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, + struct super_block *sb, int flag) +{ + struct inode *lower_inode; + struct inode *inode; + int rc = 0; + + lower_inode = lower_dentry->d_inode; + if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) { + rc = -EXDEV; + goto out; + } + if (!igrab(lower_inode)) { + rc = -ESTALE; + goto out; + } + inode = iget5_locked(sb, (unsigned long)lower_inode, + ecryptfs_inode_test, ecryptfs_inode_set, + lower_inode); + if (!inode) { + rc = -EACCES; + iput(lower_inode); + goto out; + } + if (inode->i_state & I_NEW) + unlock_new_inode(inode); + else + iput(lower_inode); + if (S_ISLNK(lower_inode->i_mode)) + inode->i_op = &ecryptfs_symlink_iops; + else if (S_ISDIR(lower_inode->i_mode)) + inode->i_op = &ecryptfs_dir_iops; + if (S_ISDIR(lower_inode->i_mode)) + inode->i_fop = &ecryptfs_dir_fops; + /* TODO: Is there a better way to identify if the inode is + * special? */ + if (S_ISBLK(lower_inode->i_mode) || S_ISCHR(lower_inode->i_mode) || + S_ISFIFO(lower_inode->i_mode) || S_ISSOCK(lower_inode->i_mode)) + init_special_inode(inode, lower_inode->i_mode, + lower_inode->i_rdev); + dentry->d_op = &ecryptfs_dops; + if (flag) + d_add(dentry, inode); + else + d_instantiate(dentry, inode); + ecryptfs_copy_attr_all(inode, lower_inode); + /* This size will be overwritten for real files w/ headers and + * other metadata */ + ecryptfs_copy_inode_size(inode, lower_inode); +out: + return rc; +} + +enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_debug, + ecryptfs_opt_ecryptfs_debug, ecryptfs_opt_cipher, + ecryptfs_opt_ecryptfs_cipher, ecryptfs_opt_ecryptfs_key_bytes, + ecryptfs_opt_passthrough, ecryptfs_opt_err }; + +static match_table_t tokens = { + {ecryptfs_opt_sig, "sig=%s"}, + {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"}, + {ecryptfs_opt_debug, "debug=%u"}, + {ecryptfs_opt_ecryptfs_debug, "ecryptfs_debug=%u"}, + {ecryptfs_opt_cipher, "cipher=%s"}, + {ecryptfs_opt_ecryptfs_cipher, "ecryptfs_cipher=%s"}, + {ecryptfs_opt_ecryptfs_key_bytes, "ecryptfs_key_bytes=%u"}, + {ecryptfs_opt_passthrough, "ecryptfs_passthrough"}, + {ecryptfs_opt_err, NULL} +}; + +/** + * ecryptfs_verify_version + * @version: The version number to confirm + * + * Returns zero on good version; non-zero otherwise + */ +static int ecryptfs_verify_version(u16 version) +{ + int rc = 0; + unsigned char major; + unsigned char minor; + + major = ((version >> 8) & 0xFF); + minor = (version & 0xFF); + if (major != ECRYPTFS_VERSION_MAJOR) { + ecryptfs_printk(KERN_ERR, "Major version number mismatch. " + "Expected [%d]; got [%d]\n", + ECRYPTFS_VERSION_MAJOR, major); + rc = -EINVAL; + goto out; + } + if (minor != ECRYPTFS_VERSION_MINOR) { + ecryptfs_printk(KERN_ERR, "Minor version number mismatch. " + "Expected [%d]; got [%d]\n", + ECRYPTFS_VERSION_MINOR, minor); + rc = -EINVAL; + goto out; + } +out: + return rc; +} + +/** + * ecryptfs_parse_options + * @sb: The ecryptfs super block + * @options: The options pased to the kernel + * + * Parse mount options: + * debug=N - ecryptfs_verbosity level for debug output + * sig=XXX - description(signature) of the key to use + * + * Returns the dentry object of the lower-level (lower/interposed) + * directory; We want to mount our stackable file system on top of + * that lower directory. + * + * The signature of the key to use must be the description of a key + * already in the keyring. Mounting will fail if the key can not be + * found. + * + * Returns zero on success; non-zero on error + */ +static int ecryptfs_parse_options(struct super_block *sb, char *options) +{ + char *p; + int rc = 0; + int sig_set = 0; + int cipher_name_set = 0; + int cipher_key_bytes; + int cipher_key_bytes_set = 0; + struct key *auth_tok_key = NULL; + struct ecryptfs_auth_tok *auth_tok = NULL; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; + substring_t args[MAX_OPT_ARGS]; + int token; + char *sig_src; + char *sig_dst; + char *debug_src; + char *cipher_name_dst; + char *cipher_name_src; + char *cipher_key_bytes_src; + struct crypto_tfm *tmp_tfm; + int cipher_name_len; + + if (!options) { + rc = -EINVAL; + goto out; + } + while ((p = strsep(&options, ",")) != NULL) { + if (!*p) + continue; + token = match_token(p, tokens, args); + switch (token) { + case ecryptfs_opt_sig: + case ecryptfs_opt_ecryptfs_sig: + sig_src = args[0].from; + sig_dst = + mount_crypt_stat->global_auth_tok_sig; + memcpy(sig_dst, sig_src, ECRYPTFS_SIG_SIZE_HEX); + sig_dst[ECRYPTFS_SIG_SIZE_HEX] = '\0'; + ecryptfs_printk(KERN_DEBUG, + "The mount_crypt_stat " + "global_auth_tok_sig set to: " + "[%s]\n", sig_dst); + sig_set = 1; + break; + case ecryptfs_opt_debug: + case ecryptfs_opt_ecryptfs_debug: + debug_src = args[0].from; + ecryptfs_verbosity = + (int)simple_strtol(debug_src, &debug_src, + 0); + ecryptfs_printk(KERN_DEBUG, + "Verbosity set to [%d]" "\n", + ecryptfs_verbosity); + break; + case ecryptfs_opt_cipher: + case ecryptfs_opt_ecryptfs_cipher: + cipher_name_src = args[0].from; + cipher_name_dst = + mount_crypt_stat-> + global_default_cipher_name; + strncpy(cipher_name_dst, cipher_name_src, + ECRYPTFS_MAX_CIPHER_NAME_SIZE); + ecryptfs_printk(KERN_DEBUG, + "The mount_crypt_stat " + "global_default_cipher_name set to: " + "[%s]\n", cipher_name_dst); + cipher_name_set = 1; + break; + case ecryptfs_opt_ecryptfs_key_bytes: + cipher_key_bytes_src = args[0].from; + cipher_key_bytes = + (int)simple_strtol(cipher_key_bytes_src, + &cipher_key_bytes_src, 0); + mount_crypt_stat->global_default_cipher_key_size = + cipher_key_bytes; + ecryptfs_printk(KERN_DEBUG, + "The mount_crypt_stat " + "global_default_cipher_key_size " + "set to: [%d]\n", mount_crypt_stat-> + global_default_cipher_key_size); + cipher_key_bytes_set = 1; + break; + case ecryptfs_opt_passthrough: + mount_crypt_stat->flags |= + ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED; + break; + case ecryptfs_opt_err: + default: + ecryptfs_printk(KERN_WARNING, + "eCryptfs: unrecognized option '%s'\n", + p); + } + } + /* Do not support lack of mount-wide signature in 0.1 + * release */ + if (!sig_set) { + rc = -EINVAL; + ecryptfs_printk(KERN_ERR, "You must supply a valid " + "passphrase auth tok signature as a mount " + "parameter; see the eCryptfs README\n"); + goto out; + } + if (!cipher_name_set) { + cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); + if (unlikely(cipher_name_len + >= ECRYPTFS_MAX_CIPHER_NAME_SIZE)) { + rc = -EINVAL; + BUG(); + goto out; + } + memcpy(mount_crypt_stat->global_default_cipher_name, + ECRYPTFS_DEFAULT_CIPHER, cipher_name_len); + mount_crypt_stat->global_default_cipher_name[cipher_name_len] + = '\0'; + } + if (!cipher_key_bytes_set) { + mount_crypt_stat->global_default_cipher_key_size = + ECRYPTFS_DEFAULT_KEY_BYTES; + ecryptfs_printk(KERN_DEBUG, "Cipher key size was not " + "specified. Defaulting to [%d]\n", + mount_crypt_stat-> + global_default_cipher_key_size); + } + rc = ecryptfs_process_cipher( + &tmp_tfm, + &mount_crypt_stat->global_key_tfm, + mount_crypt_stat->global_default_cipher_name, + mount_crypt_stat->global_default_cipher_key_size); + if (tmp_tfm) + crypto_free_tfm(tmp_tfm); + if (rc) { + printk(KERN_ERR "Error attempting to initialize cipher [%s] " + "with key size [%Zd] bytes; rc = [%d]\n", + mount_crypt_stat->global_default_cipher_name, + mount_crypt_stat->global_default_cipher_key_size, rc); + rc = -EINVAL; + goto out; + } + mutex_init(&mount_crypt_stat->global_key_tfm_mutex); + ecryptfs_printk(KERN_DEBUG, "Requesting the key with description: " + "[%s]\n", mount_crypt_stat->global_auth_tok_sig); + /* The reference to this key is held until umount is done The + * call to key_put is done in ecryptfs_put_super() */ + auth_tok_key = request_key(&key_type_user, + mount_crypt_stat->global_auth_tok_sig, + NULL); + if (!auth_tok_key || IS_ERR(auth_tok_key)) { + ecryptfs_printk(KERN_ERR, "Could not find key with " + "description: [%s]\n", + mount_crypt_stat->global_auth_tok_sig); + process_request_key_err(PTR_ERR(auth_tok_key)); + rc = -EINVAL; + goto out; + } + auth_tok = ecryptfs_get_key_payload_data(auth_tok_key); + if (ecryptfs_verify_version(auth_tok->version)) { + ecryptfs_printk(KERN_ERR, "Data structure version mismatch. " + "Userspace tools must match eCryptfs kernel " + "module with major version [%d] and minor " + "version [%d]\n", ECRYPTFS_VERSION_MAJOR, + ECRYPTFS_VERSION_MINOR); + rc = -EINVAL; + goto out; + } + if (auth_tok->token_type != ECRYPTFS_PASSWORD) { + ecryptfs_printk(KERN_ERR, "Invalid auth_tok structure " + "returned from key\n"); + rc = -EINVAL; + goto out; + } + mount_crypt_stat->global_auth_tok_key = auth_tok_key; + mount_crypt_stat->global_auth_tok = auth_tok; +out: + return rc; +} + +struct kmem_cache *ecryptfs_sb_info_cache; + +/** + * ecryptfs_fill_super + * @sb: The ecryptfs super block + * @raw_data: The options passed to mount + * @silent: Not used but required by function prototype + * + * Sets up what we can of the sb, rest is done in ecryptfs_read_super + * + * Returns zero on success; non-zero otherwise + */ +static int +ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent) +{ + int rc = 0; + + /* Released in ecryptfs_put_super() */ + ecryptfs_set_superblock_private(sb, + kmem_cache_alloc(ecryptfs_sb_info_cache, + SLAB_KERNEL)); + if (!ecryptfs_superblock_to_private(sb)) { + ecryptfs_printk(KERN_WARNING, "Out of memory\n"); + rc = -ENOMEM; + goto out; + } + memset(ecryptfs_superblock_to_private(sb), 0, + sizeof(struct ecryptfs_sb_info)); + sb->s_op = &ecryptfs_sops; + /* Released through deactivate_super(sb) from get_sb_nodev */ + sb->s_root = d_alloc(NULL, &(const struct qstr) { + .hash = 0,.name = "/",.len = 1}); + if (!sb->s_root) { + ecryptfs_printk(KERN_ERR, "d_alloc failed\n"); + rc = -ENOMEM; + goto out; + } + sb->s_root->d_op = &ecryptfs_dops; + sb->s_root->d_sb = sb; + sb->s_root->d_parent = sb->s_root; + /* Released in d_release when dput(sb->s_root) is called */ + /* through deactivate_super(sb) from get_sb_nodev() */ + ecryptfs_set_dentry_private(sb->s_root, + kmem_cache_alloc(ecryptfs_dentry_info_cache, + SLAB_KERNEL)); + if (!ecryptfs_dentry_to_private(sb->s_root)) { + ecryptfs_printk(KERN_ERR, + "dentry_info_cache alloc failed\n"); + rc = -ENOMEM; + goto out; + } + memset(ecryptfs_dentry_to_private(sb->s_root), 0, + sizeof(struct ecryptfs_dentry_info)); + rc = 0; +out: + /* Should be able to rely on deactivate_super called from + * get_sb_nodev */ + return rc; +} + +/** + * ecryptfs_read_super + * @sb: The ecryptfs super block + * @dev_name: The path to mount over + * + * Read the super block of the lower filesystem, and use + * ecryptfs_interpose to create our initial inode and super block + * struct. + */ +static int ecryptfs_read_super(struct super_block *sb, const char *dev_name) +{ + int rc; + struct nameidata nd; + struct dentry *lower_root; + struct vfsmount *lower_mnt; + + memset(&nd, 0, sizeof(struct nameidata)); + rc = path_lookup(dev_name, LOOKUP_FOLLOW, &nd); + if (rc) { + ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n"); + goto out_free; + } + lower_root = nd.dentry; + if (!lower_root->d_inode) { + ecryptfs_printk(KERN_WARNING, + "No directory to interpose on\n"); + rc = -ENOENT; + goto out_free; + } + lower_mnt = nd.mnt; + ecryptfs_set_superblock_lower(sb, lower_root->d_sb); + sb->s_maxbytes = lower_root->d_sb->s_maxbytes; + ecryptfs_set_dentry_lower(sb->s_root, lower_root); + ecryptfs_set_dentry_lower_mnt(sb->s_root, lower_mnt); + if ((rc = ecryptfs_interpose(lower_root, sb->s_root, sb, 0))) + goto out_free; + rc = 0; + goto out; +out_free: + path_release(&nd); +out: + return rc; +} + +/** + * ecryptfs_get_sb + * @fs_type + * @flags + * @dev_name: The path to mount over + * @raw_data: The options passed into the kernel + * + * The whole ecryptfs_get_sb process is broken into 4 functions: + * ecryptfs_parse_options(): handle options passed to ecryptfs, if any + * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block + * with as much information as it can before needing + * the lower filesystem. + * ecryptfs_read_super(): this accesses the lower filesystem and uses + * ecryptfs_interpolate to perform most of the linking + * ecryptfs_interpolate(): links the lower filesystem into ecryptfs + */ +static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data, + struct vfsmount *mnt) +{ + int rc; + struct super_block *sb; + + rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt); + if (rc < 0) { + printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc); + goto out; + } + sb = mnt->mnt_sb; + rc = ecryptfs_parse_options(sb, raw_data); + if (rc) { + printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc); + goto out_abort; + } + rc = ecryptfs_read_super(sb, dev_name); + if (rc) { + printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc); + goto out_abort; + } + goto out; +out_abort: + dput(sb->s_root); + up_write(&sb->s_umount); + deactivate_super(sb); +out: + return rc; +} + +/** + * ecryptfs_kill_block_super + * @sb: The ecryptfs super block + * + * Used to bring the superblock down and free the private data. + * Private data is free'd in ecryptfs_put_super() + */ +static void ecryptfs_kill_block_super(struct super_block *sb) +{ + generic_shutdown_super(sb); +} + +static struct file_system_type ecryptfs_fs_type = { + .owner = THIS_MODULE, + .name = "ecryptfs", + .get_sb = ecryptfs_get_sb, + .kill_sb = ecryptfs_kill_block_super, + .fs_flags = 0 +}; + +/** + * inode_info_init_once + * + * Initializes the ecryptfs_inode_info_cache when it is created + */ +static void +inode_info_init_once(void *vptr, struct kmem_cache *cachep, unsigned long flags) +{ + struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr; + + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(&ei->vfs_inode); +} + +static struct ecryptfs_cache_info { + kmem_cache_t **cache; + const char *name; + size_t size; + void (*ctor)(void*, struct kmem_cache *, unsigned long); +} ecryptfs_cache_infos[] = { + { + .cache = &ecryptfs_auth_tok_list_item_cache, + .name = "ecryptfs_auth_tok_list_item", + .size = sizeof(struct ecryptfs_auth_tok_list_item), + }, + { + .cache = &ecryptfs_file_info_cache, + .name = "ecryptfs_file_cache", + .size = sizeof(struct ecryptfs_file_info), + }, + { + .cache = &ecryptfs_dentry_info_cache, + .name = "ecryptfs_dentry_info_cache", + .size = sizeof(struct ecryptfs_dentry_info), + }, + { + .cache = &ecryptfs_inode_info_cache, + .name = "ecryptfs_inode_cache", + .size = sizeof(struct ecryptfs_inode_info), + .ctor = inode_info_init_once, + }, + { + .cache = &ecryptfs_sb_info_cache, + .name = "ecryptfs_sb_cache", + .size = sizeof(struct ecryptfs_sb_info), + }, + { + .cache = &ecryptfs_header_cache_0, + .name = "ecryptfs_headers_0", + .size = PAGE_CACHE_SIZE, + }, + { + .cache = &ecryptfs_header_cache_1, + .name = "ecryptfs_headers_1", + .size = PAGE_CACHE_SIZE, + }, + { + .cache = &ecryptfs_header_cache_2, + .name = "ecryptfs_headers_2", + .size = PAGE_CACHE_SIZE, + }, + { + .cache = &ecryptfs_lower_page_cache, + .name = "ecryptfs_lower_page_cache", + .size = PAGE_CACHE_SIZE, + }, +}; + +static void ecryptfs_free_kmem_caches(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) { + struct ecryptfs_cache_info *info; + + info = &ecryptfs_cache_infos[i]; + if (*(info->cache)) + kmem_cache_destroy(*(info->cache)); + } +} + +/** + * ecryptfs_init_kmem_caches + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_init_kmem_caches(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) { + struct ecryptfs_cache_info *info; + + info = &ecryptfs_cache_infos[i]; + *(info->cache) = kmem_cache_create(info->name, info->size, + 0, SLAB_HWCACHE_ALIGN, info->ctor, NULL); + if (!*(info->cache)) { + ecryptfs_free_kmem_caches(); + ecryptfs_printk(KERN_WARNING, "%s: " + "kmem_cache_create failed\n", + info->name); + return -ENOMEM; + } + } + return 0; +} + +struct ecryptfs_obj { + char *name; + struct list_head slot_list; + struct kobject kobj; +}; + +struct ecryptfs_attribute { + struct attribute attr; + ssize_t(*show) (struct ecryptfs_obj *, char *); + ssize_t(*store) (struct ecryptfs_obj *, const char *, size_t); +}; + +static ssize_t +ecryptfs_attr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t len) +{ + struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj, + kobj); + struct ecryptfs_attribute *attribute = + container_of(attr, struct ecryptfs_attribute, attr); + + return (attribute->store ? attribute->store(obj, buf, len) : 0); +} + +static ssize_t +ecryptfs_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct ecryptfs_obj *obj = container_of(kobj, struct ecryptfs_obj, + kobj); + struct ecryptfs_attribute *attribute = + container_of(attr, struct ecryptfs_attribute, attr); + + return (attribute->show ? attribute->show(obj, buf) : 0); +} + +static struct sysfs_ops ecryptfs_sysfs_ops = { + .show = ecryptfs_attr_show, + .store = ecryptfs_attr_store +}; + +static struct kobj_type ecryptfs_ktype = { + .sysfs_ops = &ecryptfs_sysfs_ops +}; + +static decl_subsys(ecryptfs, &ecryptfs_ktype, NULL); + +static ssize_t version_show(struct ecryptfs_obj *obj, char *buff) +{ + return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK); +} + +static struct ecryptfs_attribute sysfs_attr_version = __ATTR_RO(version); + +struct ecryptfs_version_str_map_elem { + u32 flag; + char *str; +} ecryptfs_version_str_map[] = { + {ECRYPTFS_VERSIONING_PASSPHRASE, "passphrase"}, + {ECRYPTFS_VERSIONING_PUBKEY, "pubkey"}, + {ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH, "plaintext passthrough"}, + {ECRYPTFS_VERSIONING_POLICY, "policy"} +}; + +static ssize_t version_str_show(struct ecryptfs_obj *obj, char *buff) +{ + int i; + int remaining = PAGE_SIZE; + int total_written = 0; + + buff[0] = '\0'; + for (i = 0; i < ARRAY_SIZE(ecryptfs_version_str_map); i++) { + int entry_size; + + if (!(ECRYPTFS_VERSIONING_MASK + & ecryptfs_version_str_map[i].flag)) + continue; + entry_size = strlen(ecryptfs_version_str_map[i].str); + if ((entry_size + 2) > remaining) + goto out; + memcpy(buff, ecryptfs_version_str_map[i].str, entry_size); + buff[entry_size++] = '\n'; + buff[entry_size] = '\0'; + buff += entry_size; + total_written += entry_size; + remaining -= entry_size; + } +out: + return total_written; +} + +static struct ecryptfs_attribute sysfs_attr_version_str = __ATTR_RO(version_str); + +static int do_sysfs_registration(void) +{ + int rc; + + if ((rc = subsystem_register(&ecryptfs_subsys))) { + printk(KERN_ERR + "Unable to register ecryptfs sysfs subsystem\n"); + goto out; + } + rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj, + &sysfs_attr_version.attr); + if (rc) { + printk(KERN_ERR + "Unable to create ecryptfs version attribute\n"); + subsystem_unregister(&ecryptfs_subsys); + goto out; + } + rc = sysfs_create_file(&ecryptfs_subsys.kset.kobj, + &sysfs_attr_version_str.attr); + if (rc) { + printk(KERN_ERR + "Unable to create ecryptfs version_str attribute\n"); + sysfs_remove_file(&ecryptfs_subsys.kset.kobj, + &sysfs_attr_version.attr); + subsystem_unregister(&ecryptfs_subsys); + goto out; + } +out: + return rc; +} + +static int __init ecryptfs_init(void) +{ + int rc; + + if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_CACHE_SIZE) { + rc = -EINVAL; + ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is " + "larger than the host's page size, and so " + "eCryptfs cannot run on this system. The " + "default eCryptfs extent size is [%d] bytes; " + "the page size is [%d] bytes.\n", + ECRYPTFS_DEFAULT_EXTENT_SIZE, PAGE_CACHE_SIZE); + goto out; + } + rc = ecryptfs_init_kmem_caches(); + if (rc) { + printk(KERN_ERR + "Failed to allocate one or more kmem_cache objects\n"); + goto out; + } + rc = register_filesystem(&ecryptfs_fs_type); + if (rc) { + printk(KERN_ERR "Failed to register filesystem\n"); + ecryptfs_free_kmem_caches(); + goto out; + } + kset_set_kset_s(&ecryptfs_subsys, fs_subsys); + sysfs_attr_version.attr.owner = THIS_MODULE; + sysfs_attr_version_str.attr.owner = THIS_MODULE; + rc = do_sysfs_registration(); + if (rc) { + printk(KERN_ERR "sysfs registration failed\n"); + unregister_filesystem(&ecryptfs_fs_type); + ecryptfs_free_kmem_caches(); + goto out; + } +out: + return rc; +} + +static void __exit ecryptfs_exit(void) +{ + sysfs_remove_file(&ecryptfs_subsys.kset.kobj, + &sysfs_attr_version.attr); + sysfs_remove_file(&ecryptfs_subsys.kset.kobj, + &sysfs_attr_version_str.attr); + subsystem_unregister(&ecryptfs_subsys); + unregister_filesystem(&ecryptfs_fs_type); + ecryptfs_free_kmem_caches(); +} + +MODULE_AUTHOR("Michael A. Halcrow <mhalcrow@us.ibm.com>"); +MODULE_DESCRIPTION("eCryptfs"); + +MODULE_LICENSE("GPL"); + +module_init(ecryptfs_init) +module_exit(ecryptfs_exit) diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c new file mode 100644 index 00000000000..924dd90a4cf --- /dev/null +++ b/fs/ecryptfs/mmap.c @@ -0,0 +1,788 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * This is where eCryptfs coordinates the symmetric encryption and + * decryption of the file data as it passes between the lower + * encrypted file and the upper decrypted file. + * + * Copyright (C) 1997-2003 Erez Zadok + * Copyright (C) 2001-2003 Stony Brook University + * Copyright (C) 2004-2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/page-flags.h> +#include <linux/mount.h> +#include <linux/file.h> +#include <linux/crypto.h> +#include <linux/scatterlist.h> +#include "ecryptfs_kernel.h" + +struct kmem_cache *ecryptfs_lower_page_cache; + +/** + * ecryptfs_get1page + * + * Get one page from cache or lower f/s, return error otherwise. + * + * Returns unlocked and up-to-date page (if ok), with increased + * refcnt. + */ +static struct page *ecryptfs_get1page(struct file *file, int index) +{ + struct page *page; + struct dentry *dentry; + struct inode *inode; + struct address_space *mapping; + + dentry = file->f_dentry; + inode = dentry->d_inode; + mapping = inode->i_mapping; + page = read_cache_page(mapping, index, + (filler_t *)mapping->a_ops->readpage, + (void *)file); + if (IS_ERR(page)) + goto out; + wait_on_page_locked(page); +out: + return page; +} + +static +int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros); + +/** + * ecryptfs_fill_zeros + * @file: The ecryptfs file + * @new_length: The new length of the data in the underlying file; + * everything between the prior end of the file and the + * new end of the file will be filled with zero's. + * new_length must be greater than current length + * + * Function for handling lseek-ing past the end of the file. + * + * This function does not support shrinking, only growing a file. + * + * Returns zero on success; non-zero otherwise. + */ +int ecryptfs_fill_zeros(struct file *file, loff_t new_length) +{ + int rc = 0; + struct dentry *dentry = file->f_dentry; + struct inode *inode = dentry->d_inode; + pgoff_t old_end_page_index = 0; + pgoff_t index = old_end_page_index; + int old_end_pos_in_page = -1; + pgoff_t new_end_page_index; + int new_end_pos_in_page; + loff_t cur_length = i_size_read(inode); + + if (cur_length != 0) { + index = old_end_page_index = + ((cur_length - 1) >> PAGE_CACHE_SHIFT); + old_end_pos_in_page = ((cur_length - 1) & ~PAGE_CACHE_MASK); + } + new_end_page_index = ((new_length - 1) >> PAGE_CACHE_SHIFT); + new_end_pos_in_page = ((new_length - 1) & ~PAGE_CACHE_MASK); + ecryptfs_printk(KERN_DEBUG, "old_end_page_index = [0x%.16x]; " + "old_end_pos_in_page = [%d]; " + "new_end_page_index = [0x%.16x]; " + "new_end_pos_in_page = [%d]\n", + old_end_page_index, old_end_pos_in_page, + new_end_page_index, new_end_pos_in_page); + if (old_end_page_index == new_end_page_index) { + /* Start and end are in the same page; we just need to + * set a portion of the existing page to zero's */ + rc = write_zeros(file, index, (old_end_pos_in_page + 1), + (new_end_pos_in_page - old_end_pos_in_page)); + if (rc) + ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], " + "index=[0x%.16x], " + "old_end_pos_in_page=[d], " + "(PAGE_CACHE_SIZE - new_end_pos_in_page" + "=[%d]" + ")=[d]) returned [%d]\n", file, index, + old_end_pos_in_page, + new_end_pos_in_page, + (PAGE_CACHE_SIZE - new_end_pos_in_page), + rc); + goto out; + } + /* Fill the remainder of the previous last page with zeros */ + rc = write_zeros(file, index, (old_end_pos_in_page + 1), + ((PAGE_CACHE_SIZE - 1) - old_end_pos_in_page)); + if (rc) { + ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], " + "index=[0x%.16x], old_end_pos_in_page=[d], " + "(PAGE_CACHE_SIZE - old_end_pos_in_page)=[d]) " + "returned [%d]\n", file, index, + old_end_pos_in_page, + (PAGE_CACHE_SIZE - old_end_pos_in_page), rc); + goto out; + } + index++; + while (index < new_end_page_index) { + /* Fill all intermediate pages with zeros */ + rc = write_zeros(file, index, 0, PAGE_CACHE_SIZE); + if (rc) { + ecryptfs_printk(KERN_ERR, "write_zeros(file=[%p], " + "index=[0x%.16x], " + "old_end_pos_in_page=[d], " + "(PAGE_CACHE_SIZE - new_end_pos_in_page" + "=[%d]" + ")=[d]) returned [%d]\n", file, index, + old_end_pos_in_page, + new_end_pos_in_page, + (PAGE_CACHE_SIZE - new_end_pos_in_page), + rc); + goto out; + } + index++; + } + /* Fill the portion at the beginning of the last new page with + * zero's */ + rc = write_zeros(file, index, 0, (new_end_pos_in_page + 1)); + if (rc) { + ecryptfs_printk(KERN_ERR, "write_zeros(file=" + "[%p], index=[0x%.16x], 0, " + "new_end_pos_in_page=[%d]" + "returned [%d]\n", file, index, + new_end_pos_in_page, rc); + goto out; + } +out: + return rc; +} + +/** + * ecryptfs_writepage + * @page: Page that is locked before this call is made + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct ecryptfs_page_crypt_context ctx; + int rc; + + ctx.page = page; + ctx.mode = ECRYPTFS_WRITEPAGE_MODE; + ctx.param.wbc = wbc; + rc = ecryptfs_encrypt_page(&ctx); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error encrypting " + "page (upper index [0x%.16x])\n", page->index); + ClearPageUptodate(page); + goto out; + } + SetPageUptodate(page); + unlock_page(page); +out: + return rc; +} + +/** + * Reads the data from the lower file file at index lower_page_index + * and copies that data into page. + * + * @param page Page to fill + * @param lower_page_index Index of the page in the lower file to get + */ +int ecryptfs_do_readpage(struct file *file, struct page *page, + pgoff_t lower_page_index) +{ + int rc; + struct dentry *dentry; + struct file *lower_file; + struct dentry *lower_dentry; + struct inode *inode; + struct inode *lower_inode; + char *page_data; + struct page *lower_page = NULL; + char *lower_page_data; + const struct address_space_operations *lower_a_ops; + + dentry = file->f_dentry; + lower_file = ecryptfs_file_to_lower(file); + lower_dentry = ecryptfs_dentry_to_lower(dentry); + inode = dentry->d_inode; + lower_inode = ecryptfs_inode_to_lower(inode); + lower_a_ops = lower_inode->i_mapping->a_ops; + lower_page = read_cache_page(lower_inode->i_mapping, lower_page_index, + (filler_t *)lower_a_ops->readpage, + (void *)lower_file); + if (IS_ERR(lower_page)) { + rc = PTR_ERR(lower_page); + lower_page = NULL; + ecryptfs_printk(KERN_ERR, "Error reading from page cache\n"); + goto out; + } + wait_on_page_locked(lower_page); + page_data = (char *)kmap(page); + if (!page_data) { + rc = -ENOMEM; + ecryptfs_printk(KERN_ERR, "Error mapping page\n"); + goto out; + } + lower_page_data = (char *)kmap(lower_page); + if (!lower_page_data) { + rc = -ENOMEM; + ecryptfs_printk(KERN_ERR, "Error mapping page\n"); + kunmap(page); + goto out; + } + memcpy(page_data, lower_page_data, PAGE_CACHE_SIZE); + kunmap(lower_page); + kunmap(page); + rc = 0; +out: + if (likely(lower_page)) + page_cache_release(lower_page); + if (rc == 0) + SetPageUptodate(page); + else + ClearPageUptodate(page); + return rc; +} + +/** + * ecryptfs_readpage + * @file: This is an ecryptfs file + * @page: ecryptfs associated page to stick the read data into + * + * Read in a page, decrypting if necessary. + * + * Returns zero on success; non-zero on error. + */ +static int ecryptfs_readpage(struct file *file, struct page *page) +{ + int rc = 0; + struct ecryptfs_crypt_stat *crypt_stat; + + BUG_ON(!(file && file->f_dentry && file->f_dentry->d_inode)); + crypt_stat = + &ecryptfs_inode_to_private(file->f_dentry->d_inode)->crypt_stat; + if (!crypt_stat + || !ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_ENCRYPTED) + || ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE)) { + ecryptfs_printk(KERN_DEBUG, + "Passing through unencrypted page\n"); + rc = ecryptfs_do_readpage(file, page, page->index); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error reading page; rc = " + "[%d]\n", rc); + goto out; + } + } else { + rc = ecryptfs_decrypt_page(file, page); + if (rc) { + + ecryptfs_printk(KERN_ERR, "Error decrypting page; " + "rc = [%d]\n", rc); + goto out; + } + } + SetPageUptodate(page); +out: + if (rc) + ClearPageUptodate(page); + ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n", + page->index); + unlock_page(page); + return rc; +} + +static int fill_zeros_to_end_of_page(struct page *page, unsigned int to) +{ + struct inode *inode = page->mapping->host; + int end_byte_in_page; + int rc = 0; + char *page_virt; + + if ((i_size_read(inode) / PAGE_CACHE_SIZE) == page->index) { + end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE; + if (to > end_byte_in_page) + end_byte_in_page = to; + page_virt = kmap(page); + if (!page_virt) { + rc = -ENOMEM; + ecryptfs_printk(KERN_WARNING, + "Could not map page\n"); + goto out; + } + memset((page_virt + end_byte_in_page), 0, + (PAGE_CACHE_SIZE - end_byte_in_page)); + kunmap(page); + } +out: + return rc; +} + +static int ecryptfs_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + int rc = 0; + + kmap(page); + if (from == 0 && to == PAGE_CACHE_SIZE) + goto out; /* If we are writing a full page, it will be + up to date. */ + if (!PageUptodate(page)) + rc = ecryptfs_do_readpage(file, page, page->index); +out: + return rc; +} + +int ecryptfs_grab_and_map_lower_page(struct page **lower_page, + char **lower_virt, + struct inode *lower_inode, + unsigned long lower_page_index) +{ + int rc = 0; + + (*lower_page) = grab_cache_page(lower_inode->i_mapping, + lower_page_index); + if (!(*lower_page)) { + ecryptfs_printk(KERN_ERR, "grab_cache_page for " + "lower_page_index = [0x%.16x] failed\n", + lower_page_index); + rc = -EINVAL; + goto out; + } + if (lower_virt) + (*lower_virt) = kmap((*lower_page)); + else + kmap((*lower_page)); +out: + return rc; +} + +int ecryptfs_writepage_and_release_lower_page(struct page *lower_page, + struct inode *lower_inode, + struct writeback_control *wbc) +{ + int rc = 0; + + rc = lower_inode->i_mapping->a_ops->writepage(lower_page, wbc); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error calling lower writepage(); " + "rc = [%d]\n", rc); + goto out; + } + lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME; + page_cache_release(lower_page); +out: + return rc; +} + +static void ecryptfs_unmap_and_release_lower_page(struct page *lower_page) +{ + kunmap(lower_page); + ecryptfs_printk(KERN_DEBUG, "Unlocking lower page with index = " + "[0x%.16x]\n", lower_page->index); + unlock_page(lower_page); + page_cache_release(lower_page); +} + +/** + * ecryptfs_write_inode_size_to_header + * + * Writes the lower file size to the first 8 bytes of the header. + * + * Returns zero on success; non-zero on error. + */ +int +ecryptfs_write_inode_size_to_header(struct file *lower_file, + struct inode *lower_inode, + struct inode *inode) +{ + int rc = 0; + struct page *header_page; + char *header_virt; + const struct address_space_operations *lower_a_ops; + u64 file_size; + + rc = ecryptfs_grab_and_map_lower_page(&header_page, &header_virt, + lower_inode, 0); + if (rc) { + ecryptfs_printk(KERN_ERR, "grab_cache_page for header page " + "failed\n"); + goto out; + } + lower_a_ops = lower_inode->i_mapping->a_ops; + rc = lower_a_ops->prepare_write(lower_file, header_page, 0, 8); + file_size = (u64)i_size_read(inode); + ecryptfs_printk(KERN_DEBUG, "Writing size: [0x%.16x]\n", file_size); + file_size = cpu_to_be64(file_size); + memcpy(header_virt, &file_size, sizeof(u64)); + rc = lower_a_ops->commit_write(lower_file, header_page, 0, 8); + if (rc < 0) + ecryptfs_printk(KERN_ERR, "Error commiting header page " + "write\n"); + ecryptfs_unmap_and_release_lower_page(header_page); + lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME; + mark_inode_dirty_sync(inode); +out: + return rc; +} + +int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode, + struct file *lower_file, + unsigned long lower_page_index, int byte_offset, + int region_bytes) +{ + int rc = 0; + + rc = ecryptfs_grab_and_map_lower_page(lower_page, NULL, lower_inode, + lower_page_index); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error attempting to grab and map " + "lower page with index [0x%.16x]\n", + lower_page_index); + goto out; + } + rc = lower_inode->i_mapping->a_ops->prepare_write(lower_file, + (*lower_page), + byte_offset, + region_bytes); + if (rc) { + ecryptfs_printk(KERN_ERR, "prepare_write for " + "lower_page_index = [0x%.16x] failed; rc = " + "[%d]\n", lower_page_index, rc); + } +out: + if (rc && (*lower_page)) { + ecryptfs_unmap_and_release_lower_page(*lower_page); + (*lower_page) = NULL; + } + return rc; +} + +/** + * ecryptfs_commit_lower_page + * + * Returns zero on success; non-zero on error + */ +int +ecryptfs_commit_lower_page(struct page *lower_page, struct inode *lower_inode, + struct file *lower_file, int byte_offset, + int region_size) +{ + int rc = 0; + + rc = lower_inode->i_mapping->a_ops->commit_write( + lower_file, lower_page, byte_offset, region_size); + if (rc < 0) { + ecryptfs_printk(KERN_ERR, + "Error committing write; rc = [%d]\n", rc); + } else + rc = 0; + ecryptfs_unmap_and_release_lower_page(lower_page); + return rc; +} + +/** + * ecryptfs_copy_page_to_lower + * + * Used for plaintext pass-through; no page index interpolation + * required. + */ +int ecryptfs_copy_page_to_lower(struct page *page, struct inode *lower_inode, + struct file *lower_file) +{ + int rc = 0; + struct page *lower_page; + + rc = ecryptfs_get_lower_page(&lower_page, lower_inode, lower_file, + page->index, 0, PAGE_CACHE_SIZE); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error attempting to get page " + "at index [0x%.16x]\n", page->index); + goto out; + } + /* TODO: aops */ + memcpy((char *)page_address(lower_page), page_address(page), + PAGE_CACHE_SIZE); + rc = ecryptfs_commit_lower_page(lower_page, lower_inode, lower_file, + 0, PAGE_CACHE_SIZE); + if (rc) + ecryptfs_printk(KERN_ERR, "Error attempting to commit page " + "at index [0x%.16x]\n", page->index); +out: + return rc; +} + +static int +process_new_file(struct ecryptfs_crypt_stat *crypt_stat, + struct file *file, struct inode *inode) +{ + struct page *header_page; + const struct address_space_operations *lower_a_ops; + struct inode *lower_inode; + struct file *lower_file; + char *header_virt; + int rc = 0; + int current_header_page = 0; + int header_pages; + int more_header_data_to_be_written = 1; + + lower_inode = ecryptfs_inode_to_lower(inode); + lower_file = ecryptfs_file_to_lower(file); + lower_a_ops = lower_inode->i_mapping->a_ops; + header_pages = ((crypt_stat->header_extent_size + * crypt_stat->num_header_extents_at_front) + / PAGE_CACHE_SIZE); + BUG_ON(header_pages < 1); + while (current_header_page < header_pages) { + rc = ecryptfs_grab_and_map_lower_page(&header_page, + &header_virt, + lower_inode, + current_header_page); + if (rc) { + ecryptfs_printk(KERN_ERR, "grab_cache_page for " + "header page [%d] failed; rc = [%d]\n", + current_header_page, rc); + goto out; + } + rc = lower_a_ops->prepare_write(lower_file, header_page, 0, + PAGE_CACHE_SIZE); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error preparing to write " + "header page out; rc = [%d]\n", rc); + goto out; + } + memset(header_virt, 0, PAGE_CACHE_SIZE); + if (more_header_data_to_be_written) { + rc = ecryptfs_write_headers_virt(header_virt, + crypt_stat, + file->f_dentry); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error " + "generating header; rc = " + "[%d]\n", rc); + rc = -EIO; + memset(header_virt, 0, PAGE_CACHE_SIZE); + ecryptfs_unmap_and_release_lower_page( + header_page); + goto out; + } + if (current_header_page == 0) + memset(header_virt, 0, 8); + more_header_data_to_be_written = 0; + } + rc = lower_a_ops->commit_write(lower_file, header_page, 0, + PAGE_CACHE_SIZE); + ecryptfs_unmap_and_release_lower_page(header_page); + if (rc < 0) { + ecryptfs_printk(KERN_ERR, + "Error commiting header page write; " + "rc = [%d]\n", rc); + break; + } + current_header_page++; + } + if (rc >= 0) { + rc = 0; + ecryptfs_printk(KERN_DEBUG, "lower_inode->i_blocks = " + "[0x%.16x]\n", lower_inode->i_blocks); + i_size_write(inode, 0); + lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME; + mark_inode_dirty_sync(inode); + } + ecryptfs_printk(KERN_DEBUG, "Clearing ECRYPTFS_NEW_FILE flag in " + "crypt_stat at memory location [%p]\n", crypt_stat); + ECRYPTFS_CLEAR_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE); +out: + return rc; +} + +/** + * ecryptfs_commit_write + * @file: The eCryptfs file object + * @page: The eCryptfs page + * @from: Ignored (we rotate the page IV on each write) + * @to: Ignored + * + * This is where we encrypt the data and pass the encrypted data to + * the lower filesystem. In OpenPGP-compatible mode, we operate on + * entire underlying packets. + */ +static int ecryptfs_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct ecryptfs_page_crypt_context ctx; + loff_t pos; + struct inode *inode; + struct inode *lower_inode; + struct file *lower_file; + struct ecryptfs_crypt_stat *crypt_stat; + int rc; + + inode = page->mapping->host; + lower_inode = ecryptfs_inode_to_lower(inode); + lower_file = ecryptfs_file_to_lower(file); + mutex_lock(&lower_inode->i_mutex); + crypt_stat = + &ecryptfs_inode_to_private(file->f_dentry->d_inode)->crypt_stat; + if (ECRYPTFS_CHECK_FLAG(crypt_stat->flags, ECRYPTFS_NEW_FILE)) { + ecryptfs_printk(KERN_DEBUG, "ECRYPTFS_NEW_FILE flag set in " + "crypt_stat at memory location [%p]\n", crypt_stat); + rc = process_new_file(crypt_stat, file, inode); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error processing new " + "file; rc = [%d]\n", rc); + goto out; + } + } else + ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); + ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" + "(page w/ index = [0x%.16x], to = [%d])\n", page->index, + to); + rc = fill_zeros_to_end_of_page(page, to); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error attempting to fill " + "zeros in page with index = [0x%.16x]\n", + page->index); + goto out; + } + ctx.page = page; + ctx.mode = ECRYPTFS_PREPARE_COMMIT_MODE; + ctx.param.lower_file = lower_file; + rc = ecryptfs_encrypt_page(&ctx); + if (rc) { + ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " + "index [0x%.16x])\n", page->index); + goto out; + } + rc = 0; + inode->i_blocks = lower_inode->i_blocks; + pos = (page->index << PAGE_CACHE_SHIFT) + to; + if (pos > i_size_read(inode)) { + i_size_write(inode, pos); + ecryptfs_printk(KERN_DEBUG, "Expanded file size to " + "[0x%.16x]\n", i_size_read(inode)); + } + ecryptfs_write_inode_size_to_header(lower_file, lower_inode, inode); + lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME; + mark_inode_dirty_sync(inode); +out: + kunmap(page); /* mapped in prior call (prepare_write) */ + if (rc < 0) + ClearPageUptodate(page); + else + SetPageUptodate(page); + mutex_unlock(&lower_inode->i_mutex); + return rc; +} + +/** + * write_zeros + * @file: The ecryptfs file + * @index: The index in which we are writing + * @start: The position after the last block of data + * @num_zeros: The number of zeros to write + * + * Write a specified number of zero's to a page. + * + * (start + num_zeros) must be less than or equal to PAGE_CACHE_SIZE + */ +static +int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros) +{ + int rc = 0; + struct page *tmp_page; + + tmp_page = ecryptfs_get1page(file, index); + if (IS_ERR(tmp_page)) { + ecryptfs_printk(KERN_ERR, "Error getting page at index " + "[0x%.16x]\n", index); + rc = PTR_ERR(tmp_page); + goto out; + } + kmap(tmp_page); + rc = ecryptfs_prepare_write(file, tmp_page, start, start + num_zeros); + if (rc) { + ecryptfs_printk(KERN_ERR, "Error preparing to write zero's " + "to remainder of page at index [0x%.16x]\n", + index); + kunmap(tmp_page); + page_cache_release(tmp_page); + goto out; + } + memset(((char *)page_address(tmp_page) + start), 0, num_zeros); + rc = ecryptfs_commit_write(file, tmp_page, start, start + num_zeros); + if (rc < 0) { + ecryptfs_printk(KERN_ERR, "Error attempting to write zero's " + "to remainder of page at index [0x%.16x]\n", + index); + kunmap(tmp_page); + page_cache_release(tmp_page); + goto out; + } + rc = 0; + kunmap(tmp_page); + page_cache_release(tmp_page); +out: + return rc; +} + +static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block) +{ + int rc = 0; + struct inode *inode; + struct inode *lower_inode; + + inode = (struct inode *)mapping->host; + lower_inode = ecryptfs_inode_to_lower(inode); + if (lower_inode->i_mapping->a_ops->bmap) + rc = lower_inode->i_mapping->a_ops->bmap(lower_inode->i_mapping, + block); + return rc; +} + +static void ecryptfs_sync_page(struct page *page) +{ + struct inode *inode; + struct inode *lower_inode; + struct page *lower_page; + + inode = page->mapping->host; + lower_inode = ecryptfs_inode_to_lower(inode); + /* NOTE: Recently swapped with grab_cache_page(), since + * sync_page() just makes sure that pending I/O gets done. */ + lower_page = find_lock_page(lower_inode->i_mapping, page->index); + if (!lower_page) { + ecryptfs_printk(KERN_DEBUG, "find_lock_page failed\n"); + return; + } + lower_page->mapping->a_ops->sync_page(lower_page); + ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16x]\n", + lower_page->index); + unlock_page(lower_page); + page_cache_release(lower_page); +} + +struct address_space_operations ecryptfs_aops = { + .writepage = ecryptfs_writepage, + .readpage = ecryptfs_readpage, + .prepare_write = ecryptfs_prepare_write, + .commit_write = ecryptfs_commit_write, + .bmap = ecryptfs_bmap, + .sync_page = ecryptfs_sync_page, +}; diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c new file mode 100644 index 00000000000..c337c0410fb --- /dev/null +++ b/fs/ecryptfs/super.c @@ -0,0 +1,198 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 1997-2003 Erez Zadok + * Copyright (C) 2001-2003 Stony Brook University + * Copyright (C) 2004-2006 International Business Machines Corp. + * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> + * Michael C. Thompson <mcthomps@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/key.h> +#include <linux/seq_file.h> +#include <linux/crypto.h> +#include "ecryptfs_kernel.h" + +struct kmem_cache *ecryptfs_inode_info_cache; + +/** + * ecryptfs_alloc_inode - allocate an ecryptfs inode + * @sb: Pointer to the ecryptfs super block + * + * Called to bring an inode into existence. + * + * Only handle allocation, setting up structures should be done in + * ecryptfs_read_inode. This is because the kernel, between now and + * then, will 0 out the private data pointer. + * + * Returns a pointer to a newly allocated inode, NULL otherwise + */ +static struct inode *ecryptfs_alloc_inode(struct super_block *sb) +{ + struct ecryptfs_inode_info *ecryptfs_inode; + struct inode *inode = NULL; + + ecryptfs_inode = kmem_cache_alloc(ecryptfs_inode_info_cache, + SLAB_KERNEL); + if (unlikely(!ecryptfs_inode)) + goto out; + ecryptfs_init_crypt_stat(&ecryptfs_inode->crypt_stat); + inode = &ecryptfs_inode->vfs_inode; +out: + return inode; +} + +/** + * ecryptfs_destroy_inode + * @inode: The ecryptfs inode + * + * This is used during the final destruction of the inode. + * All allocation of memory related to the inode, including allocated + * memory in the crypt_stat struct, will be released here. + * There should be no chance that this deallocation will be missed. + */ +static void ecryptfs_destroy_inode(struct inode *inode) +{ + struct ecryptfs_inode_info *inode_info; + + inode_info = ecryptfs_inode_to_private(inode); + ecryptfs_destruct_crypt_stat(&inode_info->crypt_stat); + kmem_cache_free(ecryptfs_inode_info_cache, inode_info); +} + +/** + * ecryptfs_init_inode + * @inode: The ecryptfs inode + * + * Set up the ecryptfs inode. + */ +void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode) +{ + ecryptfs_set_inode_lower(inode, lower_inode); + inode->i_ino = lower_inode->i_ino; + inode->i_version++; + inode->i_op = &ecryptfs_main_iops; + inode->i_fop = &ecryptfs_main_fops; + inode->i_mapping->a_ops = &ecryptfs_aops; +} + +/** + * ecryptfs_put_super + * @sb: Pointer to the ecryptfs super block + * + * Final actions when unmounting a file system. + * This will handle deallocation and release of our private data. + */ +static void ecryptfs_put_super(struct super_block *sb) +{ + struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb); + + ecryptfs_destruct_mount_crypt_stat(&sb_info->mount_crypt_stat); + kmem_cache_free(ecryptfs_sb_info_cache, sb_info); + ecryptfs_set_superblock_private(sb, NULL); +} + +/** + * ecryptfs_statfs + * @sb: The ecryptfs super block + * @buf: The struct kstatfs to fill in with stats + * + * Get the filesystem statistics. Currently, we let this pass right through + * to the lower filesystem and take no action ourselves. + */ +static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + return vfs_statfs(ecryptfs_dentry_to_lower(dentry), buf); +} + +/** + * ecryptfs_clear_inode + * @inode - The ecryptfs inode + * + * Called by iput() when the inode reference count reached zero + * and the inode is not hashed anywhere. Used to clear anything + * that needs to be, before the inode is completely destroyed and put + * on the inode free list. We use this to drop out reference to the + * lower inode. + */ +static void ecryptfs_clear_inode(struct inode *inode) +{ + iput(ecryptfs_inode_to_lower(inode)); +} + +/** + * ecryptfs_umount_begin + * + * Called in do_umount(). + */ +static void ecryptfs_umount_begin(struct vfsmount *vfsmnt, int flags) +{ + struct vfsmount *lower_mnt = + ecryptfs_dentry_to_lower_mnt(vfsmnt->mnt_sb->s_root); + struct super_block *lower_sb; + + mntput(lower_mnt); + lower_sb = lower_mnt->mnt_sb; + if (lower_sb->s_op->umount_begin) + lower_sb->s_op->umount_begin(lower_mnt, flags); +} + +/** + * ecryptfs_show_options + * + * Prints the directory we are currently mounted over. + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt) +{ + struct super_block *sb = mnt->mnt_sb; + struct dentry *lower_root_dentry = ecryptfs_dentry_to_lower(sb->s_root); + struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(sb->s_root); + char *tmp_page; + char *path; + int rc = 0; + + tmp_page = (char *)__get_free_page(GFP_KERNEL); + if (!tmp_page) { + rc = -ENOMEM; + goto out; + } + path = d_path(lower_root_dentry, lower_mnt, tmp_page, PAGE_SIZE); + if (IS_ERR(path)) { + rc = PTR_ERR(path); + goto out; + } + seq_printf(m, ",dir=%s", path); + free_page((unsigned long)tmp_page); +out: + return rc; +} + +struct super_operations ecryptfs_sops = { + .alloc_inode = ecryptfs_alloc_inode, + .destroy_inode = ecryptfs_destroy_inode, + .drop_inode = generic_delete_inode, + .put_super = ecryptfs_put_super, + .statfs = ecryptfs_statfs, + .remount_fs = NULL, + .clear_inode = ecryptfs_clear_inode, + .umount_begin = ecryptfs_umount_begin, + .show_options = ecryptfs_show_options +}; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8d544334bcd..557d5b614fa 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -720,9 +720,10 @@ static int ep_getfd(int *efd, struct inode **einode, struct file **efile, /* Allocates an inode from the eventpoll file system */ inode = ep_eventpoll_inode(); - error = PTR_ERR(inode); - if (IS_ERR(inode)) + if (IS_ERR(inode)) { + error = PTR_ERR(inode); goto eexit_2; + } /* Allocates a free descriptor to plug the file onto */ error = get_unused_fd(); diff --git a/fs/exec.c b/fs/exec.c index 6270f8f20a6..d993ea1a81a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1318,7 +1318,7 @@ static void format_corename(char *corename, const char *pattern, long signr) case 'h': down_read(&uts_sem); rc = snprintf(out_ptr, out_end - out_ptr, - "%s", system_utsname.nodename); + "%s", utsname()->nodename); up_read(&uts_sem); if (rc > out_end - out_ptr) goto out; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 4c39009350f..93e77c3d249 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -315,7 +315,7 @@ struct getdents_callback { * the name matching the specified inode number. */ static int filldir_one(void * __buf, const char * name, int len, - loff_t pos, ino_t ino, unsigned int d_type) + loff_t pos, u64 ino, unsigned int d_type) { struct getdents_callback *buf = __buf; int result = 0; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 3e50a416628..69c439f4438 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -648,7 +648,7 @@ static int fat_readdir(struct file *filp, void *dirent, filldir_t filldir) } static int fat_ioctl_filldir(void *__buf, const char *name, int name_len, - loff_t offset, ino_t ino, unsigned int d_type) + loff_t offset, u64 ino, unsigned int d_type) { struct fat_ioctl_filldir_callback *buf = __buf; struct dirent __user *d1 = buf->dirent; diff --git a/fs/fcntl.c b/fs/fcntl.c index d35cbc6bc11..e4f26165f12 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -250,19 +250,22 @@ static int setfl(int fd, struct file * filp, unsigned long arg) return error; } -static void f_modown(struct file *filp, unsigned long pid, +static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, uid_t uid, uid_t euid, int force) { write_lock_irq(&filp->f_owner.lock); if (force || !filp->f_owner.pid) { - filp->f_owner.pid = pid; + put_pid(filp->f_owner.pid); + filp->f_owner.pid = get_pid(pid); + filp->f_owner.pid_type = type; filp->f_owner.uid = uid; filp->f_owner.euid = euid; } write_unlock_irq(&filp->f_owner.lock); } -int f_setown(struct file *filp, unsigned long arg, int force) +int __f_setown(struct file *filp, struct pid *pid, enum pid_type type, + int force) { int err; @@ -270,15 +273,44 @@ int f_setown(struct file *filp, unsigned long arg, int force) if (err) return err; - f_modown(filp, arg, current->uid, current->euid, force); + f_modown(filp, pid, type, current->uid, current->euid, force); return 0; } +EXPORT_SYMBOL(__f_setown); +int f_setown(struct file *filp, unsigned long arg, int force) +{ + enum pid_type type; + struct pid *pid; + int who = arg; + int result; + type = PIDTYPE_PID; + if (who < 0) { + type = PIDTYPE_PGID; + who = -who; + } + rcu_read_lock(); + pid = find_pid(who); + result = __f_setown(filp, pid, type, force); + rcu_read_unlock(); + return result; +} EXPORT_SYMBOL(f_setown); void f_delown(struct file *filp) { - f_modown(filp, 0, 0, 0, 1); + f_modown(filp, NULL, PIDTYPE_PID, 0, 0, 1); +} + +pid_t f_getown(struct file *filp) +{ + pid_t pid; + read_lock(&filp->f_owner.lock); + pid = pid_nr(filp->f_owner.pid); + if (filp->f_owner.pid_type == PIDTYPE_PGID) + pid = -pid; + read_unlock(&filp->f_owner.lock); + return pid; } static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, @@ -319,7 +351,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, * current syscall conventions, the only way * to fix this will be in libc. */ - err = filp->f_owner.pid; + err = f_getown(filp); force_successful_syscall_return(); break; case F_SETOWN: @@ -470,24 +502,19 @@ static void send_sigio_to_task(struct task_struct *p, void send_sigio(struct fown_struct *fown, int fd, int band) { struct task_struct *p; - int pid; + enum pid_type type; + struct pid *pid; read_lock(&fown->lock); + type = fown->pid_type; pid = fown->pid; if (!pid) goto out_unlock_fown; read_lock(&tasklist_lock); - if (pid > 0) { - p = find_task_by_pid(pid); - if (p) { - send_sigio_to_task(p, fown, fd, band); - } - } else { - do_each_task_pid(-pid, PIDTYPE_PGID, p) { - send_sigio_to_task(p, fown, fd, band); - } while_each_task_pid(-pid, PIDTYPE_PGID, p); - } + do_each_pid_task(pid, type, p) { + send_sigio_to_task(p, fown, fd, band); + } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); out_unlock_fown: read_unlock(&fown->lock); @@ -503,9 +530,12 @@ static void send_sigurg_to_task(struct task_struct *p, int send_sigurg(struct fown_struct *fown) { struct task_struct *p; - int pid, ret = 0; + enum pid_type type; + struct pid *pid; + int ret = 0; read_lock(&fown->lock); + type = fown->pid_type; pid = fown->pid; if (!pid) goto out_unlock_fown; @@ -513,16 +543,9 @@ int send_sigurg(struct fown_struct *fown) ret = 1; read_lock(&tasklist_lock); - if (pid > 0) { - p = find_task_by_pid(pid); - if (p) { - send_sigurg_to_task(p, fown); - } - } else { - do_each_task_pid(-pid, PIDTYPE_PGID, p) { - send_sigurg_to_task(p, fown); - } while_each_task_pid(-pid, PIDTYPE_PGID, p); - } + do_each_pid_task(pid, type, p) { + send_sigurg_to_task(p, fown); + } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); out_unlock_fown: read_unlock(&fown->lock); diff --git a/fs/file_table.c b/fs/file_table.c index bc35a40417d..24f25a057d9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -174,6 +174,7 @@ void fastcall __fput(struct file *file) fops_put(file->f_op); if (file->f_mode & FMODE_WRITE) put_write_access(inode); + put_pid(file->f_owner.pid); file_kill(file); file->f_dentry = NULL; file->f_vfsmnt = NULL; diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c index ae783066fc3..1528a6fd029 100644 --- a/fs/hfsplus/part_tbl.c +++ b/fs/hfsplus/part_tbl.c @@ -1,5 +1,5 @@ /* - * linux/fs/hfs/part_tbl.c + * linux/fs/hfsplus/part_tbl.c * * Copyright (C) 1996-1997 Paul H. Hargrove * This file may be distributed under the terms of the GNU General Public License. diff --git a/fs/inode.c b/fs/inode.c index ada7643104e..bf6bec4e54f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -657,7 +657,7 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he return inode; } -static inline unsigned long hash(struct super_block *sb, unsigned long hashval) +static unsigned long hash(struct super_block *sb, unsigned long hashval) { unsigned long tmp; @@ -1003,7 +1003,7 @@ void generic_delete_inode(struct inode *inode) list_del_init(&inode->i_list); list_del_init(&inode->i_sb_list); - inode->i_state|=I_FREEING; + inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); @@ -1210,13 +1210,15 @@ void file_update_time(struct file *file) return; now = current_fs_time(inode->i_sb); - if (!timespec_equal(&inode->i_mtime, &now)) + if (!timespec_equal(&inode->i_mtime, &now)) { + inode->i_mtime = now; sync_it = 1; - inode->i_mtime = now; + } - if (!timespec_equal(&inode->i_ctime, &now)) + if (!timespec_equal(&inode->i_ctime, &now)) { + inode->i_ctime = now; sync_it = 1; - inode->i_ctime = now; + } if (sync_it) mark_inode_dirty_sync(inode); diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 32a8caf0c41..10be51290a2 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -1,5 +1,5 @@ /* - * linux/fs/commit.c + * linux/fs/jbd/commit.c * * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 * diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 7af6099c911..c518dd8fe60 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -1,5 +1,5 @@ /* - * linux/fs/journal.c + * linux/fs/jbd/journal.c * * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 * diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index e2281300979..4d84bdc8829 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -5,16 +5,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -183,7 +183,7 @@ cleanup: posix_acl_release(acl); } else inode->i_mode &= ~current->fs->umask; - + JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) | inode->i_mode; diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h index ab7cd0567c9..79494c4f2b1 100644 --- a/fs/jfs/endian24.h +++ b/fs/jfs/endian24.h @@ -1,5 +1,5 @@ /* - * Copyright (c) International Business Machines Corp., 2001 + * Copyright (C) International Business Machines Corp., 2001 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 976e90dc2d1..34181b8f5a0 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -4,16 +4,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -108,7 +108,7 @@ const struct file_operations jfs_file_operations = { .aio_read = generic_file_aio_read, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .sendfile = generic_file_sendfile, + .sendfile = generic_file_sendfile, .fsync = jfs_fsync, .release = jfs_release, .ioctl = jfs_ioctl, diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index a8cc169235d..f5719117edf 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -4,16 +4,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -33,7 +33,7 @@ void jfs_read_inode(struct inode *inode) { - if (diRead(inode)) { + if (diRead(inode)) { make_bad_inode(inode); return; } diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index a76293767c7..455fa429204 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -1,18 +1,18 @@ /* - * Copyright (c) International Business Machines Corp., 2002 + * Copyright (C) International Business Machines Corp., 2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _H_JFS_ACL diff --git a/fs/jfs/jfs_btree.h b/fs/jfs/jfs_btree.h index 7f3e9ac454f..79c61805bd3 100644 --- a/fs/jfs/jfs_btree.h +++ b/fs/jfs/jfs_btree.h @@ -3,16 +3,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _H_JFS_BTREE diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c index 81f0e514c49..9c5d59632aa 100644 --- a/fs/jfs/jfs_debug.c +++ b/fs/jfs/jfs_debug.c @@ -4,16 +4,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h index 9f2572aea56..40b20111383 100644 --- a/fs/jfs/jfs_dinode.h +++ b/fs/jfs/jfs_dinode.h @@ -1,18 +1,18 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2001 + * Copyright (C) International Business Machines Corp., 2000-2001 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _H_JFS_DINODE diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index f05ebb62918..23546c8fd48 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -3,16 +3,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -30,28 +30,28 @@ * * the working state of the block allocation map is accessed in * two directions: - * + * * 1) allocation and free requests that start at the dmap * level and move up through the dmap control pages (i.e. * the vast majority of requests). - * - * 2) allocation requests that start at dmap control page + * + * 2) allocation requests that start at dmap control page * level and work down towards the dmaps. - * - * the serialization scheme used here is as follows. * - * requests which start at the bottom are serialized against each - * other through buffers and each requests holds onto its buffers - * as it works it way up from a single dmap to the required level + * the serialization scheme used here is as follows. + * + * requests which start at the bottom are serialized against each + * other through buffers and each requests holds onto its buffers + * as it works it way up from a single dmap to the required level * of dmap control page. * requests that start at the top are serialized against each other * and request that start from the bottom by the multiple read/single * write inode lock of the bmap inode. requests starting at the top * take this lock in write mode while request starting at the bottom * take the lock in read mode. a single top-down request may proceed - * exclusively while multiple bottoms-up requests may proceed - * simultaneously (under the protection of busy buffers). - * + * exclusively while multiple bottoms-up requests may proceed + * simultaneously (under the protection of busy buffers). + * * in addition to information found in dmaps and dmap control pages, * the working state of the block allocation map also includes read/ * write information maintained in the bmap descriptor (i.e. total @@ -59,7 +59,7 @@ * a single exclusive lock (BMAP_LOCK) is used to guard this information * in the face of multiple-bottoms up requests. * (lock ordering: IREAD_LOCK, BMAP_LOCK); - * + * * accesses to the persistent state of the block allocation map (limited * to the persistent bitmaps in dmaps) is guarded by (busy) buffers. */ @@ -120,7 +120,7 @@ static int dbGetL2AGSize(s64 nblocks); /* * buddy table * - * table used for determining buddy sizes within characters of + * table used for determining buddy sizes within characters of * dmap bitmap words. the characters themselves serve as indexes * into the table, with the table elements yielding the maximum * binary buddy of free bits within the character. @@ -146,7 +146,7 @@ static const s8 budtab[256] = { /* - * NAME: dbMount() + * NAME: dbMount() * * FUNCTION: initializate the block allocation map. * @@ -223,12 +223,12 @@ int dbMount(struct inode *ipbmap) /* - * NAME: dbUnmount() + * NAME: dbUnmount() * * FUNCTION: terminate the block allocation map in preparation for * file system unmount. * - * the in-core bmap descriptor is written to disk and + * the in-core bmap descriptor is written to disk and * the memory for this descriptor is freed. * * PARAMETERS: @@ -311,7 +311,7 @@ int dbSync(struct inode *ipbmap) /* - * NAME: dbFree() + * NAME: dbFree() * * FUNCTION: free the specified block range from the working block * allocation map. @@ -397,7 +397,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks) * * FUNCTION: update the allocation state (free or allocate) of the * specified block range in the persistent block allocation map. - * + * * the blocks will be updated in the persistent map one * dmap at a time. * @@ -475,7 +475,7 @@ dbUpdatePMap(struct inode *ipbmap, /* update the bits of the dmap words. the first and last * words may only have a subset of their bits updated. if * this is the case, we'll work against that word (i.e. - * partial first and/or last) only in a single pass. a + * partial first and/or last) only in a single pass. a * single pass will also be used to update all words that * are to have all their bits updated. */ @@ -662,11 +662,11 @@ unlock: * the block allocation policy uses hints and a multi-step * approach. * - * for allocation requests smaller than the number of blocks + * for allocation requests smaller than the number of blocks * per dmap, we first try to allocate the new blocks * immediately following the hint. if these blocks are not * available, we try to allocate blocks near the hint. if - * no blocks near the hint are available, we next try to + * no blocks near the hint are available, we next try to * allocate within the same dmap as contains the hint. * * if no blocks are available in the dmap or the allocation @@ -713,7 +713,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) #endif /* _STILL_TO_PORT */ /* get the log2 number of blocks to be allocated. - * if the number of blocks is not a log2 multiple, + * if the number of blocks is not a log2 multiple, * it will be rounded up to the next log2 multiple. */ l2nb = BLKSTOL2(nblocks); @@ -906,7 +906,7 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) * validate extent request: * * note: defragfs policy: - * max 64 blocks will be moved. + * max 64 blocks will be moved. * allocation request size must be satisfied from a single dmap. */ if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) { @@ -1333,7 +1333,7 @@ dbAllocNear(struct bmap * bmp, * or two sub-trees, depending on the allocation group size. * we search the top nodes of these subtrees left to right for * sufficient free space. if sufficient free space is found, - * the subtree is searched to find the leftmost leaf that + * the subtree is searched to find the leftmost leaf that * has free space. once we have made it to the leaf, we * move the search to the next lower level dmap control page * corresponding to this leaf. we continue down the dmap control @@ -1398,7 +1398,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) * that fully describes the allocation group since the allocation * group is already fully described by a dmap. in this case, we * just call dbAllocCtl() to search the dmap tree and allocate the - * required space if available. + * required space if available. * * if the allocation group is completely free, dbAllocCtl() is * also called to allocate the required space. this is done for @@ -1450,7 +1450,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth; ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1)); - /* dmap control page trees fan-out by 4 and a single allocation + /* dmap control page trees fan-out by 4 and a single allocation * group may be described by 1 or 2 subtrees within the ag level * dmap control page, depending upon the ag size. examine the ag's * subtrees for sufficient free space, starting with the leftmost @@ -1633,7 +1633,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno) /* starting at the specified dmap control page level and block * number, search down the dmap control levels for the starting - * block number of a dmap page that contains or starts off + * block number of a dmap page that contains or starts off * sufficient free blocks. */ for (lev = level, b = *blkno; lev >= 0; lev--) { @@ -1677,7 +1677,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno) } /* adjust the block number to reflect the location within - * the dmap control page (i.e. the leaf) at which free + * the dmap control page (i.e. the leaf) at which free * space was found. */ b += (((s64) leafidx) << budmin); @@ -1700,12 +1700,12 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno) * NAME: dbAllocCtl() * * FUNCTION: attempt to allocate a specified number of contiguous - * blocks starting within a specific dmap. - * + * blocks starting within a specific dmap. + * * this routine is called by higher level routines that search * the dmap control pages above the actual dmaps for contiguous * free space. the result of successful searches by these - * routines are the starting block numbers within dmaps, with + * routines are the starting block numbers within dmaps, with * the dmaps themselves containing the desired contiguous free * space or starting a contiguous free space of desired size * that is made up of the blocks of one or more dmaps. these @@ -1872,14 +1872,14 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) * * FUNCTION: attempt to allocate a specified number of contiguous blocks * from a specified dmap. - * + * * this routine checks if the contiguous blocks are available. * if so, nblocks of blocks are allocated; otherwise, ENOSPC is * returned. * * PARAMETERS: * mp - pointer to bmap descriptor - * dp - pointer to dmap to attempt to allocate blocks from. + * dp - pointer to dmap to attempt to allocate blocks from. * l2nb - log2 number of contiguous block desired. * nblocks - actual number of contiguous block desired. * results - on successful return, set to the starting block number @@ -1890,7 +1890,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) * -ENOSPC - insufficient disk resources * -EIO - i/o error * - * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or + * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or * IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit; */ static int @@ -2032,7 +2032,7 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, /* root changed. bubble the change up to the dmap control pages. * if the adjustment of the upper level control pages fails, - * backout the deallocation. + * backout the deallocation. */ if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) { word = (blkno & (BPERDMAP - 1)) >> L2DBWORD; @@ -2245,7 +2245,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, * words (i.e. partial first and/or last) on an individual basis * (a single pass), freeing the bits of interest by hand and updating * the leaf corresponding to the dmap word. a single pass will be used - * for all dmap words fully contained within the specified range. + * for all dmap words fully contained within the specified range. * within this pass, the bits of all fully contained dmap words will * be marked as free in a single shot and the leaves will be updated. a * single leaf may describe the free space of multiple dmap words, @@ -2267,7 +2267,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, */ if (nb < DBWORD) { /* free (zero) the appropriate bits within this - * dmap word. + * dmap word. */ dp->wmap[word] &= cpu_to_le32(~(ONES << (DBWORD - nb) @@ -2327,7 +2327,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, BMAP_LOCK(bmp); - /* update the free count for the allocation group and + /* update the free count for the allocation group and * map. */ agno = blkno >> bmp->db_agl2size; @@ -2378,7 +2378,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, * or deallocation resulted in the root change. this range * is respresented by a single leaf of the current dmapctl * and the leaf will be updated with this value, possibly - * causing a binary buddy system within the leaves to be + * causing a binary buddy system within the leaves to be * split or joined. the update may also cause the dmapctl's * dmtree to be updated. * @@ -2590,7 +2590,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) } } - /* adjust the dmap tree to reflect the specified leaf's new + /* adjust the dmap tree to reflect the specified leaf's new * value. */ dbAdjTree(tp, leafno, newval); @@ -2638,7 +2638,7 @@ static int dbBackSplit(dmtree_t * tp, int leafno) /* the back split is accomplished by iteratively finding the leaf * that starts the buddy system that contains the specified leaf and * splitting that system in two. this iteration continues until - * the specified leaf becomes the start of a buddy system. + * the specified leaf becomes the start of a buddy system. * * determine maximum possible l2 size for the specified leaf. */ @@ -2853,7 +2853,7 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval) * NAME: dbFindLeaf() * * FUNCTION: search a dmtree_t for sufficient free blocks, returning - * the index of a leaf describing the free blocks if + * the index of a leaf describing the free blocks if * sufficient free blocks are found. * * the search starts at the top of the dmtree_t tree and @@ -2869,7 +2869,7 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval) * * RETURN VALUES: * 0 - success - * -ENOSPC - insufficient free blocks. + * -ENOSPC - insufficient free blocks. */ static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx) { @@ -3090,7 +3090,7 @@ static int blkstol2(s64 nb) /* - * NAME: dbAllocBottomUp() + * NAME: dbAllocBottomUp() * * FUNCTION: alloc the specified block range from the working block * allocation map. @@ -3241,7 +3241,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, BMAP_LOCK(bmp); /* if this allocation group is completely free, - * update the highest active allocation group number + * update the highest active allocation group number * if this allocation group is the new max. */ agno = blkno >> bmp->db_agl2size; @@ -3273,7 +3273,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, * NAME: dbExtendFS() * * FUNCTION: extend bmap from blkno for nblocks; - * dbExtendFS() updates bmap ready for dbAllocBottomUp(); + * dbExtendFS() updates bmap ready for dbAllocBottomUp(); * * L2 * | @@ -3284,7 +3284,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm; * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm * - * <---old---><----------------------------extend-----------------------> + * <---old---><----------------------------extend-----------------------> */ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) { @@ -3330,7 +3330,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0; /* - * reconfigure db_agfree[] + * reconfigure db_agfree[] * from old AG configuration to new AG configuration; * * coalesce contiguous k (newAGSize/oldAGSize) AGs; @@ -3491,7 +3491,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) } /* for each dmap in a L0 */ /* - * build current L0 page from its leaves, and + * build current L0 page from its leaves, and * initialize corresponding parent L1 leaf */ *l1leaf = dbInitDmapCtl(l0dcp, 0, ++i); @@ -3515,7 +3515,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) } /* for each L0 in a L1 */ /* - * build current L1 page from its leaves, and + * build current L1 page from its leaves, and * initialize corresponding parent L2 leaf */ *l2leaf = dbInitDmapCtl(l1dcp, 1, ++j); @@ -3570,7 +3570,7 @@ void dbFinalizeBmap(struct inode *ipbmap) * finalize bmap control page */ //finalize: - /* + /* * compute db_agpref: preferred ag to allocate from * (the leftmost ag with average free space in it); */ @@ -3614,9 +3614,9 @@ void dbFinalizeBmap(struct inode *ipbmap) /* * compute db_aglevel, db_agheigth, db_width, db_agstart: - * an ag is covered in aglevel dmapctl summary tree, - * at agheight level height (from leaf) with agwidth number of nodes - * each, which starts at agstart index node of the smmary tree node + * an ag is covered in aglevel dmapctl summary tree, + * at agheight level height (from leaf) with agwidth number of nodes + * each, which starts at agstart index node of the smmary tree node * array; */ bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize); @@ -3635,13 +3635,13 @@ void dbFinalizeBmap(struct inode *ipbmap) /* * NAME: dbInitDmap()/ujfs_idmap_page() - * + * * FUNCTION: initialize working/persistent bitmap of the dmap page * for the specified number of blocks: - * + * * at entry, the bitmaps had been initialized as free (ZEROS); - * The number of blocks will only account for the actually - * existing blocks. Blocks which don't actually exist in + * The number of blocks will only account for the actually + * existing blocks. Blocks which don't actually exist in * the aggregate will be marked as allocated (ONES); * * PARAMETERS: @@ -3677,7 +3677,7 @@ static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks) /* * free the bits corresponding to the block range (ZEROS): - * note: not all bits of the first and last words may be contained + * note: not all bits of the first and last words may be contained * within the block range. */ for (r = nblocks; r > 0; r -= nb, blkno += nb) { @@ -3709,7 +3709,7 @@ static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks) } /* - * mark bits following the range to be freed (non-existing + * mark bits following the range to be freed (non-existing * blocks) as allocated (ONES) */ @@ -3741,11 +3741,11 @@ static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks) /* * NAME: dbInitDmapTree()/ujfs_complete_dmap() - * + * * FUNCTION: initialize summary tree of the specified dmap: * * at entry, bitmap of the dmap has been initialized; - * + * * PARAMETERS: * dp - dmap to complete * blkno - starting block number for this dmap @@ -3769,7 +3769,7 @@ static int dbInitDmapTree(struct dmap * dp) /* init each leaf from corresponding wmap word: * note: leaf is set to NOFREE(-1) if all blocks of corresponding - * bitmap word are allocated. + * bitmap word are allocated. */ cp = tp->stree + le32_to_cpu(tp->leafidx); for (i = 0; i < LPERDMAP; i++) @@ -3782,10 +3782,10 @@ static int dbInitDmapTree(struct dmap * dp) /* * NAME: dbInitTree()/ujfs_adjtree() - * + * * FUNCTION: initialize binary buddy summary tree of a dmap or dmapctl. * - * at entry, the leaves of the tree has been initialized + * at entry, the leaves of the tree has been initialized * from corresponding bitmap word or root of summary tree * of the child control page; * configure binary buddy system at the leaf level, then @@ -3813,15 +3813,15 @@ static int dbInitTree(struct dmaptree * dtp) /* * configure the leaf levevl into binary buddy system * - * Try to combine buddies starting with a buddy size of 1 - * (i.e. two leaves). At a buddy size of 1 two buddy leaves - * can be combined if both buddies have a maximum free of l2min; - * the combination will result in the left-most buddy leaf having - * a maximum free of l2min+1. - * After processing all buddies for a given size, process buddies - * at the next higher buddy size (i.e. current size * 2) and - * the next maximum free (current free + 1). - * This continues until the maximum possible buddy combination + * Try to combine buddies starting with a buddy size of 1 + * (i.e. two leaves). At a buddy size of 1 two buddy leaves + * can be combined if both buddies have a maximum free of l2min; + * the combination will result in the left-most buddy leaf having + * a maximum free of l2min+1. + * After processing all buddies for a given size, process buddies + * at the next higher buddy size (i.e. current size * 2) and + * the next maximum free (current free + 1). + * This continues until the maximum possible buddy combination * yields maximum free. */ for (l2free = dtp->budmin, bsize = 1; l2free < l2max; @@ -3845,10 +3845,10 @@ static int dbInitTree(struct dmaptree * dtp) * bubble summary information of leaves up the tree. * * Starting at the leaf node level, the four nodes described by - * the higher level parent node are compared for a maximum free and - * this maximum becomes the value of the parent node. - * when all lower level nodes are processed in this fashion then - * move up to the next level (parent becomes a lower level node) and + * the higher level parent node are compared for a maximum free and + * this maximum becomes the value of the parent node. + * when all lower level nodes are processed in this fashion then + * move up to the next level (parent becomes a lower level node) and * continue the process for that level. */ for (child = le32_to_cpu(dtp->leafidx), @@ -3857,7 +3857,7 @@ static int dbInitTree(struct dmaptree * dtp) /* get index of 1st node of parent level */ parent = (child - 1) >> 2; - /* set the value of the parent node as the maximum + /* set the value of the parent node as the maximum * of the four nodes of the current level. */ for (i = 0, cp = tp + child, cp1 = tp + parent; @@ -3885,8 +3885,8 @@ static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i) dcp->budmin = L2BPERDMAP + L2LPERCTL * level; /* - * initialize the leaves of current level that were not covered - * by the specified input block range (i.e. the leaves have no + * initialize the leaves of current level that were not covered + * by the specified input block range (i.e. the leaves have no * low level dmapctl or dmap). */ cp = &dcp->stree[CTLLEAFIND + i]; @@ -3900,9 +3900,9 @@ static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i) /* * NAME: dbGetL2AGSize()/ujfs_getagl2size() - * + * * FUNCTION: Determine log2(allocation group size) from aggregate size - * + * * PARAMETERS: * nblocks - Number of blocks in aggregate * @@ -3935,8 +3935,8 @@ static int dbGetL2AGSize(s64 nblocks) /* * NAME: dbMapFileSizeToMapSize() - * - * FUNCTION: compute number of blocks the block allocation map file + * + * FUNCTION: compute number of blocks the block allocation map file * can cover from the map file size; * * RETURNS: Number of blocks which can be covered by this block map file; @@ -3968,7 +3968,7 @@ s64 dbMapFileSizeToMapSize(struct inode * ipbmap) npages = nblocks >> JFS_SBI(sb)->l2nbperpage; level = BMAPPGTOLEV(npages); - /* At each level, accumulate the number of dmap pages covered by + /* At each level, accumulate the number of dmap pages covered by * the number of full child levels below it; * repeat for the last incomplete child level. */ @@ -3990,7 +3990,7 @@ s64 dbMapFileSizeToMapSize(struct inode * ipbmap) npages--; } - /* convert the number of dmaps into the number of blocks + /* convert the number of dmaps into the number of blocks * which can be covered by the dmaps; */ nblocks = ndmaps << L2BPERDMAP; diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h index 8b14cc8e022..45ea454c74b 100644 --- a/fs/jfs/jfs_dmap.h +++ b/fs/jfs/jfs_dmap.h @@ -1,18 +1,18 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 + * Copyright (C) International Business Machines Corp., 2000-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _H_JFS_DMAP @@ -27,7 +27,7 @@ #define L2LPERDMAP 8 /* l2 number of leaves per dmap tree */ #define DBWORD 32 /* # of blks covered by a map word */ #define L2DBWORD 5 /* l2 # of blks covered by a mword */ -#define BUDMIN L2DBWORD /* max free string in a map word */ +#define BUDMIN L2DBWORD /* max free string in a map word */ #define BPERDMAP (LPERDMAP * DBWORD) /* num of blks per dmap */ #define L2BPERDMAP 13 /* l2 num of blks per dmap */ #define CTLTREESIZE (1024+256+64+16+4+1) /* size of a dmapctl tree */ @@ -57,7 +57,7 @@ #define MAXMAPSIZE MAXL2SIZE /* maximum aggregate map size */ -/* +/* * determine the maximum free string for four (lower level) nodes * of the tree. */ @@ -122,7 +122,7 @@ static __inline signed char TREEMAX(signed char *cp) #define BLKTOCTL(b,s,l) \ (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s))) -/* +/* * convert aggregate map size to the zero origin dmapctl level of the * top dmapctl. */ @@ -192,13 +192,13 @@ typedef union dmtree { /* macros for accessing fields within dmtree */ #define dmt_nleafs t1.nleafs -#define dmt_l2nleafs t1.l2nleafs -#define dmt_leafidx t1.leafidx -#define dmt_height t1.height -#define dmt_budmin t1.budmin -#define dmt_stree t1.stree +#define dmt_l2nleafs t1.l2nleafs +#define dmt_leafidx t1.leafidx +#define dmt_height t1.height +#define dmt_budmin t1.budmin +#define dmt_stree t1.stree -/* +/* * on-disk aggregate disk allocation map descriptor. */ struct dbmap_disk { @@ -237,7 +237,7 @@ struct dbmap { s64 dn_agsize; /* num of blks per alloc group */ signed char dn_maxfreebud; /* max free buddy system */ }; /* - 4096 - */ -/* +/* * in-memory aggregate disk allocation map descriptor. */ struct bmap { diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 6c3f0831984..ecb2216d881 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -3,16 +3,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -78,7 +78,7 @@ * * case-insensitive search: * - * fold search key; + * fold search key; * * case-insensitive search of B-tree: * for internal entry, router key is already folded; @@ -93,7 +93,7 @@ * else * return no match; * - * serialization: + * serialization: * target directory inode lock is being held on entry/exit * of all main directory service routines. * @@ -925,7 +925,7 @@ int dtInsert(tid_t tid, struct inode *ip, * * return: 0 - success; * errno - failure; - * leaf page unpinned; + * leaf page unpinned; */ static int dtSplitUp(tid_t tid, struct inode *ip, struct dtsplit * split, struct btstack * btstack) @@ -3767,7 +3767,7 @@ static int ciCompare(struct component_name * key, /* search key */ * across page boundary * * return: non-zero on error - * + * */ static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, int ri, struct component_name * key, int flag) @@ -3780,13 +3780,13 @@ static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, lkey.name = (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_KERNEL); if (lkey.name == NULL) - return -ENOSPC; + return -ENOMEM; rkey.name = (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_KERNEL); if (rkey.name == NULL) { kfree(lkey.name); - return -ENOSPC; + return -ENOMEM; } /* get left and right key */ diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h index 13e4fdf0772..af8513f7864 100644 --- a/fs/jfs/jfs_dtree.h +++ b/fs/jfs/jfs_dtree.h @@ -1,18 +1,18 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 + * Copyright (C) International Business Machines Corp., 2000-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _H_JFS_DTREE @@ -80,7 +80,7 @@ struct idtentry { /* * leaf node entry head/only segment * - * For legacy filesystems, name contains 13 wchars -- no index field + * For legacy filesystems, name contains 13 wchars -- no index field */ struct ldtentry { __le32 inumber; /* 4: 4-byte aligned */ diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index 933b7457bfb..a35bdca6a80 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -3,16 +3,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -125,7 +125,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) } /* allocate the disk blocks for the extent. initially, extBalloc() - * will try to allocate disk blocks for the requested size (xlen). + * will try to allocate disk blocks for the requested size (xlen). * if this fails (xlen contiguous free blocks not avaliable), it'll * try to allocate a smaller number of blocks (producing a smaller * extent), with this smaller number of blocks consisting of the @@ -150,7 +150,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) /* determine the value of the extent flag */ xflag = abnr ? XAD_NOTRECORDED : 0; - /* if we can extend the hint extent to cover the current request, + /* if we can extend the hint extent to cover the current request, * extend it. otherwise, insert a new extent to * cover the current request. */ @@ -159,7 +159,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) else rc = xtInsert(0, ip, xflag, xoff, (int) nxlen, &nxaddr, 0); - /* if the extend or insert failed, + /* if the extend or insert failed, * free the newly allocated blocks and return the error. */ if (rc) { @@ -235,7 +235,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) xoff = offsetXAD(xp); /* if the extend page is abnr and if the request is for - * the extent to be allocated and recorded, + * the extent to be allocated and recorded, * make the page allocated and recorded. */ if ((xp->flag & XAD_NOTRECORDED) && !abnr) { @@ -397,7 +397,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp) if ((rc = xtLookupList(ip, &lxdl, &xadl, 0))) return (rc); - /* check if not extent exists for the previous page. + /* check if not extent exists for the previous page. * this is possible for sparse files. */ if (xadl.nxad == 0) { @@ -410,7 +410,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp) */ xp->flag &= XAD_NOTRECORDED; - if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) { + if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) { jfs_error(ip->i_sb, "extHint: corrupt xtree"); return -EIO; } @@ -492,7 +492,7 @@ int extFill(struct inode *ip, xad_t * xp) * FUNCTION: allocate disk blocks to form an extent. * * initially, we will try to allocate disk blocks for the - * requested size (nblocks). if this fails (nblocks + * requested size (nblocks). if this fails (nblocks * contiguous free blocks not avaliable), we'll try to allocate * a smaller number of blocks (producing a smaller extent), with * this smaller number of blocks consisting of the requested @@ -500,7 +500,7 @@ int extFill(struct inode *ip, xad_t * xp) * number (i.e. 16 -> 8). we'll continue to round down and * retry the allocation until the number of blocks to allocate * is smaller than the number of blocks per page. - * + * * PARAMETERS: * ip - the inode of the file. * hint - disk block number to be used as an allocation hint. @@ -509,7 +509,7 @@ int extFill(struct inode *ip, xad_t * xp) * exit, this value is set to the number of blocks actually * allocated. * blkno - pointer to a block address that is filled in on successful - * return with the starting block number of the newly + * return with the starting block number of the newly * allocated block range. * * RETURN VALUES: @@ -530,7 +530,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) /* get the number of blocks to initially attempt to allocate. * we'll first try the number of blocks requested unless this * number is greater than the maximum number of contiguous free - * blocks in the map. in that case, we'll start off with the + * blocks in the map. in that case, we'll start off with the * maximum free. */ max = (s64) 1 << bmp->db_maxfreebud; @@ -582,19 +582,19 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) * * FUNCTION: attempt to extend an extent's allocation. * - * initially, we will try to extend the extent's allocation - * in place. if this fails, we'll try to move the extent - * to a new set of blocks. if moving the extent, we initially + * Initially, we will try to extend the extent's allocation + * in place. If this fails, we'll try to move the extent + * to a new set of blocks. If moving the extent, we initially * will try to allocate disk blocks for the requested size - * (nnew). if this fails (new contiguous free blocks not - * avaliable), we'll try to allocate a smaller number of + * (newnblks). if this fails (new contiguous free blocks not + * avaliable), we'll try to allocate a smaller number of * blocks (producing a smaller extent), with this smaller * number of blocks consisting of the requested number of * blocks rounded down to the next smaller power of 2 - * number (i.e. 16 -> 8). we'll continue to round down and + * number (i.e. 16 -> 8). We'll continue to round down and * retry the allocation until the number of blocks to allocate * is smaller than the number of blocks per page. - * + * * PARAMETERS: * ip - the inode of the file. * blkno - starting block number of the extents current allocation. @@ -625,7 +625,7 @@ extBrealloc(struct inode *ip, return (rc); } - /* in place extension not possible. + /* in place extension not possible. * try to move the extent to a new set of blocks. */ return (extBalloc(ip, blkno, newnblks, newblkno)); diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h index 3a7f3f22e98..b567e12c52d 100644 --- a/fs/jfs/jfs_extent.h +++ b/fs/jfs/jfs_extent.h @@ -1,18 +1,18 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2001 + * Copyright (C) International Business Machines Corp., 2000-2001 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _H_JFS_EXTENT diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h index 72a5588faec..9901928668c 100644 --- a/fs/jfs/jfs_filsys.h +++ b/fs/jfs/jfs_filsys.h @@ -3,16 +3,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _H_JFS_FILSYS @@ -21,9 +21,9 @@ /* * jfs_filsys.h * - * file system (implementation-dependent) constants + * file system (implementation-dependent) constants * - * refer to <limits.h> for system wide implementation-dependent constants + * refer to <limits.h> for system wide implementation-dependent constants */ /* @@ -49,7 +49,7 @@ #define JFS_DFS 0x20000000 /* DCE DFS LFS support */ -#define JFS_LINUX 0x10000000 /* Linux support */ +#define JFS_LINUX 0x10000000 /* Linux support */ /* case-sensitive name/directory support */ /* directory option */ @@ -59,7 +59,7 @@ #define JFS_COMMIT 0x00000f00 /* commit option mask */ #define JFS_GROUPCOMMIT 0x00000100 /* group (of 1) commit */ #define JFS_LAZYCOMMIT 0x00000200 /* lazy commit */ -#define JFS_TMPFS 0x00000400 /* temporary file system - +#define JFS_TMPFS 0x00000400 /* temporary file system - * do not log/commit: */ @@ -196,7 +196,7 @@ * followed by 1st extent of map */ #define AITBL_OFF (AIMAP_OFF + (SIZE_OF_MAP_PAGE << 1)) - /* + /* * 1st extent of aggregate inode table */ #define SUPER2_OFF (AITBL_OFF + INODE_EXTENT_SIZE) @@ -270,13 +270,13 @@ */ #define FM_CLEAN 0x00000000 /* file system is unmounted and clean */ #define FM_MOUNT 0x00000001 /* file system is mounted cleanly */ -#define FM_DIRTY 0x00000002 /* file system was not unmounted and clean - * when mounted or +#define FM_DIRTY 0x00000002 /* file system was not unmounted and clean + * when mounted or * commit failure occurred while being mounted: - * fsck() must be run to repair + * fsck() must be run to repair */ #define FM_LOGREDO 0x00000004 /* log based recovery (logredo()) failed: - * fsck() must be run to repair + * fsck() must be run to repair */ #define FM_EXTENDFS 0x00000008 /* file system extendfs() in progress */ diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index a45ee248958..489a3d63002 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -3,16 +3,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -98,7 +98,7 @@ static void copy_to_dinode(struct dinode *, struct inode *); * FUNCTION: initialize the incore inode map control structures for * a fileset or aggregate init time. * - * the inode map's control structure (dinomap) is + * the inode map's control structure (dinomap) is * brought in from disk and placed in virtual memory. * * PARAMETERS: @@ -107,7 +107,7 @@ static void copy_to_dinode(struct dinode *, struct inode *); * RETURN VALUES: * 0 - success * -ENOMEM - insufficient free virtual memory. - * -EIO - i/o error. + * -EIO - i/o error. */ int diMount(struct inode *ipimap) { @@ -191,7 +191,7 @@ int diMount(struct inode *ipimap) * RETURN VALUES: * 0 - success * -ENOMEM - insufficient free virtual memory. - * -EIO - i/o error. + * -EIO - i/o error. */ int diUnmount(struct inode *ipimap, int mounterror) { @@ -281,7 +281,7 @@ int diSync(struct inode *ipimap) * on entry, the specifed incore inode should itself * specify the disk inode number corresponding to the * incore inode (i.e. i_number should be initialized). - * + * * this routine handles incore inode initialization for * both "special" and "regular" inodes. special inodes * are those required early in the mount process and @@ -289,7 +289,7 @@ int diSync(struct inode *ipimap) * is not yet initialized. these "special" inodes are * identified by a NULL inode map inode pointer and are * actually initialized by a call to diReadSpecial(). - * + * * for regular inodes, the iag describing the disk inode * is read from disk to determine the inode extent address * for the disk inode. with the inode extent address in @@ -302,9 +302,9 @@ int diSync(struct inode *ipimap) * * RETURN VALUES: * 0 - success - * -EIO - i/o error. + * -EIO - i/o error. * -ENOMEM - insufficient memory - * + * */ int diRead(struct inode *ip) { @@ -586,14 +586,14 @@ void diFreeSpecial(struct inode *ip) * page of the extent that contains the disk inode is * read and the disk inode portion of the incore inode * is copied to the disk inode. - * + * * PARAMETERS: * tid - transacation id * ip - pointer to incore inode to be written to the inode extent. * * RETURN VALUES: * 0 - success - * -EIO - i/o error. + * -EIO - i/o error. */ int diWrite(tid_t tid, struct inode *ip) { @@ -676,11 +676,11 @@ int diWrite(tid_t tid, struct inode *ip) * copy btree root from in-memory inode to on-disk inode * * (tlock is taken from inline B+-tree root in in-memory - * inode when the B+-tree root is updated, which is pointed + * inode when the B+-tree root is updated, which is pointed * by jfs_ip->blid as well as being on tx tlock list) * - * further processing of btree root is based on the copy - * in in-memory inode, where txLog() will log from, and, + * further processing of btree root is based on the copy + * in in-memory inode, where txLog() will log from, and, * for xtree root, txUpdateMap() will update map and reset * XAD_NEW bit; */ @@ -824,7 +824,7 @@ int diWrite(tid_t tid, struct inode *ip) memcpy(&dp->di_DASD, &ip->i_DASD, sizeof(struct dasd)); #endif /* _JFS_FASTDASD */ - /* release the buffer holding the updated on-disk inode. + /* release the buffer holding the updated on-disk inode. * the buffer will be later written by commit processing. */ write_metapage(mp); @@ -842,7 +842,7 @@ int diWrite(tid_t tid, struct inode *ip) * if the inode to be freed represents the first (only) * free inode within the iag, the iag will be placed on * the ag free inode list. - * + * * freeing the inode will cause the inode extent to be * freed if the inode is the only allocated inode within * the extent. in this case all the disk resource backing @@ -865,11 +865,11 @@ int diWrite(tid_t tid, struct inode *ip) * any updates and are held until all updates are complete. * * PARAMETERS: - * ip - inode to be freed. + * ip - inode to be freed. * * RETURN VALUES: * 0 - success - * -EIO - i/o error. + * -EIO - i/o error. */ int diFree(struct inode *ip) { @@ -898,7 +898,7 @@ int diFree(struct inode *ip) */ iagno = INOTOIAG(inum); - /* make sure that the iag is contained within + /* make sure that the iag is contained within * the map. */ if (iagno >= imap->im_nextiag) { @@ -1013,7 +1013,7 @@ int diFree(struct inode *ip) /* update the free inode summary map for the extent if * freeing the inode means the extent will now have free - * inodes (i.e., the inode being freed is the first free + * inodes (i.e., the inode being freed is the first free * inode of extent), */ if (iagp->wmap[extno] == cpu_to_le32(ONES)) { @@ -1204,9 +1204,9 @@ int diFree(struct inode *ip) iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1); } - /* update the inode extent address and working map + /* update the inode extent address and working map * to reflect the free extent. - * the permanent map should have been updated already + * the permanent map should have been updated already * for the inode being freed. */ if (iagp->pmap[extno] != 0) { @@ -1218,7 +1218,7 @@ int diFree(struct inode *ip) /* update the free extent and free inode summary maps * to reflect the freed extent. - * the inode summary map is marked to indicate no inodes + * the inode summary map is marked to indicate no inodes * available for the freed extent. */ sword = extno >> L2EXTSPERSUM; @@ -1255,17 +1255,17 @@ int diFree(struct inode *ip) * start transaction to update block allocation map * for the inode extent freed; * - * N.B. AG_LOCK is released and iag will be released below, and + * N.B. AG_LOCK is released and iag will be released below, and * other thread may allocate inode from/reusing the ixad freed - * BUT with new/different backing inode extent from the extent - * to be freed by the transaction; + * BUT with new/different backing inode extent from the extent + * to be freed by the transaction; */ tid = txBegin(ipimap->i_sb, COMMIT_FORCE); mutex_lock(&JFS_IP(ipimap)->commit_mutex); - /* acquire tlock of the iag page of the freed ixad + /* acquire tlock of the iag page of the freed ixad * to force the page NOHOMEOK (even though no data is - * logged from the iag page) until NOREDOPAGE|FREEXTENT log + * logged from the iag page) until NOREDOPAGE|FREEXTENT log * for the free of the extent is committed; * write FREEXTENT|NOREDOPAGE log record * N.B. linelock is overlaid as freed extent descriptor; @@ -1284,8 +1284,8 @@ int diFree(struct inode *ip) * logredo needs the IAG number and IAG extent index in order * to ensure that the IMap is consistent. The least disruptive * way to pass these values through to the transaction manager - * is in the iplist array. - * + * is in the iplist array. + * * It's not pretty, but it works. */ iplist[1] = (struct inode *) (size_t)iagno; @@ -1340,18 +1340,18 @@ diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp) /* * NAME: diAlloc(pip,dir,ip) * - * FUNCTION: allocate a disk inode from the inode working map + * FUNCTION: allocate a disk inode from the inode working map * for a fileset or aggregate. * * PARAMETERS: - * pip - pointer to incore inode for the parent inode. - * dir - 'true' if the new disk inode is for a directory. - * ip - pointer to a new inode + * pip - pointer to incore inode for the parent inode. + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to a new inode * * RETURN VALUES: * 0 - success. * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * -EIO - i/o error. */ int diAlloc(struct inode *pip, bool dir, struct inode *ip) { @@ -1372,7 +1372,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) JFS_IP(ip)->ipimap = ipimap; JFS_IP(ip)->fileset = FILESYSTEM_I; - /* for a directory, the allocation policy is to start + /* for a directory, the allocation policy is to start * at the ag level using the preferred ag. */ if (dir) { @@ -1435,7 +1435,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) /* * try to allocate from the IAG */ - /* check if the inode may be allocated from the iag + /* check if the inode may be allocated from the iag * (i.e. the inode has free inodes or new extent can be added). */ if (iagp->nfreeinos || addext) { @@ -1490,7 +1490,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) * hint or, if appropriate (i.e. addext is true), allocate * an extent of free inodes at or following the extent * containing the hint. - * + * * the free inode and free extent summary maps are used * here, so determine the starting summary map position * and the number of words we'll have to examine. again, @@ -1641,7 +1641,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) * inodes should be added for the allocation group, with * the current request satisfied from this extent. if this * is the case, an attempt will be made to do just that. if - * this attempt fails or it has been determined that a new + * this attempt fails or it has been determined that a new * extent should not be added, an attempt is made to satisfy * the request by allocating an existing (backed) free inode * from the allocation group. @@ -1649,24 +1649,24 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) * PRE CONDITION: Already have the AG lock for this AG. * * PARAMETERS: - * imap - pointer to inode map control structure. - * agno - allocation group to allocate from. - * dir - 'true' if the new disk inode is for a directory. - * ip - pointer to the new inode to be filled in on successful return + * imap - pointer to inode map control structure. + * agno - allocation group to allocate from. + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to the new inode to be filled in on successful return * with the disk inode number allocated, its extent address * and the start of the ag. * * RETURN VALUES: * 0 - success. * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * -EIO - i/o error. */ static int diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip) { int rc, addext, numfree, numinos; - /* get the number of free and the number of backed disk + /* get the number of free and the number of backed disk * inodes currently within the ag. */ numfree = imap->im_agctl[agno].numfree; @@ -1719,17 +1719,17 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip) * specified primary group. * * PARAMETERS: - * imap - pointer to inode map control structure. - * agno - primary allocation group (to avoid). - * dir - 'true' if the new disk inode is for a directory. - * ip - pointer to a new inode to be filled in on successful return + * imap - pointer to inode map control structure. + * agno - primary allocation group (to avoid). + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to a new inode to be filled in on successful return * with the disk inode number allocated, its extent address * and the start of the ag. * * RETURN VALUES: * 0 - success. * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * -EIO - i/o error. */ static int diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip) @@ -1738,7 +1738,7 @@ diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip) int maxag = JFS_SBI(imap->im_ipimap->i_sb)->bmap->db_maxag; - /* try to allocate from the ags following agno up to + /* try to allocate from the ags following agno up to * the maximum ag number. */ for (ag = agno + 1; ag <= maxag; ag++) { @@ -1780,21 +1780,21 @@ diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip) * * allocation occurs from the first iag on the list using * the iag's free inode summary map to find the leftmost - * free inode in the iag. - * + * free inode in the iag. + * * PRE CONDITION: Already have AG lock for this AG. - * + * * PARAMETERS: - * imap - pointer to inode map control structure. - * agno - allocation group. - * ip - pointer to new inode to be filled in on successful return + * imap - pointer to inode map control structure. + * agno - allocation group. + * ip - pointer to new inode to be filled in on successful return * with the disk inode number allocated, its extent address * and the start of the ag. * * RETURN VALUES: * 0 - success. * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * -EIO - i/o error. */ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) { @@ -1867,7 +1867,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) return -EIO; } - /* compute the inode number within the iag. + /* compute the inode number within the iag. */ ino = (extno << L2INOSPEREXT) + rem; @@ -1892,17 +1892,17 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) /* * NAME: diAllocExt(imap,agno,ip) * - * FUNCTION: add a new extent of free inodes to an iag, allocating - * an inode from this extent to satisfy the current allocation - * request. - * + * FUNCTION: add a new extent of free inodes to an iag, allocating + * an inode from this extent to satisfy the current allocation + * request. + * * this routine first tries to find an existing iag with free * extents through the ag free extent list. if list is not * empty, the head of the list will be selected as the home * of the new extent of free inodes. otherwise (the list is * empty), a new iag will be allocated for the ag to contain * the extent. - * + * * once an iag has been selected, the free extent summary map * is used to locate a free extent within the iag and diNewExt() * is called to initialize the extent, with initialization @@ -1910,16 +1910,16 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) * for the purpose of satisfying this request. * * PARAMETERS: - * imap - pointer to inode map control structure. - * agno - allocation group number. - * ip - pointer to new inode to be filled in on successful return + * imap - pointer to inode map control structure. + * agno - allocation group number. + * ip - pointer to new inode to be filled in on successful return * with the disk inode number allocated, its extent address * and the start of the ag. * * RETURN VALUES: * 0 - success. * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * -EIO - i/o error. */ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) { @@ -2012,7 +2012,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) /* * NAME: diAllocBit(imap,iagp,ino) * - * FUNCTION: allocate a backed inode from an iag. + * FUNCTION: allocate a backed inode from an iag. * * this routine performs the mechanics of allocating a * specified inode from a backed extent. @@ -2025,19 +2025,19 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) * in the face of updates to multiple buffers. under this * approach, all required buffers are obtained before making * any updates and are held all are updates are complete. - * + * * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on * this AG. Must have read lock on imap inode. * * PARAMETERS: - * imap - pointer to inode map control structure. - * iagp - pointer to iag. - * ino - inode number to be allocated within the iag. + * imap - pointer to inode map control structure. + * iagp - pointer to iag. + * ino - inode number to be allocated within the iag. * * RETURN VALUES: * 0 - success. * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * -EIO - i/o error. */ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino) { @@ -2172,19 +2172,19 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino) * buffers. under this approach, all required buffers are * obtained before making any updates and are held until all * updates are complete. - * + * * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on * this AG. Must have read lock on imap inode. * * PARAMETERS: - * imap - pointer to inode map control structure. - * iagp - pointer to iag. - * extno - extent number. + * imap - pointer to inode map control structure. + * iagp - pointer to iag. + * extno - extent number. * * RETURN VALUES: * 0 - success. * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * -EIO - i/o error. */ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) { @@ -2432,34 +2432,34 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) /* * NAME: diNewIAG(imap,iagnop,agno) * - * FUNCTION: allocate a new iag for an allocation group. - * - * first tries to allocate the iag from the inode map - * iagfree list: - * if the list has free iags, the head of the list is removed + * FUNCTION: allocate a new iag for an allocation group. + * + * first tries to allocate the iag from the inode map + * iagfree list: + * if the list has free iags, the head of the list is removed * and returned to satisfy the request. * if the inode map's iag free list is empty, the inode map * is extended to hold a new iag. this new iag is initialized * and returned to satisfy the request. * * PARAMETERS: - * imap - pointer to inode map control structure. - * iagnop - pointer to an iag number set with the number of the + * imap - pointer to inode map control structure. + * iagnop - pointer to an iag number set with the number of the * newly allocated iag upon successful return. - * agno - allocation group number. + * agno - allocation group number. * bpp - Buffer pointer to be filled in with new IAG's buffer * * RETURN VALUES: * 0 - success. * -ENOSPC - insufficient disk resources. - * -EIO - i/o error. + * -EIO - i/o error. * - * serialization: + * serialization: * AG lock held on entry/exit; * write lock on the map is held inside; * read lock on the map is held on successful completion; * - * note: new iag transaction: + * note: new iag transaction: * . synchronously write iag; * . write log of xtree and inode of imap; * . commit; @@ -2494,7 +2494,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) /* acquire the free iag lock */ IAGFREE_LOCK(imap); - /* if there are any iags on the inode map free iag list, + /* if there are any iags on the inode map free iag list, * allocate the iag from the head of the list. */ if (imap->im_freeiag >= 0) { @@ -2618,8 +2618,8 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) flush_metapage(mp); /* - * txCommit(COMMIT_FORCE) will synchronously write address - * index pages and inode after commit in careful update order + * txCommit(COMMIT_FORCE) will synchronously write address + * index pages and inode after commit in careful update order * of address index pages (right to left, bottom up); */ iplist[0] = ipimap; @@ -2678,11 +2678,11 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) * * FUNCTION: get the buffer for the specified iag within a fileset * or aggregate inode map. - * + * * PARAMETERS: - * imap - pointer to inode map control structure. - * iagno - iag number. - * bpp - point to buffer pointer to be filled in on successful + * imap - pointer to inode map control structure. + * iagno - iag number. + * bpp - point to buffer pointer to be filled in on successful * exit. * * SERIALIZATION: @@ -2692,7 +2692,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) * * RETURN VALUES: * 0 - success. - * -EIO - i/o error. + * -EIO - i/o error. */ static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp) { @@ -2718,8 +2718,8 @@ static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp) * the specified bit position. * * PARAMETERS: - * word - word to be examined. - * start - starting bit position. + * word - word to be examined. + * start - starting bit position. * * RETURN VALUES: * bit position of first free bit in the word or 32 if @@ -2740,10 +2740,10 @@ static int diFindFree(u32 word, int start) /* * NAME: diUpdatePMap() - * - * FUNCTION: Update the persistent map in an IAG for the allocation or + * + * FUNCTION: Update the persistent map in an IAG for the allocation or * freeing of the specified inode. - * + * * PRE CONDITIONS: Working map has already been updated for allocate. * * PARAMETERS: @@ -2752,7 +2752,7 @@ static int diFindFree(u32 word, int start) * is_free - If 'true' indicates inode should be marked freed, otherwise * indicates inode should be marked allocated. * - * RETURN VALUES: + * RETURN VALUES: * 0 for success */ int @@ -2793,7 +2793,7 @@ diUpdatePMap(struct inode *ipimap, extno = ino >> L2INOSPEREXT; bitno = ino & (INOSPEREXT - 1); mask = HIGHORDER >> bitno; - /* + /* * mark the inode free in persistent map: */ if (is_free) { @@ -2803,7 +2803,7 @@ diUpdatePMap(struct inode *ipimap, * of last reference release; */ if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { - jfs_error(ipimap->i_sb, + jfs_error(ipimap->i_sb, "diUpdatePMap: inode %ld not marked as " "allocated in wmap!", inum); } @@ -2877,8 +2877,8 @@ diUpdatePMap(struct inode *ipimap, * diExtendFS() * * function: update imap for extendfs(); - * - * note: AG size has been increased s.t. each k old contiguous AGs are + * + * note: AG size has been increased s.t. each k old contiguous AGs are * coalesced into a new AG; */ int diExtendFS(struct inode *ipimap, struct inode *ipbmap) @@ -2897,7 +2897,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap) atomic_read(&imap->im_numfree)); /* - * reconstruct imap + * reconstruct imap * * coalesce contiguous k (newAGSize/oldAGSize) AGs; * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn; @@ -2931,7 +2931,7 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap) } /* leave free iag in the free iag list */ - if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { release_metapage(bp); continue; } diff --git a/fs/jfs/jfs_imap.h b/fs/jfs/jfs_imap.h index e3b7db47db6..4f9c346ed49 100644 --- a/fs/jfs/jfs_imap.h +++ b/fs/jfs/jfs_imap.h @@ -1,18 +1,18 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 + * Copyright (C) International Business Machines Corp., 2000-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _H_JFS_IMAP @@ -45,13 +45,13 @@ /* get the starting block number of the 4K page of an inode extent * that contains ino. */ -#define INOPBLK(pxd,ino,l2nbperpg) (addressPXD((pxd)) + \ +#define INOPBLK(pxd,ino,l2nbperpg) (addressPXD((pxd)) + \ ((((ino) & (INOSPEREXT-1)) >> L2INOSPERPAGE) << (l2nbperpg))) /* * inode allocation map: - * - * inode allocation map consists of + * + * inode allocation map consists of * . the inode map control page and * . inode allocation group pages (per 4096 inodes) * which are addressed by standard JFS xtree. diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index 54d73716ca8..94005584445 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -4,18 +4,18 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ + */ #ifndef _H_JFS_INCORE #define _H_JFS_INCORE diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index bffaca9ae3a..4c67ed97682 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -3,16 +3,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -61,7 +61,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) inode = new_inode(sb); if (!inode) { jfs_warn("ialloc: new_inode returned NULL!"); - return inode; + return ERR_PTR(-ENOMEM); } jfs_inode = JFS_IP(inode); @@ -69,9 +69,10 @@ struct inode *ialloc(struct inode *parent, umode_t mode) rc = diAlloc(parent, S_ISDIR(mode), inode); if (rc) { jfs_warn("ialloc: diAlloc returned %d!", rc); - make_bad_inode(inode); + if (rc == -EIO) + make_bad_inode(inode); iput(inode); - return NULL; + return ERR_PTR(rc); } inode->i_uid = current->fsuid; @@ -97,7 +98,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) inode->i_flags |= S_NOQUOTA; inode->i_nlink = 0; iput(inode); - return NULL; + return ERR_PTR(-EDQUOT); } inode->i_mode = mode; diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 1fc48df670c..0d06ccfaff0 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -3,16 +3,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _H_JFS_INODE diff --git a/fs/jfs/jfs_lock.h b/fs/jfs/jfs_lock.h index 70ac9f7d1e0..7d78e83d7c4 100644 --- a/fs/jfs/jfs_lock.h +++ b/fs/jfs/jfs_lock.h @@ -1,19 +1,19 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2001 - * Portions Copyright (c) Christoph Hellwig, 2001-2002 + * Copyright (C) International Business Machines Corp., 2000-2001 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _H_JFS_LOCK diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 3315f0b1fbc..b89c9aba046 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -4,16 +4,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -337,7 +337,7 @@ int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, * PARAMETER: cd - commit descriptor * * RETURN: end-of-log address - * + * * serialization: LOG_LOCK() held on entry/exit */ static int @@ -554,7 +554,7 @@ lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, * PARAMETER: log * * RETURN: 0 - * + * * serialization: LOG_LOCK() held on entry/exit */ static int lmNextPage(struct jfs_log * log) @@ -656,7 +656,7 @@ static int lmNextPage(struct jfs_log * log) * page number - redrive pageout of the page at the head of * pageout queue until full page has been written. * - * RETURN: + * RETURN: * * NOTE: * LOGGC_LOCK serializes log group commit queue, and @@ -920,10 +920,10 @@ static void lmPostGC(struct lbuf * bp) * this code is called again. * * PARAMETERS: log - log structure - * hard_sync - 1 to force all metadata to be written + * hard_sync - 1 to force all metadata to be written * * RETURN: 0 - * + * * serialization: LOG_LOCK() held on entry/exit */ static int lmLogSync(struct jfs_log * log, int hard_sync) @@ -1052,7 +1052,7 @@ static int lmLogSync(struct jfs_log * log, int hard_sync) * FUNCTION: write log SYNCPT record for specified log * * PARAMETERS: log - log structure - * hard_sync - set to 1 to force metadata to be written + * hard_sync - set to 1 to force metadata to be written */ void jfs_syncpt(struct jfs_log *log, int hard_sync) { LOG_LOCK(log); @@ -1067,7 +1067,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync) * insert filesystem in the active list of the log. * * PARAMETER: ipmnt - file system mount inode - * iplog - log inode (out) + * iplog - log inode (out) * * RETURN: * @@ -1082,7 +1082,7 @@ int lmLogOpen(struct super_block *sb) if (sbi->flag & JFS_NOINTEGRITY) return open_dummy_log(sb); - + if (sbi->mntflag & JFS_INLINELOG) return open_inline_log(sb); @@ -1131,7 +1131,7 @@ int lmLogOpen(struct super_block *sb) log->bdev = bdev; memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid)); - + /* * initialize log: */ @@ -1253,13 +1253,13 @@ static int open_dummy_log(struct super_block *sb) * initialize the log from log superblock. * set the log state in the superblock to LOGMOUNT and * write SYNCPT log record. - * + * * PARAMETER: log - log structure * * RETURN: 0 - if ok * -EINVAL - bad log magic number or superblock dirty * error returned from logwait() - * + * * serialization: single first open thread */ int lmLogInit(struct jfs_log * log) @@ -1297,7 +1297,7 @@ int lmLogInit(struct jfs_log * log) if (!test_bit(log_INLINELOG, &log->flag)) log->l2bsize = L2LOGPSIZE; - + /* check for disabled journaling to disk */ if (log->no_integrity) { /* @@ -1651,7 +1651,7 @@ void jfs_flush_journal(struct jfs_log *log, int wait) * PARAMETER: log - log inode * * RETURN: 0 - success - * + * * serialization: single last close thread */ int lmLogShutdown(struct jfs_log * log) @@ -1677,7 +1677,7 @@ int lmLogShutdown(struct jfs_log * log) lrd.type = cpu_to_le16(LOG_SYNCPT); lrd.length = 0; lrd.log.syncpt.sync = 0; - + lsn = lmWriteRecord(log, NULL, &lrd, NULL); bp = log->bp; lp = (struct logpage *) bp->l_ldata; @@ -1703,7 +1703,7 @@ int lmLogShutdown(struct jfs_log * log) jfs_info("lmLogShutdown: lsn:0x%x page:%d eor:%d", lsn, log->page, log->eor); - out: + out: /* * shutdown per log i/o */ @@ -1769,7 +1769,7 @@ static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi, lbmFree(bpsuper); return -EIO; } - + } /* diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h index 8c6909b8001..a53fb17ea21 100644 --- a/fs/jfs/jfs_logmgr.h +++ b/fs/jfs/jfs_logmgr.h @@ -4,16 +4,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _H_JFS_LOGMGR @@ -35,19 +35,19 @@ /* * log logical volume * - * a log is used to make the commit operation on journalled + * a log is used to make the commit operation on journalled * files within the same logical volume group atomic. * a log is implemented with a logical volume. - * there is one log per logical volume group. + * there is one log per logical volume group. * * block 0 of the log logical volume is not used (ipl etc). * block 1 contains a log "superblock" and is used by logFormat(), - * lmLogInit(), lmLogShutdown(), and logRedo() to record status - * of the log but is not otherwise used during normal processing. + * lmLogInit(), lmLogShutdown(), and logRedo() to record status + * of the log but is not otherwise used during normal processing. * blocks 2 - (N-1) are used to contain log records. * - * when a volume group is varied-on-line, logRedo() must have - * been executed before the file systems (logical volumes) in + * when a volume group is varied-on-line, logRedo() must have + * been executed before the file systems (logical volumes) in * the volume group can be mounted. */ /* @@ -97,26 +97,26 @@ struct logsuper { * log logical page * * (this comment should be rewritten !) - * the header and trailer structures (h,t) will normally have + * the header and trailer structures (h,t) will normally have * the same page and eor value. - * An exception to this occurs when a complete page write is not + * An exception to this occurs when a complete page write is not * accomplished on a power failure. Since the hardware may "split write" - * sectors in the page, any out of order sequence may occur during powerfail + * sectors in the page, any out of order sequence may occur during powerfail * and needs to be recognized during log replay. The xor value is * an "exclusive or" of all log words in the page up to eor. This * 32 bit eor is stored with the top 16 bits in the header and the * bottom 16 bits in the trailer. logredo can easily recognize pages - * that were not completed by reconstructing this eor and checking + * that were not completed by reconstructing this eor and checking * the log page. * - * Previous versions of the operating system did not allow split - * writes and detected partially written records in logredo by - * ordering the updates to the header, trailer, and the move of data - * into the logdata area. The order: (1) data is moved (2) header - * is updated (3) trailer is updated. In logredo, when the header - * differed from the trailer, the header and trailer were reconciled - * as follows: if h.page != t.page they were set to the smaller of - * the two and h.eor and t.eor set to 8 (i.e. empty page). if (only) + * Previous versions of the operating system did not allow split + * writes and detected partially written records in logredo by + * ordering the updates to the header, trailer, and the move of data + * into the logdata area. The order: (1) data is moved (2) header + * is updated (3) trailer is updated. In logredo, when the header + * differed from the trailer, the header and trailer were reconciled + * as follows: if h.page != t.page they were set to the smaller of + * the two and h.eor and t.eor set to 8 (i.e. empty page). if (only) * h.eor != t.eor they were set to the smaller of their two values. */ struct logpage { @@ -147,20 +147,20 @@ struct logpage { * in a page, pages are written to temporary paging space if * if they must be written to disk before commit, and i/o is * scheduled for modified pages to their home location after - * the log records containing the after values and the commit + * the log records containing the after values and the commit * record is written to the log on disk, undo discards the copy * in main-memory.) * - * a log record consists of a data area of variable length followed by + * a log record consists of a data area of variable length followed by * a descriptor of fixed size LOGRDSIZE bytes. - * the data area is rounded up to an integral number of 4-bytes and + * the data area is rounded up to an integral number of 4-bytes and * must be no longer than LOGPSIZE. - * the descriptor is of size of multiple of 4-bytes and aligned on a - * 4-byte boundary. + * the descriptor is of size of multiple of 4-bytes and aligned on a + * 4-byte boundary. * records are packed one after the other in the data area of log pages. - * (sometimes a DUMMY record is inserted so that at least one record ends + * (sometimes a DUMMY record is inserted so that at least one record ends * on every page or the longest record is placed on at most two pages). - * the field eor in page header/trailer points to the byte following + * the field eor in page header/trailer points to the byte following * the last record on a page. */ @@ -270,11 +270,11 @@ struct lrd { /* * NOREDOINOEXT: the inode extent is freed * - * do not apply after-image records which precede this - * record in the log with the any of the 4 page block - * numbers in this inode extent. - * - * NOTE: The fileset and pxd fields MUST remain in + * do not apply after-image records which precede this + * record in the log with the any of the 4 page block + * numbers in this inode extent. + * + * NOTE: The fileset and pxd fields MUST remain in * the same fields in the REDOPAGE record format. * */ @@ -319,12 +319,10 @@ struct lrd { * do not apply records which precede this record in the log * with the same inode number. * - * NOREDILE must be the first to be written at commit + * NOREDOFILE must be the first to be written at commit * (last to be read in logredo()) - it prevents * replay of preceding updates of all preceding generations - * of the inumber esp. the on-disk inode itself, - * but does NOT prevent - * replay of the + * of the inumber esp. the on-disk inode itself. */ struct { __le32 fileset; /* 4: fileset number */ @@ -332,7 +330,7 @@ struct lrd { } noredofile; /* - * ? NEWPAGE: + * ? NEWPAGE: * * metadata type dependent */ @@ -464,7 +462,7 @@ struct lbuf { s64 l_blkno; /* 8: log page block number */ caddr_t l_ldata; /* 4: data page */ struct page *l_page; /* The page itself */ - uint l_offset; /* Offset of l_ldata within the page */ + uint l_offset; /* Offset of l_ldata within the page */ wait_queue_head_t l_ioevent; /* 4: i/o done event */ }; diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index f5afc129d6b..0cccd1c39d7 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -4,16 +4,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -461,7 +461,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) goto add_failed; if (!bio->bi_size) goto dump_bio; - + submit_bio(WRITE, bio); } if (redirty) @@ -648,7 +648,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, jfs_err("logical_size = %d, size = %d", mp->logical_size, size); dump_stack(); - goto unlock; + goto unlock; } mp->count++; lock_metapage(mp); @@ -658,7 +658,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, "__get_metapage: using a " "discarded metapage"); discard_metapage(mp); - goto unlock; + goto unlock; } clear_bit(META_discard, &mp->flag); } diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h index 01a5a455e01..d94f8d9e87d 100644 --- a/fs/jfs/jfs_metapage.h +++ b/fs/jfs/jfs_metapage.h @@ -4,16 +4,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _H_JFS_METAPAGE @@ -33,7 +33,7 @@ struct metapage { unsigned long flag; /* See Below */ unsigned long count; /* Reference count */ void *data; /* Data pointer */ - sector_t index; /* block address of page */ + sector_t index; /* block address of page */ wait_queue_head_t wait; /* implementation */ diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 032d111bc33..4dd47983489 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -3,16 +3,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -21,18 +21,18 @@ * * note: file system in transition to aggregate/fileset: * - * file system mount is interpreted as the mount of aggregate, - * if not already mounted, and mount of the single/only fileset in + * file system mount is interpreted as the mount of aggregate, + * if not already mounted, and mount of the single/only fileset in * the aggregate; * * a file system/aggregate is represented by an internal inode * (aka mount inode) initialized with aggregate superblock; - * each vfs represents a fileset, and points to its "fileset inode + * each vfs represents a fileset, and points to its "fileset inode * allocation map inode" (aka fileset inode): - * (an aggregate itself is structured recursively as a filset: - * an internal vfs is constructed and points to its "fileset inode - * allocation map inode" (aka aggregate inode) where each inode - * represents a fileset inode) so that inode number is mapped to + * (an aggregate itself is structured recursively as a filset: + * an internal vfs is constructed and points to its "fileset inode + * allocation map inode" (aka aggregate inode) where each inode + * represents a fileset inode) so that inode number is mapped to * on-disk inode in uniform way at both aggregate and fileset level; * * each vnode/inode of a fileset is linked to its vfs (to facilitate @@ -41,7 +41,7 @@ * per aggregate information, e.g., block size, etc.) as well as * its file set inode. * - * aggregate + * aggregate * ipmnt * mntvfs -> fileset ipimap+ -> aggregate ipbmap -> aggregate ipaimap; * fileset vfs -> vp(1) <-> ... <-> vp(n) <->vproot; @@ -88,7 +88,7 @@ int jfs_mount(struct super_block *sb) struct inode *ipbmap = NULL; /* - * read/validate superblock + * read/validate superblock * (initialize mount inode from the superblock) */ if ((rc = chkSuper(sb))) { @@ -238,7 +238,7 @@ int jfs_mount(struct super_block *sb) */ int jfs_mount_rw(struct super_block *sb, int remount) { - struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_sb_info *sbi = JFS_SBI(sb); int rc; /* @@ -291,7 +291,7 @@ int jfs_mount_rw(struct super_block *sb, int remount) /* * chkSuper() * - * validate the superblock of the file system to be mounted and + * validate the superblock of the file system to be mounted and * get the file system parameters. * * returns @@ -426,7 +426,7 @@ int updateSuper(struct super_block *sb, uint state) jfs_err("updateSuper: bad state"); } else if (sbi->state == FM_DIRTY) return 0; - + if ((rc = readSuper(sb, &bh))) return rc; @@ -486,9 +486,9 @@ int readSuper(struct super_block *sb, struct buffer_head **bpp) * for this file system past this point in log. * it is harmless if mount fails. * - * note: MOUNT record is at aggregate level, not at fileset level, + * note: MOUNT record is at aggregate level, not at fileset level, * since log records of previous mounts of a fileset - * (e.g., AFTER record of extent allocation) have to be processed + * (e.g., AFTER record of extent allocation) have to be processed * to update block allocation map at aggregate level. */ static int logMOUNT(struct super_block *sb) diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h index 682cf1a68a1..884fc21ab8e 100644 --- a/fs/jfs/jfs_superblock.h +++ b/fs/jfs/jfs_superblock.h @@ -3,16 +3,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _H_JFS_SUPERBLOCK @@ -21,14 +21,14 @@ /* * make the magic number something a human could read */ -#define JFS_MAGIC "JFS1" /* Magic word */ +#define JFS_MAGIC "JFS1" /* Magic word */ #define JFS_VERSION 2 /* Version number: Version 2 */ #define LV_NAME_SIZE 11 /* MUST BE 11 for OS/2 boot sector */ -/* - * aggregate superblock +/* + * aggregate superblock * * The name superblock is too close to super_block, so the name has been * changed to jfs_superblock. The utilities are still using the old name. @@ -40,7 +40,7 @@ struct jfs_superblock { __le64 s_size; /* 8: aggregate size in hardware/LVM blocks; * VFS: number of blocks */ - __le32 s_bsize; /* 4: aggregate block size in bytes; + __le32 s_bsize; /* 4: aggregate block size in bytes; * VFS: fragment size */ __le16 s_l2bsize; /* 2: log2 of s_bsize */ @@ -54,7 +54,7 @@ struct jfs_superblock { __le32 s_flag; /* 4: aggregate attributes: * see jfs_filsys.h */ - __le32 s_state; /* 4: mount/unmount/recovery state: + __le32 s_state; /* 4: mount/unmount/recovery state: * see jfs_filsys.h */ __le32 s_compress; /* 4: > 0 if data compression */ @@ -75,11 +75,11 @@ struct jfs_superblock { struct timestruc_t s_time; /* 8: time last updated */ __le32 s_fsckloglen; /* 4: Number of filesystem blocks reserved for - * the fsck service log. + * the fsck service log. * N.B. These blocks are divided among the * versions kept. This is not a per * version size. - * N.B. These blocks are included in the + * N.B. These blocks are included in the * length field of s_fsckpxd. */ s8 s_fscklog; /* 1: which fsck service log is most recent @@ -87,7 +87,7 @@ struct jfs_superblock { * 1 => the first one * 2 => the 2nd one */ - char s_fpack[11]; /* 11: file system volume name + char s_fpack[11]; /* 11: file system volume name * N.B. This must be 11 bytes to * conform with the OS/2 BootSector * requirements diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index ebfa6c061d7..81f6f04af19 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -4,16 +4,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -2026,8 +2026,6 @@ static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, * truncate entry XAD[twm == next - 1]: */ if (twm == next - 1) { - struct pxd_lock *pxdlock; - /* format a maplock for txUpdateMap() to update bmap * to free truncated delta extent of the truncated * entry XAD[next - 1]; diff --git a/fs/jfs/jfs_txnmgr.h b/fs/jfs/jfs_txnmgr.h index 0e4dc4514c4..7863cf21afc 100644 --- a/fs/jfs/jfs_txnmgr.h +++ b/fs/jfs/jfs_txnmgr.h @@ -3,16 +3,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _H_JFS_TXNMGR @@ -179,7 +179,7 @@ struct linelock { /* (8) */ struct lv lv[20]; /* 40: */ -}; /* (48) */ +}; /* (48) */ #define dt_lock linelock @@ -211,8 +211,8 @@ struct xtlock { * at tlock.lock/linelock: watch for alignment; * N.B. next field may be set by linelock, and should not * be modified by maplock; - * N.B. index of the first pxdlock specifies index of next - * free maplock (i.e., number of maplock) in the tlock; + * N.B. index of the first pxdlock specifies index of next + * free maplock (i.e., number of maplock) in the tlock; */ struct maplock { lid_t next; /* 2: */ diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c index 21eaf7ac0fc..a386f48c73f 100644 --- a/fs/jfs/jfs_umount.c +++ b/fs/jfs/jfs_umount.c @@ -3,16 +3,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -22,8 +22,8 @@ * note: file system in transition to aggregate/fileset: * (ref. jfs_mount.c) * - * file system unmount is interpreted as mount of the single/only - * fileset in the aggregate and, if unmount of the last fileset, + * file system unmount is interpreted as mount of the single/only + * fileset in the aggregate and, if unmount of the last fileset, * as unmount of the aggerate; */ @@ -60,13 +60,13 @@ int jfs_umount(struct super_block *sb) jfs_info("UnMount JFS: sb:0x%p", sb); /* - * update superblock and close log + * update superblock and close log * * if mounted read-write and log based recovery was enabled */ if ((log = sbi->log)) /* - * Wait for outstanding transactions to be written to log: + * Wait for outstanding transactions to be written to log: */ jfs_flush_journal(log, 2); @@ -112,17 +112,17 @@ int jfs_umount(struct super_block *sb) /* * ensure all file system file pages are propagated to their - * home blocks on disk (and their in-memory buffer pages are + * home blocks on disk (and their in-memory buffer pages are * invalidated) BEFORE updating file system superblock state - * (to signify file system is unmounted cleanly, and thus in - * consistent state) and log superblock active file system + * (to signify file system is unmounted cleanly, and thus in + * consistent state) and log superblock active file system * list (to signify skip logredo()). */ if (log) { /* log = NULL if read-only mount */ updateSuper(sb, FM_CLEAN); /* - * close log: + * close log: * * remove file system from log active file system list. */ @@ -142,7 +142,7 @@ int jfs_umount_rw(struct super_block *sb) return 0; /* - * close log: + * close log: * * remove file system from log active file system list. */ diff --git a/fs/jfs/jfs_unicode.c b/fs/jfs/jfs_unicode.c index f327decfb15..c7de6f5bbef 100644 --- a/fs/jfs/jfs_unicode.c +++ b/fs/jfs/jfs_unicode.c @@ -3,16 +3,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -57,8 +57,8 @@ int jfs_strfromUCS_le(char *to, const __le16 * from, warn--; warn_again--; printk(KERN_ERR - "non-latin1 character 0x%x found in JFS file name\n", - le16_to_cpu(from[i])); + "non-latin1 character 0x%x found in JFS file name\n", + le16_to_cpu(from[i])); printk(KERN_ERR "mount with iocharset=utf8 to access\n"); } @@ -124,7 +124,7 @@ int get_UCSname(struct component_name * uniName, struct dentry *dentry) kmalloc((length + 1) * sizeof(wchar_t), GFP_NOFS); if (uniName->name == NULL) - return -ENOSPC; + return -ENOMEM; uniName->namlen = jfs_strtoUCS(uniName->name, dentry->d_name.name, length, nls_tab); diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h index 69e25ebe87a..3fbb3a22559 100644 --- a/fs/jfs/jfs_unicode.h +++ b/fs/jfs/jfs_unicode.h @@ -1,19 +1,19 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 - * Portions Copyright (c) Christoph Hellwig, 2001-2002 + * Copyright (C) International Business Machines Corp., 2000-2002 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _H_JFS_UNICODE diff --git a/fs/jfs/jfs_uniupr.c b/fs/jfs/jfs_uniupr.c index 4ab185d2630..cfe50666d31 100644 --- a/fs/jfs/jfs_uniupr.c +++ b/fs/jfs/jfs_uniupr.c @@ -1,18 +1,18 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 + * Copyright (C) International Business Machines Corp., 2000-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h index 25e9990bccd..88b6cc535bf 100644 --- a/fs/jfs/jfs_xattr.h +++ b/fs/jfs/jfs_xattr.h @@ -1,5 +1,5 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 + * Copyright (C) International Business Machines Corp., 2000-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index c92307d3a57..e98eb03e531 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -3,16 +3,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* @@ -2428,7 +2428,7 @@ printf("xtUpdate.updateLeft.split p:0x%p\n", p); * return: */ int xtAppend(tid_t tid, /* transaction id */ - struct inode *ip, int xflag, s64 xoff, s32 maxblocks, + struct inode *ip, int xflag, s64 xoff, s32 maxblocks, s32 * xlenp, /* (in/out) */ s64 * xaddrp, /* (in/out) */ int flag) @@ -2499,7 +2499,7 @@ int xtAppend(tid_t tid, /* transaction id */ pxdlist.maxnpxd = pxdlist.npxd = 0; pxd = &pxdlist.pxd[0]; nblocks = JFS_SBI(ip->i_sb)->nbperpage; - for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) { + for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) { if ((rc = dbAllocBottomUp(ip, xaddr, (s64) nblocks)) == 0) { PXDaddress(pxd, xaddr); PXDlength(pxd, nblocks); @@ -2514,7 +2514,7 @@ int xtAppend(tid_t tid, /* transaction id */ goto out; } - xlen = min(xlen, maxblocks); + xlen = min(xlen, maxblocks); /* * allocate data extent requested diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h index af668a80b40..164f6f2b101 100644 --- a/fs/jfs/jfs_xtree.h +++ b/fs/jfs/jfs_xtree.h @@ -1,18 +1,18 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 + * Copyright (C) International Business Machines Corp., 2000-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _H_JFS_XTREE diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index b8d16a6aa88..a6a8c16c872 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -4,16 +4,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -41,7 +41,7 @@ static s64 commitZeroLink(tid_t, struct inode *); /* * NAME: free_ea_wmap(inode) * - * FUNCTION: free uncommitted extended attributes from working map + * FUNCTION: free uncommitted extended attributes from working map * */ static inline void free_ea_wmap(struct inode *inode) @@ -62,7 +62,7 @@ static inline void free_ea_wmap(struct inode *inode) * FUNCTION: create a regular file in the parent directory <dip> * with name = <from dentry> and mode = <mode> * - * PARAMETER: dip - parent directory vnode + * PARAMETER: dip - parent directory vnode * dentry - dentry of new file * mode - create mode (rwxrwxrwx). * nd- nd struct @@ -97,8 +97,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, * begin the transaction before we search the directory. */ ip = ialloc(dip, mode); - if (ip == NULL) { - rc = -ENOSPC; + if (IS_ERR(ip)) { + rc = PTR_ERR(ip); goto out2; } @@ -190,7 +190,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, * FUNCTION: create a child directory in the parent directory <dip> * with name = <from dentry> and mode = <mode> * - * PARAMETER: dip - parent directory vnode + * PARAMETER: dip - parent directory vnode * dentry - dentry of child directory * mode - create mode (rwxrwxrwx). * @@ -231,8 +231,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) * begin the transaction before we search the directory. */ ip = ialloc(dip, S_IFDIR | mode); - if (ip == NULL) { - rc = -ENOSPC; + if (IS_ERR(ip)) { + rc = PTR_ERR(ip); goto out2; } @@ -324,7 +324,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) * * FUNCTION: remove a link to child directory * - * PARAMETER: dip - parent inode + * PARAMETER: dip - parent inode * dentry - child directory dentry * * RETURN: -EINVAL - if name is . or .. @@ -332,10 +332,10 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) * errors from subroutines * * note: - * if other threads have the directory open when the last link - * is removed, the "." and ".." entries, if present, are removed before - * rmdir() returns and no new entries may be created in the directory, - * but the directory is not removed until the last reference to + * if other threads have the directory open when the last link + * is removed, the "." and ".." entries, if present, are removed before + * rmdir() returns and no new entries may be created in the directory, + * but the directory is not removed until the last reference to * the directory is released (cf.unlink() of regular file). */ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) @@ -446,11 +446,11 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) /* * NAME: jfs_unlink(dip, dentry) * - * FUNCTION: remove a link to object <vp> named by <name> + * FUNCTION: remove a link to object <vp> named by <name> * from parent directory <dvp> * - * PARAMETER: dip - inode of parent directory - * dentry - dentry of object to be removed + * PARAMETER: dip - inode of parent directory + * dentry - dentry of object to be removed * * RETURN: errors from subroutines * @@ -598,7 +598,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) * * FUNCTION: for non-directory, called by jfs_remove(), * truncate a regular file, directory or symbolic - * link to zero length. return 0 if type is not + * link to zero length. return 0 if type is not * one of these. * * if the file is currently associated with a VM segment @@ -608,7 +608,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) * map by ctrunc1. * if there is no VM segment on entry, the resources are * freed in both work and permanent map. - * (? for temporary file - memory object is cached even + * (? for temporary file - memory object is cached even * after no reference: * reference count > 0 - ) * @@ -662,7 +662,7 @@ static s64 commitZeroLink(tid_t tid, struct inode *ip) /* * free xtree/data (truncate to zero length): - * free xtree/data pages from cache if COMMIT_PWMAP, + * free xtree/data pages from cache if COMMIT_PWMAP, * free xtree/data blocks from persistent block map, and * free xtree/data blocks from working block map if COMMIT_PWMAP; */ @@ -677,7 +677,7 @@ static s64 commitZeroLink(tid_t tid, struct inode *ip) * NAME: jfs_free_zero_link() * * FUNCTION: for non-directory, called by iClose(), - * free resources of a file from cache and WORKING map + * free resources of a file from cache and WORKING map * for a file previously committed with zero link count * while associated with a pager object, * @@ -762,7 +762,7 @@ void jfs_free_zero_link(struct inode *ip) * FUNCTION: create a link to <vp> by the name = <name> * in the parent directory <dvp> * - * PARAMETER: vp - target object + * PARAMETER: vp - target object * dvp - parent directory of new link * name - name of new link to target object * crp - credential @@ -858,8 +858,8 @@ static int jfs_link(struct dentry *old_dentry, * in directory <dip> * * PARAMETER: dip - parent directory vnode - * dentry - dentry of symbolic link - * name - the path name of the existing object + * dentry - dentry of symbolic link + * name - the path name of the existing object * that will be the source of the link * * RETURN: errors from subroutines @@ -906,8 +906,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, * (iAlloc() returns new, locked inode) */ ip = ialloc(dip, S_IFLNK | 0777); - if (ip == NULL) { - rc = -ENOSPC; + if (IS_ERR(ip)) { + rc = PTR_ERR(ip); goto out2; } @@ -926,7 +926,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, tblk->u.ixpxd = JFS_IP(ip)->ixpxd; /* fix symlink access permission - * (dir_create() ANDs in the u.u_cmask, + * (dir_create() ANDs in the u.u_cmask, * but symlinks really need to be 777 access) */ ip->i_mode |= 0777; @@ -967,7 +967,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, ip->i_mapping->a_ops = &jfs_aops; /* - * even though the data of symlink object (source + * even though the data of symlink object (source * path name) is treated as non-journaled user data, * it is read/written thru buffer cache for performance. */ @@ -978,7 +978,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, xlen = xsize >> JFS_SBI(sb)->l2bsize; if ((rc = xtInsert(tid, ip, 0, 0, xlen, &xaddr, 0))) { txAbort(tid, 0); - rc = -ENOSPC; goto out3; } extent = xaddr; @@ -1176,7 +1175,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* free block resources */ if ((new_size = commitZeroLink(tid, new_ip)) < 0) { txAbort(tid, 1); /* Marks FS Dirty */ - rc = new_size; + rc = new_size; goto out4; } tblk = tid_to_tblock(tid); @@ -1292,7 +1291,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_size = xtTruncate_pmap(tid, new_ip, new_size); if (new_size < 0) { txAbort(tid, 1); - rc = new_size; + rc = new_size; } else rc = txCommit(tid, 1, &new_ip, COMMIT_SYNC); txEnd(tid); @@ -1350,8 +1349,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, goto out; ip = ialloc(dir, mode); - if (ip == NULL) { - rc = -ENOSPC; + if (IS_ERR(ip)) { + rc = PTR_ERR(ip); goto out1; } jfs_ip = JFS_IP(ip); diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index 45180361871..79d625f3f73 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -3,16 +3,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 143bcd1d5ea..9c1c6e0e633 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -4,16 +4,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -82,7 +82,7 @@ static void jfs_handle_error(struct super_block *sb) "as read-only\n", sb->s_id); sb->s_flags |= MS_RDONLY; - } + } /* nothing is done for continue beyond marking the superblock dirty */ } @@ -422,7 +422,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL); if (!sbi) - return -ENOSPC; + return -ENOMEM; sb->s_fs_info = sbi; sbi->sb = sb; sbi->uid = sbi->gid = sbi->umask = -1; @@ -775,7 +775,7 @@ static int __init init_jfs_fs(void) int rc; jfs_inode_cachep = - kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0, + kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, init_once, NULL); if (jfs_inode_cachep == NULL) diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c index 16477b3835e..cee43f36f51 100644 --- a/fs/jfs/symlink.c +++ b/fs/jfs/symlink.c @@ -3,16 +3,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 7a10e192896..4c7985ebca9 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -4,16 +4,16 @@ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software + * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ @@ -57,7 +57,7 @@ * * 0 4 4 + EA_SIZE(ea1) * +------------+-------------------+--------------------+----- - * | Overall EA | First FEA Element | Second FEA Element | ..... + * | Overall EA | First FEA Element | Second FEA Element | ..... * | List Size | | | * +------------+-------------------+--------------------+----- * @@ -155,9 +155,9 @@ static void ea_release(struct inode *inode, struct ea_buffer *ea_buf); /* * NAME: ea_write_inline - * + * * FUNCTION: Attempt to write an EA inline if area is available - * + * * PRE CONDITIONS: * Already verified that the specified EA is small enough to fit inline * @@ -216,10 +216,10 @@ static int ea_write_inline(struct inode *ip, struct jfs_ea_list *ealist, /* * NAME: ea_write - * + * * FUNCTION: Write an EA for an inode - * - * PRE CONDITIONS: EA has been verified + * + * PRE CONDITIONS: EA has been verified * * PARAMETERS: * ip - Inode pointer @@ -340,9 +340,9 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size, /* * NAME: ea_read_inline - * + * * FUNCTION: Read an inlined EA into user's buffer - * + * * PARAMETERS: * ip - Inode pointer * ealist - Pointer to buffer to fill in with EA @@ -372,9 +372,9 @@ static int ea_read_inline(struct inode *ip, struct jfs_ea_list *ealist) /* * NAME: ea_read - * + * * FUNCTION: copy EA data into user's buffer - * + * * PARAMETERS: * ip - Inode pointer * ealist - Pointer to buffer to fill in with EA @@ -406,7 +406,7 @@ static int ea_read(struct inode *ip, struct jfs_ea_list *ealist) return -EIO; } - /* + /* * Figure out how many blocks were allocated when this EA list was * originally written to disk. */ @@ -443,14 +443,14 @@ static int ea_read(struct inode *ip, struct jfs_ea_list *ealist) /* * NAME: ea_get - * + * * FUNCTION: Returns buffer containing existing extended attributes. * The size of the buffer will be the larger of the existing * attributes size, or min_size. * * The buffer, which may be inlined in the inode or in the - * page cache must be release by calling ea_release or ea_put - * + * page cache must be release by calling ea_release or ea_put + * * PARAMETERS: * inode - Inode pointer * ea_buf - Structure to be populated with ealist and its metadata @@ -1054,7 +1054,7 @@ ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size) /* compute required size of list */ for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) { - if (can_list(ea)) + if (can_list(ea)) size += name_size(ea) + 1; } @@ -1069,7 +1069,7 @@ ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size) /* Copy attribute names to buffer */ buffer = data; for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) { - if (can_list(ea)) { + if (can_list(ea)) { int namelen = copy_name(buffer, ea); buffer += namelen + 1; } diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index f95cc3f3c42..e8c7765419e 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -144,42 +144,12 @@ u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock) */ /* - * Someone has sent us an SM_NOTIFY. Ensure we bind to the new port number, - * that we mark locks for reclaiming, and that we bump the pseudo NSM state. - */ -static void nlmclnt_prepare_reclaim(struct nlm_host *host) -{ - down_write(&host->h_rwsem); - host->h_monitored = 0; - host->h_state++; - host->h_nextrebind = 0; - nlm_rebind_host(host); - - /* - * Mark the locks for reclaiming. - */ - list_splice_init(&host->h_granted, &host->h_reclaim); - - dprintk("NLM: reclaiming locks for host %s\n", host->h_name); -} - -static void nlmclnt_finish_reclaim(struct nlm_host *host) -{ - host->h_reclaiming = 0; - up_write(&host->h_rwsem); - dprintk("NLM: done reclaiming locks for host %s", host->h_name); -} - -/* * Reclaim all locks on server host. We do this by spawning a separate * reclaimer thread. */ void -nlmclnt_recovery(struct nlm_host *host, u32 newstate) +nlmclnt_recovery(struct nlm_host *host) { - if (host->h_nsmstate == newstate) - return; - host->h_nsmstate = newstate; if (!host->h_reclaiming++) { nlm_get_host(host); __module_get(THIS_MODULE); @@ -199,18 +169,30 @@ reclaimer(void *ptr) daemonize("%s-reclaim", host->h_name); allow_signal(SIGKILL); + down_write(&host->h_rwsem); + /* This one ensures that our parent doesn't terminate while the * reclaim is in progress */ lock_kernel(); - lockd_up(); + lockd_up(0); /* note: this cannot fail as lockd is already running */ + + dprintk("lockd: reclaiming locks for host %s", host->h_name); - nlmclnt_prepare_reclaim(host); - /* First, reclaim all locks that have been marked. */ restart: nsmstate = host->h_nsmstate; + + /* Force a portmap getport - the peer's lockd will + * most likely end up on a different port. + */ + host->h_nextrebind = jiffies; + nlm_rebind_host(host); + + /* First, reclaim all locks that have been granted. */ + list_splice_init(&host->h_granted, &host->h_reclaim); list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) { list_del_init(&fl->fl_u.nfs_fl.list); + /* Why are we leaking memory here? --okir */ if (signalled()) continue; if (nlmclnt_reclaim(host, fl) != 0) @@ -218,11 +200,13 @@ restart: list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted); if (host->h_nsmstate != nsmstate) { /* Argh! The server rebooted again! */ - list_splice_init(&host->h_granted, &host->h_reclaim); goto restart; } } - nlmclnt_finish_reclaim(host); + + host->h_reclaiming = 0; + up_write(&host->h_rwsem); + dprintk("NLM: done reclaiming locks for host %s", host->h_name); /* Now, wake up all processes that sleep on a blocked lock */ list_for_each_entry(block, &nlm_blocked, b_list) { diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 271e2165fff..3d84f600b63 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -36,14 +36,14 @@ static const struct rpc_call_ops nlmclnt_cancel_ops; /* * Cookie counter for NLM requests */ -static u32 nlm_cookie = 0x1234; +static atomic_t nlm_cookie = ATOMIC_INIT(0x1234); -static inline void nlmclnt_next_cookie(struct nlm_cookie *c) +void nlmclnt_next_cookie(struct nlm_cookie *c) { - memcpy(c->data, &nlm_cookie, 4); - memset(c->data+4, 0, 4); + u32 cookie = atomic_inc_return(&nlm_cookie); + + memcpy(c->data, &cookie, 4); c->len=4; - nlm_cookie++; } static struct nlm_lockowner *nlm_get_lockowner(struct nlm_lockowner *lockowner) @@ -129,11 +129,11 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) nlmclnt_next_cookie(&argp->cookie); argp->state = nsm_local_state; memcpy(&lock->fh, NFS_FH(fl->fl_file->f_dentry->d_inode), sizeof(struct nfs_fh)); - lock->caller = system_utsname.nodename; + lock->caller = utsname()->nodename; lock->oh.data = req->a_owner; lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", (unsigned int)fl->fl_u.nfs_fl.owner->pid, - system_utsname.nodename); + utsname()->nodename); lock->svid = fl->fl_u.nfs_fl.owner->pid; lock->fl.fl_start = fl->fl_start; lock->fl.fl_end = fl->fl_end; @@ -153,6 +153,7 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl) { struct rpc_clnt *client = NFS_CLIENT(inode); struct sockaddr_in addr; + struct nfs_server *nfssrv = NFS_SERVER(inode); struct nlm_host *host; struct nlm_rqst *call; sigset_t oldset; @@ -166,7 +167,9 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl) } rpc_peeraddr(client, (struct sockaddr *) &addr, sizeof(addr)); - host = nlmclnt_lookup_host(&addr, client->cl_xprt->prot, vers); + host = nlmclnt_lookup_host(&addr, client->cl_xprt->prot, vers, + nfssrv->nfs_client->cl_hostname, + strlen(nfssrv->nfs_client->cl_hostname)); if (host == NULL) return -ENOLCK; @@ -499,7 +502,7 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) unsigned char fl_flags = fl->fl_flags; int status = -ENOLCK; - if (!host->h_monitored && nsm_monitor(host) < 0) { + if (nsm_monitor(host) < 0) { printk(KERN_NOTICE "lockd: failed to monitor %s\n", host->h_name); goto out; diff --git a/fs/lockd/host.c b/fs/lockd/host.c index a0d0b58ce7a..fb24a973034 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -27,46 +27,60 @@ #define NLM_HOST_EXPIRE ((nrhosts > NLM_HOST_MAX)? 300 * HZ : 120 * HZ) #define NLM_HOST_COLLECT ((nrhosts > NLM_HOST_MAX)? 120 * HZ : 60 * HZ) -static struct nlm_host * nlm_hosts[NLM_HOST_NRHASH]; +static struct hlist_head nlm_hosts[NLM_HOST_NRHASH]; static unsigned long next_gc; static int nrhosts; static DEFINE_MUTEX(nlm_host_mutex); static void nlm_gc_hosts(void); +static struct nsm_handle * __nsm_find(const struct sockaddr_in *, + const char *, int, int); /* * Find an NLM server handle in the cache. If there is none, create it. */ struct nlm_host * -nlmclnt_lookup_host(struct sockaddr_in *sin, int proto, int version) +nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version, + const char *hostname, int hostname_len) { - return nlm_lookup_host(0, sin, proto, version); + return nlm_lookup_host(0, sin, proto, version, + hostname, hostname_len); } /* * Find an NLM client handle in the cache. If there is none, create it. */ struct nlm_host * -nlmsvc_lookup_host(struct svc_rqst *rqstp) +nlmsvc_lookup_host(struct svc_rqst *rqstp, + const char *hostname, int hostname_len) { return nlm_lookup_host(1, &rqstp->rq_addr, - rqstp->rq_prot, rqstp->rq_vers); + rqstp->rq_prot, rqstp->rq_vers, + hostname, hostname_len); } /* * Common host lookup routine for server & client */ struct nlm_host * -nlm_lookup_host(int server, struct sockaddr_in *sin, - int proto, int version) +nlm_lookup_host(int server, const struct sockaddr_in *sin, + int proto, int version, + const char *hostname, + int hostname_len) { - struct nlm_host *host, **hp; - u32 addr; + struct hlist_head *chain; + struct hlist_node *pos; + struct nlm_host *host; + struct nsm_handle *nsm = NULL; int hash; - dprintk("lockd: nlm_lookup_host(%08x, p=%d, v=%d)\n", - (unsigned)(sin? ntohl(sin->sin_addr.s_addr) : 0), proto, version); + dprintk("lockd: nlm_lookup_host(%u.%u.%u.%u, p=%d, v=%d, my role=%s, name=%.*s)\n", + NIPQUAD(sin->sin_addr.s_addr), proto, version, + server? "server" : "client", + hostname_len, + hostname? hostname : "<none>"); + hash = NLM_ADDRHASH(sin->sin_addr.s_addr); @@ -76,7 +90,22 @@ nlm_lookup_host(int server, struct sockaddr_in *sin, if (time_after_eq(jiffies, next_gc)) nlm_gc_hosts(); - for (hp = &nlm_hosts[hash]; (host = *hp) != 0; hp = &host->h_next) { + /* We may keep several nlm_host objects for a peer, because each + * nlm_host is identified by + * (address, protocol, version, server/client) + * We could probably simplify this a little by putting all those + * different NLM rpc_clients into one single nlm_host object. + * This would allow us to have one nlm_host per address. + */ + chain = &nlm_hosts[hash]; + hlist_for_each_entry(host, pos, chain, h_hash) { + if (!nlm_cmp_addr(&host->h_addr, sin)) + continue; + + /* See if we have an NSM handle for this client */ + if (!nsm) + nsm = host->h_nsmhandle; + if (host->h_proto != proto) continue; if (host->h_version != version) @@ -84,28 +113,30 @@ nlm_lookup_host(int server, struct sockaddr_in *sin, if (host->h_server != server) continue; - if (nlm_cmp_addr(&host->h_addr, sin)) { - if (hp != nlm_hosts + hash) { - *hp = host->h_next; - host->h_next = nlm_hosts[hash]; - nlm_hosts[hash] = host; - } - nlm_get_host(host); - mutex_unlock(&nlm_host_mutex); - return host; - } - } + /* Move to head of hash chain. */ + hlist_del(&host->h_hash); + hlist_add_head(&host->h_hash, chain); - /* Ooops, no host found, create it */ - dprintk("lockd: creating host entry\n"); + nlm_get_host(host); + goto out; + } + if (nsm) + atomic_inc(&nsm->sm_count); - host = kzalloc(sizeof(*host), GFP_KERNEL); - if (!host) - goto nohost; + host = NULL; - addr = sin->sin_addr.s_addr; - sprintf(host->h_name, "%u.%u.%u.%u", NIPQUAD(addr)); + /* Sadly, the host isn't in our hash table yet. See if + * we have an NSM handle for it. If not, create one. + */ + if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len))) + goto out; + host = kzalloc(sizeof(*host), GFP_KERNEL); + if (!host) { + nsm_release(nsm); + goto out; + } + host->h_name = nsm->sm_name; host->h_addr = *sin; host->h_addr.sin_port = 0; /* ouch! */ host->h_version = version; @@ -119,9 +150,9 @@ nlm_lookup_host(int server, struct sockaddr_in *sin, init_rwsem(&host->h_rwsem); host->h_state = 0; /* pseudo NSM state */ host->h_nsmstate = 0; /* real NSM state */ + host->h_nsmhandle = nsm; host->h_server = server; - host->h_next = nlm_hosts[hash]; - nlm_hosts[hash] = host; + hlist_add_head(&host->h_hash, chain); INIT_LIST_HEAD(&host->h_lockowners); spin_lock_init(&host->h_lock); INIT_LIST_HEAD(&host->h_granted); @@ -130,35 +161,39 @@ nlm_lookup_host(int server, struct sockaddr_in *sin, if (++nrhosts > NLM_HOST_MAX) next_gc = 0; -nohost: +out: mutex_unlock(&nlm_host_mutex); return host; } -struct nlm_host * -nlm_find_client(void) +/* + * Destroy a host + */ +static void +nlm_destroy_host(struct nlm_host *host) { - /* find a nlm_host for a client for which h_killed == 0. - * and return it + struct rpc_clnt *clnt; + + BUG_ON(!list_empty(&host->h_lockowners)); + BUG_ON(atomic_read(&host->h_count)); + + /* + * Release NSM handle and unmonitor host. */ - int hash; - mutex_lock(&nlm_host_mutex); - for (hash = 0 ; hash < NLM_HOST_NRHASH; hash++) { - struct nlm_host *host, **hp; - for (hp = &nlm_hosts[hash]; (host = *hp) != 0; hp = &host->h_next) { - if (host->h_server && - host->h_killed == 0) { - nlm_get_host(host); - mutex_unlock(&nlm_host_mutex); - return host; - } + nsm_unmonitor(host); + + if ((clnt = host->h_rpcclnt) != NULL) { + if (atomic_read(&clnt->cl_users)) { + printk(KERN_WARNING + "lockd: active RPC handle\n"); + clnt->cl_dead = 1; + } else { + rpc_destroy_client(host->h_rpcclnt); } } - mutex_unlock(&nlm_host_mutex); - return NULL; + kfree(host); } - /* * Create the NLM RPC client for an NLM peer */ @@ -260,22 +295,82 @@ void nlm_release_host(struct nlm_host *host) } /* + * We were notified that the host indicated by address &sin + * has rebooted. + * Release all resources held by that peer. + */ +void nlm_host_rebooted(const struct sockaddr_in *sin, + const char *hostname, int hostname_len, + u32 new_state) +{ + struct hlist_head *chain; + struct hlist_node *pos; + struct nsm_handle *nsm; + struct nlm_host *host; + + dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n", + hostname, NIPQUAD(sin->sin_addr)); + + /* Find the NSM handle for this peer */ + if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0))) + return; + + /* When reclaiming locks on this peer, make sure that + * we set up a new notification */ + nsm->sm_monitored = 0; + + /* Mark all hosts tied to this NSM state as having rebooted. + * We run the loop repeatedly, because we drop the host table + * lock for this. + * To avoid processing a host several times, we match the nsmstate. + */ +again: mutex_lock(&nlm_host_mutex); + for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { + hlist_for_each_entry(host, pos, chain, h_hash) { + if (host->h_nsmhandle == nsm + && host->h_nsmstate != new_state) { + host->h_nsmstate = new_state; + host->h_state++; + + nlm_get_host(host); + mutex_unlock(&nlm_host_mutex); + + if (host->h_server) { + /* We're server for this guy, just ditch + * all the locks he held. */ + nlmsvc_free_host_resources(host); + } else { + /* He's the server, initiate lock recovery. */ + nlmclnt_recovery(host); + } + + nlm_release_host(host); + goto again; + } + } + } + + mutex_unlock(&nlm_host_mutex); +} + +/* * Shut down the hosts module. * Note that this routine is called only at server shutdown time. */ void nlm_shutdown_hosts(void) { + struct hlist_head *chain; + struct hlist_node *pos; struct nlm_host *host; - int i; dprintk("lockd: shutting down host module\n"); mutex_lock(&nlm_host_mutex); /* First, make all hosts eligible for gc */ dprintk("lockd: nuking all hosts...\n"); - for (i = 0; i < NLM_HOST_NRHASH; i++) { - for (host = nlm_hosts[i]; host; host = host->h_next) + for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { + hlist_for_each_entry(host, pos, chain, h_hash) host->h_expires = jiffies - 1; } @@ -287,8 +382,8 @@ nlm_shutdown_hosts(void) if (nrhosts) { printk(KERN_WARNING "lockd: couldn't shutdown host module!\n"); dprintk("lockd: %d hosts left:\n", nrhosts); - for (i = 0; i < NLM_HOST_NRHASH; i++) { - for (host = nlm_hosts[i]; host; host = host->h_next) { + for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { + hlist_for_each_entry(host, pos, chain, h_hash) { dprintk(" %s (cnt %d use %d exp %ld)\n", host->h_name, atomic_read(&host->h_count), host->h_inuse, host->h_expires); @@ -305,45 +400,32 @@ nlm_shutdown_hosts(void) static void nlm_gc_hosts(void) { - struct nlm_host **q, *host; - struct rpc_clnt *clnt; - int i; + struct hlist_head *chain; + struct hlist_node *pos, *next; + struct nlm_host *host; dprintk("lockd: host garbage collection\n"); - for (i = 0; i < NLM_HOST_NRHASH; i++) { - for (host = nlm_hosts[i]; host; host = host->h_next) + for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { + hlist_for_each_entry(host, pos, chain, h_hash) host->h_inuse = 0; } /* Mark all hosts that hold locks, blocks or shares */ nlmsvc_mark_resources(); - for (i = 0; i < NLM_HOST_NRHASH; i++) { - q = &nlm_hosts[i]; - while ((host = *q) != NULL) { + for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { + hlist_for_each_entry_safe(host, pos, next, chain, h_hash) { if (atomic_read(&host->h_count) || host->h_inuse || time_before(jiffies, host->h_expires)) { dprintk("nlm_gc_hosts skipping %s (cnt %d use %d exp %ld)\n", host->h_name, atomic_read(&host->h_count), host->h_inuse, host->h_expires); - q = &host->h_next; continue; } dprintk("lockd: delete host %s\n", host->h_name); - *q = host->h_next; - /* Don't unmonitor hosts that have been invalidated */ - if (host->h_monitored && !host->h_killed) - nsm_unmonitor(host); - if ((clnt = host->h_rpcclnt) != NULL) { - if (atomic_read(&clnt->cl_users)) { - printk(KERN_WARNING - "lockd: active RPC handle\n"); - clnt->cl_dead = 1; - } else { - rpc_destroy_client(host->h_rpcclnt); - } - } - kfree(host); + hlist_del_init(&host->h_hash); + + nlm_destroy_host(host); nrhosts--; } } @@ -351,3 +433,88 @@ nlm_gc_hosts(void) next_gc = jiffies + NLM_HOST_COLLECT; } + +/* + * Manage NSM handles + */ +static LIST_HEAD(nsm_handles); +static DEFINE_MUTEX(nsm_mutex); + +static struct nsm_handle * +__nsm_find(const struct sockaddr_in *sin, + const char *hostname, int hostname_len, + int create) +{ + struct nsm_handle *nsm = NULL; + struct list_head *pos; + + if (!sin) + return NULL; + + if (hostname && memchr(hostname, '/', hostname_len) != NULL) { + if (printk_ratelimit()) { + printk(KERN_WARNING "Invalid hostname \"%.*s\" " + "in NFS lock request\n", + hostname_len, hostname); + } + return NULL; + } + + mutex_lock(&nsm_mutex); + list_for_each(pos, &nsm_handles) { + nsm = list_entry(pos, struct nsm_handle, sm_link); + + if (hostname && nsm_use_hostnames) { + if (strlen(nsm->sm_name) != hostname_len + || memcmp(nsm->sm_name, hostname, hostname_len)) + continue; + } else if (!nlm_cmp_addr(&nsm->sm_addr, sin)) + continue; + atomic_inc(&nsm->sm_count); + goto out; + } + + if (!create) { + nsm = NULL; + goto out; + } + + nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL); + if (nsm != NULL) { + nsm->sm_addr = *sin; + nsm->sm_name = (char *) (nsm + 1); + memcpy(nsm->sm_name, hostname, hostname_len); + nsm->sm_name[hostname_len] = '\0'; + atomic_set(&nsm->sm_count, 1); + + list_add(&nsm->sm_link, &nsm_handles); + } + +out: + mutex_unlock(&nsm_mutex); + return nsm; +} + +struct nsm_handle * +nsm_find(const struct sockaddr_in *sin, const char *hostname, int hostname_len) +{ + return __nsm_find(sin, hostname, hostname_len, 1); +} + +/* + * Release an NSM handle + */ +void +nsm_release(struct nsm_handle *nsm) +{ + if (!nsm) + return; + if (atomic_dec_and_test(&nsm->sm_count)) { + mutex_lock(&nsm_mutex); + if (atomic_read(&nsm->sm_count) == 0) { + list_del(&nsm->sm_link); + kfree(nsm); + } + mutex_unlock(&nsm_mutex); + } +} diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 5954dcb497e..e0179f8c327 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -24,13 +24,13 @@ static struct rpc_program nsm_program; /* * Local NSM state */ -u32 nsm_local_state; +int nsm_local_state; /* * Common procedure for SM_MON/SM_UNMON calls */ static int -nsm_mon_unmon(struct nlm_host *host, u32 proc, struct nsm_res *res) +nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) { struct rpc_clnt *clnt; int status; @@ -46,10 +46,11 @@ nsm_mon_unmon(struct nlm_host *host, u32 proc, struct nsm_res *res) goto out; } - args.addr = host->h_addr.sin_addr.s_addr; - args.proto= (host->h_proto<<1) | host->h_server; + memset(&args, 0, sizeof(args)); + args.mon_name = nsm->sm_name; + args.addr = nsm->sm_addr.sin_addr.s_addr; args.prog = NLM_PROGRAM; - args.vers = host->h_version; + args.vers = 3; args.proc = NLMPROC_NSM_NOTIFY; memset(res, 0, sizeof(*res)); @@ -70,17 +71,22 @@ nsm_mon_unmon(struct nlm_host *host, u32 proc, struct nsm_res *res) int nsm_monitor(struct nlm_host *host) { + struct nsm_handle *nsm = host->h_nsmhandle; struct nsm_res res; int status; dprintk("lockd: nsm_monitor(%s)\n", host->h_name); + BUG_ON(nsm == NULL); - status = nsm_mon_unmon(host, SM_MON, &res); + if (nsm->sm_monitored) + return 0; + + status = nsm_mon_unmon(nsm, SM_MON, &res); if (status < 0 || res.status != 0) printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name); else - host->h_monitored = 1; + nsm->sm_monitored = 1; return status; } @@ -90,16 +96,26 @@ nsm_monitor(struct nlm_host *host) int nsm_unmonitor(struct nlm_host *host) { + struct nsm_handle *nsm = host->h_nsmhandle; struct nsm_res res; - int status; - - dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name); - - status = nsm_mon_unmon(host, SM_UNMON, &res); - if (status < 0) - printk(KERN_NOTICE "lockd: cannot unmonitor %s\n", host->h_name); - else - host->h_monitored = 0; + int status = 0; + + if (nsm == NULL) + return 0; + host->h_nsmhandle = NULL; + + if (atomic_read(&nsm->sm_count) == 1 + && nsm->sm_monitored && !nsm->sm_sticky) { + dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name); + + status = nsm_mon_unmon(nsm, SM_UNMON, &res); + if (status < 0) + printk(KERN_NOTICE "lockd: cannot unmonitor %s\n", + host->h_name); + else + nsm->sm_monitored = 0; + } + nsm_release(nsm); return status; } @@ -135,7 +151,7 @@ nsm_create(void) static u32 * xdr_encode_common(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp) { - char buffer[20]; + char buffer[20], *name; /* * Use the dotted-quad IP address of the remote host as @@ -143,9 +159,14 @@ xdr_encode_common(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp) * hostname first for whatever remote hostname it receives, * so this works alright. */ - sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr)); - if (!(p = xdr_encode_string(p, buffer)) - || !(p = xdr_encode_string(p, system_utsname.nodename))) + if (nsm_use_hostnames) { + name = argp->mon_name; + } else { + sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr)); + name = buffer; + } + if (!(p = xdr_encode_string(p, name)) + || !(p = xdr_encode_string(p, utsname()->nodename))) return ERR_PTR(-EIO); *p++ = htonl(argp->prog); *p++ = htonl(argp->vers); @@ -160,9 +181,11 @@ xdr_encode_mon(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp) p = xdr_encode_common(rqstp, p, argp); if (IS_ERR(p)) return PTR_ERR(p); + + /* Surprise - there may even be room for an IPv6 address now */ *p++ = argp->addr; - *p++ = argp->vers; - *p++ = argp->proto; + *p++ = 0; + *p++ = 0; *p++ = 0; rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p); return 0; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 9a991b52c64..634139232aa 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -31,7 +31,9 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> +#include <net/ip.h> #include <linux/lockd/lockd.h> +#include <linux/lockd/sm_inter.h> #include <linux/nfs.h> #define NLMDBG_FACILITY NLMDBG_SVC @@ -46,6 +48,7 @@ EXPORT_SYMBOL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; static pid_t nlmsvc_pid; +static struct svc_serv *nlmsvc_serv; int nlmsvc_grace_period; unsigned long nlmsvc_timeout; @@ -59,6 +62,7 @@ static DECLARE_WAIT_QUEUE_HEAD(lockd_exit); static unsigned long nlm_grace_period; static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO; static int nlm_udpport, nlm_tcpport; +int nsm_use_hostnames = 0; /* * Constants needed for the sysctl interface. @@ -96,7 +100,6 @@ static inline void clear_grace_period(void) static void lockd(struct svc_rqst *rqstp) { - struct svc_serv *serv = rqstp->rq_server; int err = 0; unsigned long grace_period_expire; @@ -112,6 +115,7 @@ lockd(struct svc_rqst *rqstp) * Let our maker know we're running. */ nlmsvc_pid = current->pid; + nlmsvc_serv = rqstp->rq_server; complete(&lockd_start_done); daemonize("lockd"); @@ -161,7 +165,7 @@ lockd(struct svc_rqst *rqstp) * Find a socket with data available and call its * recvfrom routine. */ - err = svc_recv(serv, rqstp, timeout); + err = svc_recv(rqstp, timeout); if (err == -EAGAIN || err == -EINTR) continue; if (err < 0) { @@ -174,7 +178,7 @@ lockd(struct svc_rqst *rqstp) dprintk("lockd: request from %08x\n", (unsigned)ntohl(rqstp->rq_addr.sin_addr.s_addr)); - svc_process(serv, rqstp); + svc_process(rqstp); } @@ -189,6 +193,7 @@ lockd(struct svc_rqst *rqstp) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); nlmsvc_pid = 0; + nlmsvc_serv = NULL; } else printk(KERN_DEBUG "lockd: new process, skipping host shutdown\n"); @@ -205,54 +210,77 @@ lockd(struct svc_rqst *rqstp) module_put_and_exit(0); } + +static int find_socket(struct svc_serv *serv, int proto) +{ + struct svc_sock *svsk; + int found = 0; + list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) + if (svsk->sk_sk->sk_protocol == proto) { + found = 1; + break; + } + return found; +} + +static int make_socks(struct svc_serv *serv, int proto) +{ + /* Make any sockets that are needed but not present. + * If nlm_udpport or nlm_tcpport were set as module + * options, make those sockets unconditionally + */ + static int warned; + int err = 0; + if (proto == IPPROTO_UDP || nlm_udpport) + if (!find_socket(serv, IPPROTO_UDP)) + err = svc_makesock(serv, IPPROTO_UDP, nlm_udpport); + if (err == 0 && (proto == IPPROTO_TCP || nlm_tcpport)) + if (!find_socket(serv, IPPROTO_TCP)) + err= svc_makesock(serv, IPPROTO_TCP, nlm_tcpport); + if (!err) + warned = 0; + else if (warned++ == 0) + printk(KERN_WARNING + "lockd_up: makesock failed, error=%d\n", err); + return err; +} + /* * Bring up the lockd process if it's not already up. */ int -lockd_up(void) +lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ { - static int warned; struct svc_serv * serv; int error = 0; mutex_lock(&nlmsvc_mutex); /* - * Unconditionally increment the user count ... this is - * the number of clients who _want_ a lockd process. - */ - nlmsvc_users++; - /* * Check whether we're already up and running. */ - if (nlmsvc_pid) + if (nlmsvc_pid) { + if (proto) + error = make_socks(nlmsvc_serv, proto); goto out; + } /* * Sanity check: if there's no pid, * we should be the first user ... */ - if (nlmsvc_users > 1) + if (nlmsvc_users) printk(KERN_WARNING "lockd_up: no pid, %d users??\n", nlmsvc_users); error = -ENOMEM; - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); goto out; } - if ((error = svc_makesock(serv, IPPROTO_UDP, nlm_udpport)) < 0 -#ifdef CONFIG_NFSD_TCP - || (error = svc_makesock(serv, IPPROTO_TCP, nlm_tcpport)) < 0 -#endif - ) { - if (warned++ == 0) - printk(KERN_WARNING - "lockd_up: makesock failed, error=%d\n", error); + if ((error = make_socks(serv, proto)) < 0) goto destroy_and_out; - } - warned = 0; /* * Create the kernel thread and wait for it to start. @@ -272,6 +300,8 @@ lockd_up(void) destroy_and_out: svc_destroy(serv); out: + if (!error) + nlmsvc_users++; mutex_unlock(&nlmsvc_mutex); return error; } @@ -367,6 +397,22 @@ static ctl_table nlm_sysctls[] = { .extra1 = (int *) &nlm_port_min, .extra2 = (int *) &nlm_port_max, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nsm_use_hostnames", + .data = &nsm_use_hostnames, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nsm_local_state", + .data = &nsm_local_state, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; @@ -455,6 +501,7 @@ module_param_call(nlm_udpport, param_set_port, param_get_int, &nlm_udpport, 0644); module_param_call(nlm_tcpport, param_set_port, param_get_int, &nlm_tcpport, 0644); +module_param(nsm_use_hostnames, bool, 0644); /* * Initialising and terminating the module. diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index a2dd9ccb9b3..fa370f6eb07 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -38,8 +38,8 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, return nlm_lck_denied_nolocks; /* Obtain host handle */ - if (!(host = nlmsvc_lookup_host(rqstp)) - || (argp->monitor && !host->h_monitored && nsm_monitor(host) < 0)) + if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len)) + || (argp->monitor && nsm_monitor(host) < 0)) goto no_locks; *hostp = host; @@ -260,7 +260,9 @@ static int nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *a struct nlm_rqst *call; int stat; - host = nlmsvc_lookup_host(rqstp); + host = nlmsvc_lookup_host(rqstp, + argp->lock.caller, + argp->lock.len); if (host == NULL) return rpc_system_err; @@ -420,10 +422,6 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, void *resp) { struct sockaddr_in saddr = rqstp->rq_addr; - int vers = argp->vers; - int prot = argp->proto >> 1; - - struct nlm_host *host; dprintk("lockd: SM_NOTIFY called\n"); if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) @@ -438,21 +436,10 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, /* Obtain the host pointer for this NFS server and try to * reclaim all locks we hold on this server. */ + memset(&saddr, 0, sizeof(saddr)); saddr.sin_addr.s_addr = argp->addr; + nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state); - if ((argp->proto & 1)==0) { - if ((host = nlmclnt_lookup_host(&saddr, prot, vers)) != NULL) { - nlmclnt_recovery(host, argp->state); - nlm_release_host(host); - } - } else { - /* If we run on an NFS server, delete all locks held by the client */ - - if ((host = nlm_lookup_host(1, &saddr, prot, vers)) != NULL) { - nlmsvc_free_host_resources(host); - nlm_release_host(host); - } - } return rpc_success; } @@ -468,7 +455,7 @@ nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp, dprintk("lockd: GRANTED_RES called\n"); - nlmsvc_grant_reply(rqstp, &argp->cookie, argp->status); + nlmsvc_grant_reply(&argp->cookie, argp->status); return rpc_success; } diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index c9d419703cf..814c6064c9e 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -40,7 +40,7 @@ static void nlmsvc_release_block(struct nlm_block *block); static void nlmsvc_insert_block(struct nlm_block *block, unsigned long); -static int nlmsvc_remove_block(struct nlm_block *block); +static void nlmsvc_remove_block(struct nlm_block *block); static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock); static void nlmsvc_freegrantargs(struct nlm_rqst *call); @@ -49,7 +49,7 @@ static const struct rpc_call_ops nlmsvc_grant_ops; /* * The list of blocked locks to retry */ -static struct nlm_block * nlm_blocked; +static LIST_HEAD(nlm_blocked); /* * Insert a blocked lock into the global list @@ -57,48 +57,44 @@ static struct nlm_block * nlm_blocked; static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when) { - struct nlm_block **bp, *b; + struct nlm_block *b; + struct list_head *pos; dprintk("lockd: nlmsvc_insert_block(%p, %ld)\n", block, when); - kref_get(&block->b_count); - if (block->b_queued) - nlmsvc_remove_block(block); - bp = &nlm_blocked; + if (list_empty(&block->b_list)) { + kref_get(&block->b_count); + } else { + list_del_init(&block->b_list); + } + + pos = &nlm_blocked; if (when != NLM_NEVER) { if ((when += jiffies) == NLM_NEVER) when ++; - while ((b = *bp) && time_before_eq(b->b_when,when) && b->b_when != NLM_NEVER) - bp = &b->b_next; - } else - while ((b = *bp) != 0) - bp = &b->b_next; + list_for_each(pos, &nlm_blocked) { + b = list_entry(pos, struct nlm_block, b_list); + if (time_after(b->b_when,when) || b->b_when == NLM_NEVER) + break; + } + /* On normal exit from the loop, pos == &nlm_blocked, + * so we will be adding to the end of the list - good + */ + } - block->b_queued = 1; + list_add_tail(&block->b_list, pos); block->b_when = when; - block->b_next = b; - *bp = block; } /* * Remove a block from the global list */ -static int +static inline void nlmsvc_remove_block(struct nlm_block *block) { - struct nlm_block **bp, *b; - - if (!block->b_queued) - return 1; - for (bp = &nlm_blocked; (b = *bp) != 0; bp = &b->b_next) { - if (b == block) { - *bp = block->b_next; - block->b_queued = 0; - nlmsvc_release_block(block); - return 1; - } + if (!list_empty(&block->b_list)) { + list_del_init(&block->b_list); + nlmsvc_release_block(block); } - - return 0; } /* @@ -107,14 +103,14 @@ nlmsvc_remove_block(struct nlm_block *block) static struct nlm_block * nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock) { - struct nlm_block **head, *block; + struct nlm_block *block; struct file_lock *fl; dprintk("lockd: nlmsvc_lookup_block f=%p pd=%d %Ld-%Ld ty=%d\n", file, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end, lock->fl.fl_type); - for (head = &nlm_blocked; (block = *head) != 0; head = &block->b_next) { + list_for_each_entry(block, &nlm_blocked, b_list) { fl = &block->b_call->a_args.lock.fl; dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n", block->b_file, fl->fl_pid, @@ -143,20 +139,20 @@ static inline int nlm_cookie_match(struct nlm_cookie *a, struct nlm_cookie *b) * Find a block with a given NLM cookie. */ static inline struct nlm_block * -nlmsvc_find_block(struct nlm_cookie *cookie, struct sockaddr_in *sin) +nlmsvc_find_block(struct nlm_cookie *cookie) { struct nlm_block *block; - for (block = nlm_blocked; block; block = block->b_next) { - dprintk("cookie: head of blocked queue %p, block %p\n", - nlm_blocked, block); - if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie) - && nlm_cmp_addr(sin, &block->b_host->h_addr)) - break; + list_for_each_entry(block, &nlm_blocked, b_list) { + if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie)) + goto found; } - if (block != NULL) - kref_get(&block->b_count); + return NULL; + +found: + dprintk("nlmsvc_find_block(%s): block=%p\n", nlmdbg_cookie2a(cookie), block); + kref_get(&block->b_count); return block; } @@ -169,6 +165,11 @@ nlmsvc_find_block(struct nlm_cookie *cookie, struct sockaddr_in *sin) * request, but (as I found out later) that's because some implementations * do just this. Never mind the standards comittees, they support our * logging industries. + * + * 10 years later: I hope we can safely ignore these old and broken + * clients by now. Let's fix this so we can uniquely identify an incoming + * GRANTED_RES message by cookie, without having to rely on the client's IP + * address. --okir */ static inline struct nlm_block * nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file, @@ -179,7 +180,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_rqst *call = NULL; /* Create host handle for callback */ - host = nlmsvc_lookup_host(rqstp); + host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len); if (host == NULL) return NULL; @@ -192,6 +193,8 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file, if (block == NULL) goto failed; kref_init(&block->b_count); + INIT_LIST_HEAD(&block->b_list); + INIT_LIST_HEAD(&block->b_flist); if (!nlmsvc_setgrantargs(call, lock)) goto failed_free; @@ -199,7 +202,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file, /* Set notifier function for VFS, and init args */ call->a_args.lock.fl.fl_flags |= FL_SLEEP; call->a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations; - call->a_args.cookie = *cookie; /* see above */ + nlmclnt_next_cookie(&call->a_args.cookie); dprintk("lockd: created block %p...\n", block); @@ -210,8 +213,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_file *file, file->f_count++; /* Add to file's list of blocks */ - block->b_fnext = file->f_blocks; - file->f_blocks = block; + list_add(&block->b_flist, &file->f_blocks); /* Set up RPC arguments for callback */ block->b_call = call; @@ -248,19 +250,13 @@ static void nlmsvc_free_block(struct kref *kref) { struct nlm_block *block = container_of(kref, struct nlm_block, b_count); struct nlm_file *file = block->b_file; - struct nlm_block **bp; dprintk("lockd: freeing block %p...\n", block); - down(&file->f_sema); /* Remove block from file's list of blocks */ - for (bp = &file->f_blocks; *bp; bp = &(*bp)->b_fnext) { - if (*bp == block) { - *bp = block->b_fnext; - break; - } - } - up(&file->f_sema); + mutex_lock(&file->f_mutex); + list_del_init(&block->b_flist); + mutex_unlock(&file->f_mutex); nlmsvc_freegrantargs(block->b_call); nlm_release_call(block->b_call); @@ -274,47 +270,32 @@ static void nlmsvc_release_block(struct nlm_block *block) kref_put(&block->b_count, nlmsvc_free_block); } -static void nlmsvc_act_mark(struct nlm_host *host, struct nlm_file *file) -{ - struct nlm_block *block; - - down(&file->f_sema); - for (block = file->f_blocks; block != NULL; block = block->b_fnext) - block->b_host->h_inuse = 1; - up(&file->f_sema); -} - -static void nlmsvc_act_unlock(struct nlm_host *host, struct nlm_file *file) +/* + * Loop over all blocks and delete blocks held by + * a matching host. + */ +void nlmsvc_traverse_blocks(struct nlm_host *host, + struct nlm_file *file, + nlm_host_match_fn_t match) { - struct nlm_block *block; + struct nlm_block *block, *next; restart: - down(&file->f_sema); - for (block = file->f_blocks; block != NULL; block = block->b_fnext) { - if (host != NULL && host != block->b_host) + mutex_lock(&file->f_mutex); + list_for_each_entry_safe(block, next, &file->f_blocks, b_flist) { + if (!match(block->b_host, host)) continue; - if (!block->b_queued) + /* Do not destroy blocks that are not on + * the global retry list - why? */ + if (list_empty(&block->b_list)) continue; kref_get(&block->b_count); - up(&file->f_sema); + mutex_unlock(&file->f_mutex); nlmsvc_unlink_block(block); nlmsvc_release_block(block); goto restart; } - up(&file->f_sema); -} - -/* - * Loop over all blocks and perform the action specified. - * (NLM_ACT_CHECK handled by nlmsvc_inspect_file). - */ -void -nlmsvc_traverse_blocks(struct nlm_host *host, struct nlm_file *file, int action) -{ - if (action == NLM_ACT_MARK) - nlmsvc_act_mark(host, file); - else - nlmsvc_act_unlock(host, file); + mutex_unlock(&file->f_mutex); } /* @@ -325,7 +306,7 @@ static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock) { locks_copy_lock(&call->a_args.lock.fl, &lock->fl); memcpy(&call->a_args.lock.fh, &lock->fh, sizeof(call->a_args.lock.fh)); - call->a_args.lock.caller = system_utsname.nodename; + call->a_args.lock.caller = utsname()->nodename; call->a_args.lock.oh.len = lock->oh.len; /* set default data area */ @@ -373,7 +354,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, lock->fl.fl_flags &= ~FL_SLEEP; again: /* Lock file against concurrent access */ - down(&file->f_sema); + mutex_lock(&file->f_mutex); /* Get existing block (in case client is busy-waiting) */ block = nlmsvc_lookup_block(file, lock); if (block == NULL) { @@ -411,10 +392,10 @@ again: /* If we don't have a block, create and initialize it. Then * retry because we may have slept in kmalloc. */ - /* We have to release f_sema as nlmsvc_create_block may try to + /* We have to release f_mutex as nlmsvc_create_block may try to * to claim it while doing host garbage collection */ if (newblock == NULL) { - up(&file->f_sema); + mutex_unlock(&file->f_mutex); dprintk("lockd: blocking on this lock (allocating).\n"); if (!(newblock = nlmsvc_create_block(rqstp, file, lock, cookie))) return nlm_lck_denied_nolocks; @@ -424,7 +405,7 @@ again: /* Append to list of blocked */ nlmsvc_insert_block(newblock, NLM_NEVER); out: - up(&file->f_sema); + mutex_unlock(&file->f_mutex); nlmsvc_release_block(newblock); nlmsvc_release_block(block); dprintk("lockd: nlmsvc_lock returned %u\n", ret); @@ -451,6 +432,7 @@ nlmsvc_testlock(struct nlm_file *file, struct nlm_lock *lock, (long long)conflock->fl.fl_start, (long long)conflock->fl.fl_end); conflock->caller = "somehost"; /* FIXME */ + conflock->len = strlen(conflock->caller); conflock->oh.len = 0; /* don't return OH info */ conflock->svid = conflock->fl.fl_pid; return nlm_lck_denied; @@ -507,9 +489,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); - down(&file->f_sema); + mutex_lock(&file->f_mutex); block = nlmsvc_lookup_block(file, lock); - up(&file->f_sema); + mutex_unlock(&file->f_mutex); if (block != NULL) { status = nlmsvc_unlink_block(block); nlmsvc_release_block(block); @@ -527,10 +509,10 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) static void nlmsvc_notify_blocked(struct file_lock *fl) { - struct nlm_block **bp, *block; + struct nlm_block *block; dprintk("lockd: VFS unblock notification for block %p\n", fl); - for (bp = &nlm_blocked; (block = *bp) != 0; bp = &block->b_next) { + list_for_each_entry(block, &nlm_blocked, b_list) { if (nlm_compare_locks(&block->b_call->a_args.lock.fl, fl)) { nlmsvc_insert_block(block, 0); svc_wake_up(block->b_daemon); @@ -663,17 +645,14 @@ static const struct rpc_call_ops nlmsvc_grant_ops = { * block. */ void -nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status) +nlmsvc_grant_reply(struct nlm_cookie *cookie, u32 status) { struct nlm_block *block; - struct nlm_file *file; - dprintk("grant_reply: looking for cookie %x, host (%08x), s=%d \n", - *(unsigned int *)(cookie->data), - ntohl(rqstp->rq_addr.sin_addr.s_addr), status); - if (!(block = nlmsvc_find_block(cookie, &rqstp->rq_addr))) + dprintk("grant_reply: looking for cookie %x, s=%d \n", + *(unsigned int *)(cookie->data), status); + if (!(block = nlmsvc_find_block(cookie))) return; - file = block->b_file; if (block) { if (status == NLM_LCK_DENIED_GRACE_PERIOD) { @@ -696,16 +675,19 @@ nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status unsigned long nlmsvc_retry_blocked(void) { - struct nlm_block *block; + unsigned long timeout = MAX_SCHEDULE_TIMEOUT; + struct nlm_block *block; + + while (!list_empty(&nlm_blocked)) { + block = list_entry(nlm_blocked.next, struct nlm_block, b_list); - dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n", - nlm_blocked, - nlm_blocked? nlm_blocked->b_when : 0); - while ((block = nlm_blocked) != 0) { if (block->b_when == NLM_NEVER) break; - if (time_after(block->b_when,jiffies)) + if (time_after(block->b_when,jiffies)) { + timeout = block->b_when - jiffies; break; + } + dprintk("nlmsvc_retry_blocked(%p, when=%ld)\n", block, block->b_when); kref_get(&block->b_count); @@ -713,8 +695,5 @@ nlmsvc_retry_blocked(void) nlmsvc_release_block(block); } - if ((block = nlm_blocked) && block->b_when != NLM_NEVER) - return (block->b_when - jiffies); - - return MAX_SCHEDULE_TIMEOUT; + return timeout; } diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index dbb66a3b5cd..75b2c81bcb9 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -66,8 +66,8 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, return nlm_lck_denied_nolocks; /* Obtain host handle */ - if (!(host = nlmsvc_lookup_host(rqstp)) - || (argp->monitor && !host->h_monitored && nsm_monitor(host) < 0)) + if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len)) + || (argp->monitor && nsm_monitor(host) < 0)) goto no_locks; *hostp = host; @@ -287,7 +287,9 @@ static int nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args *ar struct nlm_rqst *call; int stat; - host = nlmsvc_lookup_host(rqstp); + host = nlmsvc_lookup_host(rqstp, + argp->lock.caller, + argp->lock.len); if (host == NULL) return rpc_system_err; @@ -449,9 +451,6 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, void *resp) { struct sockaddr_in saddr = rqstp->rq_addr; - int vers = argp->vers; - int prot = argp->proto >> 1; - struct nlm_host *host; dprintk("lockd: SM_NOTIFY called\n"); if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) @@ -466,19 +465,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, /* Obtain the host pointer for this NFS server and try to * reclaim all locks we hold on this server. */ + memset(&saddr, 0, sizeof(saddr)); saddr.sin_addr.s_addr = argp->addr; - if ((argp->proto & 1)==0) { - if ((host = nlmclnt_lookup_host(&saddr, prot, vers)) != NULL) { - nlmclnt_recovery(host, argp->state); - nlm_release_host(host); - } - } else { - /* If we run on an NFS server, delete all locks held by the client */ - if ((host = nlm_lookup_host(1, &saddr, prot, vers)) != NULL) { - nlmsvc_free_host_resources(host); - nlm_release_host(host); - } - } + nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state); return rpc_success; } @@ -495,7 +484,7 @@ nlmsvc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp, dprintk("lockd: GRANTED_RES called\n"); - nlmsvc_grant_reply(rqstp, &argp->cookie, argp->status); + nlmsvc_grant_reply(&argp->cookie, argp->status); return rpc_success; } diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c index 27288c83da9..b9926ce8782 100644 --- a/fs/lockd/svcshare.c +++ b/fs/lockd/svcshare.c @@ -85,24 +85,20 @@ nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file, } /* - * Traverse all shares for a given file (and host). - * NLM_ACT_CHECK is handled by nlmsvc_inspect_file. + * Traverse all shares for a given file, and delete + * those owned by the given (type of) host */ -void -nlmsvc_traverse_shares(struct nlm_host *host, struct nlm_file *file, int action) +void nlmsvc_traverse_shares(struct nlm_host *host, struct nlm_file *file, + nlm_host_match_fn_t match) { struct nlm_share *share, **shpp; shpp = &file->f_shares; while ((share = *shpp) != NULL) { - if (action == NLM_ACT_MARK) - share->s_host->h_inuse = 1; - else if (action == NLM_ACT_UNLOCK) { - if (host == NULL || host == share->s_host) { - *shpp = share->s_next; - kfree(share); - continue; - } + if (match(share->s_host, host)) { + *shpp = share->s_next; + kfree(share); + continue; } shpp = &share->s_next; } diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index a92dd98f840..514f5f20701 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -25,9 +25,9 @@ /* * Global file hash table */ -#define FILE_HASH_BITS 5 +#define FILE_HASH_BITS 7 #define FILE_NRHASH (1<<FILE_HASH_BITS) -static struct nlm_file * nlm_files[FILE_NRHASH]; +static struct hlist_head nlm_files[FILE_NRHASH]; static DEFINE_MUTEX(nlm_file_mutex); #ifdef NFSD_DEBUG @@ -82,6 +82,7 @@ u32 nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, struct nfs_fh *f) { + struct hlist_node *pos; struct nlm_file *file; unsigned int hash; u32 nfserr; @@ -93,7 +94,7 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, /* Lock file table */ mutex_lock(&nlm_file_mutex); - for (file = nlm_files[hash]; file; file = file->f_next) + hlist_for_each_entry(file, pos, &nlm_files[hash], f_list) if (!nfs_compare_fh(&file->f_handle, f)) goto found; @@ -105,8 +106,9 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, goto out_unlock; memcpy(&file->f_handle, f, sizeof(struct nfs_fh)); - file->f_hash = hash; - init_MUTEX(&file->f_sema); + mutex_init(&file->f_mutex); + INIT_HLIST_NODE(&file->f_list); + INIT_LIST_HEAD(&file->f_blocks); /* Open the file. Note that this must not sleep for too long, else * we would lock up lockd:-) So no NFS re-exports, folks. @@ -115,12 +117,11 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, * the file. */ if ((nfserr = nlmsvc_ops->fopen(rqstp, f, &file->f_file)) != 0) { - dprintk("lockd: open failed (nfserr %d)\n", ntohl(nfserr)); + dprintk("lockd: open failed (error %d)\n", nfserr); goto out_free; } - file->f_next = nlm_files[hash]; - nlm_files[hash] = file; + hlist_add_head(&file->f_list, &nlm_files[hash]); found: dprintk("lockd: found file %p (count %d)\n", file, file->f_count); @@ -149,22 +150,14 @@ out_free: static inline void nlm_delete_file(struct nlm_file *file) { - struct nlm_file **fp, *f; - nlm_debug_print_file("closing file", file); - - fp = nlm_files + file->f_hash; - while ((f = *fp) != NULL) { - if (f == file) { - *fp = file->f_next; - nlmsvc_ops->fclose(file->f_file); - kfree(file); - return; - } - fp = &f->f_next; + if (!hlist_unhashed(&file->f_list)) { + hlist_del(&file->f_list); + nlmsvc_ops->fclose(file->f_file); + kfree(file); + } else { + printk(KERN_WARNING "lockd: attempt to release unknown file!\n"); } - - printk(KERN_WARNING "lockd: attempt to release unknown file!\n"); } /* @@ -172,7 +165,8 @@ nlm_delete_file(struct nlm_file *file) * action. */ static int -nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, int action) +nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, + nlm_host_match_fn_t match) { struct inode *inode = nlmsvc_file_inode(file); struct file_lock *fl; @@ -186,17 +180,11 @@ again: /* update current lock count */ file->f_locks++; + lockhost = (struct nlm_host *) fl->fl_owner; - if (action == NLM_ACT_MARK) - lockhost->h_inuse = 1; - else if (action == NLM_ACT_CHECK) - return 1; - else if (action == NLM_ACT_UNLOCK) { + if (match(lockhost, host)) { struct file_lock lock = *fl; - if (host && lockhost != host) - continue; - lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; @@ -213,53 +201,66 @@ again: } /* - * Operate on a single file + * Inspect a single file */ static inline int -nlm_inspect_file(struct nlm_host *host, struct nlm_file *file, int action) +nlm_inspect_file(struct nlm_host *host, struct nlm_file *file, nlm_host_match_fn_t match) { - if (action == NLM_ACT_CHECK) { - /* Fast path for mark and sweep garbage collection */ - if (file->f_count || file->f_blocks || file->f_shares) + nlmsvc_traverse_blocks(host, file, match); + nlmsvc_traverse_shares(host, file, match); + return nlm_traverse_locks(host, file, match); +} + +/* + * Quick check whether there are still any locks, blocks or + * shares on a given file. + */ +static inline int +nlm_file_inuse(struct nlm_file *file) +{ + struct inode *inode = nlmsvc_file_inode(file); + struct file_lock *fl; + + if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) + return 1; + + for (fl = inode->i_flock; fl; fl = fl->fl_next) { + if (fl->fl_lmops == &nlmsvc_lock_operations) return 1; - } else { - nlmsvc_traverse_blocks(host, file, action); - nlmsvc_traverse_shares(host, file, action); } - return nlm_traverse_locks(host, file, action); + file->f_locks = 0; + return 0; } /* * Loop over all files in the file table. */ static int -nlm_traverse_files(struct nlm_host *host, int action) +nlm_traverse_files(struct nlm_host *host, nlm_host_match_fn_t match) { - struct nlm_file *file, **fp; + struct hlist_node *pos, *next; + struct nlm_file *file; int i, ret = 0; mutex_lock(&nlm_file_mutex); for (i = 0; i < FILE_NRHASH; i++) { - fp = nlm_files + i; - while ((file = *fp) != NULL) { + hlist_for_each_entry_safe(file, pos, next, &nlm_files[i], f_list) { file->f_count++; mutex_unlock(&nlm_file_mutex); /* Traverse locks, blocks and shares of this file * and update file->f_locks count */ - if (nlm_inspect_file(host, file, action)) + if (nlm_inspect_file(host, file, match)) ret = 1; mutex_lock(&nlm_file_mutex); file->f_count--; /* No more references to this file. Let go of it. */ - if (!file->f_blocks && !file->f_locks + if (list_empty(&file->f_blocks) && !file->f_locks && !file->f_shares && !file->f_count) { - *fp = file->f_next; + hlist_del(&file->f_list); nlmsvc_ops->fclose(file->f_file); kfree(file); - } else { - fp = &file->f_next; } } } @@ -286,23 +287,54 @@ nlm_release_file(struct nlm_file *file) mutex_lock(&nlm_file_mutex); /* If there are no more locks etc, delete the file */ - if(--file->f_count == 0) { - if(!nlm_inspect_file(NULL, file, NLM_ACT_CHECK)) - nlm_delete_file(file); - } + if (--file->f_count == 0 && !nlm_file_inuse(file)) + nlm_delete_file(file); mutex_unlock(&nlm_file_mutex); } /* + * Helpers function for resource traversal + * + * nlmsvc_mark_host: + * used by the garbage collector; simply sets h_inuse. + * Always returns 0. + * + * nlmsvc_same_host: + * returns 1 iff the two hosts match. Used to release + * all resources bound to a specific host. + * + * nlmsvc_is_client: + * returns 1 iff the host is a client. + * Used by nlmsvc_invalidate_all + */ +static int +nlmsvc_mark_host(struct nlm_host *host, struct nlm_host *dummy) +{ + host->h_inuse = 1; + return 0; +} + +static int +nlmsvc_same_host(struct nlm_host *host, struct nlm_host *other) +{ + return host == other; +} + +static int +nlmsvc_is_client(struct nlm_host *host, struct nlm_host *dummy) +{ + return host->h_server; +} + +/* * Mark all hosts that still hold resources */ void nlmsvc_mark_resources(void) { dprintk("lockd: nlmsvc_mark_resources\n"); - - nlm_traverse_files(NULL, NLM_ACT_MARK); + nlm_traverse_files(NULL, nlmsvc_mark_host); } /* @@ -313,23 +345,25 @@ nlmsvc_free_host_resources(struct nlm_host *host) { dprintk("lockd: nlmsvc_free_host_resources\n"); - if (nlm_traverse_files(host, NLM_ACT_UNLOCK)) + if (nlm_traverse_files(host, nlmsvc_same_host)) { printk(KERN_WARNING - "lockd: couldn't remove all locks held by %s", + "lockd: couldn't remove all locks held by %s\n", host->h_name); + BUG(); + } } /* - * delete all hosts structs for clients + * Remove all locks held for clients */ void nlmsvc_invalidate_all(void) { - struct nlm_host *host; - while ((host = nlm_find_client()) != NULL) { - nlmsvc_free_host_resources(host); - host->h_expires = 0; - host->h_killed = 1; - nlm_release_host(host); - } + /* Release all locks held by NFS clients. + * Previously, the code would call + * nlmsvc_free_host_resources for each client in + * turn, which is about as inefficient as it gets. + * Now we just do it once in nlm_traverse_files. + */ + nlm_traverse_files(NULL, nlmsvc_is_client); } diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 033ea4ac2c3..61c46facf25 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -515,7 +515,7 @@ nlmclt_decode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp) */ #define NLM_void_sz 0 #define NLM_cookie_sz 1+XDR_QUADLEN(NLM_MAXCOOKIELEN) -#define NLM_caller_sz 1+XDR_QUADLEN(sizeof(system_utsname.nodename)) +#define NLM_caller_sz 1+XDR_QUADLEN(sizeof(utsname()->nodename)) #define NLM_netobj_sz 1+XDR_QUADLEN(XDR_MAX_NETOBJ) /* #define NLM_owner_sz 1+XDR_QUADLEN(NLM_MAXOWNER) */ #define NLM_fhandle_sz 1+XDR_QUADLEN(NFS2_FHSIZE) diff --git a/fs/locks.c b/fs/locks.c index 21dfadfca2b..e0b6a80649a 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1514,7 +1514,7 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg) goto out_unlock; } - error = f_setown(filp, current->pid, 0); + error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); out_unlock: unlock_kernel(); return error; diff --git a/fs/namespace.c b/fs/namespace.c index 66d921e14fe..55442a6cf22 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -133,7 +133,7 @@ struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) static inline int check_mnt(struct vfsmount *mnt) { - return mnt->mnt_namespace == current->namespace; + return mnt->mnt_namespace == current->nsproxy->namespace; } static void touch_namespace(struct namespace *ns) @@ -830,7 +830,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, if (parent_nd) { detach_mnt(source_mnt, parent_nd); attach_mnt(source_mnt, nd); - touch_namespace(current->namespace); + touch_namespace(current->nsproxy->namespace); } else { mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); commit_tree(source_mnt); @@ -1441,7 +1441,7 @@ dput_out: */ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) { - struct namespace *namespace = tsk->namespace; + struct namespace *namespace = tsk->nsproxy->namespace; struct namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; struct vfsmount *p, *q; @@ -1508,7 +1508,7 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) int copy_namespace(int flags, struct task_struct *tsk) { - struct namespace *namespace = tsk->namespace; + struct namespace *namespace = tsk->nsproxy->namespace; struct namespace *new_ns; int err = 0; @@ -1531,7 +1531,7 @@ int copy_namespace(int flags, struct task_struct *tsk) goto out; } - tsk->namespace = new_ns; + tsk->nsproxy->namespace = new_ns; out: put_namespace(namespace); @@ -1754,7 +1754,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root, detach_mnt(user_nd.mnt, &root_parent); attach_mnt(user_nd.mnt, &old_nd); /* mount old root on put_old */ attach_mnt(new_nd.mnt, &root_parent); /* mount new_root on / */ - touch_namespace(current->namespace); + touch_namespace(current->nsproxy->namespace); spin_unlock(&vfsmount_lock); chroot_fs_refs(&user_nd, &new_nd); security_sb_post_pivotroot(&user_nd, &new_nd); @@ -1780,7 +1780,6 @@ static void __init init_mount_tree(void) { struct vfsmount *mnt; struct namespace *namespace; - struct task_struct *g, *p; mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); if (IS_ERR(mnt)) @@ -1796,13 +1795,8 @@ static void __init init_mount_tree(void) namespace->root = mnt; mnt->mnt_namespace = namespace; - init_task.namespace = namespace; - read_lock(&tasklist_lock); - do_each_thread(g, p) { - get_namespace(namespace); - p->namespace = namespace; - } while_each_thread(g, p); - read_unlock(&tasklist_lock); + init_task.nsproxy->namespace = namespace; + get_namespace(namespace); set_fs_pwd(current->fs, namespace->root, namespace->root->mnt_root); set_fs_root(current->fs, namespace->root, namespace->root->mnt_root); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index a3ee11364db..7933e2e99db 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -58,7 +58,6 @@ module_param_call(callback_tcpport, param_set_port, param_get_int, */ static void nfs_callback_svc(struct svc_rqst *rqstp) { - struct svc_serv *serv = rqstp->rq_server; int err; __module_get(THIS_MODULE); @@ -80,7 +79,7 @@ static void nfs_callback_svc(struct svc_rqst *rqstp) /* * Listen for a request on the socket */ - err = svc_recv(serv, rqstp, MAX_SCHEDULE_TIMEOUT); + err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); if (err == -EAGAIN || err == -EINTR) continue; if (err < 0) { @@ -91,7 +90,7 @@ static void nfs_callback_svc(struct svc_rqst *rqstp) } dprintk("%s: request from %u.%u.%u.%u\n", __FUNCTION__, NIPQUAD(rqstp->rq_addr.sin_addr.s_addr)); - svc_process(serv, rqstp); + svc_process(rqstp); } svc_exit_thread(rqstp); @@ -116,7 +115,7 @@ int nfs_callback_up(void) goto out; init_completion(&nfs_callback_info.started); init_completion(&nfs_callback_info.stopped); - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); ret = -ENOMEM; if (!serv) goto out_err; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ec1938d4b81..8106f3b29e4 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -460,7 +460,8 @@ static int nfs_start_lockd(struct nfs_server *server) goto out; if (server->flags & NFS_MOUNT_NONLM) goto out; - error = lockd_up(); + error = lockd_up((server->flags & NFS_MOUNT_TCP) ? + IPPROTO_TCP : IPPROTO_UDP); if (error < 0) server->flags |= NFS_MOUNT_NONLM; else diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index c0a754ecdee..1d656a64519 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -312,7 +312,7 @@ static int __init root_nfs_name(char *name) /* Override them by options set on kernel command-line */ root_nfs_parse(name, buf); - cp = system_utsname.nodename; + cp = utsname()->nodename; if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) { printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n"); return -1; diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 01bc68c628a..e13fa23bd10 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -319,12 +319,25 @@ svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old) static struct cache_head *export_table[EXPORT_HASHMAX]; +static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc) +{ + int i; + + for (i = 0; i < fsloc->locations_count; i++) { + kfree(fsloc->locations[i].path); + kfree(fsloc->locations[i].hosts); + } + kfree(fsloc->locations); +} + static void svc_export_put(struct kref *ref) { struct svc_export *exp = container_of(ref, struct svc_export, h.ref); dput(exp->ex_dentry); mntput(exp->ex_mnt); auth_domain_put(exp->ex_client); + kfree(exp->ex_path); + nfsd4_fslocs_free(&exp->ex_fslocs); kfree(exp); } @@ -370,7 +383,7 @@ static int check_export(struct inode *inode, int flags) */ if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) && !(flags & NFSEXP_FSID)) { - dprintk("exp_export: export of non-dev fs without fsid"); + dprintk("exp_export: export of non-dev fs without fsid\n"); return -EINVAL; } if (!inode->i_sb->s_export_op) { @@ -386,6 +399,69 @@ static int check_export(struct inode *inode, int flags) } +#ifdef CONFIG_NFSD_V4 + +static int +fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) +{ + int len; + int migrated, i, err; + + len = qword_get(mesg, buf, PAGE_SIZE); + if (len != 5 || memcmp(buf, "fsloc", 5)) + return 0; + + /* listsize */ + err = get_int(mesg, &fsloc->locations_count); + if (err) + return err; + if (fsloc->locations_count > MAX_FS_LOCATIONS) + return -EINVAL; + if (fsloc->locations_count == 0) + return 0; + + fsloc->locations = kzalloc(fsloc->locations_count + * sizeof(struct nfsd4_fs_location), GFP_KERNEL); + if (!fsloc->locations) + return -ENOMEM; + for (i=0; i < fsloc->locations_count; i++) { + /* colon separated host list */ + err = -EINVAL; + len = qword_get(mesg, buf, PAGE_SIZE); + if (len <= 0) + goto out_free_all; + err = -ENOMEM; + fsloc->locations[i].hosts = kstrdup(buf, GFP_KERNEL); + if (!fsloc->locations[i].hosts) + goto out_free_all; + err = -EINVAL; + /* slash separated path component list */ + len = qword_get(mesg, buf, PAGE_SIZE); + if (len <= 0) + goto out_free_all; + err = -ENOMEM; + fsloc->locations[i].path = kstrdup(buf, GFP_KERNEL); + if (!fsloc->locations[i].path) + goto out_free_all; + } + /* migrated */ + err = get_int(mesg, &migrated); + if (err) + goto out_free_all; + err = -EINVAL; + if (migrated < 0 || migrated > 1) + goto out_free_all; + fsloc->migrated = migrated; + return 0; +out_free_all: + nfsd4_fslocs_free(fsloc); + return err; +} + +#else /* CONFIG_NFSD_V4 */ +static inline int fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc) { return 0; } +#endif + static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) { /* client path expiry [flags anonuid anongid fsid] */ @@ -398,6 +474,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) int an_int; nd.dentry = NULL; + exp.ex_path = NULL; if (mesg[mlen-1] != '\n') return -EINVAL; @@ -428,6 +505,10 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) exp.ex_client = dom; exp.ex_mnt = nd.mnt; exp.ex_dentry = nd.dentry; + exp.ex_path = kstrdup(buf, GFP_KERNEL); + err = -ENOMEM; + if (!exp.ex_path) + goto out; /* expiry */ err = -EINVAL; @@ -435,6 +516,11 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if (exp.h.expiry_time == 0) goto out; + /* fs locations */ + exp.ex_fslocs.locations = NULL; + exp.ex_fslocs.locations_count = 0; + exp.ex_fslocs.migrated = 0; + /* flags */ err = get_int(&mesg, &an_int); if (err == -ENOENT) @@ -460,6 +546,10 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) err = check_export(nd.dentry->d_inode, exp.ex_flags); if (err) goto out; + + err = fsloc_parse(&mesg, buf, &exp.ex_fslocs); + if (err) + goto out; } expp = svc_export_lookup(&exp); @@ -473,6 +563,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) else exp_put(expp); out: + kfree(exp.ex_path); if (nd.dentry) path_release(&nd); out_no_path: @@ -482,7 +573,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) return err; } -static void exp_flags(struct seq_file *m, int flag, int fsid, uid_t anonu, uid_t anong); +static void exp_flags(struct seq_file *m, int flag, int fsid, + uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fslocs); static int svc_export_show(struct seq_file *m, struct cache_detail *cd, @@ -501,8 +593,8 @@ static int svc_export_show(struct seq_file *m, seq_putc(m, '('); if (test_bit(CACHE_VALID, &h->flags) && !test_bit(CACHE_NEGATIVE, &h->flags)) - exp_flags(m, exp->ex_flags, exp->ex_fsid, - exp->ex_anon_uid, exp->ex_anon_gid); + exp_flags(m, exp->ex_flags, exp->ex_fsid, + exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs); seq_puts(m, ")\n"); return 0; } @@ -524,6 +616,10 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_client = item->ex_client; new->ex_dentry = dget(item->ex_dentry); new->ex_mnt = mntget(item->ex_mnt); + new->ex_path = NULL; + new->ex_fslocs.locations = NULL; + new->ex_fslocs.locations_count = 0; + new->ex_fslocs.migrated = 0; } static void export_update(struct cache_head *cnew, struct cache_head *citem) @@ -535,6 +631,14 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) new->ex_anon_uid = item->ex_anon_uid; new->ex_anon_gid = item->ex_anon_gid; new->ex_fsid = item->ex_fsid; + new->ex_path = item->ex_path; + item->ex_path = NULL; + new->ex_fslocs.locations = item->ex_fslocs.locations; + item->ex_fslocs.locations = NULL; + new->ex_fslocs.locations_count = item->ex_fslocs.locations_count; + item->ex_fslocs.locations_count = 0; + new->ex_fslocs.migrated = item->ex_fslocs.migrated; + item->ex_fslocs.migrated = 0; } static struct cache_head *svc_export_alloc(void) @@ -1048,36 +1152,28 @@ int exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp, struct cache_req *creq) { - struct svc_expkey *fsid_key; struct svc_export *exp; int rv; u32 fsidv[2]; mk_fsid_v1(fsidv, 0); - fsid_key = exp_find_key(clp, 1, fsidv, creq); - if (IS_ERR(fsid_key) && PTR_ERR(fsid_key) == -EAGAIN) + exp = exp_find(clp, 1, fsidv, creq); + if (IS_ERR(exp) && PTR_ERR(exp) == -EAGAIN) return nfserr_dropit; - if (!fsid_key || IS_ERR(fsid_key)) - return nfserr_perm; - - exp = exp_get_by_name(clp, fsid_key->ek_mnt, fsid_key->ek_dentry, creq); if (exp == NULL) - rv = nfserr_perm; + return nfserr_perm; else if (IS_ERR(exp)) - rv = nfserrno(PTR_ERR(exp)); - else { - rv = fh_compose(fhp, exp, - fsid_key->ek_dentry, NULL); - exp_put(exp); - } - cache_put(&fsid_key->h, &svc_expkey_cache); + return nfserrno(PTR_ERR(exp)); + rv = fh_compose(fhp, exp, exp->ex_dentry, NULL); + exp_put(exp); return rv; } /* Iterator */ static void *e_start(struct seq_file *m, loff_t *pos) + __acquires(svc_export_cache.hash_lock) { loff_t n = *pos; unsigned hash, export; @@ -1086,7 +1182,7 @@ static void *e_start(struct seq_file *m, loff_t *pos) exp_readlock(); read_lock(&svc_export_cache.hash_lock); if (!n--) - return (void *)1; + return SEQ_START_TOKEN; hash = n >> 32; export = n & ((1LL<<32) - 1); @@ -1110,7 +1206,7 @@ static void *e_next(struct seq_file *m, void *p, loff_t *pos) struct cache_head *ch = p; int hash = (*pos >> 32); - if (p == (void *)1) + if (p == SEQ_START_TOKEN) hash = 0; else if (ch->next == NULL) { hash++; @@ -1131,6 +1227,7 @@ static void *e_next(struct seq_file *m, void *p, loff_t *pos) } static void e_stop(struct seq_file *m, void *p) + __releases(svc_export_cache.hash_lock) { read_unlock(&svc_export_cache.hash_lock); exp_readunlock(); @@ -1156,7 +1253,8 @@ static struct flags { { 0, {"", ""}} }; -static void exp_flags(struct seq_file *m, int flag, int fsid, uid_t anonu, uid_t anong) +static void exp_flags(struct seq_file *m, int flag, int fsid, + uid_t anonu, uid_t anong, struct nfsd4_fs_locations *fsloc) { int first = 0; struct flags *flg; @@ -1172,21 +1270,34 @@ static void exp_flags(struct seq_file *m, int flag, int fsid, uid_t anonu, uid_t seq_printf(m, "%sanonuid=%d", first++?",":"", anonu); if (anong != (gid_t)-2 && anong != (0x10000-2)) seq_printf(m, "%sanongid=%d", first++?",":"", anong); + if (fsloc && fsloc->locations_count > 0) { + char *loctype = (fsloc->migrated) ? "refer" : "replicas"; + int i; + + seq_printf(m, "%s%s=", first++?",":"", loctype); + seq_escape(m, fsloc->locations[0].path, ",;@ \t\n\\"); + seq_putc(m, '@'); + seq_escape(m, fsloc->locations[0].hosts, ",;@ \t\n\\"); + for (i = 1; i < fsloc->locations_count; i++) { + seq_putc(m, ';'); + seq_escape(m, fsloc->locations[i].path, ",;@ \t\n\\"); + seq_putc(m, '@'); + seq_escape(m, fsloc->locations[i].hosts, ",;@ \t\n\\"); + } + } } static int e_show(struct seq_file *m, void *p) { struct cache_head *cp = p; struct svc_export *exp = container_of(cp, struct svc_export, h); - svc_client *clp; - if (p == (void *)1) { + if (p == SEQ_START_TOKEN) { seq_puts(m, "# Version 1.1\n"); seq_puts(m, "# Path Client(Flags) # IPs\n"); return 0; } - clp = exp->ex_client; cache_get(&exp->h); if (cache_check(&svc_export_cache, &exp->h, NULL)) return 0; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index fc95c4df669..9187755661d 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -1,5 +1,5 @@ /* - * linux/fs/nfsd/nfsacl.c + * linux/fs/nfsd/nfs2acl.c * * Process version 2 NFSACL requests. * @@ -241,7 +241,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, u32 *p, rqstp->rq_res.page_len = w; while (w > 0) { - if (!svc_take_res_page(rqstp)) + if (!rqstp->rq_respages[rqstp->rq_resused++]) return 0; w -= PAGE_SIZE; } @@ -333,4 +333,5 @@ struct svc_version nfsd_acl_version2 = { .vs_proc = nfsd_acl_procedures2, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS3_SVC_XDRSIZE, + .vs_hidden = 1, }; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 16e10c170ae..d4bdc00c116 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -185,7 +185,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, u32 *p, rqstp->rq_res.page_len = w; while (w > 0) { - if (!svc_take_res_page(rqstp)) + if (!rqstp->rq_respages[rqstp->rq_resused++]) return 0; w -= PAGE_SIZE; } @@ -263,5 +263,6 @@ struct svc_version nfsd_acl_version3 = { .vs_proc = nfsd_acl_procedures3, .vs_dispatch = nfsd_dispatch, .vs_xdrsize = NFS3_SVC_XDRSIZE, + .vs_hidden = 1, }; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index f61142afea4..a5ebc7dbb38 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -160,6 +160,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp, struct nfsd3_readres *resp) { int nfserr; + u32 max_blocksize = svc_max_payload(rqstp); dprintk("nfsd: READ(3) %s %lu bytes at %lu\n", SVCFH_fmt(&argp->fh), @@ -172,15 +173,15 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp, */ resp->count = argp->count; - if (NFSSVC_MAXBLKSIZE < resp->count) - resp->count = NFSSVC_MAXBLKSIZE; + if (max_blocksize < resp->count) + resp->count = max_blocksize; svc_reserve(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); fh_copy(&resp->fh, &argp->fh); nfserr = nfsd_read(rqstp, &resp->fh, NULL, argp->offset, - argp->vec, argp->vlen, + rqstp->rq_vec, argp->vlen, &resp->count); if (nfserr == 0) { struct inode *inode = resp->fh.fh_dentry->d_inode; @@ -210,7 +211,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp, resp->committed = argp->stable; nfserr = nfsd_write(rqstp, &resp->fh, NULL, argp->offset, - argp->vec, argp->vlen, + rqstp->rq_vec, argp->vlen, argp->len, &resp->committed); resp->count = argp->count; @@ -538,15 +539,16 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, struct nfsd3_fsinfores *resp) { int nfserr; + u32 max_blocksize = svc_max_payload(rqstp); dprintk("nfsd: FSINFO(3) %s\n", SVCFH_fmt(&argp->fh)); - resp->f_rtmax = NFSSVC_MAXBLKSIZE; - resp->f_rtpref = NFSSVC_MAXBLKSIZE; + resp->f_rtmax = max_blocksize; + resp->f_rtpref = max_blocksize; resp->f_rtmult = PAGE_SIZE; - resp->f_wtmax = NFSSVC_MAXBLKSIZE; - resp->f_wtpref = NFSSVC_MAXBLKSIZE; + resp->f_wtmax = max_blocksize; + resp->f_wtpref = max_blocksize; resp->f_wtmult = PAGE_SIZE; resp->f_dtpref = PAGE_SIZE; resp->f_maxfilesize = ~(u32) 0; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 243d94b9653..247d518248b 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -330,6 +330,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, u32 *p, { unsigned int len; int v,pn; + u32 max_blocksize = svc_max_payload(rqstp); if (!(p = decode_fh(p, &args->fh)) || !(p = xdr_decode_hyper(p, &args->offset))) @@ -337,17 +338,16 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, u32 *p, len = args->count = ntohl(*p++); - if (len > NFSSVC_MAXBLKSIZE) - len = NFSSVC_MAXBLKSIZE; + if (len > max_blocksize) + len = max_blocksize; /* set up the kvec */ v=0; while (len > 0) { - pn = rqstp->rq_resused; - svc_take_page(rqstp); - args->vec[v].iov_base = page_address(rqstp->rq_respages[pn]); - args->vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE; - len -= args->vec[v].iov_len; + pn = rqstp->rq_resused++; + rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]); + rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE; + len -= rqstp->rq_vec[v].iov_len; v++; } args->vlen = v; @@ -359,6 +359,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, u32 *p, struct nfsd3_writeargs *args) { unsigned int len, v, hdr; + u32 max_blocksize = svc_max_payload(rqstp); if (!(p = decode_fh(p, &args->fh)) || !(p = xdr_decode_hyper(p, &args->offset))) @@ -373,22 +374,22 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, u32 *p, rqstp->rq_arg.len - hdr < len) return 0; - args->vec[0].iov_base = (void*)p; - args->vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; + rqstp->rq_vec[0].iov_base = (void*)p; + rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; - if (len > NFSSVC_MAXBLKSIZE) - len = NFSSVC_MAXBLKSIZE; + if (len > max_blocksize) + len = max_blocksize; v= 0; - while (len > args->vec[v].iov_len) { - len -= args->vec[v].iov_len; + while (len > rqstp->rq_vec[v].iov_len) { + len -= rqstp->rq_vec[v].iov_len; v++; - args->vec[v].iov_base = page_address(rqstp->rq_argpages[v]); - args->vec[v].iov_len = PAGE_SIZE; + rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]); + rqstp->rq_vec[v].iov_len = PAGE_SIZE; } - args->vec[v].iov_len = len; + rqstp->rq_vec[v].iov_len = len; args->vlen = v+1; - return args->count == args->len && args->vec[0].iov_len > 0; + return args->count == args->len && rqstp->rq_vec[0].iov_len > 0; } int @@ -446,11 +447,11 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, u32 *p, * This page appears in the rq_res.pages list, but as pages_len is always * 0, it won't get in the way */ - svc_take_page(rqstp); len = ntohl(*p++); if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE) return 0; - args->tname = new = page_address(rqstp->rq_respages[rqstp->rq_resused-1]); + args->tname = new = + page_address(rqstp->rq_respages[rqstp->rq_resused++]); args->tlen = len; /* first copy and check from the first page */ old = (char*)p; @@ -522,8 +523,8 @@ nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, u32 *p, { if (!(p = decode_fh(p, &args->fh))) return 0; - svc_take_page(rqstp); - args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]); + args->buffer = + page_address(rqstp->rq_respages[rqstp->rq_resused++]); return xdr_argsize_check(rqstp, p); } @@ -554,8 +555,8 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p, if (args->count > PAGE_SIZE) args->count = PAGE_SIZE; - svc_take_page(rqstp); - args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]); + args->buffer = + page_address(rqstp->rq_respages[rqstp->rq_resused++]); return xdr_argsize_check(rqstp, p); } @@ -565,6 +566,7 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, u32 *p, struct nfsd3_readdirargs *args) { int len, pn; + u32 max_blocksize = svc_max_payload(rqstp); if (!(p = decode_fh(p, &args->fh))) return 0; @@ -573,13 +575,12 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, u32 *p, args->dircount = ntohl(*p++); args->count = ntohl(*p++); - len = (args->count > NFSSVC_MAXBLKSIZE) ? NFSSVC_MAXBLKSIZE : + len = (args->count > max_blocksize) ? max_blocksize : args->count; args->count = len; while (len > 0) { - pn = rqstp->rq_resused; - svc_take_page(rqstp); + pn = rqstp->rq_resused++; if (!args->buffer) args->buffer = page_address(rqstp->rq_respages[pn]); len -= PAGE_SIZE; @@ -668,7 +669,6 @@ nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p, rqstp->rq_res.page_len = resp->len; if (resp->len & 3) { /* need to pad the tail */ - rqstp->rq_restailpage = 0; rqstp->rq_res.tail[0].iov_base = p; *p = 0; rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); @@ -693,7 +693,6 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, u32 *p, rqstp->rq_res.page_len = resp->count; if (resp->count & 3) { /* need to pad the tail */ - rqstp->rq_restailpage = 0; rqstp->rq_res.tail[0].iov_base = p; *p = 0; rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3); @@ -768,7 +767,6 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, u32 *p, rqstp->rq_res.page_len = (resp->count) << 2; /* add the 'tail' to the end of the 'head' page - page 0. */ - rqstp->rq_restailpage = 0; rqstp->rq_res.tail[0].iov_base = p; *p++ = 0; /* no more entries */ *p++ = htonl(resp->common.err == nfserr_eof); diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index edb107e61b9..5d94555cdc8 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -63,6 +63,8 @@ #define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \ | NFS4_ACE_DIRECTORY_INHERIT_ACE | NFS4_ACE_INHERIT_ONLY_ACE) +#define NFS4_SUPPORTED_FLAGS (NFS4_INHERITANCE_FLAGS | NFS4_ACE_IDENTIFIER_GROUP) + #define MASK_EQUAL(mask1, mask2) \ ( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) ) @@ -96,24 +98,26 @@ deny_mask(u32 allow_mask, unsigned int flags) /* XXX: modify functions to return NFS errors; they're only ever * used by nfs code, after all.... */ -static int -mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags) +/* We only map from NFSv4 to POSIX ACLs when setting ACLs, when we err on the + * side of being more restrictive, so the mode bit mapping below is + * pessimistic. An optimistic version would be needed to handle DENY's, + * but we espect to coalesce all ALLOWs and DENYs before mapping to mode + * bits. */ + +static void +low_mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags) { - u32 ignore = 0; + u32 write_mode = NFS4_WRITE_MODE; - if (!(flags & NFS4_ACL_DIR)) - ignore |= NFS4_ACE_DELETE_CHILD; /* ignore it */ - perm |= ignore; + if (flags & NFS4_ACL_DIR) + write_mode |= NFS4_ACE_DELETE_CHILD; *mode = 0; if ((perm & NFS4_READ_MODE) == NFS4_READ_MODE) *mode |= ACL_READ; - if ((perm & NFS4_WRITE_MODE) == NFS4_WRITE_MODE) + if ((perm & write_mode) == write_mode) *mode |= ACL_WRITE; if ((perm & NFS4_EXECUTE_MODE) == NFS4_EXECUTE_MODE) *mode |= ACL_EXECUTE; - if (!MASK_EQUAL(perm, ignore|mask_from_posix(*mode, flags))) - return -EINVAL; - return 0; } struct ace_container { @@ -338,38 +342,6 @@ sort_pacl(struct posix_acl *pacl) return; } -static int -write_pace(struct nfs4_ace *ace, struct posix_acl *pacl, - struct posix_acl_entry **pace, short tag, unsigned int flags) -{ - struct posix_acl_entry *this = *pace; - - if (*pace == pacl->a_entries + pacl->a_count) - return -EINVAL; /* fell off the end */ - (*pace)++; - this->e_tag = tag; - if (tag == ACL_USER_OBJ) - flags |= NFS4_ACL_OWNER; - if (mode_from_nfs4(ace->access_mask, &this->e_perm, flags)) - return -EINVAL; - this->e_id = (tag == ACL_USER || tag == ACL_GROUP ? - ace->who : ACL_UNDEFINED_ID); - return 0; -} - -static struct nfs4_ace * -get_next_v4_ace(struct list_head **p, struct list_head *head) -{ - struct nfs4_ace *ace; - - *p = (*p)->next; - if (*p == head) - return NULL; - ace = list_entry(*p, struct nfs4_ace, l_ace); - - return ace; -} - int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl, struct posix_acl **dpacl, unsigned int flags) @@ -385,42 +357,23 @@ nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl, goto out; error = nfs4_acl_split(acl, dacl); - if (error < 0) + if (error) goto out_acl; - if (pacl != NULL) { - if (acl->naces == 0) { - error = -ENODATA; - goto try_dpacl; - } - - *pacl = _nfsv4_to_posix_one(acl, flags); - if (IS_ERR(*pacl)) { - error = PTR_ERR(*pacl); - *pacl = NULL; - goto out_acl; - } + *pacl = _nfsv4_to_posix_one(acl, flags); + if (IS_ERR(*pacl)) { + error = PTR_ERR(*pacl); + *pacl = NULL; + goto out_acl; } -try_dpacl: - if (dpacl != NULL) { - if (dacl->naces == 0) { - if (pacl == NULL || *pacl == NULL) - error = -ENODATA; - goto out_acl; - } - - error = 0; - *dpacl = _nfsv4_to_posix_one(dacl, flags); - if (IS_ERR(*dpacl)) { - error = PTR_ERR(*dpacl); - *dpacl = NULL; - goto out_acl; - } + *dpacl = _nfsv4_to_posix_one(dacl, flags); + if (IS_ERR(*dpacl)) { + error = PTR_ERR(*dpacl); + *dpacl = NULL; } - out_acl: - if (error && pacl) { + if (error) { posix_acl_release(*pacl); *pacl = NULL; } @@ -429,349 +382,311 @@ out: return error; } +/* + * While processing the NFSv4 ACE, this maintains bitmasks representing + * which permission bits have been allowed and which denied to a given + * entity: */ +struct posix_ace_state { + u32 allow; + u32 deny; +}; + +struct posix_user_ace_state { + uid_t uid; + struct posix_ace_state perms; +}; + +struct posix_ace_state_array { + int n; + struct posix_user_ace_state aces[]; +}; + +/* + * While processing the NFSv4 ACE, this maintains the partial permissions + * calculated so far: */ + +struct posix_acl_state { + struct posix_ace_state owner; + struct posix_ace_state group; + struct posix_ace_state other; + struct posix_ace_state everyone; + struct posix_ace_state mask; /* Deny unused in this case */ + struct posix_ace_state_array *users; + struct posix_ace_state_array *groups; +}; + static int -same_who(struct nfs4_ace *a, struct nfs4_ace *b) +init_state(struct posix_acl_state *state, int cnt) { - return a->whotype == b->whotype && - (a->whotype != NFS4_ACL_WHO_NAMED || a->who == b->who); + int alloc; + + memset(state, 0, sizeof(struct posix_acl_state)); + /* + * In the worst case, each individual acl could be for a distinct + * named user or group, but we don't no which, so we allocate + * enough space for either: + */ + alloc = sizeof(struct posix_ace_state_array) + + cnt*sizeof(struct posix_ace_state); + state->users = kzalloc(alloc, GFP_KERNEL); + if (!state->users) + return -ENOMEM; + state->groups = kzalloc(alloc, GFP_KERNEL); + if (!state->groups) { + kfree(state->users); + return -ENOMEM; + } + return 0; } -static int -complementary_ace_pair(struct nfs4_ace *allow, struct nfs4_ace *deny, - unsigned int flags) -{ - int ignore = 0; - if (!(flags & NFS4_ACL_DIR)) - ignore |= NFS4_ACE_DELETE_CHILD; - return MASK_EQUAL(ignore|deny_mask(allow->access_mask, flags), - ignore|deny->access_mask) && - allow->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE && - deny->type == NFS4_ACE_ACCESS_DENIED_ACE_TYPE && - allow->flag == deny->flag && - same_who(allow, deny); +static void +free_state(struct posix_acl_state *state) { + kfree(state->users); + kfree(state->groups); } -static inline int -user_obj_from_v4(struct nfs4_acl *n4acl, struct list_head **p, - struct posix_acl *pacl, struct posix_acl_entry **pace, - unsigned int flags) +static inline void add_to_mask(struct posix_acl_state *state, struct posix_ace_state *astate) { - int error = -EINVAL; - struct nfs4_ace *ace, *ace2; - - ace = get_next_v4_ace(p, &n4acl->ace_head); - if (ace == NULL) - goto out; - if (ace2type(ace) != ACL_USER_OBJ) - goto out; - error = write_pace(ace, pacl, pace, ACL_USER_OBJ, flags); - if (error < 0) - goto out; - error = -EINVAL; - ace2 = get_next_v4_ace(p, &n4acl->ace_head); - if (ace2 == NULL) - goto out; - if (!complementary_ace_pair(ace, ace2, flags)) - goto out; - error = 0; -out: - return error; + state->mask.allow |= astate->allow; } -static inline int -users_from_v4(struct nfs4_acl *n4acl, struct list_head **p, - struct nfs4_ace **mask_ace, - struct posix_acl *pacl, struct posix_acl_entry **pace, - unsigned int flags) -{ - int error = -EINVAL; - struct nfs4_ace *ace, *ace2; +/* + * Certain bits (SYNCHRONIZE, DELETE, WRITE_OWNER, READ/WRITE_NAMED_ATTRS, + * READ_ATTRIBUTES, READ_ACL) are currently unenforceable and don't translate + * to traditional read/write/execute permissions. + * + * It's problematic to reject acls that use certain mode bits, because it + * places the burden on users to learn the rules about which bits one + * particular server sets, without giving the user a lot of help--we return an + * error that could mean any number of different things. To make matters + * worse, the problematic bits might be introduced by some application that's + * automatically mapping from some other acl model. + * + * So wherever possible we accept anything, possibly erring on the side of + * denying more permissions than necessary. + * + * However we do reject *explicit* DENY's of a few bits representing + * permissions we could never deny: + */ - ace = get_next_v4_ace(p, &n4acl->ace_head); - if (ace == NULL) - goto out; - while (ace2type(ace) == ACL_USER) { - if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE) - goto out; - if (*mask_ace && - !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask)) - goto out; - *mask_ace = ace; - ace = get_next_v4_ace(p, &n4acl->ace_head); - if (ace == NULL) - goto out; - if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) - goto out; - error = write_pace(ace, pacl, pace, ACL_USER, flags); - if (error < 0) - goto out; - error = -EINVAL; - ace2 = get_next_v4_ace(p, &n4acl->ace_head); - if (ace2 == NULL) - goto out; - if (!complementary_ace_pair(ace, ace2, flags)) - goto out; - if ((*mask_ace)->flag != ace2->flag || - !same_who(*mask_ace, ace2)) - goto out; - ace = get_next_v4_ace(p, &n4acl->ace_head); - if (ace == NULL) - goto out; - } - error = 0; -out: - return error; +static inline int check_deny(u32 mask, int isowner) +{ + if (mask & (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL)) + return -EINVAL; + if (!isowner) + return 0; + if (mask & (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL)) + return -EINVAL; + return 0; } -static inline int -group_obj_and_groups_from_v4(struct nfs4_acl *n4acl, struct list_head **p, - struct nfs4_ace **mask_ace, - struct posix_acl *pacl, struct posix_acl_entry **pace, - unsigned int flags) +static struct posix_acl * +posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) { - int error = -EINVAL; - struct nfs4_ace *ace, *ace2; - struct ace_container *ac; - struct list_head group_l; - - INIT_LIST_HEAD(&group_l); - ace = list_entry(*p, struct nfs4_ace, l_ace); - - /* group owner (mask and allow aces) */ + struct posix_acl_entry *pace; + struct posix_acl *pacl; + int nace; + int i, error = 0; - if (pacl->a_count != 3) { - /* then the group owner should be preceded by mask */ - if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE) - goto out; - if (*mask_ace && - !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask)) - goto out; - *mask_ace = ace; - ace = get_next_v4_ace(p, &n4acl->ace_head); - if (ace == NULL) - goto out; + nace = 4 + state->users->n + state->groups->n; + pacl = posix_acl_alloc(nace, GFP_KERNEL); + if (!pacl) + return ERR_PTR(-ENOMEM); - if ((*mask_ace)->flag != ace->flag || !same_who(*mask_ace, ace)) - goto out; + pace = pacl->a_entries; + pace->e_tag = ACL_USER_OBJ; + error = check_deny(state->owner.deny, 1); + if (error) + goto out_err; + low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags); + pace->e_id = ACL_UNDEFINED_ID; + + for (i=0; i < state->users->n; i++) { + pace++; + pace->e_tag = ACL_USER; + error = check_deny(state->users->aces[i].perms.deny, 0); + if (error) + goto out_err; + low_mode_from_nfs4(state->users->aces[i].perms.allow, + &pace->e_perm, flags); + pace->e_id = state->users->aces[i].uid; + add_to_mask(state, &state->users->aces[i].perms); } - if (ace2type(ace) != ACL_GROUP_OBJ) - goto out; - - ac = kmalloc(sizeof(*ac), GFP_KERNEL); - error = -ENOMEM; - if (ac == NULL) - goto out; - ac->ace = ace; - list_add_tail(&ac->ace_l, &group_l); - - error = -EINVAL; - if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) - goto out; - - error = write_pace(ace, pacl, pace, ACL_GROUP_OBJ, flags); - if (error < 0) - goto out; - - error = -EINVAL; - ace = get_next_v4_ace(p, &n4acl->ace_head); - if (ace == NULL) - goto out; - - /* groups (mask and allow aces) */ - - while (ace2type(ace) == ACL_GROUP) { - if (*mask_ace == NULL) - goto out; - - if (ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE || - !MASK_EQUAL(ace->access_mask, (*mask_ace)->access_mask)) - goto out; - *mask_ace = ace; + pace++; + pace->e_tag = ACL_GROUP_OBJ; + error = check_deny(state->group.deny, 0); + if (error) + goto out_err; + low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags); + pace->e_id = ACL_UNDEFINED_ID; + add_to_mask(state, &state->group); + + for (i=0; i < state->groups->n; i++) { + pace++; + pace->e_tag = ACL_GROUP; + error = check_deny(state->groups->aces[i].perms.deny, 0); + if (error) + goto out_err; + low_mode_from_nfs4(state->groups->aces[i].perms.allow, + &pace->e_perm, flags); + pace->e_id = state->groups->aces[i].uid; + add_to_mask(state, &state->groups->aces[i].perms); + } - ace = get_next_v4_ace(p, &n4acl->ace_head); - if (ace == NULL) - goto out; - ac = kmalloc(sizeof(*ac), GFP_KERNEL); - error = -ENOMEM; - if (ac == NULL) - goto out; - error = -EINVAL; - if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE || - !same_who(ace, *mask_ace)) - goto out; + pace++; + pace->e_tag = ACL_MASK; + low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags); + pace->e_id = ACL_UNDEFINED_ID; - ac->ace = ace; - list_add_tail(&ac->ace_l, &group_l); + pace++; + pace->e_tag = ACL_OTHER; + error = check_deny(state->other.deny, 0); + if (error) + goto out_err; + low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags); + pace->e_id = ACL_UNDEFINED_ID; - error = write_pace(ace, pacl, pace, ACL_GROUP, flags); - if (error < 0) - goto out; - error = -EINVAL; - ace = get_next_v4_ace(p, &n4acl->ace_head); - if (ace == NULL) - goto out; - } + return pacl; +out_err: + posix_acl_release(pacl); + return ERR_PTR(error); +} - /* group owner (deny ace) */ +static inline void allow_bits(struct posix_ace_state *astate, u32 mask) +{ + /* Allow all bits in the mask not already denied: */ + astate->allow |= mask & ~astate->deny; +} - if (ace2type(ace) != ACL_GROUP_OBJ) - goto out; - ac = list_entry(group_l.next, struct ace_container, ace_l); - ace2 = ac->ace; - if (!complementary_ace_pair(ace2, ace, flags)) - goto out; - list_del(group_l.next); - kfree(ac); +static inline void deny_bits(struct posix_ace_state *astate, u32 mask) +{ + /* Deny all bits in the mask not already allowed: */ + astate->deny |= mask & ~astate->allow; +} - /* groups (deny aces) */ +static int find_uid(struct posix_acl_state *state, struct posix_ace_state_array *a, uid_t uid) +{ + int i; - while (!list_empty(&group_l)) { - ace = get_next_v4_ace(p, &n4acl->ace_head); - if (ace == NULL) - goto out; - if (ace2type(ace) != ACL_GROUP) - goto out; - ac = list_entry(group_l.next, struct ace_container, ace_l); - ace2 = ac->ace; - if (!complementary_ace_pair(ace2, ace, flags)) - goto out; - list_del(group_l.next); - kfree(ac); - } + for (i = 0; i < a->n; i++) + if (a->aces[i].uid == uid) + return i; + /* Not found: */ + a->n++; + a->aces[i].uid = uid; + a->aces[i].perms.allow = state->everyone.allow; + a->aces[i].perms.deny = state->everyone.deny; - ace = get_next_v4_ace(p, &n4acl->ace_head); - if (ace == NULL) - goto out; - if (ace2type(ace) != ACL_OTHER) - goto out; - error = 0; -out: - while (!list_empty(&group_l)) { - ac = list_entry(group_l.next, struct ace_container, ace_l); - list_del(group_l.next); - kfree(ac); - } - return error; + return i; } -static inline int -mask_from_v4(struct nfs4_acl *n4acl, struct list_head **p, - struct nfs4_ace **mask_ace, - struct posix_acl *pacl, struct posix_acl_entry **pace, - unsigned int flags) +static void deny_bits_array(struct posix_ace_state_array *a, u32 mask) { - int error = -EINVAL; - struct nfs4_ace *ace; + int i; - ace = list_entry(*p, struct nfs4_ace, l_ace); - if (pacl->a_count != 3) { - if (*mask_ace == NULL) - goto out; - (*mask_ace)->access_mask = deny_mask((*mask_ace)->access_mask, flags); - write_pace(*mask_ace, pacl, pace, ACL_MASK, flags); - } - error = 0; -out: - return error; + for (i=0; i < a->n; i++) + deny_bits(&a->aces[i].perms, mask); } -static inline int -other_from_v4(struct nfs4_acl *n4acl, struct list_head **p, - struct posix_acl *pacl, struct posix_acl_entry **pace, - unsigned int flags) +static void allow_bits_array(struct posix_ace_state_array *a, u32 mask) { - int error = -EINVAL; - struct nfs4_ace *ace, *ace2; + int i; - ace = list_entry(*p, struct nfs4_ace, l_ace); - if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) - goto out; - error = write_pace(ace, pacl, pace, ACL_OTHER, flags); - if (error < 0) - goto out; - error = -EINVAL; - ace2 = get_next_v4_ace(p, &n4acl->ace_head); - if (ace2 == NULL) - goto out; - if (!complementary_ace_pair(ace, ace2, flags)) - goto out; - error = 0; -out: - return error; + for (i=0; i < a->n; i++) + allow_bits(&a->aces[i].perms, mask); } -static int -calculate_posix_ace_count(struct nfs4_acl *n4acl) +static void process_one_v4_ace(struct posix_acl_state *state, + struct nfs4_ace *ace) { - if (n4acl->naces == 6) /* owner, owner group, and other only */ - return 3; - else { /* Otherwise there must be a mask entry. */ - /* Also, the remaining entries are for named users and - * groups, and come in threes (mask, allow, deny): */ - if (n4acl->naces < 7) - return -EINVAL; - if ((n4acl->naces - 7) % 3) - return -EINVAL; - return 4 + (n4acl->naces - 7)/3; + u32 mask = ace->access_mask; + int i; + + switch (ace2type(ace)) { + case ACL_USER_OBJ: + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->owner, mask); + } else { + deny_bits(&state->owner, mask); + } + break; + case ACL_USER: + i = find_uid(state, state->users, ace->who); + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->users->aces[i].perms, mask); + } else { + deny_bits(&state->users->aces[i].perms, mask); + mask = state->users->aces[i].perms.deny; + deny_bits(&state->owner, mask); + } + break; + case ACL_GROUP_OBJ: + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->group, mask); + } else { + deny_bits(&state->group, mask); + mask = state->group.deny; + deny_bits(&state->owner, mask); + deny_bits(&state->everyone, mask); + deny_bits_array(state->users, mask); + deny_bits_array(state->groups, mask); + } + break; + case ACL_GROUP: + i = find_uid(state, state->groups, ace->who); + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->groups->aces[i].perms, mask); + } else { + deny_bits(&state->groups->aces[i].perms, mask); + mask = state->groups->aces[i].perms.deny; + deny_bits(&state->owner, mask); + deny_bits(&state->group, mask); + deny_bits(&state->everyone, mask); + deny_bits_array(state->users, mask); + deny_bits_array(state->groups, mask); + } + break; + case ACL_OTHER: + if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) { + allow_bits(&state->owner, mask); + allow_bits(&state->group, mask); + allow_bits(&state->other, mask); + allow_bits(&state->everyone, mask); + allow_bits_array(state->users, mask); + allow_bits_array(state->groups, mask); + } else { + deny_bits(&state->owner, mask); + deny_bits(&state->group, mask); + deny_bits(&state->other, mask); + deny_bits(&state->everyone, mask); + deny_bits_array(state->users, mask); + deny_bits_array(state->groups, mask); + } } } - static struct posix_acl * _nfsv4_to_posix_one(struct nfs4_acl *n4acl, unsigned int flags) { + struct posix_acl_state state; struct posix_acl *pacl; - int error = -EINVAL, nace = 0; - struct list_head *p; - struct nfs4_ace *mask_ace = NULL; - struct posix_acl_entry *pace; - - nace = calculate_posix_ace_count(n4acl); - if (nace < 0) - goto out_err; - - pacl = posix_acl_alloc(nace, GFP_KERNEL); - error = -ENOMEM; - if (pacl == NULL) - goto out_err; - - pace = &pacl->a_entries[0]; - p = &n4acl->ace_head; - - error = user_obj_from_v4(n4acl, &p, pacl, &pace, flags); - if (error) - goto out_acl; - - error = users_from_v4(n4acl, &p, &mask_ace, pacl, &pace, flags); - if (error) - goto out_acl; + struct nfs4_ace *ace; + int ret; - error = group_obj_and_groups_from_v4(n4acl, &p, &mask_ace, pacl, &pace, - flags); - if (error) - goto out_acl; + ret = init_state(&state, n4acl->naces); + if (ret) + return ERR_PTR(ret); - error = mask_from_v4(n4acl, &p, &mask_ace, pacl, &pace, flags); - if (error) - goto out_acl; - error = other_from_v4(n4acl, &p, pacl, &pace, flags); - if (error) - goto out_acl; + list_for_each_entry(ace, &n4acl->ace_head, l_ace) + process_one_v4_ace(&state, ace); - error = -EINVAL; - if (p->next != &n4acl->ace_head) - goto out_acl; - if (pace != pacl->a_entries + pacl->a_count) - goto out_acl; + pacl = posix_state_to_acl(&state, flags); - sort_pacl(pacl); + free_state(&state); - return pacl; -out_acl: - posix_acl_release(pacl); -out_err: - pacl = ERR_PTR(error); + if (!IS_ERR(pacl)) + sort_pacl(pacl); return pacl; } @@ -785,22 +700,41 @@ nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl) list_for_each_safe(h, n, &acl->ace_head) { ace = list_entry(h, struct nfs4_ace, l_ace); - if ((ace->flag & NFS4_INHERITANCE_FLAGS) - != NFS4_INHERITANCE_FLAGS) - continue; + if (ace->type != NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE && + ace->type != NFS4_ACE_ACCESS_DENIED_ACE_TYPE) + return -EINVAL; - error = nfs4_acl_add_ace(dacl, ace->type, ace->flag, - ace->access_mask, ace->whotype, ace->who); - if (error < 0) - goto out; + if (ace->flag & ~NFS4_SUPPORTED_FLAGS) + return -EINVAL; - list_del(h); - kfree(ace); - acl->naces--; + switch (ace->flag & NFS4_INHERITANCE_FLAGS) { + case 0: + /* Leave this ace in the effective acl: */ + continue; + case NFS4_INHERITANCE_FLAGS: + /* Add this ace to the default acl and remove it + * from the effective acl: */ + error = nfs4_acl_add_ace(dacl, ace->type, ace->flag, + ace->access_mask, ace->whotype, ace->who); + if (error) + return error; + list_del(h); + kfree(ace); + acl->naces--; + break; + case NFS4_INHERITANCE_FLAGS & ~NFS4_ACE_INHERIT_ONLY_ACE: + /* Add this ace to the default, but leave it in + * the effective acl as well: */ + error = nfs4_acl_add_ace(dacl, ace->type, ace->flag, + ace->access_mask, ace->whotype, ace->who); + if (error) + return error; + break; + default: + return -EINVAL; + } } - -out: - return error; + return 0; } static short @@ -930,23 +864,6 @@ nfs4_acl_write_who(int who, char *p) return -1; } -static inline int -match_who(struct nfs4_ace *ace, uid_t owner, gid_t group, uid_t who) -{ - switch (ace->whotype) { - case NFS4_ACL_WHO_NAMED: - return who == ace->who; - case NFS4_ACL_WHO_OWNER: - return who == owner; - case NFS4_ACL_WHO_GROUP: - return who == group; - case NFS4_ACL_WHO_EVERYONE: - return 1; - default: - return 0; - } -} - EXPORT_SYMBOL(nfs4_acl_new); EXPORT_SYMBOL(nfs4_acl_free); EXPORT_SYMBOL(nfs4_acl_add_ace); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 8583d99ee74..f6ca9fb3fc6 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -131,7 +131,7 @@ xdr_error: \ #define READ_BUF(nbytes) do { \ p = xdr_inline_decode(xdr, nbytes); \ if (!p) { \ - dprintk("NFSD: %s: reply buffer overflowed in line %d.", \ + dprintk("NFSD: %s: reply buffer overflowed in line %d.\n", \ __FUNCTION__, __LINE__); \ return -EIO; \ } \ diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index ee4eff27aed..8333db12cac 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -600,7 +600,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_se &setattr->sa_stateid, CHECK_FH | WR_STATE, NULL); nfs4_unlock_state(); if (status) { - dprintk("NFSD: nfsd4_setattr: couldn't process stateid!"); + dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); return status; } } @@ -646,7 +646,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_writ *p++ = nfssvc_boot.tv_usec; status = nfsd_write(rqstp, current_fh, filp, write->wr_offset, - write->wr_vec, write->wr_vlen, write->wr_buflen, + rqstp->rq_vec, write->wr_vlen, write->wr_buflen, &write->wr_how_written); if (filp) fput(filp); @@ -802,13 +802,29 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, * SETCLIENTID_CONFIRM, PUTFH and PUTROOTFH * require a valid current filehandle */ - if ((!current_fh->fh_dentry) && - !((op->opnum == OP_PUTFH) || (op->opnum == OP_PUTROOTFH) || - (op->opnum == OP_SETCLIENTID) || - (op->opnum == OP_SETCLIENTID_CONFIRM) || - (op->opnum == OP_RENEW) || (op->opnum == OP_RESTOREFH) || - (op->opnum == OP_RELEASE_LOCKOWNER))) { - op->status = nfserr_nofilehandle; + if (!current_fh->fh_dentry) { + if (!((op->opnum == OP_PUTFH) || + (op->opnum == OP_PUTROOTFH) || + (op->opnum == OP_SETCLIENTID) || + (op->opnum == OP_SETCLIENTID_CONFIRM) || + (op->opnum == OP_RENEW) || + (op->opnum == OP_RESTOREFH) || + (op->opnum == OP_RELEASE_LOCKOWNER))) { + op->status = nfserr_nofilehandle; + goto encode_op; + } + } + /* Check must be done at start of each operation, except + * for GETATTR and ops not listed as returning NFS4ERR_MOVED + */ + else if (current_fh->fh_export->ex_fslocs.migrated && + !((op->opnum == OP_GETATTR) || + (op->opnum == OP_PUTROOTFH) || + (op->opnum == OP_PUTPUBFH) || + (op->opnum == OP_RENEW) || + (op->opnum == OP_SETCLIENTID) || + (op->opnum == OP_RELEASE_LOCKOWNER))) { + op->status = nfserr_moved; goto encode_op; } switch (op->opnum) { diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index e35d7e52fde..1cbd2e4ee12 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -184,7 +184,7 @@ struct dentry_list_arg { static int nfsd4_build_dentrylist(void *arg, const char *name, int namlen, - loff_t offset, ino_t ino, unsigned int d_type) + loff_t offset, u64 ino, unsigned int d_type) { struct dentry_list_arg *dla = arg; struct list_head *dentries = &dla->dentries; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 5446a0861d1..41fc241b729 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -60,6 +60,14 @@ #define NFSDDBG_FACILITY NFSDDBG_XDR +/* + * As per referral draft, the fsid for a referral MUST be different from the fsid of the containing + * directory in order to indicate to the client that a filesystem boundary is present + * We use a fixed fsid for a referral + */ +#define NFS4_REFERRAL_FSID_MAJOR 0x8000000ULL +#define NFS4_REFERRAL_FSID_MINOR 0x8000000ULL + static int check_filename(char *str, int len, int err) { @@ -198,8 +206,7 @@ static char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes) p = new; memcpy(p, argp->tmp, nbytes); } else { - if (p != argp->tmpp) - BUG(); + BUG_ON(p != argp->tmpp); argp->tmpp = NULL; } if (defer_free(argp, kfree, p)) { @@ -927,26 +934,26 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) printk(KERN_NOTICE "xdr error! (%s:%d)\n", __FILE__, __LINE__); goto xdr_error; } - write->wr_vec[0].iov_base = p; - write->wr_vec[0].iov_len = avail; + argp->rqstp->rq_vec[0].iov_base = p; + argp->rqstp->rq_vec[0].iov_len = avail; v = 0; len = write->wr_buflen; - while (len > write->wr_vec[v].iov_len) { - len -= write->wr_vec[v].iov_len; + while (len > argp->rqstp->rq_vec[v].iov_len) { + len -= argp->rqstp->rq_vec[v].iov_len; v++; - write->wr_vec[v].iov_base = page_address(argp->pagelist[0]); + argp->rqstp->rq_vec[v].iov_base = page_address(argp->pagelist[0]); argp->pagelist++; if (argp->pagelen >= PAGE_SIZE) { - write->wr_vec[v].iov_len = PAGE_SIZE; + argp->rqstp->rq_vec[v].iov_len = PAGE_SIZE; argp->pagelen -= PAGE_SIZE; } else { - write->wr_vec[v].iov_len = argp->pagelen; + argp->rqstp->rq_vec[v].iov_len = argp->pagelen; argp->pagelen -= len; } } - argp->end = (u32*) (write->wr_vec[v].iov_base + write->wr_vec[v].iov_len); - argp->p = (u32*) (write->wr_vec[v].iov_base + (XDR_QUADLEN(len) << 2)); - write->wr_vec[v].iov_len = len; + argp->end = (u32*) (argp->rqstp->rq_vec[v].iov_base + argp->rqstp->rq_vec[v].iov_len); + argp->p = (u32*) (argp->rqstp->rq_vec[v].iov_base + (XDR_QUADLEN(len) << 2)); + argp->rqstp->rq_vec[v].iov_len = len; write->wr_vlen = v+1; DECODE_TAIL; @@ -1224,6 +1231,119 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) stateowner->so_replay.rp_buflen); \ } } while (0); +/* Encode as an array of strings the string given with components + * seperated @sep. + */ +static int nfsd4_encode_components(char sep, char *components, + u32 **pp, int *buflen) +{ + u32 *p = *pp; + u32 *countp = p; + int strlen, count=0; + char *str, *end; + + dprintk("nfsd4_encode_components(%s)\n", components); + if ((*buflen -= 4) < 0) + return nfserr_resource; + WRITE32(0); /* We will fill this in with @count later */ + end = str = components; + while (*end) { + for (; *end && (*end != sep); end++) + ; /* Point to end of component */ + strlen = end - str; + if (strlen) { + if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0) + return nfserr_resource; + WRITE32(strlen); + WRITEMEM(str, strlen); + count++; + } + else + end++; + str = end; + } + *pp = p; + p = countp; + WRITE32(count); + return 0; +} + +/* + * encode a location element of a fs_locations structure + */ +static int nfsd4_encode_fs_location4(struct nfsd4_fs_location *location, + u32 **pp, int *buflen) +{ + int status; + u32 *p = *pp; + + status = nfsd4_encode_components(':', location->hosts, &p, buflen); + if (status) + return status; + status = nfsd4_encode_components('/', location->path, &p, buflen); + if (status) + return status; + *pp = p; + return 0; +} + +/* + * Return the path to an export point in the pseudo filesystem namespace + * Returned string is safe to use as long as the caller holds a reference + * to @exp. + */ +static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp) +{ + struct svc_fh tmp_fh; + char *path, *rootpath; + int stat; + + fh_init(&tmp_fh, NFS4_FHSIZE); + stat = exp_pseudoroot(rqstp->rq_client, &tmp_fh, &rqstp->rq_chandle); + if (stat) + return ERR_PTR(stat); + rootpath = tmp_fh.fh_export->ex_path; + + path = exp->ex_path; + + if (strncmp(path, rootpath, strlen(rootpath))) { + printk("nfsd: fs_locations failed;" + "%s is not contained in %s\n", path, rootpath); + return ERR_PTR(-EOPNOTSUPP); + } + + return path + strlen(rootpath); +} + +/* + * encode a fs_locations structure + */ +static int nfsd4_encode_fs_locations(struct svc_rqst *rqstp, + struct svc_export *exp, + u32 **pp, int *buflen) +{ + int status, i; + u32 *p = *pp; + struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; + char *root = nfsd4_path(rqstp, exp); + + if (IS_ERR(root)) + return PTR_ERR(root); + status = nfsd4_encode_components('/', root, &p, buflen); + if (status) + return status; + if ((*buflen -= 4) < 0) + return nfserr_resource; + WRITE32(fslocs->locations_count); + for (i=0; i<fslocs->locations_count; i++) { + status = nfsd4_encode_fs_location4(&fslocs->locations[i], + &p, buflen); + if (status) + return status; + } + *pp = p; + return 0; +} static u32 nfs4_ftypes[16] = { NF4BAD, NF4FIFO, NF4CHR, NF4BAD, @@ -1273,6 +1393,25 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, int whotype, uid_t id, int group, return nfsd4_encode_name(rqstp, whotype, id, group, p, buflen); } +#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ + FATTR4_WORD0_RDATTR_ERROR) +#define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID + +static int fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err) +{ + /* As per referral draft: */ + if (*bmval0 & ~WORD0_ABSENT_FS_ATTRS || + *bmval1 & ~WORD1_ABSENT_FS_ATTRS) { + if (*bmval0 & FATTR4_WORD0_RDATTR_ERROR || + *bmval0 & FATTR4_WORD0_FS_LOCATIONS) + *rdattr_err = NFSERR_MOVED; + else + return nfserr_moved; + } + *bmval0 &= WORD0_ABSENT_FS_ATTRS; + *bmval1 &= WORD1_ABSENT_FS_ATTRS; + return 0; +} /* * Note: @fhp can be NULL; in this case, we might have to compose the filehandle @@ -1295,6 +1434,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, u32 *attrlenp; u32 dummy; u64 dummy64; + u32 rdattr_err = 0; u32 *p = buffer; int status; int aclsupport = 0; @@ -1304,6 +1444,12 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0); BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1); + if (exp->ex_fslocs.migrated) { + status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err); + if (status) + goto out; + } + status = vfs_getattr(exp->ex_mnt, dentry, &stat); if (status) goto out_nfserr; @@ -1335,6 +1481,11 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, goto out_nfserr; } } + if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) { + if (exp->ex_fslocs.locations == NULL) { + bmval0 &= ~FATTR4_WORD0_FS_LOCATIONS; + } + } if ((buflen -= 16) < 0) goto out_resource; @@ -1344,12 +1495,15 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, attrlenp = p++; /* to be backfilled later */ if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { + u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0; if ((buflen -= 12) < 0) goto out_resource; + if (!aclsupport) + word0 &= ~FATTR4_WORD0_ACL; + if (!exp->ex_fslocs.locations) + word0 &= ~FATTR4_WORD0_FS_LOCATIONS; WRITE32(2); - WRITE32(aclsupport ? - NFSD_SUPPORTED_ATTRS_WORD0 : - NFSD_SUPPORTED_ATTRS_WORD0 & ~FATTR4_WORD0_ACL); + WRITE32(word0); WRITE32(NFSD_SUPPORTED_ATTRS_WORD1); } if (bmval0 & FATTR4_WORD0_TYPE) { @@ -1403,7 +1557,10 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if (bmval0 & FATTR4_WORD0_FSID) { if ((buflen -= 16) < 0) goto out_resource; - if (is_fsid(fhp, rqstp->rq_reffh)) { + if (exp->ex_fslocs.migrated) { + WRITE64(NFS4_REFERRAL_FSID_MAJOR); + WRITE64(NFS4_REFERRAL_FSID_MINOR); + } else if (is_fsid(fhp, rqstp->rq_reffh)) { WRITE64((u64)exp->ex_fsid); WRITE64((u64)0); } else { @@ -1426,7 +1583,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { if ((buflen -= 4) < 0) goto out_resource; - WRITE32(0); + WRITE32(rdattr_err); } if (bmval0 & FATTR4_WORD0_ACL) { struct nfs4_ace *ace; @@ -1514,6 +1671,13 @@ out_acl: goto out_resource; WRITE64((u64) statfs.f_files); } + if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) { + status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen); + if (status == nfserr_resource) + goto out_resource; + if (status) + goto out; + } if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) { if ((buflen -= 4) < 0) goto out_resource; @@ -1537,12 +1701,12 @@ out_acl: if (bmval0 & FATTR4_WORD0_MAXREAD) { if ((buflen -= 8) < 0) goto out_resource; - WRITE64((u64) NFSSVC_MAXBLKSIZE); + WRITE64((u64) svc_max_payload(rqstp)); } if (bmval0 & FATTR4_WORD0_MAXWRITE) { if ((buflen -= 8) < 0) goto out_resource; - WRITE64((u64) NFSSVC_MAXBLKSIZE); + WRITE64((u64) svc_max_payload(rqstp)); } if (bmval1 & FATTR4_WORD1_MODE) { if ((buflen -= 4) < 0) @@ -1846,7 +2010,6 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_ge nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry, resp->p, &buflen, getattr->ga_bmval, resp->rqstp); - if (!nfserr) resp->p += buflen; return nfserr; @@ -2040,7 +2203,8 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, int nfserr, struct n } static int -nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read *read) +nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, + struct nfsd4_read *read) { u32 eof; int v, pn; @@ -2055,31 +2219,33 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read RESERVE_SPACE(8); /* eof flag and byte count */ - maxcount = NFSSVC_MAXBLKSIZE; + maxcount = svc_max_payload(resp->rqstp); if (maxcount > read->rd_length) maxcount = read->rd_length; len = maxcount; v = 0; while (len > 0) { - pn = resp->rqstp->rq_resused; - svc_take_page(resp->rqstp); - read->rd_iov[v].iov_base = page_address(resp->rqstp->rq_respages[pn]); - read->rd_iov[v].iov_len = len < PAGE_SIZE ? len : PAGE_SIZE; + pn = resp->rqstp->rq_resused++; + resp->rqstp->rq_vec[v].iov_base = + page_address(resp->rqstp->rq_respages[pn]); + resp->rqstp->rq_vec[v].iov_len = + len < PAGE_SIZE ? len : PAGE_SIZE; v++; len -= PAGE_SIZE; } read->rd_vlen = v; nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, - read->rd_offset, read->rd_iov, read->rd_vlen, + read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, &maxcount); if (nfserr == nfserr_symlink) nfserr = nfserr_inval; if (nfserr) return nfserr; - eof = (read->rd_offset + maxcount >= read->rd_fhp->fh_dentry->d_inode->i_size); + eof = (read->rd_offset + maxcount >= + read->rd_fhp->fh_dentry->d_inode->i_size); WRITE32(eof); WRITE32(maxcount); @@ -2089,7 +2255,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read resp->xbuf->page_len = maxcount; /* Use rest of head for padding and remaining ops: */ - resp->rqstp->rq_restailpage = 0; resp->xbuf->tail[0].iov_base = p; resp->xbuf->tail[0].iov_len = 0; if (maxcount&3) { @@ -2114,8 +2279,7 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r if (resp->xbuf->page_len) return nfserr_resource; - svc_take_page(resp->rqstp); - page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); + page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]); maxcount = PAGE_SIZE; RESERVE_SPACE(4); @@ -2139,7 +2303,6 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r resp->xbuf->page_len = maxcount; /* Use rest of head for padding and remaining ops: */ - resp->rqstp->rq_restailpage = 0; resp->xbuf->tail[0].iov_base = p; resp->xbuf->tail[0].iov_len = 0; if (maxcount&3) { @@ -2190,8 +2353,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re goto err_no_verf; } - svc_take_page(resp->rqstp); - page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); + page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]); readdir->common.err = 0; readdir->buflen = maxcount; readdir->buffer = page; @@ -2216,10 +2378,10 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re p = readdir->buffer; *p++ = 0; /* no more entries */ *p++ = htonl(readdir->common.err == nfserr_eof); - resp->xbuf->page_len = ((char*)p) - (char*)page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); + resp->xbuf->page_len = ((char*)p) - (char*)page_address( + resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); /* Use rest of head for padding and remaining ops: */ - resp->rqstp->rq_restailpage = 0; resp->xbuf->tail[0].iov_base = tailbase; resp->xbuf->tail[0].iov_len = 0; resp->p = resp->xbuf->tail[0].iov_base; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 7046ac9cf97..39aed901514 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -23,10 +23,14 @@ #include <linux/pagemap.h> #include <linux/init.h> #include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/ctype.h> #include <linux/nfs.h> #include <linux/nfsd_idmap.h> +#include <linux/lockd/bind.h> #include <linux/sunrpc/svc.h> +#include <linux/sunrpc/svcsock.h> #include <linux/nfsd/nfsd.h> #include <linux/nfsd/cache.h> #include <linux/nfsd/xdr.h> @@ -35,8 +39,6 @@ #include <asm/uaccess.h> -unsigned int nfsd_versbits = ~0; - /* * We have a single directory with 9 nodes in it. */ @@ -52,7 +54,10 @@ enum { NFSD_List, NFSD_Fh, NFSD_Threads, + NFSD_Pool_Threads, NFSD_Versions, + NFSD_Ports, + NFSD_MaxBlkSize, /* * The below MUST come last. Otherwise we leave a hole in nfsd_files[] * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops @@ -75,7 +80,10 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size); static ssize_t write_getfs(struct file *file, char *buf, size_t size); static ssize_t write_filehandle(struct file *file, char *buf, size_t size); static ssize_t write_threads(struct file *file, char *buf, size_t size); +static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); static ssize_t write_versions(struct file *file, char *buf, size_t size); +static ssize_t write_ports(struct file *file, char *buf, size_t size); +static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); #ifdef CONFIG_NFSD_V4 static ssize_t write_leasetime(struct file *file, char *buf, size_t size); static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); @@ -91,7 +99,10 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = { [NFSD_Getfs] = write_getfs, [NFSD_Fh] = write_filehandle, [NFSD_Threads] = write_threads, + [NFSD_Pool_Threads] = write_pool_threads, [NFSD_Versions] = write_versions, + [NFSD_Ports] = write_ports, + [NFSD_MaxBlkSize] = write_maxblksize, #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = write_leasetime, [NFSD_RecoveryDir] = write_recoverydir, @@ -358,6 +369,72 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) return strlen(buf); } +extern int nfsd_nrpools(void); +extern int nfsd_get_nrthreads(int n, int *); +extern int nfsd_set_nrthreads(int n, int *); + +static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) +{ + /* if size > 0, look for an array of number of threads per node + * and apply them then write out number of threads per node as reply + */ + char *mesg = buf; + int i; + int rv; + int len; + int npools = nfsd_nrpools(); + int *nthreads; + + if (npools == 0) { + /* + * NFS is shut down. The admin can start it by + * writing to the threads file but NOT the pool_threads + * file, sorry. Report zero threads. + */ + strcpy(buf, "0\n"); + return strlen(buf); + } + + nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL); + if (nthreads == NULL) + return -ENOMEM; + + if (size > 0) { + for (i = 0; i < npools; i++) { + rv = get_int(&mesg, &nthreads[i]); + if (rv == -ENOENT) + break; /* fewer numbers than pools */ + if (rv) + goto out_free; /* syntax error */ + rv = -EINVAL; + if (nthreads[i] < 0) + goto out_free; + } + rv = nfsd_set_nrthreads(i, nthreads); + if (rv) + goto out_free; + } + + rv = nfsd_get_nrthreads(npools, nthreads); + if (rv) + goto out_free; + + mesg = buf; + size = SIMPLE_TRANSACTION_LIMIT; + for (i = 0; i < npools && size > 0; i++) { + snprintf(mesg, size, "%d%c", nthreads[i], (i == npools-1 ? '\n' : ' ')); + len = strlen(mesg); + size -= len; + mesg += len; + } + + return (mesg-buf); + +out_free: + kfree(nthreads); + return rv; +} + static ssize_t write_versions(struct file *file, char *buf, size_t size) { /* @@ -372,6 +449,10 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size) if (size>0) { if (nfsd_serv) + /* Cannot change versions without updating + * nfsd_serv->sv_xdrsize, and reallocing + * rq_argp and rq_resp + */ return -EBUSY; if (buf[size-1] != '\n') return -EINVAL; @@ -390,10 +471,7 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size) case 2: case 3: case 4: - if (sign != '-') - NFSCTL_VERSET(nfsd_versbits, num); - else - NFSCTL_VERUNSET(nfsd_versbits, num); + nfsd_vers(num, sign == '-' ? NFSD_CLEAR : NFSD_SET); break; default: return -EINVAL; @@ -404,16 +482,15 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size) /* If all get turned off, turn them back on, as * having no versions is BAD */ - if ((nfsd_versbits & NFSCTL_VERALL)==0) - nfsd_versbits = NFSCTL_VERALL; + nfsd_reset_versions(); } /* Now write current state into reply buffer */ len = 0; sep = ""; for (num=2 ; num <= 4 ; num++) - if (NFSCTL_VERISSET(NFSCTL_VERALL, num)) { + if (nfsd_vers(num, NFSD_AVAIL)) { len += sprintf(buf+len, "%s%c%d", sep, - NFSCTL_VERISSET(nfsd_versbits, num)?'+':'-', + nfsd_vers(num, NFSD_TEST)?'+':'-', num); sep = " "; } @@ -421,6 +498,95 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size) return len; } +static ssize_t write_ports(struct file *file, char *buf, size_t size) +{ + if (size == 0) { + int len = 0; + lock_kernel(); + if (nfsd_serv) + len = svc_sock_names(buf, nfsd_serv, NULL); + unlock_kernel(); + return len; + } + /* Either a single 'fd' number is written, in which + * case it must be for a socket of a supported family/protocol, + * and we use it as an nfsd socket, or + * A '-' followed by the 'name' of a socket in which case + * we close the socket. + */ + if (isdigit(buf[0])) { + char *mesg = buf; + int fd; + int err; + err = get_int(&mesg, &fd); + if (err) + return -EINVAL; + if (fd < 0) + return -EINVAL; + err = nfsd_create_serv(); + if (!err) { + int proto = 0; + err = svc_addsock(nfsd_serv, fd, buf, &proto); + if (err >= 0) { + err = lockd_up(proto); + if (err < 0) + svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf); + } + /* Decrease the count, but don't shutdown the + * the service + */ + lock_kernel(); + nfsd_serv->sv_nrthreads--; + unlock_kernel(); + } + return err < 0 ? err : 0; + } + if (buf[0] == '-') { + char *toclose = kstrdup(buf+1, GFP_KERNEL); + int len = 0; + if (!toclose) + return -ENOMEM; + lock_kernel(); + if (nfsd_serv) + len = svc_sock_names(buf, nfsd_serv, toclose); + unlock_kernel(); + if (len >= 0) + lockd_down(); + kfree(toclose); + return len; + } + return -EINVAL; +} + +int nfsd_max_blksize; + +static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) +{ + char *mesg = buf; + if (size > 0) { + int bsize; + int rv = get_int(&mesg, &bsize); + if (rv) + return rv; + /* force bsize into allowed range and + * required alignment. + */ + if (bsize < 1024) + bsize = 1024; + if (bsize > NFSSVC_MAXBLKSIZE) + bsize = NFSSVC_MAXBLKSIZE; + bsize &= ~(1024-1); + lock_kernel(); + if (nfsd_serv && nfsd_serv->sv_nrthreads) { + unlock_kernel(); + return -EBUSY; + } + nfsd_max_blksize = bsize; + unlock_kernel(); + } + return sprintf(buf, "%d\n", nfsd_max_blksize); +} + #ifdef CONFIG_NFSD_V4 extern time_t nfs4_leasetime(void); @@ -483,7 +649,10 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, + [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 06cd0db0f32..9ee1dab5d44 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -146,20 +146,20 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp, * status, 17 words for fattr, and 1 word for the byte count. */ - if (NFSSVC_MAXBLKSIZE < argp->count) { + if (NFSSVC_MAXBLKSIZE_V2 < argp->count) { printk(KERN_NOTICE "oversized read request from %u.%u.%u.%u:%d (%d bytes)\n", NIPQUAD(rqstp->rq_addr.sin_addr.s_addr), ntohs(rqstp->rq_addr.sin_port), argp->count); - argp->count = NFSSVC_MAXBLKSIZE; + argp->count = NFSSVC_MAXBLKSIZE_V2; } svc_reserve(rqstp, (19<<2) + argp->count + 4); resp->count = argp->count; nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, argp->offset, - argp->vec, argp->vlen, + rqstp->rq_vec, argp->vlen, &resp->count); if (nfserr) return nfserr; @@ -185,7 +185,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp, nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, argp->offset, - argp->vec, argp->vlen, + rqstp->rq_vec, argp->vlen, argp->len, &stable); return nfsd_return_attrs(nfserr, resp); @@ -225,7 +225,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, nfserr = nfserr_exist; if (isdotent(argp->name, argp->len)) goto done; - fh_lock(dirfhp); + fh_lock_nested(dirfhp, I_MUTEX_PARENT); dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len); if (IS_ERR(dchild)) { nfserr = nfserrno(PTR_ERR(dchild)); @@ -553,7 +553,7 @@ static struct svc_procedure nfsd_procedures2[18] = { PROC(none, void, void, none, RC_NOCACHE, ST), PROC(lookup, diropargs, diropres, fhandle, RC_NOCACHE, ST+FH+AT), PROC(readlink, readlinkargs, readlinkres, none, RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4), - PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE/4), + PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4), PROC(none, void, void, none, RC_NOCACHE, ST), PROC(write, writeargs, attrstat, fhandle, RC_REPLBUFF, ST+AT), PROC(create, createargs, diropres, fhandle, RC_REPLBUFF, ST+FH+AT), diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index ec1decf29ba..6fa6340a5fb 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -57,12 +57,6 @@ static atomic_t nfsd_busy; static unsigned long nfsd_last_call; static DEFINE_SPINLOCK(nfsd_call_lock); -struct nfsd_list { - struct list_head list; - struct task_struct *task; -}; -static struct list_head nfsd_list = LIST_HEAD_INIT(nfsd_list); - #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static struct svc_stat nfsd_acl_svcstats; static struct svc_version * nfsd_acl_version[] = { @@ -117,6 +111,32 @@ struct svc_program nfsd_program = { }; +int nfsd_vers(int vers, enum vers_op change) +{ + if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) + return -1; + switch(change) { + case NFSD_SET: + nfsd_versions[vers] = nfsd_version[vers]; + break; +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + if (vers < NFSD_ACL_NRVERS) + nfsd_acl_version[vers] = nfsd_acl_version[vers]; +#endif + case NFSD_CLEAR: + nfsd_versions[vers] = NULL; +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + if (vers < NFSD_ACL_NRVERS) + nfsd_acl_version[vers] = NULL; +#endif + break; + case NFSD_TEST: + return nfsd_versions[vers] != NULL; + case NFSD_AVAIL: + return nfsd_version[vers] != NULL; + } + return 0; +} /* * Maximum number of nfsd processes */ @@ -130,16 +150,192 @@ int nfsd_nrthreads(void) return nfsd_serv->sv_nrthreads; } +static int killsig; /* signal that was used to kill last nfsd */ +static void nfsd_last_thread(struct svc_serv *serv) +{ + /* When last nfsd thread exits we need to do some clean-up */ + struct svc_sock *svsk; + list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) + lockd_down(); + nfsd_serv = NULL; + nfsd_racache_shutdown(); + nfs4_state_shutdown(); + + printk(KERN_WARNING "nfsd: last server has exited\n"); + if (killsig != SIG_NOCLEAN) { + printk(KERN_WARNING "nfsd: unexporting all filesystems\n"); + nfsd_export_flush(); + } +} + +void nfsd_reset_versions(void) +{ + int found_one = 0; + int i; + + for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) { + if (nfsd_program.pg_vers[i]) + found_one = 1; + } + + if (!found_one) { + for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) + nfsd_program.pg_vers[i] = nfsd_version[i]; +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) + nfsd_acl_program.pg_vers[i] = + nfsd_acl_version[i]; +#endif + } +} + +int nfsd_create_serv(void) +{ + int err = 0; + lock_kernel(); + if (nfsd_serv) { + svc_get(nfsd_serv); + unlock_kernel(); + return 0; + } + if (nfsd_max_blksize == 0) { + /* choose a suitable default */ + struct sysinfo i; + si_meminfo(&i); + /* Aim for 1/4096 of memory per thread + * This gives 1MB on 4Gig machines + * But only uses 32K on 128M machines. + * Bottom out at 8K on 32M and smaller. + * Of course, this is only a default. + */ + nfsd_max_blksize = NFSSVC_MAXBLKSIZE; + i.totalram <<= PAGE_SHIFT - 12; + while (nfsd_max_blksize > i.totalram && + nfsd_max_blksize >= 8*1024*2) + nfsd_max_blksize /= 2; + } + + atomic_set(&nfsd_busy, 0); + nfsd_serv = svc_create_pooled(&nfsd_program, + NFSD_BUFSIZE - NFSSVC_MAXBLKSIZE + nfsd_max_blksize, + nfsd_last_thread, + nfsd, SIG_NOCLEAN, THIS_MODULE); + if (nfsd_serv == NULL) + err = -ENOMEM; + unlock_kernel(); + do_gettimeofday(&nfssvc_boot); /* record boot time */ + return err; +} + +static int nfsd_init_socks(int port) +{ + int error; + if (!list_empty(&nfsd_serv->sv_permsocks)) + return 0; + + error = lockd_up(IPPROTO_UDP); + if (error >= 0) { + error = svc_makesock(nfsd_serv, IPPROTO_UDP, port); + if (error < 0) + lockd_down(); + } + if (error < 0) + return error; + +#ifdef CONFIG_NFSD_TCP + error = lockd_up(IPPROTO_TCP); + if (error >= 0) { + error = svc_makesock(nfsd_serv, IPPROTO_TCP, port); + if (error < 0) + lockd_down(); + } + if (error < 0) + return error; +#endif + return 0; +} + +int nfsd_nrpools(void) +{ + if (nfsd_serv == NULL) + return 0; + else + return nfsd_serv->sv_nrpools; +} + +int nfsd_get_nrthreads(int n, int *nthreads) +{ + int i = 0; + + if (nfsd_serv != NULL) { + for (i = 0; i < nfsd_serv->sv_nrpools && i < n; i++) + nthreads[i] = nfsd_serv->sv_pools[i].sp_nrthreads; + } + + return 0; +} + +int nfsd_set_nrthreads(int n, int *nthreads) +{ + int i = 0; + int tot = 0; + int err = 0; + + if (nfsd_serv == NULL || n <= 0) + return 0; + + if (n > nfsd_serv->sv_nrpools) + n = nfsd_serv->sv_nrpools; + + /* enforce a global maximum number of threads */ + tot = 0; + for (i = 0; i < n; i++) { + if (nthreads[i] > NFSD_MAXSERVS) + nthreads[i] = NFSD_MAXSERVS; + tot += nthreads[i]; + } + if (tot > NFSD_MAXSERVS) { + /* total too large: scale down requested numbers */ + for (i = 0; i < n && tot > 0; i++) { + int new = nthreads[i] * NFSD_MAXSERVS / tot; + tot -= (nthreads[i] - new); + nthreads[i] = new; + } + for (i = 0; i < n && tot > 0; i++) { + nthreads[i]--; + tot--; + } + } + + /* + * There must always be a thread in pool 0; the admin + * can't shut down NFS completely using pool_threads. + */ + if (nthreads[0] == 0) + nthreads[0] = 1; + + /* apply the new numbers */ + lock_kernel(); + svc_get(nfsd_serv); + for (i = 0; i < n; i++) { + err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i], + nthreads[i]); + if (err) + break; + } + svc_destroy(nfsd_serv); + unlock_kernel(); + + return err; +} + int nfsd_svc(unsigned short port, int nrservs) { int error; - int none_left, found_one, i; - struct list_head *victim; lock_kernel(); - dprintk("nfsd: creating service: vers 0x%x\n", - nfsd_versbits); + dprintk("nfsd: creating service\n"); error = -EINVAL; if (nrservs <= 0) nrservs = 0; @@ -153,91 +349,20 @@ nfsd_svc(unsigned short port, int nrservs) error = nfs4_state_start(); if (error<0) goto out; - if (!nfsd_serv) { - /* - * Use the nfsd_ctlbits to define which - * versions that will be advertised. - * If nfsd_ctlbits doesn't list any version, - * export them all. - */ - found_one = 0; - - for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) { - if (NFSCTL_VERISSET(nfsd_versbits, i)) { - nfsd_program.pg_vers[i] = nfsd_version[i]; - found_one = 1; - } else - nfsd_program.pg_vers[i] = NULL; - } - if (!found_one) { - for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) - nfsd_program.pg_vers[i] = nfsd_version[i]; - } + nfsd_reset_versions(); + error = nfsd_create_serv(); -#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) - found_one = 0; - - for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) { - if (NFSCTL_VERISSET(nfsd_versbits, i)) { - nfsd_acl_program.pg_vers[i] = - nfsd_acl_version[i]; - found_one = 1; - } else - nfsd_acl_program.pg_vers[i] = NULL; - } - - if (!found_one) { - for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) - nfsd_acl_program.pg_vers[i] = - nfsd_acl_version[i]; - } -#endif - - atomic_set(&nfsd_busy, 0); - error = -ENOMEM; - nfsd_serv = svc_create(&nfsd_program, NFSD_BUFSIZE); - if (nfsd_serv == NULL) - goto out; - error = svc_makesock(nfsd_serv, IPPROTO_UDP, port); - if (error < 0) - goto failure; + if (error) + goto out; + error = nfsd_init_socks(port); + if (error) + goto failure; -#ifdef CONFIG_NFSD_TCP - error = svc_makesock(nfsd_serv, IPPROTO_TCP, port); - if (error < 0) - goto failure; -#endif - do_gettimeofday(&nfssvc_boot); /* record boot time */ - } else - nfsd_serv->sv_nrthreads++; - nrservs -= (nfsd_serv->sv_nrthreads-1); - while (nrservs > 0) { - nrservs--; - __module_get(THIS_MODULE); - error = svc_create_thread(nfsd, nfsd_serv); - if (error < 0) { - module_put(THIS_MODULE); - break; - } - } - victim = nfsd_list.next; - while (nrservs < 0 && victim != &nfsd_list) { - struct nfsd_list *nl = - list_entry(victim,struct nfsd_list, list); - victim = victim->next; - send_sig(SIG_NOCLEAN, nl->task, 1); - nrservs++; - } + error = svc_set_num_threads(nfsd_serv, NULL, nrservs); failure: - none_left = (nfsd_serv->sv_nrthreads == 1); svc_destroy(nfsd_serv); /* Release server */ - if (none_left) { - nfsd_serv = NULL; - nfsd_racache_shutdown(); - nfs4_state_shutdown(); - } out: unlock_kernel(); return error; @@ -270,10 +395,8 @@ update_thread_usage(int busy_threads) static void nfsd(struct svc_rqst *rqstp) { - struct svc_serv *serv = rqstp->rq_server; struct fs_struct *fsp; int err; - struct nfsd_list me; sigset_t shutdown_mask, allowed_mask; /* Lock module and set up kernel thread */ @@ -297,10 +420,7 @@ nfsd(struct svc_rqst *rqstp) nfsdstats.th_cnt++; - lockd_up(); /* start lockd */ - - me.task = current; - list_add(&me.list, &nfsd_list); + rqstp->rq_task = current; unlock_kernel(); @@ -322,8 +442,7 @@ nfsd(struct svc_rqst *rqstp) * Find a socket with data available and call its * recvfrom routine. */ - while ((err = svc_recv(serv, rqstp, - 60*60*HZ)) == -EAGAIN) + while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN) ; if (err < 0) break; @@ -336,7 +455,7 @@ nfsd(struct svc_rqst *rqstp) /* Process request with signals blocked. */ sigprocmask(SIG_SETMASK, &allowed_mask, NULL); - svc_process(serv, rqstp); + svc_process(rqstp); /* Unlock export hash tables */ exp_readunlock(); @@ -353,29 +472,13 @@ nfsd(struct svc_rqst *rqstp) if (sigismember(¤t->pending.signal, signo) && !sigismember(¤t->blocked, signo)) break; - err = signo; + killsig = signo; } - /* Clear signals before calling lockd_down() and svc_exit_thread() */ + /* Clear signals before calling svc_exit_thread() */ flush_signals(current); lock_kernel(); - /* Release lockd */ - lockd_down(); - - /* Check if this is last thread */ - if (serv->sv_nrthreads==1) { - - printk(KERN_WARNING "nfsd: last server has exited\n"); - if (err != SIG_NOCLEAN) { - printk(KERN_WARNING "nfsd: unexporting all filesystems\n"); - nfsd_export_flush(); - } - nfsd_serv = NULL; - nfsd_racache_shutdown(); /* release read-ahead cache */ - nfs4_state_shutdown(); - } - list_del(&me.list); nfsdstats.th_cnt --; out: diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index e3a0797dd56..1135c0d1455 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -1,5 +1,5 @@ /* - * linux/fs/nfsd/xdr.c + * linux/fs/nfsd/nfsxdr.c * * XDR support for nfsd * @@ -254,19 +254,18 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, u32 *p, len = args->count = ntohl(*p++); p++; /* totalcount - unused */ - if (len > NFSSVC_MAXBLKSIZE) - len = NFSSVC_MAXBLKSIZE; + if (len > NFSSVC_MAXBLKSIZE_V2) + len = NFSSVC_MAXBLKSIZE_V2; /* set up somewhere to store response. * We take pages, put them on reslist and include in iovec */ v=0; while (len > 0) { - pn=rqstp->rq_resused; - svc_take_page(rqstp); - args->vec[v].iov_base = page_address(rqstp->rq_respages[pn]); - args->vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE; - len -= args->vec[v].iov_len; + pn = rqstp->rq_resused++; + rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]); + rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE; + len -= rqstp->rq_vec[v].iov_len; v++; } args->vlen = v; @@ -286,21 +285,21 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, u32 *p, args->offset = ntohl(*p++); /* offset */ p++; /* totalcount */ len = args->len = ntohl(*p++); - args->vec[0].iov_base = (void*)p; - args->vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - + rqstp->rq_vec[0].iov_base = (void*)p; + rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - (((void*)p) - rqstp->rq_arg.head[0].iov_base); - if (len > NFSSVC_MAXBLKSIZE) - len = NFSSVC_MAXBLKSIZE; + if (len > NFSSVC_MAXBLKSIZE_V2) + len = NFSSVC_MAXBLKSIZE_V2; v = 0; - while (len > args->vec[v].iov_len) { - len -= args->vec[v].iov_len; + while (len > rqstp->rq_vec[v].iov_len) { + len -= rqstp->rq_vec[v].iov_len; v++; - args->vec[v].iov_base = page_address(rqstp->rq_argpages[v]); - args->vec[v].iov_len = PAGE_SIZE; + rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_pages[v]); + rqstp->rq_vec[v].iov_len = PAGE_SIZE; } - args->vec[v].iov_len = len; + rqstp->rq_vec[v].iov_len = len; args->vlen = v+1; - return args->vec[0].iov_len > 0; + return rqstp->rq_vec[0].iov_len > 0; } int @@ -333,8 +332,7 @@ nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, u32 *p, struct nfsd_readlinka { if (!(p = decode_fh(p, &args->fh))) return 0; - svc_take_page(rqstp); - args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]); + args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]); return xdr_argsize_check(rqstp, p); } @@ -375,8 +373,7 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, u32 *p, if (args->count > PAGE_SIZE) args->count = PAGE_SIZE; - svc_take_page(rqstp); - args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused-1]); + args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]); return xdr_argsize_check(rqstp, p); } @@ -416,7 +413,6 @@ nfssvc_encode_readlinkres(struct svc_rqst *rqstp, u32 *p, rqstp->rq_res.page_len = resp->len; if (resp->len & 3) { /* need to pad the tail */ - rqstp->rq_restailpage = 0; rqstp->rq_res.tail[0].iov_base = p; *p = 0; rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3); @@ -436,7 +432,6 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p, rqstp->rq_res.page_len = resp->count; if (resp->count & 3) { /* need to pad the tail */ - rqstp->rq_restailpage = 0; rqstp->rq_res.tail[0].iov_base = p; *p = 0; rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3); @@ -463,7 +458,7 @@ nfssvc_encode_statfsres(struct svc_rqst *rqstp, u32 *p, { struct kstatfs *stat = &resp->stats; - *p++ = htonl(NFSSVC_MAXBLKSIZE); /* max transfer size */ + *p++ = htonl(NFSSVC_MAXBLKSIZE_V2); /* max transfer size */ *p++ = htonl(stat->f_bsize); *p++ = htonl(stat->f_blocks); *p++ = htonl(stat->f_bfree); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c9e3b5a8fe0..1141bd29e4e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -54,6 +54,7 @@ #include <linux/nfsd_idmap.h> #include <linux/security.h> #endif /* CONFIG_NFSD_V4 */ +#include <linux/jhash.h> #include <asm/uaccess.h> @@ -81,10 +82,19 @@ struct raparms { dev_t p_dev; int p_set; struct file_ra_state p_ra; + unsigned int p_hindex; }; +struct raparm_hbucket { + struct raparms *pb_head; + spinlock_t pb_lock; +} ____cacheline_aligned_in_smp; + static struct raparms * raparml; -static struct raparms * raparm_cache; +#define RAPARM_HASH_BITS 4 +#define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS) +#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) +static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE]; /* * Called from nfsd_lookup and encode_dirent. Check if we have crossed @@ -437,13 +447,11 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, } else if (error < 0) goto out_nfserr; - if (pacl) { - error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS); - if (error < 0) - goto out_nfserr; - } + error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS); + if (error < 0) + goto out_nfserr; - if (dpacl) { + if (S_ISDIR(inode->i_mode)) { error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT); if (error < 0) goto out_nfserr; @@ -743,16 +751,20 @@ nfsd_sync_dir(struct dentry *dp) * Obtain the readahead parameters for the file * specified by (dev, ino). */ -static DEFINE_SPINLOCK(ra_lock); static inline struct raparms * nfsd_get_raparms(dev_t dev, ino_t ino) { struct raparms *ra, **rap, **frap = NULL; int depth = 0; + unsigned int hash; + struct raparm_hbucket *rab; - spin_lock(&ra_lock); - for (rap = &raparm_cache; (ra = *rap); rap = &ra->p_next) { + hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK; + rab = &raparm_hash[hash]; + + spin_lock(&rab->pb_lock); + for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) { if (ra->p_ino == ino && ra->p_dev == dev) goto found; depth++; @@ -761,7 +773,7 @@ nfsd_get_raparms(dev_t dev, ino_t ino) } depth = nfsdstats.ra_size*11/10; if (!frap) { - spin_unlock(&ra_lock); + spin_unlock(&rab->pb_lock); return NULL; } rap = frap; @@ -769,15 +781,16 @@ nfsd_get_raparms(dev_t dev, ino_t ino) ra->p_dev = dev; ra->p_ino = ino; ra->p_set = 0; + ra->p_hindex = hash; found: - if (rap != &raparm_cache) { + if (rap != &rab->pb_head) { *rap = ra->p_next; - ra->p_next = raparm_cache; - raparm_cache = ra; + ra->p_next = rab->pb_head; + rab->pb_head = ra; } ra->p_count++; nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; - spin_unlock(&ra_lock); + spin_unlock(&rab->pb_lock); return ra; } @@ -791,22 +804,26 @@ nfsd_read_actor(read_descriptor_t *desc, struct page *page, unsigned long offset { unsigned long count = desc->count; struct svc_rqst *rqstp = desc->arg.data; + struct page **pp = rqstp->rq_respages + rqstp->rq_resused; if (size > count) size = count; if (rqstp->rq_res.page_len == 0) { get_page(page); - rqstp->rq_respages[rqstp->rq_resused++] = page; + put_page(*pp); + *pp = page; + rqstp->rq_resused++; rqstp->rq_res.page_base = offset; rqstp->rq_res.page_len = size; - } else if (page != rqstp->rq_respages[rqstp->rq_resused-1]) { + } else if (page != pp[-1]) { get_page(page); - rqstp->rq_respages[rqstp->rq_resused++] = page; + put_page(*pp); + *pp = page; + rqstp->rq_resused++; rqstp->rq_res.page_len += size; - } else { + } else rqstp->rq_res.page_len += size; - } desc->count = count - size; desc->written += size; @@ -837,7 +854,7 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, file->f_ra = ra->p_ra; if (file->f_op->sendfile && rqstp->rq_sendfile_ok) { - svc_pushback_unused_pages(rqstp); + rqstp->rq_resused = 1; err = file->f_op->sendfile(file, &offset, *count, nfsd_read_actor, rqstp); } else { @@ -849,11 +866,12 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, /* Write back readahead params */ if (ra) { - spin_lock(&ra_lock); + struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; + spin_lock(&rab->pb_lock); ra->p_ra = file->f_ra; ra->p_set = 1; ra->p_count--; - spin_unlock(&ra_lock); + spin_unlock(&rab->pb_lock); } if (err >= 0) { @@ -1114,7 +1132,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, */ if (!resfhp->fh_dentry) { /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ - fh_lock(fhp); + fh_lock_nested(fhp, I_MUTEX_PARENT); dchild = lookup_one_len(fname, dentry, flen); err = PTR_ERR(dchild); if (IS_ERR(dchild)) @@ -1240,7 +1258,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfserr_notdir; if(!dirp->i_op || !dirp->i_op->lookup) goto out; - fh_lock(fhp); + fh_lock_nested(fhp, I_MUTEX_PARENT); /* * Compose the response file handle. @@ -1494,7 +1512,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (isdotent(name, len)) goto out; - fh_lock(ffhp); + fh_lock_nested(ffhp, I_MUTEX_PARENT); ddir = ffhp->fh_dentry; dirp = ddir->d_inode; @@ -1644,7 +1662,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (err) goto out; - fh_lock(fhp); + fh_lock_nested(fhp, I_MUTEX_PARENT); dentry = fhp->fh_dentry; dirp = dentry->d_inode; @@ -1829,11 +1847,11 @@ nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc) void nfsd_racache_shutdown(void) { - if (!raparm_cache) + if (!raparml) return; dprintk("nfsd: freeing readahead buffers.\n"); kfree(raparml); - raparm_cache = raparml = NULL; + raparml = NULL; } /* * Initialize readahead param cache @@ -1842,19 +1860,31 @@ int nfsd_racache_init(int cache_size) { int i; + int j = 0; + int nperbucket; + - if (raparm_cache) + if (raparml) return 0; + if (cache_size < 2*RAPARM_HASH_SIZE) + cache_size = 2*RAPARM_HASH_SIZE; raparml = kmalloc(sizeof(struct raparms) * cache_size, GFP_KERNEL); if (raparml != NULL) { dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); + for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) { + raparm_hash[i].pb_head = NULL; + spin_lock_init(&raparm_hash[i].pb_lock); + } + nperbucket = cache_size >> RAPARM_HASH_BITS; memset(raparml, 0, sizeof(struct raparms) * cache_size); for (i = 0; i < cache_size - 1; i++) { - raparml[i].p_next = raparml + i + 1; + if (i % nperbucket == 0) + raparm_hash[j++].pb_head = raparml + i; + if (i % nperbucket < nperbucket-1) + raparml[i].p_next = raparml + i + 1; } - raparm_cache = raparml; } else { printk(KERN_WARNING "nfsd: Could not allocate memory read-ahead cache.\n"); diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c index b83381c07ad..6993faea28a 100644 --- a/fs/nls/nls_ascii.c +++ b/fs/nls/nls_ascii.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_ascii.c + * linux/fs/nls/nls_ascii.c * * Charset ascii translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 9de6b495f11..7dfdab98729 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_base.c + * linux/fs/nls/nls_base.c * * Native language support--charsets and unicode translations. * By Gordon Chaffee 1996, 1997 @@ -163,8 +163,6 @@ int register_nls(struct nls_table * nls) { struct nls_table ** tmp = &tables; - if (!nls) - return -EINVAL; if (nls->next) return -EBUSY; diff --git a/fs/nls/nls_cp1250.c b/fs/nls/nls_cp1250.c index 32e78cf9518..570aa69846a 100644 --- a/fs/nls/nls_cp1250.c +++ b/fs/nls/nls_cp1250.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp1250.c + * linux/fs/nls/nls_cp1250.c * * Charset cp1250 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp1251.c b/fs/nls/nls_cp1251.c index cb41c8ae448..f114afa069d 100644 --- a/fs/nls/nls_cp1251.c +++ b/fs/nls/nls_cp1251.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp1251.c + * linux/fs/nls/nls_cp1251.c * * Charset cp1251 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp1255.c b/fs/nls/nls_cp1255.c index efdeefee534..e57f2cbf5bc 100644 --- a/fs/nls/nls_cp1255.c +++ b/fs/nls/nls_cp1255.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp1255.c + * linux/fs/nls/nls_cp1255.c * * Charset cp1255 translation tables. * The Unicode to charset table has only exact mappings. diff --git a/fs/nls/nls_cp437.c b/fs/nls/nls_cp437.c index 5c4a1cd685d..d41930ce4a4 100644 --- a/fs/nls/nls_cp437.c +++ b/fs/nls/nls_cp437.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp437.c + * linux/fs/nls/nls_cp437.c * * Charset cp437 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp737.c b/fs/nls/nls_cp737.c index e8b3ca8462e..d21f8790aa1 100644 --- a/fs/nls/nls_cp737.c +++ b/fs/nls/nls_cp737.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp737.c + * linux/fs/nls/nls_cp737.c * * Charset cp737 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp775.c b/fs/nls/nls_cp775.c index bdb290ea523..c97714c38a9 100644 --- a/fs/nls/nls_cp775.c +++ b/fs/nls/nls_cp775.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp775.c + * linux/fs/nls/nls_cp775.c * * Charset cp775 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp850.c b/fs/nls/nls_cp850.c index 25deaa4c864..843b7d975ba 100644 --- a/fs/nls/nls_cp850.c +++ b/fs/nls/nls_cp850.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp850.c + * linux/fs/nls/nls_cp850.c * * Charset cp850 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp852.c b/fs/nls/nls_cp852.c index b822a7b6b97..83cfd844d5c 100644 --- a/fs/nls/nls_cp852.c +++ b/fs/nls/nls_cp852.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp852.c + * linux/fs/nls/nls_cp852.c * * Charset cp852 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp855.c b/fs/nls/nls_cp855.c index e8641b7a8b2..9190b7b574f 100644 --- a/fs/nls/nls_cp855.c +++ b/fs/nls/nls_cp855.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp855.c + * linux/fs/nls/nls_cp855.c * * Charset cp855 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp857.c b/fs/nls/nls_cp857.c index 7ba589ef8cc..ef3d36db808 100644 --- a/fs/nls/nls_cp857.c +++ b/fs/nls/nls_cp857.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp857.c + * linux/fs/nls/nls_cp857.c * * Charset cp857 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp860.c b/fs/nls/nls_cp860.c index 3b9e49ce8c8..7e2fb664589 100644 --- a/fs/nls/nls_cp860.c +++ b/fs/nls/nls_cp860.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp860.c + * linux/fs/nls/nls_cp860.c * * Charset cp860 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp861.c b/fs/nls/nls_cp861.c index 959ff64ee97..66d8d808ccf 100644 --- a/fs/nls/nls_cp861.c +++ b/fs/nls/nls_cp861.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp861.c + * linux/fs/nls/nls_cp861.c * * Charset cp861 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp862.c b/fs/nls/nls_cp862.c index b96928f5a02..360ba388485 100644 --- a/fs/nls/nls_cp862.c +++ b/fs/nls/nls_cp862.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp862.c + * linux/fs/nls/nls_cp862.c * * Charset cp862 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp863.c b/fs/nls/nls_cp863.c index baa6e0eab1d..656a93113e3 100644 --- a/fs/nls/nls_cp863.c +++ b/fs/nls/nls_cp863.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp863.c + * linux/fs/nls/nls_cp863.c * * Charset cp863 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp864.c b/fs/nls/nls_cp864.c index f4dabb037df..01ca7309753 100644 --- a/fs/nls/nls_cp864.c +++ b/fs/nls/nls_cp864.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp864.c + * linux/fs/nls/nls_cp864.c * * Charset cp864 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp865.c b/fs/nls/nls_cp865.c index 4caeafae32c..5ba6ee13e10 100644 --- a/fs/nls/nls_cp865.c +++ b/fs/nls/nls_cp865.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp865.c + * linux/fs/nls/nls_cp865.c * * Charset cp865 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp866.c b/fs/nls/nls_cp866.c index f2b4a9a293f..c5f82221c9f 100644 --- a/fs/nls/nls_cp866.c +++ b/fs/nls/nls_cp866.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp866.c + * linux/fs/nls/nls_cp866.c * * Charset cp866 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp869.c b/fs/nls/nls_cp869.c index 12b436f4eca..8d4015124d1 100644 --- a/fs/nls/nls_cp869.c +++ b/fs/nls/nls_cp869.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp869.c + * linux/fs/nls/nls_cp869.c * * Charset cp869 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp874.c b/fs/nls/nls_cp874.c index b5766a01703..df042052c2d 100644 --- a/fs/nls/nls_cp874.c +++ b/fs/nls/nls_cp874.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp874.c + * linux/fs/nls/nls_cp874.c * * Charset cp874 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_cp932.c b/fs/nls/nls_cp932.c index 2c1a17cdcd2..2a9ccf3bc7e 100644 --- a/fs/nls/nls_cp932.c +++ b/fs/nls/nls_cp932.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp932.c + * linux/fs/nls/nls_cp932.c * * Charset cp932 translation tables. * This translation table was generated automatically, the diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c index ef4cef464ab..046fde8170e 100644 --- a/fs/nls/nls_cp936.c +++ b/fs/nls/nls_cp936.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp936.c + * linux/fs/nls/nls_cp936.c * * Charset cp936 translation tables. * This translation table was generated automatically, the diff --git a/fs/nls/nls_cp949.c b/fs/nls/nls_cp949.c index 4351ae21d89..92ae19372f0 100644 --- a/fs/nls/nls_cp949.c +++ b/fs/nls/nls_cp949.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp949.c + * linux/fs/nls/nls_cp949.c * * Charset cp949 translation tables. * This translation table was generated automatically, the diff --git a/fs/nls/nls_cp950.c b/fs/nls/nls_cp950.c index 8167a285887..5665945fb88 100644 --- a/fs/nls/nls_cp950.c +++ b/fs/nls/nls_cp950.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_cp950.c + * linux/fs/nls/nls_cp950.c * * Charset cp950 translation tables. * This translation table was generated automatically, the diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c index 06640c3e402..73293511578 100644 --- a/fs/nls/nls_euc-jp.c +++ b/fs/nls/nls_euc-jp.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_euc-jp.c + * linux/fs/nls/nls_euc-jp.c * * Added `OSF/JVC Recommended Code Set Conversion Specification * between Japanese EUC and Shift-JIS' support: <hirofumi@mail.parknet.co.jp> diff --git a/fs/nls/nls_iso8859-1.c b/fs/nls/nls_iso8859-1.c index 70a2c195672..2483c3c6c1c 100644 --- a/fs/nls/nls_iso8859-1.c +++ b/fs/nls/nls_iso8859-1.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_iso8859-1.c + * linux/fs/nls/nls_iso8859-1.c * * Charset iso8859-1 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_iso8859-13.c b/fs/nls/nls_iso8859-13.c index 4547035f21a..7b8721d7436 100644 --- a/fs/nls/nls_iso8859-13.c +++ b/fs/nls/nls_iso8859-13.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_iso8859-13.c + * linux/fs/nls/nls_iso8859-13.c * * Charset iso8859-13 translation tables. * The Unicode to charset table has only exact mappings. diff --git a/fs/nls/nls_iso8859-14.c b/fs/nls/nls_iso8859-14.c index 13628d0dd3a..2e895e638db 100644 --- a/fs/nls/nls_iso8859-14.c +++ b/fs/nls/nls_iso8859-14.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_iso8859-14.c + * linux/fs/nls/nls_iso8859-14.c * * Charset iso8859-14 translation tables. * diff --git a/fs/nls/nls_iso8859-15.c b/fs/nls/nls_iso8859-15.c index 88b924bf7e1..5c91592779f 100644 --- a/fs/nls/nls_iso8859-15.c +++ b/fs/nls/nls_iso8859-15.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_iso8859-15.c + * linux/fs/nls/nls_iso8859-15.c * * Charset iso8859-15 translation tables. * The Unicode to charset table has only exact mappings. diff --git a/fs/nls/nls_iso8859-2.c b/fs/nls/nls_iso8859-2.c index 372528a6c40..892d38fe953 100644 --- a/fs/nls/nls_iso8859-2.c +++ b/fs/nls/nls_iso8859-2.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_iso8859-2.c + * linux/fs/nls/nls_iso8859-2.c * * Charset iso8859-2 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_iso8859-3.c b/fs/nls/nls_iso8859-3.c index 81b45a23436..49317bcdb4b 100644 --- a/fs/nls/nls_iso8859-3.c +++ b/fs/nls/nls_iso8859-3.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_iso8859-3.c + * linux/fs/nls/nls_iso8859-3.c * * Charset iso8859-3 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_iso8859-4.c b/fs/nls/nls_iso8859-4.c index 101b87f5a49..9f3b9368c2c 100644 --- a/fs/nls/nls_iso8859-4.c +++ b/fs/nls/nls_iso8859-4.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_iso8859-4.c + * linux/fs/nls/nls_iso8859-4.c * * Charset iso8859-4 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_iso8859-5.c b/fs/nls/nls_iso8859-5.c index 83b0084de5e..001a2bb132c 100644 --- a/fs/nls/nls_iso8859-5.c +++ b/fs/nls/nls_iso8859-5.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_iso8859-5.c + * linux/fs/nls/nls_iso8859-5.c * * Charset iso8859-5 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_iso8859-6.c b/fs/nls/nls_iso8859-6.c index 0c519d65f55..8cec03d6608 100644 --- a/fs/nls/nls_iso8859-6.c +++ b/fs/nls/nls_iso8859-6.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_iso8859-6.c + * linux/fs/nls/nls_iso8859-6.c * * Charset iso8859-6 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_iso8859-7.c b/fs/nls/nls_iso8859-7.c index bd0854625ac..1be707d5ac3 100644 --- a/fs/nls/nls_iso8859-7.c +++ b/fs/nls/nls_iso8859-7.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_iso8859-7.c + * linux/fs/nls/nls_iso8859-7.c * * Charset iso8859-7 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_iso8859-9.c b/fs/nls/nls_iso8859-9.c index 988eff791c0..8c0146f7383 100644 --- a/fs/nls/nls_iso8859-9.c +++ b/fs/nls/nls_iso8859-9.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_iso8859-9.c + * linux/fs/nls/nls_iso8859-9.c * * Charset iso8859-9 translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_koi8-r.c b/fs/nls/nls_koi8-r.c index 0ad22c24979..fefbe080726 100644 --- a/fs/nls/nls_koi8-r.c +++ b/fs/nls/nls_koi8-r.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_koi8-r.c + * linux/fs/nls/nls_koi8-r.c * * Charset koi8-r translation tables. * Generated automatically from the Unicode and charset diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c index 5db83efe27c..e7bc1d75c78 100644 --- a/fs/nls/nls_koi8-ru.c +++ b/fs/nls/nls_koi8-ru.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_koi8-ru.c + * linux/fs/nls/nls_koi8-ru.c * * Charset koi8-ru translation based on charset koi8-u. * The Unicode to charset table has only exact mappings. diff --git a/fs/nls/nls_koi8-u.c b/fs/nls/nls_koi8-u.c index 9d30fd61cf4..015070211f2 100644 --- a/fs/nls/nls_koi8-u.c +++ b/fs/nls/nls_koi8-u.c @@ -1,5 +1,5 @@ /* - * linux/fs/nls_koi8-u.c + * linux/fs/nls/nls_koi8-u.c * * Charset koi8-u translation tables. * The Unicode to charset table has only exact mappings. diff --git a/fs/proc/array.c b/fs/proc/array.c index c0e554971df..25e917fb473 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -162,7 +162,7 @@ static inline char * task_state(struct task_struct *p, char *buffer) int g; struct fdtable *fdt = NULL; - read_lock(&tasklist_lock); + rcu_read_lock(); buffer += sprintf(buffer, "State:\t%s\n" "SleepAVG:\t%lu%%\n" @@ -174,14 +174,13 @@ static inline char * task_state(struct task_struct *p, char *buffer) "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), (p->sleep_avg/1024)*100/(1020000000/1024), - p->tgid, - p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0, - pid_alive(p) && p->ptrace ? p->parent->pid : 0, + p->tgid, p->pid, + pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, + pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, p->uid, p->euid, p->suid, p->fsuid, p->gid, p->egid, p->sgid, p->fsgid); - read_unlock(&tasklist_lock); + task_lock(p); - rcu_read_lock(); if (p->files) fdt = files_fdtable(p->files); buffer += sprintf(buffer, @@ -244,6 +243,7 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, static inline char * task_sig(struct task_struct *p, char *buffer) { + unsigned long flags; sigset_t pending, shpending, blocked, ignored, caught; int num_threads = 0; unsigned long qsize = 0; @@ -255,10 +255,8 @@ static inline char * task_sig(struct task_struct *p, char *buffer) sigemptyset(&ignored); sigemptyset(&caught); - /* Gather all the data with the appropriate locks held */ - read_lock(&tasklist_lock); - if (p->sighand) { - spin_lock_irq(&p->sighand->siglock); + rcu_read_lock(); + if (lock_task_sighand(p, &flags)) { pending = p->pending.signal; shpending = p->signal->shared_pending.signal; blocked = p->blocked; @@ -266,9 +264,9 @@ static inline char * task_sig(struct task_struct *p, char *buffer) num_threads = atomic_read(&p->signal->count); qsize = atomic_read(&p->user->sigpending); qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; - spin_unlock_irq(&p->sighand->siglock); + unlock_task_sighand(p, &flags); } - read_unlock(&tasklist_lock); + rcu_read_unlock(); buffer += sprintf(buffer, "Threads:\t%d\n", num_threads); buffer += sprintf(buffer, "SigQ:\t%lu/%lu\n", qsize, qlim); @@ -322,7 +320,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) sigset_t sigign, sigcatch; char state; int res; - pid_t ppid, pgid = -1, sid = -1; + pid_t ppid = 0, pgid = -1, sid = -1; int num_threads = 0; struct mm_struct *mm; unsigned long long start_time; @@ -330,8 +328,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) unsigned long min_flt = 0, maj_flt = 0; cputime_t cutime, cstime, utime, stime; unsigned long rsslim = 0; - struct task_struct *t; char tcomm[sizeof(task->comm)]; + unsigned long flags; state = *get_task_state(task); vsize = eip = esp = 0; @@ -349,15 +347,33 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) cutime = cstime = utime = stime = cputime_zero; mutex_lock(&tty_mutex); - read_lock(&tasklist_lock); - if (task->sighand) { - spin_lock_irq(&task->sighand->siglock); - num_threads = atomic_read(&task->signal->count); + rcu_read_lock(); + if (lock_task_sighand(task, &flags)) { + struct signal_struct *sig = task->signal; + struct tty_struct *tty = sig->tty; + + if (tty) { + /* + * sig->tty is not stable, but tty_mutex + * protects us from release_dev(tty) + */ + barrier(); + tty_pgrp = tty->pgrp; + tty_nr = new_encode_dev(tty_devnum(tty)); + } + + num_threads = atomic_read(&sig->count); collect_sigign_sigcatch(task, &sigign, &sigcatch); + cmin_flt = sig->cmin_flt; + cmaj_flt = sig->cmaj_flt; + cutime = sig->cutime; + cstime = sig->cstime; + rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; + /* add up live thread stats at the group level */ if (whole) { - t = task; + struct task_struct *t = task; do { min_flt += t->min_flt; maj_flt += t->maj_flt; @@ -365,31 +381,20 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) stime = cputime_add(stime, t->stime); t = next_thread(t); } while (t != task); - } - spin_unlock_irq(&task->sighand->siglock); - } - if (task->signal) { - if (task->signal->tty) { - tty_pgrp = task->signal->tty->pgrp; - tty_nr = new_encode_dev(tty_devnum(task->signal->tty)); + min_flt += sig->min_flt; + maj_flt += sig->maj_flt; + utime = cputime_add(utime, sig->utime); + stime = cputime_add(stime, sig->stime); } + + sid = sig->session; pgid = process_group(task); - sid = task->signal->session; - cmin_flt = task->signal->cmin_flt; - cmaj_flt = task->signal->cmaj_flt; - cutime = task->signal->cutime; - cstime = task->signal->cstime; - rsslim = task->signal->rlim[RLIMIT_RSS].rlim_cur; - if (whole) { - min_flt += task->signal->min_flt; - maj_flt += task->signal->maj_flt; - utime = cputime_add(utime, task->signal->utime); - stime = cputime_add(stime, task->signal->stime); - } + ppid = rcu_dereference(task->real_parent)->tgid; + + unlock_task_sighand(task, &flags); } - ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0; - read_unlock(&tasklist_lock); + rcu_read_unlock(); mutex_unlock(&tty_mutex); if (!whole || num_threads<2) diff --git a/fs/proc/base.c b/fs/proc/base.c index 89c20d9d50b..82da55b5cff 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -71,6 +71,7 @@ #include <linux/cpuset.h> #include <linux/audit.h> #include <linux/poll.h> +#include <linux/nsproxy.h> #include "internal.h" /* NOTE: @@ -83,262 +84,44 @@ * in /proc for a task before it execs a suid executable. */ -/* - * For hysterical raisins we keep the same inumbers as in the old procfs. - * Feel free to change the macro below - just keep the range distinct from - * inumbers of the rest of procfs (currently those are in 0x0000--0xffff). - * As soon as we'll get a separate superblock we will be able to forget - * about magical ranges too. - */ - -#define fake_ino(pid,ino) (((pid)<<16)|(ino)) - -enum pid_directory_inos { - PROC_TGID_INO = 2, - PROC_TGID_TASK, - PROC_TGID_STATUS, - PROC_TGID_MEM, -#ifdef CONFIG_SECCOMP - PROC_TGID_SECCOMP, -#endif - PROC_TGID_CWD, - PROC_TGID_ROOT, - PROC_TGID_EXE, - PROC_TGID_FD, - PROC_TGID_ENVIRON, - PROC_TGID_AUXV, - PROC_TGID_CMDLINE, - PROC_TGID_STAT, - PROC_TGID_STATM, - PROC_TGID_MAPS, - PROC_TGID_NUMA_MAPS, - PROC_TGID_MOUNTS, - PROC_TGID_MOUNTSTATS, - PROC_TGID_WCHAN, -#ifdef CONFIG_MMU - PROC_TGID_SMAPS, -#endif -#ifdef CONFIG_SCHEDSTATS - PROC_TGID_SCHEDSTAT, -#endif -#ifdef CONFIG_CPUSETS - PROC_TGID_CPUSET, -#endif -#ifdef CONFIG_SECURITY - PROC_TGID_ATTR, - PROC_TGID_ATTR_CURRENT, - PROC_TGID_ATTR_PREV, - PROC_TGID_ATTR_EXEC, - PROC_TGID_ATTR_FSCREATE, - PROC_TGID_ATTR_KEYCREATE, - PROC_TGID_ATTR_SOCKCREATE, -#endif -#ifdef CONFIG_AUDITSYSCALL - PROC_TGID_LOGINUID, -#endif - PROC_TGID_OOM_SCORE, - PROC_TGID_OOM_ADJUST, - PROC_TID_INO, - PROC_TID_STATUS, - PROC_TID_MEM, -#ifdef CONFIG_SECCOMP - PROC_TID_SECCOMP, -#endif - PROC_TID_CWD, - PROC_TID_ROOT, - PROC_TID_EXE, - PROC_TID_FD, - PROC_TID_ENVIRON, - PROC_TID_AUXV, - PROC_TID_CMDLINE, - PROC_TID_STAT, - PROC_TID_STATM, - PROC_TID_MAPS, - PROC_TID_NUMA_MAPS, - PROC_TID_MOUNTS, - PROC_TID_MOUNTSTATS, - PROC_TID_WCHAN, -#ifdef CONFIG_MMU - PROC_TID_SMAPS, -#endif -#ifdef CONFIG_SCHEDSTATS - PROC_TID_SCHEDSTAT, -#endif -#ifdef CONFIG_CPUSETS - PROC_TID_CPUSET, -#endif -#ifdef CONFIG_SECURITY - PROC_TID_ATTR, - PROC_TID_ATTR_CURRENT, - PROC_TID_ATTR_PREV, - PROC_TID_ATTR_EXEC, - PROC_TID_ATTR_FSCREATE, - PROC_TID_ATTR_KEYCREATE, - PROC_TID_ATTR_SOCKCREATE, -#endif -#ifdef CONFIG_AUDITSYSCALL - PROC_TID_LOGINUID, -#endif - PROC_TID_OOM_SCORE, - PROC_TID_OOM_ADJUST, - - /* Add new entries before this */ - PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */ -}; /* Worst case buffer size needed for holding an integer. */ #define PROC_NUMBUF 10 struct pid_entry { - int type; int len; char *name; mode_t mode; + struct inode_operations *iop; + struct file_operations *fop; + union proc_op op; }; -#define E(type,name,mode) {(type),sizeof(name)-1,(name),(mode)} - -static struct pid_entry tgid_base_stuff[] = { - E(PROC_TGID_TASK, "task", S_IFDIR|S_IRUGO|S_IXUGO), - E(PROC_TGID_FD, "fd", S_IFDIR|S_IRUSR|S_IXUSR), - E(PROC_TGID_ENVIRON, "environ", S_IFREG|S_IRUSR), - E(PROC_TGID_AUXV, "auxv", S_IFREG|S_IRUSR), - E(PROC_TGID_STATUS, "status", S_IFREG|S_IRUGO), - E(PROC_TGID_CMDLINE, "cmdline", S_IFREG|S_IRUGO), - E(PROC_TGID_STAT, "stat", S_IFREG|S_IRUGO), - E(PROC_TGID_STATM, "statm", S_IFREG|S_IRUGO), - E(PROC_TGID_MAPS, "maps", S_IFREG|S_IRUGO), -#ifdef CONFIG_NUMA - E(PROC_TGID_NUMA_MAPS, "numa_maps", S_IFREG|S_IRUGO), -#endif - E(PROC_TGID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), -#ifdef CONFIG_SECCOMP - E(PROC_TGID_SECCOMP, "seccomp", S_IFREG|S_IRUSR|S_IWUSR), -#endif - E(PROC_TGID_CWD, "cwd", S_IFLNK|S_IRWXUGO), - E(PROC_TGID_ROOT, "root", S_IFLNK|S_IRWXUGO), - E(PROC_TGID_EXE, "exe", S_IFLNK|S_IRWXUGO), - E(PROC_TGID_MOUNTS, "mounts", S_IFREG|S_IRUGO), - E(PROC_TGID_MOUNTSTATS, "mountstats", S_IFREG|S_IRUSR), -#ifdef CONFIG_MMU - E(PROC_TGID_SMAPS, "smaps", S_IFREG|S_IRUGO), -#endif -#ifdef CONFIG_SECURITY - E(PROC_TGID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO), -#endif -#ifdef CONFIG_KALLSYMS - E(PROC_TGID_WCHAN, "wchan", S_IFREG|S_IRUGO), -#endif -#ifdef CONFIG_SCHEDSTATS - E(PROC_TGID_SCHEDSTAT, "schedstat", S_IFREG|S_IRUGO), -#endif -#ifdef CONFIG_CPUSETS - E(PROC_TGID_CPUSET, "cpuset", S_IFREG|S_IRUGO), -#endif - E(PROC_TGID_OOM_SCORE, "oom_score",S_IFREG|S_IRUGO), - E(PROC_TGID_OOM_ADJUST,"oom_adj", S_IFREG|S_IRUGO|S_IWUSR), -#ifdef CONFIG_AUDITSYSCALL - E(PROC_TGID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO), -#endif - {0,0,NULL,0} -}; -static struct pid_entry tid_base_stuff[] = { - E(PROC_TID_FD, "fd", S_IFDIR|S_IRUSR|S_IXUSR), - E(PROC_TID_ENVIRON, "environ", S_IFREG|S_IRUSR), - E(PROC_TID_AUXV, "auxv", S_IFREG|S_IRUSR), - E(PROC_TID_STATUS, "status", S_IFREG|S_IRUGO), - E(PROC_TID_CMDLINE, "cmdline", S_IFREG|S_IRUGO), - E(PROC_TID_STAT, "stat", S_IFREG|S_IRUGO), - E(PROC_TID_STATM, "statm", S_IFREG|S_IRUGO), - E(PROC_TID_MAPS, "maps", S_IFREG|S_IRUGO), -#ifdef CONFIG_NUMA - E(PROC_TID_NUMA_MAPS, "numa_maps", S_IFREG|S_IRUGO), -#endif - E(PROC_TID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), -#ifdef CONFIG_SECCOMP - E(PROC_TID_SECCOMP, "seccomp", S_IFREG|S_IRUSR|S_IWUSR), -#endif - E(PROC_TID_CWD, "cwd", S_IFLNK|S_IRWXUGO), - E(PROC_TID_ROOT, "root", S_IFLNK|S_IRWXUGO), - E(PROC_TID_EXE, "exe", S_IFLNK|S_IRWXUGO), - E(PROC_TID_MOUNTS, "mounts", S_IFREG|S_IRUGO), -#ifdef CONFIG_MMU - E(PROC_TID_SMAPS, "smaps", S_IFREG|S_IRUGO), -#endif -#ifdef CONFIG_SECURITY - E(PROC_TID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO), -#endif -#ifdef CONFIG_KALLSYMS - E(PROC_TID_WCHAN, "wchan", S_IFREG|S_IRUGO), -#endif -#ifdef CONFIG_SCHEDSTATS - E(PROC_TID_SCHEDSTAT, "schedstat",S_IFREG|S_IRUGO), -#endif -#ifdef CONFIG_CPUSETS - E(PROC_TID_CPUSET, "cpuset", S_IFREG|S_IRUGO), -#endif - E(PROC_TID_OOM_SCORE, "oom_score",S_IFREG|S_IRUGO), - E(PROC_TID_OOM_ADJUST, "oom_adj", S_IFREG|S_IRUGO|S_IWUSR), -#ifdef CONFIG_AUDITSYSCALL - E(PROC_TID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO), -#endif - {0,0,NULL,0} -}; - -#ifdef CONFIG_SECURITY -static struct pid_entry tgid_attr_stuff[] = { - E(PROC_TGID_ATTR_CURRENT, "current", S_IFREG|S_IRUGO|S_IWUGO), - E(PROC_TGID_ATTR_PREV, "prev", S_IFREG|S_IRUGO), - E(PROC_TGID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO), - E(PROC_TGID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO), - E(PROC_TGID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO), - E(PROC_TGID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO), - {0,0,NULL,0} -}; -static struct pid_entry tid_attr_stuff[] = { - E(PROC_TID_ATTR_CURRENT, "current", S_IFREG|S_IRUGO|S_IWUGO), - E(PROC_TID_ATTR_PREV, "prev", S_IFREG|S_IRUGO), - E(PROC_TID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO), - E(PROC_TID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO), - E(PROC_TID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO), - E(PROC_TID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO), - {0,0,NULL,0} -}; -#endif - -#undef E - -static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) -{ - struct task_struct *task = get_proc_task(inode); - struct files_struct *files = NULL; - struct file *file; - int fd = proc_fd(inode); - - if (task) { - files = get_files_struct(task); - put_task_struct(task); - } - if (files) { - /* - * We are not taking a ref to the file structure, so we must - * hold ->file_lock. - */ - spin_lock(&files->file_lock); - file = fcheck_files(files, fd); - if (file) { - *mnt = mntget(file->f_vfsmnt); - *dentry = dget(file->f_dentry); - spin_unlock(&files->file_lock); - put_files_struct(files); - return 0; - } - spin_unlock(&files->file_lock); - put_files_struct(files); - } - return -ENOENT; +#define NOD(NAME, MODE, IOP, FOP, OP) { \ + .len = sizeof(NAME) - 1, \ + .name = (NAME), \ + .mode = MODE, \ + .iop = IOP, \ + .fop = FOP, \ + .op = OP, \ } +#define DIR(NAME, MODE, OTYPE) \ + NOD(NAME, (S_IFDIR|(MODE)), \ + &proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations, \ + {} ) +#define LNK(NAME, OTYPE) \ + NOD(NAME, (S_IFLNK|S_IRWXUGO), \ + &proc_pid_link_inode_operations, NULL, \ + { .proc_get_link = &proc_##OTYPE##_link } ) +#define REG(NAME, MODE, OTYPE) \ + NOD(NAME, (S_IFREG|(MODE)), NULL, \ + &proc_##OTYPE##_operations, {}) +#define INF(NAME, MODE, OTYPE) \ + NOD(NAME, (S_IFREG|(MODE)), \ + NULL, &proc_info_file_operations, \ + { .proc_read = &proc_##OTYPE } ) + static struct fs_struct *get_fs_struct(struct task_struct *task) { struct fs_struct *fs; @@ -587,7 +370,7 @@ static int mounts_open(struct inode *inode, struct file *file) if (task) { task_lock(task); - namespace = task->namespace; + namespace = task->nsproxy->namespace; if (namespace) get_namespace(namespace); task_unlock(task); @@ -658,7 +441,7 @@ static int mountstats_open(struct inode *inode, struct file *file) if (task) { task_lock(task); - namespace = task->namespace; + namespace = task->nsproxy->namespace; if (namespace) get_namespace(namespace); task_unlock(task); @@ -1137,143 +920,6 @@ static struct inode_operations proc_pid_link_inode_operations = { .setattr = proc_setattr, }; -static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) -{ - struct dentry *dentry = filp->f_dentry; - struct inode *inode = dentry->d_inode; - struct task_struct *p = get_proc_task(inode); - unsigned int fd, tid, ino; - int retval; - char buf[PROC_NUMBUF]; - struct files_struct * files; - struct fdtable *fdt; - - retval = -ENOENT; - if (!p) - goto out_no_task; - retval = 0; - tid = p->pid; - - fd = filp->f_pos; - switch (fd) { - case 0: - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - default: - files = get_files_struct(p); - if (!files) - goto out; - rcu_read_lock(); - fdt = files_fdtable(files); - for (fd = filp->f_pos-2; - fd < fdt->max_fds; - fd++, filp->f_pos++) { - unsigned int i,j; - - if (!fcheck_files(files, fd)) - continue; - rcu_read_unlock(); - - j = PROC_NUMBUF; - i = fd; - do { - j--; - buf[j] = '0' + (i % 10); - i /= 10; - } while (i); - - ino = fake_ino(tid, PROC_TID_FD_DIR + fd); - if (filldir(dirent, buf+j, PROC_NUMBUF-j, fd+2, ino, DT_LNK) < 0) { - rcu_read_lock(); - break; - } - rcu_read_lock(); - } - rcu_read_unlock(); - put_files_struct(files); - } -out: - put_task_struct(p); -out_no_task: - return retval; -} - -static int proc_pident_readdir(struct file *filp, - void *dirent, filldir_t filldir, - struct pid_entry *ents, unsigned int nents) -{ - int i; - int pid; - struct dentry *dentry = filp->f_dentry; - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); - struct pid_entry *p; - ino_t ino; - int ret; - - ret = -ENOENT; - if (!task) - goto out; - - ret = 0; - pid = task->pid; - put_task_struct(task); - i = filp->f_pos; - switch (i) { - case 0: - ino = inode->i_ino; - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - default: - i -= 2; - if (i >= nents) { - ret = 1; - goto out; - } - p = ents + i; - while (p->name) { - if (filldir(dirent, p->name, p->len, filp->f_pos, - fake_ino(pid, p->type), p->mode >> 12) < 0) - goto out; - filp->f_pos++; - p++; - } - } - - ret = 1; -out: - return ret; -} - -static int proc_tgid_base_readdir(struct file * filp, - void * dirent, filldir_t filldir) -{ - return proc_pident_readdir(filp,dirent,filldir, - tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff)); -} - -static int proc_tid_base_readdir(struct file * filp, - void * dirent, filldir_t filldir) -{ - return proc_pident_readdir(filp,dirent,filldir, - tid_base_stuff,ARRAY_SIZE(tid_base_stuff)); -} /* building an inode */ @@ -1293,13 +939,13 @@ static int task_dumpable(struct task_struct *task) } -static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task, int ino) +static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) { struct inode * inode; struct proc_inode *ei; /* We need a new inode */ - + inode = new_inode(sb); if (!inode) goto out; @@ -1307,13 +953,12 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st /* Common stuff */ ei = PROC_I(inode); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_ino = fake_ino(task->pid, ino); inode->i_op = &proc_def_inode_operations; /* * grab the reference to task. */ - ei->pid = get_pid(task->pids[PIDTYPE_PID].pid); + ei->pid = get_task_pid(task, PIDTYPE_PID); if (!ei->pid) goto out_unlock; @@ -1333,6 +978,27 @@ out_unlock: return NULL; } +static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *task; + generic_fillattr(inode, stat); + + rcu_read_lock(); + stat->uid = 0; + stat->gid = 0; + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { + stat->uid = task->euid; + stat->gid = task->egid; + } + } + rcu_read_unlock(); + return 0; +} + /* dentry stuff */ /* @@ -1372,25 +1038,130 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) return 0; } -static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +static int pid_delete_dentry(struct dentry * dentry) { - struct inode *inode = dentry->d_inode; - struct task_struct *task; - generic_fillattr(inode, stat); + /* Is the task we represent dead? + * If so, then don't put the dentry on the lru list, + * kill it immediately. + */ + return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; +} + +static struct dentry_operations pid_dentry_operations = +{ + .d_revalidate = pid_revalidate, + .d_delete = pid_delete_dentry, +}; + +/* Lookups */ + +typedef struct dentry *instantiate_t(struct inode *, struct dentry *, struct task_struct *, void *); + +/* + * Fill a directory entry. + * + * If possible create the dcache entry and derive our inode number and + * file type from dcache entry. + * + * Since all of the proc inode numbers are dynamically generated, the inode + * numbers do not exist until the inode is cache. This means creating the + * the dcache entry in readdir is necessary to keep the inode numbers + * reported by readdir in sync with the inode numbers reported + * by stat. + */ +static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + char *name, int len, + instantiate_t instantiate, struct task_struct *task, void *ptr) +{ + struct dentry *child, *dir = filp->f_dentry; + struct inode *inode; + struct qstr qname; + ino_t ino = 0; + unsigned type = DT_UNKNOWN; + + qname.name = name; + qname.len = len; + qname.hash = full_name_hash(name, len); + + child = d_lookup(dir, &qname); + if (!child) { + struct dentry *new; + new = d_alloc(dir, &qname); + if (new) { + child = instantiate(dir->d_inode, new, task, ptr); + if (child) + dput(new); + else + child = new; + } + } + if (!child || IS_ERR(child) || !child->d_inode) + goto end_instantiate; + inode = child->d_inode; + if (inode) { + ino = inode->i_ino; + type = inode->i_mode >> 12; + } + dput(child); +end_instantiate: + if (!ino) + ino = find_inode_number(dir, &qname); + if (!ino) + ino = 1; + return filldir(dirent, name, len, filp->f_pos, ino, type); +} + +static unsigned name_to_int(struct dentry *dentry) +{ + const char *name = dentry->d_name.name; + int len = dentry->d_name.len; + unsigned n = 0; + + if (len > 1 && *name == '0') + goto out; + while (len-- > 0) { + unsigned c = *name++ - '0'; + if (c > 9) + goto out; + if (n >= (~0U-9)/10) + goto out; + n *= 10; + n += c; + } + return n; +out: + return ~0U; +} + +static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) +{ + struct task_struct *task = get_proc_task(inode); + struct files_struct *files = NULL; + struct file *file; + int fd = proc_fd(inode); - rcu_read_lock(); - stat->uid = 0; - stat->gid = 0; - task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { - if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || - task_dumpable(task)) { - stat->uid = task->euid; - stat->gid = task->egid; + files = get_files_struct(task); + put_task_struct(task); + } + if (files) { + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (file) { + *mnt = mntget(file->f_vfsmnt); + *dentry = dget(file->f_dentry); + spin_unlock(&files->file_lock); + put_files_struct(files); + return 0; } + spin_unlock(&files->file_lock); + put_files_struct(files); } - rcu_read_unlock(); - return 0; + return -ENOENT; } static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) @@ -1428,75 +1199,30 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) return 0; } -static int pid_delete_dentry(struct dentry * dentry) -{ - /* Is the task we represent dead? - * If so, then don't put the dentry on the lru list, - * kill it immediately. - */ - return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; -} - static struct dentry_operations tid_fd_dentry_operations = { .d_revalidate = tid_fd_revalidate, .d_delete = pid_delete_dentry, }; -static struct dentry_operations pid_dentry_operations = -{ - .d_revalidate = pid_revalidate, - .d_delete = pid_delete_dentry, -}; - -/* Lookups */ - -static unsigned name_to_int(struct dentry *dentry) -{ - const char *name = dentry->d_name.name; - int len = dentry->d_name.len; - unsigned n = 0; - - if (len > 1 && *name == '0') - goto out; - while (len-- > 0) { - unsigned c = *name++ - '0'; - if (c > 9) - goto out; - if (n >= (~0U-9)/10) - goto out; - n *= 10; - n += c; - } - return n; -out: - return ~0U; -} - -/* SMP-safe */ -static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd) +static struct dentry *proc_fd_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, void *ptr) { - struct task_struct *task = get_proc_task(dir); - unsigned fd = name_to_int(dentry); - struct dentry *result = ERR_PTR(-ENOENT); - struct file * file; - struct files_struct * files; - struct inode *inode; - struct proc_inode *ei; - - if (!task) - goto out_no_task; - if (fd == ~0U) - goto out; + unsigned fd = *(unsigned *)ptr; + struct file *file; + struct files_struct *files; + struct inode *inode; + struct proc_inode *ei; + struct dentry *error = ERR_PTR(-ENOENT); - inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd); + inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) goto out; ei = PROC_I(inode); ei->fd = fd; files = get_files_struct(task); if (!files) - goto out_unlock; + goto out_iput; inode->i_mode = S_IFLNK; /* @@ -1506,13 +1232,14 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, spin_lock(&files->file_lock); file = fcheck_files(files, fd); if (!file) - goto out_unlock2; + goto out_unlock; if (file->f_mode & 1) inode->i_mode |= S_IRUSR | S_IXUSR; if (file->f_mode & 2) inode->i_mode |= S_IWUSR | S_IXUSR; spin_unlock(&files->file_lock); put_files_struct(files); + inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; ei->op.proc_get_link = proc_fd_link; @@ -1520,34 +1247,106 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (tid_fd_revalidate(dentry, NULL)) - result = NULL; -out: - put_task_struct(task); -out_no_task: - return result; + error = NULL; -out_unlock2: + out: + return error; +out_unlock: spin_unlock(&files->file_lock); put_files_struct(files); -out_unlock: +out_iput: iput(inode); goto out; } -static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir); -static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd); -static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); +static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd) +{ + struct task_struct *task = get_proc_task(dir); + unsigned fd = name_to_int(dentry); + struct dentry *result = ERR_PTR(-ENOENT); + + if (!task) + goto out_no_task; + if (fd == ~0U) + goto out; + + result = proc_fd_instantiate(dir, dentry, task, &fd); +out: + put_task_struct(task); +out_no_task: + return result; +} + +static int proc_fd_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + struct task_struct *task, int fd) +{ + char name[PROC_NUMBUF]; + int len = snprintf(name, sizeof(name), "%d", fd); + return proc_fill_cache(filp, dirent, filldir, name, len, + proc_fd_instantiate, task, &fd); +} + +static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *p = get_proc_task(inode); + unsigned int fd, tid, ino; + int retval; + struct files_struct * files; + struct fdtable *fdt; + + retval = -ENOENT; + if (!p) + goto out_no_task; + retval = 0; + tid = p->pid; + + fd = filp->f_pos; + switch (fd) { + case 0: + if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + default: + files = get_files_struct(p); + if (!files) + goto out; + rcu_read_lock(); + fdt = files_fdtable(files); + for (fd = filp->f_pos-2; + fd < fdt->max_fds; + fd++, filp->f_pos++) { + + if (!fcheck_files(files, fd)) + continue; + rcu_read_unlock(); + + if (proc_fd_fill_cache(filp, dirent, filldir, p, fd) < 0) { + rcu_read_lock(); + break; + } + rcu_read_lock(); + } + rcu_read_unlock(); + put_files_struct(files); + } +out: + put_task_struct(p); +out_no_task: + return retval; +} static struct file_operations proc_fd_operations = { .read = generic_read_dir, .readdir = proc_readfd, }; -static struct file_operations proc_task_operations = { - .read = generic_read_dir, - .readdir = proc_task_readdir, -}; - /* * proc directories can do almost nothing.. */ @@ -1556,11 +1355,137 @@ static struct inode_operations proc_fd_inode_operations = { .setattr = proc_setattr, }; -static struct inode_operations proc_task_inode_operations = { - .lookup = proc_task_lookup, - .getattr = proc_task_getattr, - .setattr = proc_setattr, -}; +static struct dentry *proc_pident_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, void *ptr) +{ + struct pid_entry *p = ptr; + struct inode *inode; + struct proc_inode *ei; + struct dentry *error = ERR_PTR(-EINVAL); + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ei = PROC_I(inode); + inode->i_mode = p->mode; + if (S_ISDIR(inode->i_mode)) + inode->i_nlink = 2; /* Use getattr to fix if necessary */ + if (p->iop) + inode->i_op = p->iop; + if (p->fop) + inode->i_fop = p->fop; + ei->op = p->op; + dentry->d_op = &pid_dentry_operations; + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; +out: + return error; +} + +static struct dentry *proc_pident_lookup(struct inode *dir, + struct dentry *dentry, + struct pid_entry *ents, + unsigned int nents) +{ + struct inode *inode; + struct dentry *error; + struct task_struct *task = get_proc_task(dir); + struct pid_entry *p, *last; + + error = ERR_PTR(-ENOENT); + inode = NULL; + + if (!task) + goto out_no_task; + + /* + * Yes, it does not scale. And it should not. Don't add + * new entries into /proc/<tgid>/ without very good reasons. + */ + last = &ents[nents - 1]; + for (p = ents; p <= last; p++) { + if (p->len != dentry->d_name.len) + continue; + if (!memcmp(dentry->d_name.name, p->name, p->len)) + break; + } + if (p > last) + goto out; + + error = proc_pident_instantiate(dir, dentry, task, p); +out: + put_task_struct(task); +out_no_task: + return error; +} + +static int proc_pident_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + struct task_struct *task, struct pid_entry *p) +{ + return proc_fill_cache(filp, dirent, filldir, p->name, p->len, + proc_pident_instantiate, task, p); +} + +static int proc_pident_readdir(struct file *filp, + void *dirent, filldir_t filldir, + struct pid_entry *ents, unsigned int nents) +{ + int i; + int pid; + struct dentry *dentry = filp->f_dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); + struct pid_entry *p, *last; + ino_t ino; + int ret; + + ret = -ENOENT; + if (!task) + goto out_no_task; + + ret = 0; + pid = task->pid; + i = filp->f_pos; + switch (i) { + case 0: + ino = inode->i_ino; + if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + default: + i -= 2; + if (i >= nents) { + ret = 1; + goto out; + } + p = ents + i; + last = &ents[nents - 1]; + while (p <= last) { + if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0) + goto out; + filp->f_pos++; + p++; + } + } + + ret = 1; +out: + put_task_struct(task); +out_no_task: + return ret; +} #ifdef CONFIG_SECURITY static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, @@ -1581,8 +1506,8 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, if (!(page = __get_free_page(GFP_KERNEL))) goto out; - length = security_getprocattr(task, - (char*)file->f_dentry->d_name.name, + length = security_getprocattr(task, + (char*)file->f_dentry->d_name.name, (void*)page, count); if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); @@ -1595,17 +1520,17 @@ out_no_task: static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) -{ +{ struct inode * inode = file->f_dentry->d_inode; - char *page; - ssize_t length; + char *page; + ssize_t length; struct task_struct *task = get_proc_task(inode); length = -ESRCH; if (!task) goto out_no_task; - if (count > PAGE_SIZE) - count = PAGE_SIZE; + if (count > PAGE_SIZE) + count = PAGE_SIZE; /* No partial writes. */ length = -EINVAL; @@ -1613,16 +1538,16 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, goto out; length = -ENOMEM; - page = (char*)__get_free_page(GFP_USER); - if (!page) + page = (char*)__get_free_page(GFP_USER); + if (!page) goto out; - length = -EFAULT; - if (copy_from_user(page, buf, count)) + length = -EFAULT; + if (copy_from_user(page, buf, count)) goto out_free; - length = security_setprocattr(task, - (char*)file->f_dentry->d_name.name, + length = security_setprocattr(task, + (char*)file->f_dentry->d_name.name, (void*)page, count); out_free: free_page((unsigned long) page); @@ -1630,330 +1555,263 @@ out: put_task_struct(task); out_no_task: return length; -} +} static struct file_operations proc_pid_attr_operations = { .read = proc_pid_attr_read, .write = proc_pid_attr_write, }; -static struct file_operations proc_tid_attr_operations; -static struct inode_operations proc_tid_attr_inode_operations; -static struct file_operations proc_tgid_attr_operations; -static struct inode_operations proc_tgid_attr_inode_operations; +static struct pid_entry attr_dir_stuff[] = { + REG("current", S_IRUGO|S_IWUGO, pid_attr), + REG("prev", S_IRUGO, pid_attr), + REG("exec", S_IRUGO|S_IWUGO, pid_attr), + REG("fscreate", S_IRUGO|S_IWUGO, pid_attr), + REG("keycreate", S_IRUGO|S_IWUGO, pid_attr), + REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr), +}; + +static int proc_attr_dir_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + return proc_pident_readdir(filp,dirent,filldir, + attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff)); +} + +static struct file_operations proc_attr_dir_operations = { + .read = generic_read_dir, + .readdir = proc_attr_dir_readdir, +}; + +static struct dentry *proc_attr_dir_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + return proc_pident_lookup(dir, dentry, + attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); +} + +static struct inode_operations proc_attr_dir_inode_operations = { + .lookup = proc_attr_dir_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + #endif -/* SMP-safe */ -static struct dentry *proc_pident_lookup(struct inode *dir, - struct dentry *dentry, - struct pid_entry *ents) +/* + * /proc/self: + */ +static int proc_self_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + char tmp[PROC_NUMBUF]; + sprintf(tmp, "%d", current->tgid); + return vfs_readlink(dentry,buffer,buflen,tmp); +} + +static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) { + char tmp[PROC_NUMBUF]; + sprintf(tmp, "%d", current->tgid); + return ERR_PTR(vfs_follow_link(nd,tmp)); +} + +static struct inode_operations proc_self_inode_operations = { + .readlink = proc_self_readlink, + .follow_link = proc_self_follow_link, +}; + +/* + * proc base + * + * These are the directory entries in the root directory of /proc + * that properly belong to the /proc filesystem, as they describe + * describe something that is process related. + */ +static struct pid_entry proc_base_stuff[] = { + NOD("self", S_IFLNK|S_IRWXUGO, + &proc_self_inode_operations, NULL, {}), +}; + +/* + * Exceptional case: normally we are not allowed to unhash a busy + * directory. In this case, however, we can do it - no aliasing problems + * due to the way we treat inodes. + */ +static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); + if (task) { + put_task_struct(task); + return 1; + } + d_drop(dentry); + return 0; +} + +static struct dentry_operations proc_base_dentry_operations = +{ + .d_revalidate = proc_base_revalidate, + .d_delete = pid_delete_dentry, +}; + +static struct dentry *proc_base_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, void *ptr) +{ + struct pid_entry *p = ptr; struct inode *inode; + struct proc_inode *ei; + struct dentry *error = ERR_PTR(-EINVAL); + + /* Allocate the inode */ + error = ERR_PTR(-ENOMEM); + inode = new_inode(dir->i_sb); + if (!inode) + goto out; + + /* Initialize the inode */ + ei = PROC_I(inode); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + /* + * grab the reference to the task. + */ + ei->pid = get_task_pid(task, PIDTYPE_PID); + if (!ei->pid) + goto out_iput; + + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_mode = p->mode; + if (S_ISDIR(inode->i_mode)) + inode->i_nlink = 2; + if (S_ISLNK(inode->i_mode)) + inode->i_size = 64; + if (p->iop) + inode->i_op = p->iop; + if (p->fop) + inode->i_fop = p->fop; + ei->op = p->op; + dentry->d_op = &proc_base_dentry_operations; + d_add(dentry, inode); + error = NULL; +out: + return error; +out_iput: + iput(inode); + goto out; +} + +static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry) +{ struct dentry *error; struct task_struct *task = get_proc_task(dir); - struct pid_entry *p; - struct proc_inode *ei; + struct pid_entry *p, *last; error = ERR_PTR(-ENOENT); - inode = NULL; if (!task) goto out_no_task; - for (p = ents; p->name; p++) { + /* Lookup the directory entry */ + last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1]; + for (p = proc_base_stuff; p <= last; p++) { if (p->len != dentry->d_name.len) continue; if (!memcmp(dentry->d_name.name, p->name, p->len)) break; } - if (!p->name) + if (p > last) goto out; - error = ERR_PTR(-EINVAL); - inode = proc_pid_make_inode(dir->i_sb, task, p->type); - if (!inode) - goto out; + error = proc_base_instantiate(dir, dentry, task, p); - ei = PROC_I(inode); - inode->i_mode = p->mode; - /* - * Yes, it does not scale. And it should not. Don't add - * new entries into /proc/<tgid>/ without very good reasons. - */ - switch(p->type) { - case PROC_TGID_TASK: - inode->i_nlink = 2; - inode->i_op = &proc_task_inode_operations; - inode->i_fop = &proc_task_operations; - break; - case PROC_TID_FD: - case PROC_TGID_FD: - inode->i_nlink = 2; - inode->i_op = &proc_fd_inode_operations; - inode->i_fop = &proc_fd_operations; - break; - case PROC_TID_EXE: - case PROC_TGID_EXE: - inode->i_op = &proc_pid_link_inode_operations; - ei->op.proc_get_link = proc_exe_link; - break; - case PROC_TID_CWD: - case PROC_TGID_CWD: - inode->i_op = &proc_pid_link_inode_operations; - ei->op.proc_get_link = proc_cwd_link; - break; - case PROC_TID_ROOT: - case PROC_TGID_ROOT: - inode->i_op = &proc_pid_link_inode_operations; - ei->op.proc_get_link = proc_root_link; - break; - case PROC_TID_ENVIRON: - case PROC_TGID_ENVIRON: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_pid_environ; - break; - case PROC_TID_AUXV: - case PROC_TGID_AUXV: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_pid_auxv; - break; - case PROC_TID_STATUS: - case PROC_TGID_STATUS: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_pid_status; - break; - case PROC_TID_STAT: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_tid_stat; - break; - case PROC_TGID_STAT: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_tgid_stat; - break; - case PROC_TID_CMDLINE: - case PROC_TGID_CMDLINE: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_pid_cmdline; - break; - case PROC_TID_STATM: - case PROC_TGID_STATM: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_pid_statm; - break; - case PROC_TID_MAPS: - case PROC_TGID_MAPS: - inode->i_fop = &proc_maps_operations; - break; +out: + put_task_struct(task); +out_no_task: + return error; +} + +static int proc_base_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + struct task_struct *task, struct pid_entry *p) +{ + return proc_fill_cache(filp, dirent, filldir, p->name, p->len, + proc_base_instantiate, task, p); +} + +/* + * Thread groups + */ +static struct file_operations proc_task_operations; +static struct inode_operations proc_task_inode_operations; + +static struct pid_entry tgid_base_stuff[] = { + DIR("task", S_IRUGO|S_IXUGO, task), + DIR("fd", S_IRUSR|S_IXUSR, fd), + INF("environ", S_IRUSR, pid_environ), + INF("auxv", S_IRUSR, pid_auxv), + INF("status", S_IRUGO, pid_status), + INF("cmdline", S_IRUGO, pid_cmdline), + INF("stat", S_IRUGO, tgid_stat), + INF("statm", S_IRUGO, pid_statm), + REG("maps", S_IRUGO, maps), #ifdef CONFIG_NUMA - case PROC_TID_NUMA_MAPS: - case PROC_TGID_NUMA_MAPS: - inode->i_fop = &proc_numa_maps_operations; - break; + REG("numa_maps", S_IRUGO, numa_maps), #endif - case PROC_TID_MEM: - case PROC_TGID_MEM: - inode->i_fop = &proc_mem_operations; - break; + REG("mem", S_IRUSR|S_IWUSR, mem), #ifdef CONFIG_SECCOMP - case PROC_TID_SECCOMP: - case PROC_TGID_SECCOMP: - inode->i_fop = &proc_seccomp_operations; - break; -#endif /* CONFIG_SECCOMP */ - case PROC_TID_MOUNTS: - case PROC_TGID_MOUNTS: - inode->i_fop = &proc_mounts_operations; - break; + REG("seccomp", S_IRUSR|S_IWUSR, seccomp), +#endif + LNK("cwd", cwd), + LNK("root", root), + LNK("exe", exe), + REG("mounts", S_IRUGO, mounts), + REG("mountstats", S_IRUSR, mountstats), #ifdef CONFIG_MMU - case PROC_TID_SMAPS: - case PROC_TGID_SMAPS: - inode->i_fop = &proc_smaps_operations; - break; + REG("smaps", S_IRUGO, smaps), #endif - case PROC_TID_MOUNTSTATS: - case PROC_TGID_MOUNTSTATS: - inode->i_fop = &proc_mountstats_operations; - break; #ifdef CONFIG_SECURITY - case PROC_TID_ATTR: - inode->i_nlink = 2; - inode->i_op = &proc_tid_attr_inode_operations; - inode->i_fop = &proc_tid_attr_operations; - break; - case PROC_TGID_ATTR: - inode->i_nlink = 2; - inode->i_op = &proc_tgid_attr_inode_operations; - inode->i_fop = &proc_tgid_attr_operations; - break; - case PROC_TID_ATTR_CURRENT: - case PROC_TGID_ATTR_CURRENT: - case PROC_TID_ATTR_PREV: - case PROC_TGID_ATTR_PREV: - case PROC_TID_ATTR_EXEC: - case PROC_TGID_ATTR_EXEC: - case PROC_TID_ATTR_FSCREATE: - case PROC_TGID_ATTR_FSCREATE: - case PROC_TID_ATTR_KEYCREATE: - case PROC_TGID_ATTR_KEYCREATE: - case PROC_TID_ATTR_SOCKCREATE: - case PROC_TGID_ATTR_SOCKCREATE: - inode->i_fop = &proc_pid_attr_operations; - break; + DIR("attr", S_IRUGO|S_IXUGO, attr_dir), #endif #ifdef CONFIG_KALLSYMS - case PROC_TID_WCHAN: - case PROC_TGID_WCHAN: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_pid_wchan; - break; + INF("wchan", S_IRUGO, pid_wchan), #endif #ifdef CONFIG_SCHEDSTATS - case PROC_TID_SCHEDSTAT: - case PROC_TGID_SCHEDSTAT: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_pid_schedstat; - break; + INF("schedstat", S_IRUGO, pid_schedstat), #endif #ifdef CONFIG_CPUSETS - case PROC_TID_CPUSET: - case PROC_TGID_CPUSET: - inode->i_fop = &proc_cpuset_operations; - break; + REG("cpuset", S_IRUGO, cpuset), #endif - case PROC_TID_OOM_SCORE: - case PROC_TGID_OOM_SCORE: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_oom_score; - break; - case PROC_TID_OOM_ADJUST: - case PROC_TGID_OOM_ADJUST: - inode->i_fop = &proc_oom_adjust_operations; - break; + INF("oom_score", S_IRUGO, oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), #ifdef CONFIG_AUDITSYSCALL - case PROC_TID_LOGINUID: - case PROC_TGID_LOGINUID: - inode->i_fop = &proc_loginuid_operations; - break; + REG("loginuid", S_IWUSR|S_IRUGO, loginuid), #endif - default: - printk("procfs: impossible type (%d)",p->type); - iput(inode); - error = ERR_PTR(-EINVAL); - goto out; - } - dentry->d_op = &pid_dentry_operations; - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, NULL)) - error = NULL; -out: - put_task_struct(task); -out_no_task: - return error; -} - -static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ - return proc_pident_lookup(dir, dentry, tgid_base_stuff); -} - -static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ - return proc_pident_lookup(dir, dentry, tid_base_stuff); -} - -static struct file_operations proc_tgid_base_operations = { - .read = generic_read_dir, - .readdir = proc_tgid_base_readdir, }; -static struct file_operations proc_tid_base_operations = { - .read = generic_read_dir, - .readdir = proc_tid_base_readdir, -}; - -static struct inode_operations proc_tgid_base_inode_operations = { - .lookup = proc_tgid_base_lookup, - .getattr = pid_getattr, - .setattr = proc_setattr, -}; - -static struct inode_operations proc_tid_base_inode_operations = { - .lookup = proc_tid_base_lookup, - .getattr = pid_getattr, - .setattr = proc_setattr, -}; - -#ifdef CONFIG_SECURITY -static int proc_tgid_attr_readdir(struct file * filp, - void * dirent, filldir_t filldir) -{ - return proc_pident_readdir(filp,dirent,filldir, - tgid_attr_stuff,ARRAY_SIZE(tgid_attr_stuff)); -} - -static int proc_tid_attr_readdir(struct file * filp, +static int proc_tgid_base_readdir(struct file * filp, void * dirent, filldir_t filldir) { return proc_pident_readdir(filp,dirent,filldir, - tid_attr_stuff,ARRAY_SIZE(tid_attr_stuff)); + tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff)); } -static struct file_operations proc_tgid_attr_operations = { - .read = generic_read_dir, - .readdir = proc_tgid_attr_readdir, -}; - -static struct file_operations proc_tid_attr_operations = { +static struct file_operations proc_tgid_base_operations = { .read = generic_read_dir, - .readdir = proc_tid_attr_readdir, + .readdir = proc_tgid_base_readdir, }; -static struct dentry *proc_tgid_attr_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) -{ - return proc_pident_lookup(dir, dentry, tgid_attr_stuff); -} - -static struct dentry *proc_tid_attr_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) -{ - return proc_pident_lookup(dir, dentry, tid_attr_stuff); +static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ + return proc_pident_lookup(dir, dentry, + tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); } -static struct inode_operations proc_tgid_attr_inode_operations = { - .lookup = proc_tgid_attr_lookup, - .getattr = pid_getattr, - .setattr = proc_setattr, -}; - -static struct inode_operations proc_tid_attr_inode_operations = { - .lookup = proc_tid_attr_lookup, +static struct inode_operations proc_tgid_base_inode_operations = { + .lookup = proc_tgid_base_lookup, .getattr = pid_getattr, .setattr = proc_setattr, }; -#endif - -/* - * /proc/self: - */ -static int proc_self_readlink(struct dentry *dentry, char __user *buffer, - int buflen) -{ - char tmp[PROC_NUMBUF]; - sprintf(tmp, "%d", current->tgid); - return vfs_readlink(dentry,buffer,buflen,tmp); -} - -static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - char tmp[PROC_NUMBUF]; - sprintf(tmp, "%d", current->tgid); - return ERR_PTR(vfs_follow_link(nd,tmp)); -} - -static struct inode_operations proc_self_inode_operations = { - .readlink = proc_self_readlink, - .follow_link = proc_self_follow_link, -}; /** * proc_flush_task - Remove dcache entries for @task from the /proc dcache. @@ -2022,54 +1880,23 @@ out: return; } -/* SMP-safe */ -struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +struct dentry *proc_pid_instantiate(struct inode *dir, + struct dentry * dentry, struct task_struct *task, void *ptr) { - struct dentry *result = ERR_PTR(-ENOENT); - struct task_struct *task; + struct dentry *error = ERR_PTR(-ENOENT); struct inode *inode; - struct proc_inode *ei; - unsigned tgid; - - if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)) { - inode = new_inode(dir->i_sb); - if (!inode) - return ERR_PTR(-ENOMEM); - ei = PROC_I(inode); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_ino = fake_ino(0, PROC_TGID_INO); - ei->pde = NULL; - inode->i_mode = S_IFLNK|S_IRWXUGO; - inode->i_uid = inode->i_gid = 0; - inode->i_size = 64; - inode->i_op = &proc_self_inode_operations; - d_add(dentry, inode); - return NULL; - } - tgid = name_to_int(dentry); - if (tgid == ~0U) - goto out; - rcu_read_lock(); - task = find_task_by_pid(tgid); - if (task) - get_task_struct(task); - rcu_read_unlock(); - if (!task) - goto out; - - inode = proc_pid_make_inode(dir->i_sb, task, PROC_TGID_INO); + inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) - goto out_put_task; + goto out; inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; inode->i_op = &proc_tgid_base_inode_operations; inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; -#ifdef CONFIG_SECURITY - inode->i_nlink = 5; -#else inode->i_nlink = 4; +#ifdef CONFIG_SECURITY + inode->i_nlink += 1; #endif dentry->d_op = &pid_dentry_operations; @@ -2077,179 +1904,251 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (pid_revalidate(dentry, NULL)) - result = NULL; - -out_put_task: - put_task_struct(task); + error = NULL; out: - return result; + return error; } -/* SMP-safe */ -static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) { struct dentry *result = ERR_PTR(-ENOENT); struct task_struct *task; - struct task_struct *leader = get_proc_task(dir); - struct inode *inode; - unsigned tid; + unsigned tgid; - if (!leader) - goto out_no_task; + result = proc_base_lookup(dir, dentry); + if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT) + goto out; - tid = name_to_int(dentry); - if (tid == ~0U) + tgid = name_to_int(dentry); + if (tgid == ~0U) goto out; rcu_read_lock(); - task = find_task_by_pid(tid); + task = find_task_by_pid(tgid); if (task) get_task_struct(task); rcu_read_unlock(); if (!task) goto out; - if (leader->tgid != task->tgid) - goto out_drop_task; - - inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_INO); - - - if (!inode) - goto out_drop_task; - inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; - inode->i_op = &proc_tid_base_inode_operations; - inode->i_fop = &proc_tid_base_operations; - inode->i_flags|=S_IMMUTABLE; -#ifdef CONFIG_SECURITY - inode->i_nlink = 4; -#else - inode->i_nlink = 3; -#endif - - dentry->d_op = &pid_dentry_operations; - - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, NULL)) - result = NULL; -out_drop_task: + result = proc_pid_instantiate(dir, dentry, task, NULL); put_task_struct(task); out: - put_task_struct(leader); -out_no_task: return result; } /* - * Find the first tgid to return to user space. - * - * Usually this is just whatever follows &init_task, but if the users - * buffer was too small to hold the full list or there was a seek into - * the middle of the directory we have more work to do. - * - * In the case of a short read we start with find_task_by_pid. + * Find the first task with tgid >= tgid * - * In the case of a seek we start with &init_task and walk nr - * threads past it. */ -static struct task_struct *first_tgid(int tgid, unsigned int nr) +static struct task_struct *next_tgid(unsigned int tgid) { - struct task_struct *pos; - rcu_read_lock(); - if (tgid && nr) { - pos = find_task_by_pid(tgid); - if (pos && thread_group_leader(pos)) - goto found; - } - /* If nr exceeds the number of processes get out quickly */ - pos = NULL; - if (nr && nr >= nr_processes()) - goto done; + struct task_struct *task; + struct pid *pid; - /* If we haven't found our starting place yet start with - * the init_task and walk nr tasks forward. - */ - for (pos = next_task(&init_task); nr > 0; --nr) { - pos = next_task(pos); - if (pos == &init_task) { - pos = NULL; - goto done; - } + rcu_read_lock(); +retry: + task = NULL; + pid = find_ge_pid(tgid); + if (pid) { + tgid = pid->nr + 1; + task = pid_task(pid, PIDTYPE_PID); + /* What we to know is if the pid we have find is the + * pid of a thread_group_leader. Testing for task + * being a thread_group_leader is the obvious thing + * todo but there is a window when it fails, due to + * the pid transfer logic in de_thread. + * + * So we perform the straight forward test of seeing + * if the pid we have found is the pid of a thread + * group leader, and don't worry if the task we have + * found doesn't happen to be a thread group leader. + * As we don't care in the case of readdir. + */ + if (!task || !has_group_leader_pid(task)) + goto retry; + get_task_struct(task); } -found: - get_task_struct(pos); -done: rcu_read_unlock(); - return pos; + return task; } -/* - * Find the next task in the task list. - * Return NULL if we loop or there is any error. - * - * The reference to the input task_struct is released. - */ -static struct task_struct *next_tgid(struct task_struct *start) +#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff)) + +static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + struct task_struct *task, int tgid) { - struct task_struct *pos; - rcu_read_lock(); - pos = start; - if (pid_alive(start)) - pos = next_task(start); - if (pid_alive(pos) && (pos != &init_task)) { - get_task_struct(pos); - goto done; - } - pos = NULL; -done: - rcu_read_unlock(); - put_task_struct(start); - return pos; + char name[PROC_NUMBUF]; + int len = snprintf(name, sizeof(name), "%d", tgid); + return proc_fill_cache(filp, dirent, filldir, name, len, + proc_pid_instantiate, task, NULL); } /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - char buf[PROC_NUMBUF]; unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; + struct task_struct *reaper = get_proc_task(filp->f_dentry->d_inode); struct task_struct *task; int tgid; - if (!nr) { - ino_t ino = fake_ino(0,PROC_TGID_INO); - if (filldir(dirent, "self", 4, filp->f_pos, ino, DT_LNK) < 0) - return 0; - filp->f_pos++; - nr++; + if (!reaper) + goto out_no_task; + + for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) { + struct pid_entry *p = &proc_base_stuff[nr]; + if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0) + goto out; } - nr -= 1; - /* f_version caches the tgid value that the last readdir call couldn't - * return. lseek aka telldir automagically resets f_version to 0. - */ - tgid = filp->f_version; - filp->f_version = 0; - for (task = first_tgid(tgid, nr); + tgid = filp->f_pos - TGID_OFFSET; + for (task = next_tgid(tgid); task; - task = next_tgid(task), filp->f_pos++) { - int len; - ino_t ino; + put_task_struct(task), task = next_tgid(tgid + 1)) { tgid = task->pid; - len = snprintf(buf, sizeof(buf), "%d", tgid); - ino = fake_ino(tgid, PROC_TGID_INO); - if (filldir(dirent, buf, len, filp->f_pos, ino, DT_DIR) < 0) { - /* returning this tgid failed, save it as the first - * pid for the next readir call */ - filp->f_version = tgid; + filp->f_pos = tgid + TGID_OFFSET; + if (proc_pid_fill_cache(filp, dirent, filldir, task, tgid) < 0) { put_task_struct(task); - break; + goto out; } } + filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; +out: + put_task_struct(reaper); +out_no_task: return 0; } /* + * Tasks + */ +static struct pid_entry tid_base_stuff[] = { + DIR("fd", S_IRUSR|S_IXUSR, fd), + INF("environ", S_IRUSR, pid_environ), + INF("auxv", S_IRUSR, pid_auxv), + INF("status", S_IRUGO, pid_status), + INF("cmdline", S_IRUGO, pid_cmdline), + INF("stat", S_IRUGO, tid_stat), + INF("statm", S_IRUGO, pid_statm), + REG("maps", S_IRUGO, maps), +#ifdef CONFIG_NUMA + REG("numa_maps", S_IRUGO, numa_maps), +#endif + REG("mem", S_IRUSR|S_IWUSR, mem), +#ifdef CONFIG_SECCOMP + REG("seccomp", S_IRUSR|S_IWUSR, seccomp), +#endif + LNK("cwd", cwd), + LNK("root", root), + LNK("exe", exe), + REG("mounts", S_IRUGO, mounts), +#ifdef CONFIG_MMU + REG("smaps", S_IRUGO, smaps), +#endif +#ifdef CONFIG_SECURITY + DIR("attr", S_IRUGO|S_IXUGO, attr_dir), +#endif +#ifdef CONFIG_KALLSYMS + INF("wchan", S_IRUGO, pid_wchan), +#endif +#ifdef CONFIG_SCHEDSTATS + INF("schedstat", S_IRUGO, pid_schedstat), +#endif +#ifdef CONFIG_CPUSETS + REG("cpuset", S_IRUGO, cpuset), +#endif + INF("oom_score", S_IRUGO, oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), +#ifdef CONFIG_AUDITSYSCALL + REG("loginuid", S_IWUSR|S_IRUGO, loginuid), +#endif +}; + +static int proc_tid_base_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + return proc_pident_readdir(filp,dirent,filldir, + tid_base_stuff,ARRAY_SIZE(tid_base_stuff)); +} + +static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ + return proc_pident_lookup(dir, dentry, + tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); +} + +static struct file_operations proc_tid_base_operations = { + .read = generic_read_dir, + .readdir = proc_tid_base_readdir, +}; + +static struct inode_operations proc_tid_base_inode_operations = { + .lookup = proc_tid_base_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + +static struct dentry *proc_task_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, void *ptr) +{ + struct dentry *error = ERR_PTR(-ENOENT); + struct inode *inode; + inode = proc_pid_make_inode(dir->i_sb, task); + + if (!inode) + goto out; + inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; + inode->i_op = &proc_tid_base_inode_operations; + inode->i_fop = &proc_tid_base_operations; + inode->i_flags|=S_IMMUTABLE; + inode->i_nlink = 3; +#ifdef CONFIG_SECURITY + inode->i_nlink += 1; +#endif + + dentry->d_op = &pid_dentry_operations; + + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; +out: + return error; +} + +static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +{ + struct dentry *result = ERR_PTR(-ENOENT); + struct task_struct *task; + struct task_struct *leader = get_proc_task(dir); + unsigned tid; + + if (!leader) + goto out_no_task; + + tid = name_to_int(dentry); + if (tid == ~0U) + goto out; + + rcu_read_lock(); + task = find_task_by_pid(tid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + if (!task) + goto out; + if (leader->tgid != task->tgid) + goto out_drop_task; + + result = proc_task_instantiate(dir, dentry, task, NULL); +out_drop_task: + put_task_struct(task); +out: + put_task_struct(leader); +out_no_task: + return result; +} + +/* * Find the first tid of a thread group to return to user space. * * Usually this is just the thread group leader, but if the users @@ -2318,10 +2217,18 @@ static struct task_struct *next_tid(struct task_struct *start) return pos; } +static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + struct task_struct *task, int tid) +{ + char name[PROC_NUMBUF]; + int len = snprintf(name, sizeof(name), "%d", tid); + return proc_fill_cache(filp, dirent, filldir, name, len, + proc_task_instantiate, task, NULL); +} + /* for the /proc/TGID/task/ directories */ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) { - char buf[PROC_NUMBUF]; struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; struct task_struct *leader = get_proc_task(inode); @@ -2358,11 +2265,8 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi for (task = first_tid(leader, tid, pos - 2); task; task = next_tid(task), pos++) { - int len; tid = task->pid; - len = snprintf(buf, sizeof(buf), "%d", tid); - ino = fake_ino(tid, PROC_TID_INO); - if (filldir(dirent, buf, len, pos, ino, DT_DIR < 0)) { + if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) { /* returning this tgid failed, save it as the first * pid for the next readir call */ filp->f_version = tid; @@ -2392,3 +2296,14 @@ static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct return 0; } + +static struct inode_operations proc_task_inode_operations = { + .lookup = proc_task_lookup, + .getattr = proc_task_getattr, + .setattr = proc_setattr, +}; + +static struct file_operations proc_task_operations = { + .read = generic_read_dir, + .readdir = proc_task_readdir, +}; diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 66bc425f2f3..8d88e58ed5c 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -45,6 +45,7 @@ #include <linux/sysrq.h> #include <linux/vmalloc.h> #include <linux/crash_dump.h> +#include <linux/pspace.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/io.h> @@ -91,7 +92,7 @@ static int loadavg_read_proc(char *page, char **start, off_t off, LOAD_INT(a), LOAD_FRAC(a), LOAD_INT(b), LOAD_FRAC(b), LOAD_INT(c), LOAD_FRAC(c), - nr_running(), nr_threads, last_pid); + nr_running(), nr_threads, init_pspace.last_pid); return proc_calc_metrics(page, start, off, count, eof, len); } diff --git a/fs/proc/root.c b/fs/proc/root.c index 8901c65caca..ffe66c38488 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,6 +16,7 @@ #include <linux/module.h> #include <linux/bitops.h> #include <linux/smp_lock.h> +#include <linux/mount.h> #include "internal.h" @@ -28,6 +29,17 @@ struct proc_dir_entry *proc_sys_root; static int proc_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { + if (proc_mnt) { + /* Seed the root directory with a pid so it doesn't need + * to be special in base.c. I would do this earlier but + * the only task alive when /proc is mounted the first time + * is the init_task and it doesn't have any pids. + */ + struct proc_inode *ei; + ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode); + if (!ei->pid) + ei->pid = find_get_pid(1); + } return get_sb_single(fs_type, flags, data, proc_fill_super, mnt); } diff --git a/fs/readdir.c b/fs/readdir.c index b6109329b60..bff3ee58e2f 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -69,20 +69,24 @@ struct readdir_callback { }; static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset, - ino_t ino, unsigned int d_type) + u64 ino, unsigned int d_type) { struct readdir_callback * buf = (struct readdir_callback *) __buf; struct old_linux_dirent __user * dirent; + unsigned long d_ino; if (buf->result) return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + return -EOVERFLOW; buf->result++; dirent = buf->dirent; if (!access_ok(VERIFY_WRITE, dirent, (unsigned long)(dirent->d_name + namlen + 1) - (unsigned long)dirent)) goto efault; - if ( __put_user(ino, &dirent->d_ino) || + if ( __put_user(d_ino, &dirent->d_ino) || __put_user(offset, &dirent->d_offset) || __put_user(namlen, &dirent->d_namlen) || __copy_to_user(dirent->d_name, name, namlen) || @@ -138,22 +142,26 @@ struct getdents_callback { }; static int filldir(void * __buf, const char * name, int namlen, loff_t offset, - ino_t ino, unsigned int d_type) + u64 ino, unsigned int d_type) { struct linux_dirent __user * dirent; struct getdents_callback * buf = (struct getdents_callback *) __buf; + unsigned long d_ino; int reclen = ROUND_UP(NAME_OFFSET(dirent) + namlen + 2); buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + return -EOVERFLOW; dirent = buf->previous; if (dirent) { if (__put_user(offset, &dirent->d_off)) goto efault; } dirent = buf->current_dir; - if (__put_user(ino, &dirent->d_ino)) + if (__put_user(d_ino, &dirent->d_ino)) goto efault; if (__put_user(reclen, &dirent->d_reclen)) goto efault; @@ -222,7 +230,7 @@ struct getdents_callback64 { }; static int filldir64(void * __buf, const char * name, int namlen, loff_t offset, - ino_t ino, unsigned int d_type) + u64 ino, unsigned int d_type) { struct linux_dirent64 __user *dirent; struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 41f24369e47..c093642fb98 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -38,8 +38,7 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp) int err; int jbegin_failure = 0; - if (!S_ISREG(inode->i_mode)) - BUG(); + BUG_ON(!S_ISREG(inode->i_mode)); /* fast out for when nothing needs to be done */ if ((atomic_read(&inode->i_count) > 1 || @@ -125,8 +124,7 @@ static int reiserfs_sync_file(struct file *p_s_filp, int n_err; int barrier_done; - if (!S_ISREG(p_s_inode->i_mode)) - BUG(); + BUG_ON(!S_ISREG(p_s_inode->i_mode)); n_err = sync_mapping_buffers(p_s_inode->i_mapping); reiserfs_write_lock(p_s_inode->i_sb); barrier_done = reiserfs_commit_for_inode(p_s_inode); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 7e5a2f5ebeb..9c69bcacad2 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1780,7 +1780,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, err = -EDQUOT; goto out_end_trans; } - if (!dir || !dir->i_nlink) { + if (!dir->i_nlink) { err = -EPERM; goto out_bad_inode; } diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c index 7a88adbceef..b9b423b22a8 100644 --- a/fs/reiserfs/item_ops.c +++ b/fs/reiserfs/item_ops.c @@ -75,8 +75,7 @@ static int sd_create_vi(struct virtual_node *vn, static int sd_check_left(struct virtual_item *vi, int free, int start_skip, int end_skip) { - if (start_skip || end_skip) - BUG(); + BUG_ON(start_skip || end_skip); return -1; } @@ -87,8 +86,7 @@ static int sd_check_right(struct virtual_item *vi, int free) static int sd_part_size(struct virtual_item *vi, int first, int count) { - if (count) - BUG(); + BUG_ON(count); return 0; } @@ -476,8 +474,7 @@ static int direntry_create_vi(struct virtual_node *vn, vi->vi_index = TYPE_DIRENTRY; - if (!(vi->vi_ih) || !vi->vi_item) - BUG(); + BUG_ON(!(vi->vi_ih) || !vi->vi_item); dir_u->flags = 0; if (le_ih_k_offset(vi->vi_ih) == DOT_OFFSET) @@ -575,8 +572,7 @@ static int direntry_check_right(struct virtual_item *vi, int free) free -= dir_u->entry_sizes[i]; entries++; } - if (entries == dir_u->entry_count) - BUG(); + BUG_ON(entries == dir_u->entry_count); /* "." and ".." can not be separated from each other */ if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index e6b5ccf23f1..ad8cbc49883 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -718,8 +718,7 @@ static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, spinlock_t * lock, void (fn) (struct buffer_chunk *)) { int ret = 0; - if (chunk->nr >= CHUNK_SIZE) - BUG(); + BUG_ON(chunk->nr >= CHUNK_SIZE); chunk->bh[chunk->nr++] = bh; if (chunk->nr >= CHUNK_SIZE) { ret = 1; @@ -788,8 +787,7 @@ static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh, /* buffer must be locked for __add_jh, should be able to have * two adds at the same time */ - if (bh->b_private) - BUG(); + BUG_ON(bh->b_private); jh->bh = bh; bh->b_private = jh; } @@ -2967,8 +2965,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, int retval; reiserfs_check_lock_depth(p_s_sb, "journal_begin"); - if (nblocks > journal->j_trans_max) - BUG(); + BUG_ON(nblocks > journal->j_trans_max); PROC_INFO_INC(p_s_sb, journal.journal_being); /* set here for journal_join */ @@ -3084,9 +3081,8 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct if (reiserfs_transaction_running(s)) { th = current->journal_info; th->t_refcount++; - if (th->t_refcount < 2) { - BUG(); - } + BUG_ON(th->t_refcount < 2); + return th; } th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS); @@ -3126,9 +3122,7 @@ static int journal_join(struct reiserfs_transaction_handle *th, ** pointer */ th->t_handle_save = cur_th; - if (cur_th && cur_th->t_refcount > 1) { - BUG(); - } + BUG_ON(cur_th && cur_th->t_refcount > 1); return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_JOIN); } @@ -3141,9 +3135,7 @@ int journal_join_abort(struct reiserfs_transaction_handle *th, ** pointer */ th->t_handle_save = cur_th; - if (cur_th && cur_th->t_refcount > 1) { - BUG(); - } + BUG_ON(cur_th && cur_th->t_refcount > 1); return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_ABORT); } @@ -3178,8 +3170,7 @@ int journal_begin(struct reiserfs_transaction_handle *th, current->journal_info = th; } ret = do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_REG); - if (current->journal_info != th) - BUG(); + BUG_ON(current->journal_info != th); /* I guess this boils down to being the reciprocal of clm-2100 above. * If do_journal_begin_r fails, we need to put it back, since journal_end @@ -3324,8 +3315,7 @@ int journal_end(struct reiserfs_transaction_handle *th, /* we aren't allowed to close a nested transaction on a different ** filesystem from the one in the task struct */ - if (cur_th->t_super != th->t_super) - BUG(); + BUG_ON(cur_th->t_super != th->t_super); if (th != cur_th) { memcpy(current->journal_info, th, sizeof(*th)); @@ -3444,9 +3434,7 @@ int journal_end_sync(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); /* you can sync while nested, very, very bad */ - if (th->t_refcount > 1) { - BUG(); - } + BUG_ON(th->t_refcount > 1); if (journal->j_len == 0) { reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1); @@ -3556,9 +3544,8 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, ** will be dealt with by next transaction that actually writes something, but should be taken ** care of in this trans */ - if (journal->j_len == 0) { - BUG(); - } + BUG_ON(journal->j_len == 0); + /* if wcount > 0, and we are called to with flush or commit_now, ** we wait on j_join_wait. We will wake up when the last writer has ** finished the transaction, and started it on its way to the disk. @@ -3592,9 +3579,8 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, unlock_journal(p_s_sb); } } - if (journal->j_trans_id == trans_id) { - BUG(); - } + BUG_ON(journal->j_trans_id == trans_id); + if (commit_now && journal_list_still_alive(p_s_sb, trans_id) && wait_on_commit) { @@ -4074,9 +4060,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, set_commit_trans_len(commit, journal->j_len); /* special check in case all buffers in the journal were marked for not logging */ - if (journal->j_len == 0) { - BUG(); - } + BUG_ON(journal->j_len == 0); /* we're about to dirty all the log blocks, mark the description block * dirty now too. Don't mark the commit block dirty until all the @@ -4173,8 +4157,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, journal, jl, &jl->j_tail_bh_list); lock_kernel(); } - if (!list_empty(&jl->j_tail_bh_list)) - BUG(); + BUG_ON(!list_empty(&jl->j_tail_bh_list)); up(&jl->j_commit_lock); /* honor the flush wishes from the caller, simple commits can diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 16e9cff8f15..abde1edc223 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -67,8 +67,7 @@ inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de) { struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num; - if (de->de_entry_num >= ih_entry_count(de->de_ih)) - BUG(); + BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih)); de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num); de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0); @@ -80,8 +79,7 @@ inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de) // what entry points to static inline void set_de_object_key(struct reiserfs_dir_entry *de) { - if (de->de_entry_num >= ih_entry_count(de->de_ih)) - BUG(); + BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih)); de->de_dir_id = deh_dir_id(&(de->de_deh[de->de_entry_num])); de->de_objectid = deh_objectid(&(de->de_deh[de->de_entry_num])); } @@ -90,8 +88,7 @@ static inline void store_de_entry_key(struct reiserfs_dir_entry *de) { struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num; - if (de->de_entry_num >= ih_entry_count(de->de_ih)) - BUG(); + BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih)); /* store key of the found entry */ de->de_entry_key.version = KEY_FORMAT_3_5; diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 8b9b1312713..5240abe1a70 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1476,9 +1476,7 @@ static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th, int n_block_size = p_s_sb->s_blocksize; int cut_bytes; BUG_ON(!th->t_trans_id); - - if (n_new_file_size != p_s_inode->i_size) - BUG(); + BUG_ON(n_new_file_size != p_s_inode->i_size); /* the page being sent in could be NULL if there was an i/o error ** reading in the last block. The user will hit problems trying to diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index d935fb9394e..7bdb0ed443e 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -773,7 +773,7 @@ int reiserfs_xattr_del(struct inode *inode, const char *name) static int reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen, - loff_t offset, ino_t ino, unsigned int d_type) + loff_t offset, u64 ino, unsigned int d_type) { struct dentry *xadir = (struct dentry *)buf; @@ -851,7 +851,7 @@ struct reiserfs_chown_buf { /* XXX: If there is a better way to do this, I'd love to hear about it */ static int reiserfs_chown_xattrs_filler(void *buf, const char *name, int namelen, - loff_t offset, ino_t ino, unsigned int d_type) + loff_t offset, u64 ino, unsigned int d_type) { struct reiserfs_chown_buf *chown_buf = (struct reiserfs_chown_buf *)buf; struct dentry *xafile, *xadir = chown_buf->xadir; @@ -1036,7 +1036,7 @@ struct reiserfs_listxattr_buf { static int reiserfs_listxattr_filler(void *buf, const char *name, int namelen, - loff_t offset, ino_t ino, unsigned int d_type) + loff_t offset, u64 ino, unsigned int d_type) { struct reiserfs_listxattr_buf *b = (struct reiserfs_listxattr_buf *)buf; int len = 0; diff --git a/fs/stat.c b/fs/stat.c index 60a31d5e596..bca07eb2003 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -140,6 +140,8 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta memset(&tmp, 0, sizeof(struct __old_kernel_stat)); tmp.st_dev = old_encode_dev(stat->dev); tmp.st_ino = stat->ino; + if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) + return -EOVERFLOW; tmp.st_mode = stat->mode; tmp.st_nlink = stat->nlink; if (tmp.st_nlink != stat->nlink) @@ -210,6 +212,8 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) tmp.st_dev = new_encode_dev(stat->dev); #endif tmp.st_ino = stat->ino; + if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) + return -EOVERFLOW; tmp.st_mode = stat->mode; tmp.st_nlink = stat->nlink; if (tmp.st_nlink != stat->nlink) @@ -347,6 +351,8 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf) tmp.st_rdev = huge_encode_dev(stat->rdev); #endif tmp.st_ino = stat->ino; + if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) + return -EOVERFLOW; #ifdef STAT64_HAS_BROKEN_ST_INO tmp.__st_ino = stat->ino; #endif diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index cf3786625bf..146f1dedec8 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -157,8 +157,8 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) if ((retval = fill_read_buffer(file->f_dentry,buffer))) goto out; } - pr_debug("%s: count = %d, ppos = %lld, buf = %s\n", - __FUNCTION__,count,*ppos,buffer->page); + pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", + __FUNCTION__, count, *ppos, buffer->page); retval = flush_read_buffer(buffer,buf,count,ppos); out: up(&buffer->sem); diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c index 36fbeccdc72..c75f68361e3 100644 --- a/fs/xfs/support/debug.c +++ b/fs/xfs/support/debug.c @@ -53,8 +53,7 @@ cmn_err(register int level, char *fmt, ...) va_end(ap); spin_unlock_irqrestore(&xfs_err_lock,flags); - if (level == CE_PANIC) - BUG(); + BUG_ON(level == CE_PANIC); } void @@ -72,8 +71,7 @@ icmn_err(register int level, char *fmt, va_list ap) strcat(message, "\n"); spin_unlock_irqrestore(&xfs_err_lock,flags); printk("%s%s", err_level[level], message); - if (level == CE_PANIC) - BUG(); + BUG_ON(level == CE_PANIC); } void |