aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c4
-rw-r--r--fs/9p/v9fs_vfs.h6
-rw-r--r--fs/9p/vfs_addr.c5
-rw-r--r--fs/9p/vfs_dir.c60
-rw-r--r--fs/9p/vfs_file.c93
-rw-r--r--fs/9p/vfs_inode.c39
-rw-r--r--fs/9p/vfs_super.c6
-rw-r--r--fs/Kconfig607
-rw-r--r--fs/Kconfig.binfmt22
-rw-r--r--fs/Makefile2
-rw-r--r--fs/binfmt_elf.c37
-rw-r--r--fs/binfmt_elf_fdpic.c19
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/buffer.c3
-rw-r--r--fs/char_dev.c3
-rw-r--r--fs/cifs/Kconfig142
-rw-r--r--fs/cifs/file.c4
-rw-r--r--fs/coda/psdev.c5
-rw-r--r--fs/exec.c16
-rw-r--r--fs/ext2/Kconfig55
-rw-r--r--fs/ext3/Kconfig67
-rw-r--r--fs/ext3/balloc.c3
-rw-r--r--fs/ext3/dir.c30
-rw-r--r--fs/ext3/inode.c7
-rw-r--r--fs/ext3/resize.c3
-rw-r--r--fs/ext3/super.c16
-rw-r--r--fs/ext4/Kconfig79
-rw-r--r--fs/ext4/balloc.c12
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/ext4_sb.h3
-rw-r--r--fs/ext4/inode.c143
-rw-r--r--fs/ext4/mballoc.c263
-rw-r--r--fs/ext4/mballoc.h31
-rw-r--r--fs/ext4/super.c132
-rw-r--r--fs/fuse/file.c5
-rw-r--r--fs/fuse/fuse_i.h5
-rw-r--r--fs/fuse/inode.c3
-rw-r--r--fs/hfsplus/extents.c3
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/jbd/Kconfig30
-rw-r--r--fs/jbd/commit.c10
-rw-r--r--fs/jbd/transaction.c16
-rw-r--r--fs/jbd2/Kconfig33
-rw-r--r--fs/jbd2/commit.c3
-rw-r--r--fs/jbd2/transaction.c1
-rw-r--r--fs/jffs2/Kconfig188
-rw-r--r--fs/jffs2/compr.c4
-rw-r--r--fs/jffs2/dir.c2
-rw-r--r--fs/jffs2/erase.c4
-rw-r--r--fs/jffs2/fs.c6
-rw-r--r--fs/jffs2/nodemgmt.c4
-rw-r--r--fs/jffs2/wbuf.c5
-rw-r--r--fs/nfs/callback.c19
-rw-r--r--fs/nfs/client.c5
-rw-r--r--fs/nfs/dir.c22
-rw-r--r--fs/nfs/file.c18
-rw-r--r--fs/nfs/inode.c183
-rw-r--r--fs/nfs/internal.h25
-rw-r--r--fs/nfs/mount_clnt.c3
-rw-r--r--fs/nfs/namespace.c7
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3proc.c20
-rw-r--r--fs/nfs/nfs4namespace.c105
-rw-r--r--fs/nfs/nfs4proc.c6
-rw-r--r--fs/nfs/proc.c10
-rw-r--r--fs/nfs/super.c130
-rw-r--r--fs/nfs/unlink.c5
-rw-r--r--fs/nfs/write.c3
-rw-r--r--fs/nls/nls_base.c21
-rw-r--r--fs/ntfs/file.c4
-rw-r--r--fs/partitions/check.c10
-rw-r--r--fs/proc/array.c8
-rw-r--r--fs/proc/proc_misc.c125
-rw-r--r--fs/proc/vmcore.c5
-rw-r--r--fs/ramfs/file-nommu.c4
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/seq_file.c29
-rw-r--r--fs/sysfs/bin.c42
-rw-r--r--fs/sysfs/dir.c24
-rw-r--r--fs/sysfs/file.c46
-rw-r--r--fs/sysfs/mount.c15
-rw-r--r--fs/sysfs/sysfs.h6
-rw-r--r--fs/ubifs/budget.c26
-rw-r--r--fs/ubifs/compress.c2
-rw-r--r--fs/ubifs/debug.c79
-rw-r--r--fs/ubifs/debug.h6
-rw-r--r--fs/ubifs/file.c260
-rw-r--r--fs/ubifs/find.c4
-rw-r--r--fs/ubifs/gc.c90
-rw-r--r--fs/ubifs/io.c12
-rw-r--r--fs/ubifs/key.h22
-rw-r--r--fs/ubifs/lprops.c34
-rw-r--r--fs/ubifs/lpt.c48
-rw-r--r--fs/ubifs/lpt_commit.c187
-rw-r--r--fs/ubifs/misc.h27
-rw-r--r--fs/ubifs/scan.c2
-rw-r--r--fs/ubifs/super.c109
-rw-r--r--fs/ubifs/tnc.c345
-rw-r--r--fs/ubifs/tnc_misc.c4
-rw-r--r--fs/ubifs/ubifs-media.h1
-rw-r--r--fs/ubifs/ubifs.h85
-rw-r--r--fs/ubifs/xattr.c2
102 files changed, 2929 insertions, 1563 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index c061c3f18e7..24eb01087b6 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -30,8 +30,8 @@
#include <linux/parser.h>
#include <linux/idr.h>
#include <net/9p/9p.h>
-#include <net/9p/transport.h>
#include <net/9p/client.h>
+#include <net/9p/transport.h>
#include "v9fs.h"
#include "v9fs_vfs.h"
@@ -234,7 +234,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
if (!v9ses->clnt->dotu)
v9ses->flags &= ~V9FS_EXTENDED;
- v9ses->maxdata = v9ses->clnt->msize;
+ v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
/* for legacy mode, fall back to V9FS_ACCESS_ANY */
if (!v9fs_extended(v9ses) &&
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 57997fa14e6..c295ba786ed 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -46,9 +46,11 @@ extern struct dentry_operations v9fs_cached_dentry_operations;
struct inode *v9fs_get_inode(struct super_block *sb, int mode);
ino_t v9fs_qid2ino(struct p9_qid *qid);
-void v9fs_stat2inode(struct p9_stat *, struct inode *, struct super_block *);
+void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
int v9fs_dir_release(struct inode *inode, struct file *filp);
int v9fs_file_open(struct inode *inode, struct file *file);
-void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat);
+void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
void v9fs_dentry_release(struct dentry *);
int v9fs_uflags2omode(int uflags, int extended);
+
+ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 97d3aed5798..6fcb1e7095c 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -38,7 +38,6 @@
#include "v9fs.h"
#include "v9fs_vfs.h"
-#include "fid.h"
/**
* v9fs_vfs_readpage - read an entire page in from 9P
@@ -53,14 +52,12 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
int retval;
loff_t offset;
char *buffer;
- struct p9_fid *fid;
P9_DPRINTK(P9_DEBUG_VFS, "\n");
- fid = filp->private_data;
buffer = kmap(page);
offset = page_offset(page);
- retval = p9_client_readn(fid, buffer, offset, PAGE_CACHE_SIZE);
+ retval = v9fs_file_readn(filp, buffer, NULL, offset, PAGE_CACHE_SIZE);
if (retval < 0)
goto done;
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index e298fe19409..873cd31baa4 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -45,7 +45,7 @@
*
*/
-static inline int dt_type(struct p9_stat *mistat)
+static inline int dt_type(struct p9_wstat *mistat)
{
unsigned long perm = mistat->mode;
int rettype = DT_REG;
@@ -69,32 +69,58 @@ static inline int dt_type(struct p9_stat *mistat)
static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
int over;
+ struct p9_wstat st;
+ int err;
struct p9_fid *fid;
- struct v9fs_session_info *v9ses;
- struct inode *inode;
- struct p9_stat *st;
+ int buflen;
+ char *statbuf;
+ int n, i = 0;
P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
- inode = filp->f_path.dentry->d_inode;
- v9ses = v9fs_inode2v9ses(inode);
fid = filp->private_data;
- while ((st = p9_client_dirread(fid, filp->f_pos)) != NULL) {
- if (IS_ERR(st))
- return PTR_ERR(st);
- over = filldir(dirent, st->name.str, st->name.len, filp->f_pos,
- v9fs_qid2ino(&st->qid), dt_type(st));
+ buflen = fid->clnt->msize - P9_IOHDRSZ;
+ statbuf = kmalloc(buflen, GFP_KERNEL);
+ if (!statbuf)
+ return -ENOMEM;
- if (over)
+ while (1) {
+ err = v9fs_file_readn(filp, statbuf, NULL, buflen,
+ fid->rdir_fpos);
+ if (err <= 0)
break;
- filp->f_pos += st->size;
- kfree(st);
- st = NULL;
+ n = err;
+ while (i < n) {
+ err = p9stat_read(statbuf + i, buflen-i, &st,
+ fid->clnt->dotu);
+ if (err) {
+ P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
+ err = -EIO;
+ p9stat_free(&st);
+ goto free_and_exit;
+ }
+
+ i += st.size+2;
+ fid->rdir_fpos += st.size+2;
+
+ over = filldir(dirent, st.name, strlen(st.name),
+ filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st));
+
+ filp->f_pos += st.size+2;
+
+ p9stat_free(&st);
+
+ if (over) {
+ err = 0;
+ goto free_and_exit;
+ }
+ }
}
- kfree(st);
- return 0;
+free_and_exit:
+ kfree(statbuf);
+ return err;
}
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 52944d2249a..041c5269228 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -120,23 +120,72 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
}
/**
- * v9fs_file_read - read from a file
+ * v9fs_file_readn - read from a file
* @filp: file pointer to read
* @data: data buffer to read data into
+ * @udata: user data buffer to read data into
* @count: size of buffer
* @offset: offset at which to read data
*
*/
+
+ssize_t
+v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
+ u64 offset)
+{
+ int n, total;
+ struct p9_fid *fid = filp->private_data;
+
+ P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
+ (long long unsigned) offset, count);
+
+ n = 0;
+ total = 0;
+ do {
+ n = p9_client_read(fid, data, udata, offset, count);
+ if (n <= 0)
+ break;
+
+ if (data)
+ data += n;
+ if (udata)
+ udata += n;
+
+ offset += n;
+ count -= n;
+ total += n;
+ } while (count > 0 && n == (fid->clnt->msize - P9_IOHDRSZ));
+
+ if (n < 0)
+ total = n;
+
+ return total;
+}
+
+/**
+ * v9fs_file_read - read from a file
+ * @filp: file pointer to read
+ * @udata: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+
static ssize_t
-v9fs_file_read(struct file *filp, char __user * data, size_t count,
+v9fs_file_read(struct file *filp, char __user *udata, size_t count,
loff_t * offset)
{
int ret;
struct p9_fid *fid;
- P9_DPRINTK(P9_DEBUG_VFS, "\n");
+ P9_DPRINTK(P9_DEBUG_VFS, "count %d offset %lld\n", count, *offset);
fid = filp->private_data;
- ret = p9_client_uread(fid, data, *offset, count);
+
+ if (count > (fid->clnt->msize - P9_IOHDRSZ))
+ ret = v9fs_file_readn(filp, NULL, udata, count, *offset);
+ else
+ ret = p9_client_read(fid, NULL, udata, *offset, count);
+
if (ret > 0)
*offset += ret;
@@ -156,19 +205,38 @@ static ssize_t
v9fs_file_write(struct file *filp, const char __user * data,
size_t count, loff_t * offset)
{
- int ret;
+ int n, rsize, total = 0;
struct p9_fid *fid;
+ struct p9_client *clnt;
struct inode *inode = filp->f_path.dentry->d_inode;
+ int origin = *offset;
P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
(int)count, (int)*offset);
fid = filp->private_data;
- ret = p9_client_uwrite(fid, data, *offset, count);
- if (ret > 0) {
- invalidate_inode_pages2_range(inode->i_mapping, *offset,
- *offset+ret);
- *offset += ret;
+ clnt = fid->clnt;
+
+ rsize = fid->iounit;
+ if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
+ rsize = clnt->msize - P9_IOHDRSZ;
+
+ do {
+ if (count < rsize)
+ rsize = count;
+
+ n = p9_client_write(fid, NULL, data+total, *offset+total,
+ rsize);
+ if (n <= 0)
+ break;
+ count -= n;
+ total += n;
+ } while (count > 0);
+
+ if (total > 0) {
+ invalidate_inode_pages2_range(inode->i_mapping, origin,
+ origin+total);
+ *offset += total;
}
if (*offset > inode->i_size) {
@@ -176,7 +244,10 @@ v9fs_file_write(struct file *filp, const char __user * data,
inode->i_blocks = (inode->i_size + 512 - 1) >> 9;
}
- return ret;
+ if (n < 0)
+ return n;
+
+ return total;
}
static const struct file_operations v9fs_cached_file_operations = {
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index e83aa5ebe86..8314d3f43b7 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -334,7 +334,7 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
{
int err, umode;
struct inode *ret;
- struct p9_stat *st;
+ struct p9_wstat *st;
ret = NULL;
st = p9_client_stat(fid);
@@ -417,6 +417,8 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
struct p9_fid *dfid, *ofid, *fid;
struct inode *inode;
+ P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
+
err = 0;
ofid = NULL;
fid = NULL;
@@ -424,6 +426,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
dfid = v9fs_fid_clone(dentry->d_parent);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
+ P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err);
dfid = NULL;
goto error;
}
@@ -432,18 +435,22 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
ofid = p9_client_walk(dfid, 0, NULL, 1);
if (IS_ERR(ofid)) {
err = PTR_ERR(ofid);
+ P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
ofid = NULL;
goto error;
}
err = p9_client_fcreate(ofid, name, perm, mode, extension);
- if (err < 0)
+ if (err < 0) {
+ P9_DPRINTK(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
goto error;
+ }
/* now walk from the parent so we can get unopened fid */
fid = p9_client_walk(dfid, 1, &name, 0);
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
+ P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
fid = NULL;
goto error;
} else
@@ -453,6 +460,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
+ P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
goto error;
}
@@ -734,7 +742,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
int err;
struct v9fs_session_info *v9ses;
struct p9_fid *fid;
- struct p9_stat *st;
+ struct p9_wstat *st;
P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
err = -EPERM;
@@ -815,10 +823,9 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
*/
void
-v9fs_stat2inode(struct p9_stat *stat, struct inode *inode,
+v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
struct super_block *sb)
{
- int n;
char ext[32];
struct v9fs_session_info *v9ses = sb->s_fs_info;
@@ -842,11 +849,7 @@ v9fs_stat2inode(struct p9_stat *stat, struct inode *inode,
int major = -1;
int minor = -1;
- n = stat->extension.len;
- if (n > sizeof(ext)-1)
- n = sizeof(ext)-1;
- memmove(ext, stat->extension.str, n);
- ext[n] = 0;
+ strncpy(ext, stat->extension, sizeof(ext));
sscanf(ext, "%c %u %u", &type, &major, &minor);
switch (type) {
case 'c':
@@ -857,10 +860,11 @@ v9fs_stat2inode(struct p9_stat *stat, struct inode *inode,
break;
default:
P9_DPRINTK(P9_DEBUG_ERROR,
- "Unknown special type %c (%.*s)\n", type,
- stat->extension.len, stat->extension.str);
+ "Unknown special type %c %s\n", type,
+ stat->extension);
};
inode->i_rdev = MKDEV(major, minor);
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
} else
inode->i_rdev = 0;
@@ -904,7 +908,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
struct v9fs_session_info *v9ses;
struct p9_fid *fid;
- struct p9_stat *st;
+ struct p9_wstat *st;
P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
retval = -EPERM;
@@ -926,15 +930,10 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
}
/* copy extension buffer into buffer */
- if (st->extension.len < buflen)
- buflen = st->extension.len + 1;
-
- memmove(buffer, st->extension.str, buflen - 1);
- buffer[buflen-1] = 0;
+ strncpy(buffer, st->extension, buflen);
P9_DPRINTK(P9_DEBUG_VFS,
- "%s -> %.*s (%s)\n", dentry->d_name.name, st->extension.len,
- st->extension.str, buffer);
+ "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer);
retval = buflen;
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index bf59c396049..d6cb1a0ca72 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -111,7 +111,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
struct inode *inode = NULL;
struct dentry *root = NULL;
struct v9fs_session_info *v9ses = NULL;
- struct p9_stat *st = NULL;
+ struct p9_wstat *st = NULL;
int mode = S_IRWXUGO | S_ISVTX;
uid_t uid = current->fsuid;
gid_t gid = current->fsgid;
@@ -161,10 +161,14 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
sb->s_root = root;
root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
+
v9fs_stat2inode(st, root->d_inode, sb);
+
v9fs_fid_add(root, fid);
+ p9stat_free(st);
kfree(st);
+P9_DPRINTK(P9_DEBUG_VFS, " return simple set mount\n");
return simple_set_mnt(mnt, sb);
release_sb:
diff --git a/fs/Kconfig b/fs/Kconfig
index 9e9d70c02a0..e46297f020c 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -6,61 +6,9 @@ menu "File systems"
if BLOCK
-config EXT2_FS
- tristate "Second extended fs support"
- help
- Ext2 is a standard Linux file system for hard disks.
-
- To compile this file system support as a module, choose M here: the
- module will be called ext2.
-
- If unsure, say Y.
-
-config EXT2_FS_XATTR
- bool "Ext2 extended attributes"
- depends on EXT2_FS
- help
- Extended attributes are name:value pairs associated with inodes by
- the kernel or by users (see the attr(5) manual page, or visit
- <http://acl.bestbits.at/> for details).
-
- If unsure, say N.
-
-config EXT2_FS_POSIX_ACL
- bool "Ext2 POSIX Access Control Lists"
- depends on EXT2_FS_XATTR
- select FS_POSIX_ACL
- help
- Posix Access Control Lists (ACLs) support permissions for users and
- groups beyond the owner/group/world scheme.
-
- To learn more about Access Control Lists, visit the Posix ACLs for
- Linux website <http://acl.bestbits.at/>.
-
- If you don't know what Access Control Lists are, say N
-
-config EXT2_FS_SECURITY
- bool "Ext2 Security Labels"
- depends on EXT2_FS_XATTR
- help
- Security labels support alternative access control models
- implemented by security modules like SELinux. This option
- enables an extended attribute handler for file security
- labels in the ext2 filesystem.
-
- If you are not using a security module that requires using
- extended attributes for file security labels, say N.
-
-config EXT2_FS_XIP
- bool "Ext2 execute in place support"
- depends on EXT2_FS && MMU
- help
- Execute in place can be used on memory-backed block devices. If you
- enable this option, you can select to mount block devices which are
- capable of this feature without using the page cache.
-
- If you do not use a block device that is capable of using this,
- or if unsure, say N.
+source "fs/ext2/Kconfig"
+source "fs/ext3/Kconfig"
+source "fs/ext4/Kconfig"
config FS_XIP
# execute in place
@@ -68,218 +16,8 @@ config FS_XIP
depends on EXT2_FS_XIP
default y
-config EXT3_FS
- tristate "Ext3 journalling file system support"
- select JBD
- help
- This is the journalling version of the Second extended file system
- (often called ext3), the de facto standard Linux file system
- (method to organize files on a storage device) for hard disks.
-
- The journalling code included in this driver means you do not have
- to run e2fsck (file system checker) on your file systems after a
- crash. The journal keeps track of any changes that were being made
- at the time the system crashed, and can ensure that your file system
- is consistent without the need for a lengthy check.
-
- Other than adding the journal to the file system, the on-disk format
- of ext3 is identical to ext2. It is possible to freely switch
- between using the ext3 driver and the ext2 driver, as long as the
- file system has been cleanly unmounted, or e2fsck is run on the file
- system.
-
- To add a journal on an existing ext2 file system or change the
- behavior of ext3 file systems, you can use the tune2fs utility ("man
- tune2fs"). To modify attributes of files and directories on ext3
- file systems, use chattr ("man chattr"). You need to be using
- e2fsprogs version 1.20 or later in order to create ext3 journals
- (available at <http://sourceforge.net/projects/e2fsprogs/>).
-
- To compile this file system support as a module, choose M here: the
- module will be called ext3.
-
-config EXT3_FS_XATTR
- bool "Ext3 extended attributes"
- depends on EXT3_FS
- default y
- help
- Extended attributes are name:value pairs associated with inodes by
- the kernel or by users (see the attr(5) manual page, or visit
- <http://acl.bestbits.at/> for details).
-
- If unsure, say N.
-
- You need this for POSIX ACL support on ext3.
-
-config EXT3_FS_POSIX_ACL
- bool "Ext3 POSIX Access Control Lists"
- depends on EXT3_FS_XATTR
- select FS_POSIX_ACL
- help
- Posix Access Control Lists (ACLs) support permissions for users and
- groups beyond the owner/group/world scheme.
-
- To learn more about Access Control Lists, visit the Posix ACLs for
- Linux website <http://acl.bestbits.at/>.
-
- If you don't know what Access Control Lists are, say N
-
-config EXT3_FS_SECURITY
- bool "Ext3 Security Labels"
- depends on EXT3_FS_XATTR
- help
- Security labels support alternative access control models
- implemented by security modules like SELinux. This option
- enables an extended attribute handler for file security
- labels in the ext3 filesystem.
-
- If you are not using a security module that requires using
- extended attributes for file security labels, say N.
-
-config EXT4_FS
- tristate "The Extended 4 (ext4) filesystem"
- select JBD2
- select CRC16
- help
- This is the next generation of the ext3 filesystem.
-
- Unlike the change from ext2 filesystem to ext3 filesystem,
- the on-disk format of ext4 is not forwards compatible with
- ext3; it is based on extent maps and it supports 48-bit
- physical block numbers. The ext4 filesystem also supports delayed
- allocation, persistent preallocation, high resolution time stamps,
- and a number of other features to improve performance and speed
- up fsck time. For more information, please see the web pages at
- http://ext4.wiki.kernel.org.
-
- The ext4 filesystem will support mounting an ext3
- filesystem; while there will be some performance gains from
- the delayed allocation and inode table readahead, the best
- performance gains will require enabling ext4 features in the
- filesystem, or formating a new filesystem as an ext4
- filesystem initially.
-
- To compile this file system support as a module, choose M here. The
- module will be called ext4dev.
-
- If unsure, say N.
-
-config EXT4DEV_COMPAT
- bool "Enable ext4dev compatibility"
- depends on EXT4_FS
- help
- Starting with 2.6.28, the name of the ext4 filesystem was
- renamed from ext4dev to ext4. Unfortunately there are some
- legacy userspace programs (such as klibc's fstype) have
- "ext4dev" hardcoded.
-
- To enable backwards compatibility so that systems that are
- still expecting to mount ext4 filesystems using ext4dev,
- chose Y here. This feature will go away by 2.6.31, so
- please arrange to get your userspace programs fixed!
-
-config EXT4_FS_XATTR
- bool "Ext4 extended attributes"
- depends on EXT4_FS
- default y
- help
- Extended attributes are name:value pairs associated with inodes by
- the kernel or by users (see the attr(5) manual page, or visit
- <http://acl.bestbits.at/> for details).
-
- If unsure, say N.
-
- You need this for POSIX ACL support on ext4.
-
-config EXT4_FS_POSIX_ACL
- bool "Ext4 POSIX Access Control Lists"
- depends on EXT4_FS_XATTR
- select FS_POSIX_ACL
- help
- POSIX Access Control Lists (ACLs) support permissions for users and
- groups beyond the owner/group/world scheme.
-
- To learn more about Access Control Lists, visit the POSIX ACLs for
- Linux website <http://acl.bestbits.at/>.
-
- If you don't know what Access Control Lists are, say N
-
-config EXT4_FS_SECURITY
- bool "Ext4 Security Labels"
- depends on EXT4_FS_XATTR
- help
- Security labels support alternative access control models
- implemented by security modules like SELinux. This option
- enables an extended attribute handler for file security
- labels in the ext4 filesystem.
-
- If you are not using a security module that requires using
- extended attributes for file security labels, say N.
-
-config JBD
- tristate
- help
- This is a generic journalling layer for block devices. It is
- currently used by the ext3 file system, but it could also be
- used to add journal support to other file systems or block
- devices such as RAID or LVM.
-
- If you are using the ext3 file system, you need to say Y here.
- If you are not using ext3 then you will probably want to say N.
-
- To compile this device as a module, choose M here: the module will be
- called jbd. If you are compiling ext3 into the kernel, you
- cannot compile this code as a module.
-
-config JBD_DEBUG
- bool "JBD (ext3) debugging support"
- depends on JBD && DEBUG_FS
- help
- If you are using the ext3 journaled file system (or potentially any
- other file system/device using JBD), this option allows you to
- enable debugging output while the system is running, in order to
- help track down any problems you are having. By default the
- debugging output will be turned off.
-
- If you select Y here, then you will be able to turn on debugging
- with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a
- number between 1 and 5, the higher the number, the more debugging
- output is generated. To turn debugging off again, do
- "echo 0 > /sys/kernel/debug/jbd/jbd-debug".
-
-config JBD2
- tristate
- select CRC32
- help
- This is a generic journaling layer for block devices that support
- both 32-bit and 64-bit block numbers. It is currently used by
- the ext4 and OCFS2 filesystems, but it could also be used to add
- journal support to other file systems or block devices such
- as RAID or LVM.
-
- If you are using ext4 or OCFS2, you need to say Y here.
- If you are not using ext4 or OCFS2 then you will
- probably want to say N.
-
- To compile this device as a module, choose M here. The module will be
- called jbd2. If you are compiling ext4 or OCFS2 into the kernel,
- you cannot compile this code as a module.
-
-config JBD2_DEBUG
- bool "JBD2 (ext4) debugging support"
- depends on JBD2 && DEBUG_FS
- help
- If you are using the ext4 journaled file system (or
- potentially any other filesystem/device using JBD2), this option
- allows you to enable debugging output while the system is running,
- in order to help track down any problems you are having.
- By default, the debugging output will be turned off.
-
- If you select Y here, then you will be able to turn on debugging
- with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
- number between 1 and 5. The higher the number, the more debugging
- output is generated. To turn debugging off again, do
- "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
+source "fs/jbd/Kconfig"
+source "fs/jbd2/Kconfig"
config FS_MBCACHE
# Meta block cache for Extended Attributes (ext2/ext3/ext4)
@@ -665,7 +403,7 @@ config AUTOFS4_FS
N here.
config FUSE_FS
- tristate "Filesystem in Userspace support"
+ tristate "FUSE (Filesystem in Userspace) support"
help
With FUSE it is possible to implement a fully functional filesystem
in a userspace program.
@@ -1168,195 +906,7 @@ config EFS_FS
To compile the EFS file system support as a module, choose M here: the
module will be called efs.
-config JFFS2_FS
- tristate "Journalling Flash File System v2 (JFFS2) support"
- select CRC32
- depends on MTD
- help
- JFFS2 is the second generation of the Journalling Flash File System
- for use on diskless embedded devices. It provides improved wear
- levelling, compression and support for hard links. You cannot use
- this on normal block devices, only on 'MTD' devices.
-
- Further information on the design and implementation of JFFS2 is
- available at <http://sources.redhat.com/jffs2/>.
-
-config JFFS2_FS_DEBUG
- int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)"
- depends on JFFS2_FS
- default "0"
- help
- This controls the amount of debugging messages produced by the JFFS2
- code. Set it to zero for use in production systems. For evaluation,
- testing and debugging, it's advisable to set it to one. This will
- enable a few assertions and will print debugging messages at the
- KERN_DEBUG loglevel, where they won't normally be visible. Level 2
- is unlikely to be useful - it enables extra debugging in certain
- areas which at one point needed debugging, but when the bugs were
- located and fixed, the detailed messages were relegated to level 2.
-
- If reporting bugs, please try to have available a full dump of the
- messages at debug level 1 while the misbehaviour was occurring.
-
-config JFFS2_FS_WRITEBUFFER
- bool "JFFS2 write-buffering support"
- depends on JFFS2_FS
- default y
- help
- This enables the write-buffering support in JFFS2.
-
- This functionality is required to support JFFS2 on the following
- types of flash devices:
- - NAND flash
- - NOR flash with transparent ECC
- - DataFlash
-
-config JFFS2_FS_WBUF_VERIFY
- bool "Verify JFFS2 write-buffer reads"
- depends on JFFS2_FS_WRITEBUFFER
- default n
- help
- This causes JFFS2 to read back every page written through the
- write-buffer, and check for errors.
-
-config JFFS2_SUMMARY
- bool "JFFS2 summary support (EXPERIMENTAL)"
- depends on JFFS2_FS && EXPERIMENTAL
- default n
- help
- This feature makes it possible to use summary information
- for faster filesystem mount.
-
- The summary information can be inserted into a filesystem image
- by the utility 'sumtool'.
-
- If unsure, say 'N'.
-
-config JFFS2_FS_XATTR
- bool "JFFS2 XATTR support (EXPERIMENTAL)"
- depends on JFFS2_FS && EXPERIMENTAL
- default n
- help
- Extended attributes are name:value pairs associated with inodes by
- the kernel or by users (see the attr(5) manual page, or visit
- <http://acl.bestbits.at/> for details).
-
- If unsure, say N.
-
-config JFFS2_FS_POSIX_ACL
- bool "JFFS2 POSIX Access Control Lists"
- depends on JFFS2_FS_XATTR
- default y
- select FS_POSIX_ACL
- help
- Posix Access Control Lists (ACLs) support permissions for users and
- groups beyond the owner/group/world scheme.
-
- To learn more about Access Control Lists, visit the Posix ACLs for
- Linux website <http://acl.bestbits.at/>.
-
- If you don't know what Access Control Lists are, say N
-
-config JFFS2_FS_SECURITY
- bool "JFFS2 Security Labels"
- depends on JFFS2_FS_XATTR
- default y
- help
- Security labels support alternative access control models
- implemented by security modules like SELinux. This option
- enables an extended attribute handler for file security
- labels in the jffs2 filesystem.
-
- If you are not using a security module that requires using
- extended attributes for file security labels, say N.
-
-config JFFS2_COMPRESSION_OPTIONS
- bool "Advanced compression options for JFFS2"
- depends on JFFS2_FS
- default n
- help
- Enabling this option allows you to explicitly choose which
- compression modules, if any, are enabled in JFFS2. Removing
- compressors can mean you cannot read existing file systems,
- and enabling experimental compressors can mean that you
- write a file system which cannot be read by a standard kernel.
-
- If unsure, you should _definitely_ say 'N'.
-
-config JFFS2_ZLIB
- bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS
- select ZLIB_INFLATE
- select ZLIB_DEFLATE
- depends on JFFS2_FS
- default y
- help
- Zlib is designed to be a free, general-purpose, legally unencumbered,
- lossless data-compression library for use on virtually any computer
- hardware and operating system. See <http://www.gzip.org/zlib/> for
- further information.
-
- Say 'Y' if unsure.
-
-config JFFS2_LZO
- bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS
- select LZO_COMPRESS
- select LZO_DECOMPRESS
- depends on JFFS2_FS
- default n
- help
- minilzo-based compression. Generally works better than Zlib.
-
- This feature was added in July, 2007. Say 'N' if you need
- compatibility with older bootloaders or kernels.
-
-config JFFS2_RTIME
- bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS
- depends on JFFS2_FS
- default y
- help
- Rtime does manage to recompress already-compressed data. Say 'Y' if unsure.
-
-config JFFS2_RUBIN
- bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS
- depends on JFFS2_FS
- default n
- help
- RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure.
-
-choice
- prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS
- default JFFS2_CMODE_PRIORITY
- depends on JFFS2_FS
- help
- You can set here the default compression mode of JFFS2 from
- the available compression modes. Don't touch if unsure.
-
-config JFFS2_CMODE_NONE
- bool "no compression"
- help
- Uses no compression.
-
-config JFFS2_CMODE_PRIORITY
- bool "priority"
- help
- Tries the compressors in a predefined order and chooses the first
- successful one.
-
-config JFFS2_CMODE_SIZE
- bool "size (EXPERIMENTAL)"
- help
- Tries all compressors and chooses the one which has the smallest
- result.
-
-config JFFS2_CMODE_FAVOURLZO
- bool "Favour LZO"
- help
- Tries all compressors and chooses the one which has the smallest
- result but gives some preference to LZO (which has faster
- decompression) at the expense of size.
-
-endchoice
-
+source "fs/jffs2/Kconfig"
# UBIFS File system configuration
source "fs/ubifs/Kconfig"
@@ -1913,148 +1463,7 @@ config SMB_NLS_REMOTE
smbmount from samba 2.2.0 or later supports this.
-config CIFS
- tristate "CIFS support (advanced network filesystem, SMBFS successor)"
- depends on INET
- select NLS
- help
- This is the client VFS module for the Common Internet File System
- (CIFS) protocol which is the successor to the Server Message Block
- (SMB) protocol, the native file sharing mechanism for most early
- PC operating systems. The CIFS protocol is fully supported by
- file servers such as Windows 2000 (including Windows 2003, NT 4
- and Windows XP) as well by Samba (which provides excellent CIFS
- server support for Linux and many other operating systems). Limited
- support for OS/2 and Windows ME and similar servers is provided as
- well.
-
- The cifs module provides an advanced network file system
- client for mounting to CIFS compliant servers. It includes
- support for DFS (hierarchical name space), secure per-user
- session establishment via Kerberos or NTLM or NTLMv2,
- safe distributed caching (oplock), optional packet
- signing, Unicode and other internationalization improvements.
- If you need to mount to Samba or Windows from this machine, say Y.
-
-config CIFS_STATS
- bool "CIFS statistics"
- depends on CIFS
- help
- Enabling this option will cause statistics for each server share
- mounted by the cifs client to be displayed in /proc/fs/cifs/Stats
-
-config CIFS_STATS2
- bool "Extended statistics"
- depends on CIFS_STATS
- help
- Enabling this option will allow more detailed statistics on SMB
- request timing to be displayed in /proc/fs/cifs/DebugData and also
- allow optional logging of slow responses to dmesg (depending on the
- value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details).
- These additional statistics may have a minor effect on performance
- and memory utilization.
-
- Unless you are a developer or are doing network performance analysis
- or tuning, say N.
-
-config CIFS_WEAK_PW_HASH
- bool "Support legacy servers which use weaker LANMAN security"
- depends on CIFS
- help
- Modern CIFS servers including Samba and most Windows versions
- (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos)
- security mechanisms. These hash the password more securely
- than the mechanisms used in the older LANMAN version of the
- SMB protocol but LANMAN based authentication is needed to
- establish sessions with some old SMB servers.
-
- Enabling this option allows the cifs module to mount to older
- LANMAN based servers such as OS/2 and Windows 95, but such
- mounts may be less secure than mounts using NTLM or more recent
- security mechanisms if you are on a public network. Unless you
- have a need to access old SMB servers (and are on a private
- network) you probably want to say N. Even if this support
- is enabled in the kernel build, LANMAN authentication will not be
- used automatically. At runtime LANMAN mounts are disabled but
- can be set to required (or optional) either in
- /proc/fs/cifs (see fs/cifs/README for more detail) or via an
- option on the mount command. This support is disabled by
- default in order to reduce the possibility of a downgrade
- attack.
-
- If unsure, say N.
-
-config CIFS_UPCALL
- bool "Kerberos/SPNEGO advanced session setup"
- depends on CIFS && KEYS
- help
- Enables an upcall mechanism for CIFS which accesses
- userspace helper utilities to provide SPNEGO packaged (RFC 4178)
- Kerberos tickets which are needed to mount to certain secure servers
- (for which more secure Kerberos authentication is required). If
- unsure, say N.
-
-config CIFS_XATTR
- bool "CIFS extended attributes"
- depends on CIFS
- help
- Extended attributes are name:value pairs associated with inodes by
- the kernel or by users (see the attr(5) manual page, or visit
- <http://acl.bestbits.at/> for details). CIFS maps the name of
- extended attributes beginning with the user namespace prefix
- to SMB/CIFS EAs. EAs are stored on Windows servers without the
- user namespace prefix, but their names are seen by Linux cifs clients
- prefaced by the user namespace prefix. The system namespace
- (used by some filesystems to store ACLs) is not supported at
- this time.
-
- If unsure, say N.
-
-config CIFS_POSIX
- bool "CIFS POSIX Extensions"
- depends on CIFS_XATTR
- help
- Enabling this option will cause the cifs client to attempt to
- negotiate a newer dialect with servers, such as Samba 3.0.5
- or later, that optionally can handle more POSIX like (rather
- than Windows like) file behavior. It also enables
- support for POSIX ACLs (getfacl and setfacl) to servers
- (such as Samba 3.10 and later) which can negotiate
- CIFS POSIX ACL support. If unsure, say N.
-
-config CIFS_DEBUG2
- bool "Enable additional CIFS debugging routines"
- depends on CIFS
- help
- Enabling this option adds a few more debugging routines
- to the cifs code which slightly increases the size of
- the cifs module and can cause additional logging of debug
- messages in some error paths, slowing performance. This
- option can be turned off unless you are debugging
- cifs problems. If unsure, say N.
-
-config CIFS_EXPERIMENTAL
- bool "CIFS Experimental Features (EXPERIMENTAL)"
- depends on CIFS && EXPERIMENTAL
- help
- Enables cifs features under testing. These features are
- experimental and currently include DFS support and directory
- change notification ie fcntl(F_DNOTIFY), as well as the upcall
- mechanism which will be used for Kerberos session negotiation
- and uid remapping. Some of these features also may depend on
- setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental
- (which is disabled by default). See the file fs/cifs/README
- for more details. If unsure, say N.
-
-config CIFS_DFS_UPCALL
- bool "DFS feature support (EXPERIMENTAL)"
- depends on CIFS_EXPERIMENTAL
- depends on KEYS
- help
- Enables an upcall mechanism for CIFS which contacts userspace
- helper utilities to provide server name resolution (host names to
- IP addresses) which is needed for implicit mounts of DFS junction
- points. If unsure, say N.
+source "fs/cifs/Kconfig"
config NCP_FS
tristate "NCP file system support (to mount NetWare volumes)"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 801db134181..ce9fb3fbfae 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -40,6 +40,28 @@ config BINFMT_ELF_FDPIC
It is also possible to run FDPIC ELF binaries on MMU linux also.
+config CORE_DUMP_DEFAULT_ELF_HEADERS
+ bool "Write ELF core dumps with partial segments"
+ default n
+ depends on BINFMT_ELF
+ help
+ ELF core dump files describe each memory mapping of the crashed
+ process, and can contain or omit the memory contents of each one.
+ The contents of an unmodified text mapping are omitted by default.
+
+ For an unmodified text mapping of an ELF object, including just
+ the first page of the file in a core dump makes it possible to
+ identify the build ID bits in the file, without paying the i/o
+ cost and disk space to dump all the text. However, versions of
+ GDB before 6.7 are confused by ELF core dump files in this format.
+
+ The core dump behavior can be controlled per process using
+ the /proc/PID/coredump_filter pseudo-file; this setting is
+ inherited. See Documentation/filesystems/proc.txt for details.
+
+ This config option changes the default setting of coredump_filter
+ seen at boot time. If unsure, say N.
+
config BINFMT_FLAT
bool "Kernel support for flat binaries"
depends on !MMU && (!FRV || BROKEN)
diff --git a/fs/Makefile b/fs/Makefile
index d0c69f57e5b..2168c902d5c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -71,7 +71,7 @@ obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
obj-$(CONFIG_REISERFS_FS) += reiserfs/
obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
-obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4dev
+obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4
obj-$(CONFIG_JBD) += jbd/
obj-$(CONFIG_JBD2) += jbd2/
obj-$(CONFIG_EXT2_FS) += ext2/
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 655ed8d30a8..8fcfa398d35 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -683,7 +683,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
* switch really is going to happen - do this in
* flush_thread(). - akpm
*/
- SET_PERSONALITY(loc->elf_ex, 0);
+ SET_PERSONALITY(loc->elf_ex);
interpreter = open_exec(elf_interpreter);
retval = PTR_ERR(interpreter);
@@ -734,7 +734,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
goto out_free_dentry;
} else {
/* Executables without an interpreter also need a personality */
- SET_PERSONALITY(loc->elf_ex, 0);
+ SET_PERSONALITY(loc->elf_ex);
}
/* Flush all traces of the currently running executable */
@@ -748,7 +748,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
/* Do this immediately, since STACK_TOP as used in setup_arg_pages
may depend on the personality. */
- SET_PERSONALITY(loc->elf_ex, 0);
+ SET_PERSONALITY(loc->elf_ex);
if (elf_read_implies_exec(loc->elf_ex, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
@@ -1156,16 +1156,24 @@ static int dump_seek(struct file *file, loff_t off)
static unsigned long vma_dump_size(struct vm_area_struct *vma,
unsigned long mm_flags)
{
+#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type))
+
/* The vma can be set up to tell us the answer directly. */
if (vma->vm_flags & VM_ALWAYSDUMP)
goto whole;
+ /* Hugetlb memory check */
+ if (vma->vm_flags & VM_HUGETLB) {
+ if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
+ goto whole;
+ if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
+ goto whole;
+ }
+
/* Do not dump I/O mapped devices or special mappings */
if (vma->vm_flags & (VM_IO | VM_RESERVED))
return 0;
-#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type))
-
/* By default, dump shared memory if mapped from an anonymous file. */
if (vma->vm_flags & VM_SHARED) {
if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0 ?
@@ -1333,20 +1341,15 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
prstatus->pr_pgrp = task_pgrp_vnr(p);
prstatus->pr_sid = task_session_vnr(p);
if (thread_group_leader(p)) {
+ struct task_cputime cputime;
+
/*
- * This is the record for the group leader. Add in the
- * cumulative times of previous dead threads. This total
- * won't include the time of each live thread whose state
- * is included in the core dump. The final total reported
- * to our parent process when it calls wait4 will include
- * those sums as well as the little bit more time it takes
- * this and each other thread to finish dying after the
- * core dump synchronization phase.
+ * This is the record for the group leader. It shows the
+ * group-wide total, not its individual thread total.
*/
- cputime_to_timeval(cputime_add(p->utime, p->signal->utime),
- &prstatus->pr_utime);
- cputime_to_timeval(cputime_add(p->stime, p->signal->stime),
- &prstatus->pr_stime);
+ thread_group_cputime(p, &cputime);
+ cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
+ cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
} else {
cputime_to_timeval(p->utime, &prstatus->pr_utime);
cputime_to_timeval(p->stime, &prstatus->pr_stime);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 0e8367c5462..5b5424cb339 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1390,20 +1390,15 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
prstatus->pr_pgrp = task_pgrp_vnr(p);
prstatus->pr_sid = task_session_vnr(p);
if (thread_group_leader(p)) {
+ struct task_cputime cputime;
+
/*
- * This is the record for the group leader. Add in the
- * cumulative times of previous dead threads. This total
- * won't include the time of each live thread whose state
- * is included in the core dump. The final total reported
- * to our parent process when it calls wait4 will include
- * those sums as well as the little bit more time it takes
- * this and each other thread to finish dying after the
- * core dump synchronization phase.
+ * This is the record for the group leader. It shows the
+ * group-wide total, not its individual thread total.
*/
- cputime_to_timeval(cputime_add(p->utime, p->signal->utime),
- &prstatus->pr_utime);
- cputime_to_timeval(cputime_add(p->stime, p->signal->stime),
- &prstatus->pr_stime);
+ thread_group_cputime(p, &cputime);
+ cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
+ cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
} else {
cputime_to_timeval(p->utime, &prstatus->pr_utime);
cputime_to_timeval(p->stime, &prstatus->pr_stime);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index d84f0469a01..218408eed1b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1262,7 +1262,7 @@ EXPORT_SYMBOL(ioctl_by_bdev);
/**
* lookup_bdev - lookup a struct block_device by name
- * @pathname: special file representing the block device
+ * @path: special file representing the block device
*
* Get a reference to the blockdevice at @pathname in the current
* namespace if possible and return it. Return ERR_PTR(error)
diff --git a/fs/buffer.c b/fs/buffer.c
index ac78d4c19b3..6569fda5cfe 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -76,8 +76,7 @@ EXPORT_SYMBOL(__lock_buffer);
void unlock_buffer(struct buffer_head *bh)
{
- smp_mb__before_clear_bit();
- clear_buffer_locked(bh);
+ clear_bit_unlock(BH_Lock, &bh->b_state);
smp_mb__after_clear_bit();
wake_up_bit(&bh->b_state, BH_Lock);
}
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 3cb7cda3d78..262fa10e213 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -22,9 +22,6 @@
#include <linux/mutex.h>
#include <linux/backing-dev.h>
-#ifdef CONFIG_KMOD
-#include <linux/kmod.h>
-#endif
#include "internal.h"
/*
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
new file mode 100644
index 00000000000..341a98965bd
--- /dev/null
+++ b/fs/cifs/Kconfig
@@ -0,0 +1,142 @@
+config CIFS
+ tristate "CIFS support (advanced network filesystem, SMBFS successor)"
+ depends on INET
+ select NLS
+ help
+ This is the client VFS module for the Common Internet File System
+ (CIFS) protocol which is the successor to the Server Message Block
+ (SMB) protocol, the native file sharing mechanism for most early
+ PC operating systems. The CIFS protocol is fully supported by
+ file servers such as Windows 2000 (including Windows 2003, NT 4
+ and Windows XP) as well by Samba (which provides excellent CIFS
+ server support for Linux and many other operating systems). Limited
+ support for OS/2 and Windows ME and similar servers is provided as
+ well.
+
+ The cifs module provides an advanced network file system
+ client for mounting to CIFS compliant servers. It includes
+ support for DFS (hierarchical name space), secure per-user
+ session establishment via Kerberos or NTLM or NTLMv2,
+ safe distributed caching (oplock), optional packet
+ signing, Unicode and other internationalization improvements.
+ If you need to mount to Samba or Windows from this machine, say Y.
+
+config CIFS_STATS
+ bool "CIFS statistics"
+ depends on CIFS
+ help
+ Enabling this option will cause statistics for each server share
+ mounted by the cifs client to be displayed in /proc/fs/cifs/Stats
+
+config CIFS_STATS2
+ bool "Extended statistics"
+ depends on CIFS_STATS
+ help
+ Enabling this option will allow more detailed statistics on SMB
+ request timing to be displayed in /proc/fs/cifs/DebugData and also
+ allow optional logging of slow responses to dmesg (depending on the
+ value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details).
+ These additional statistics may have a minor effect on performance
+ and memory utilization.
+
+ Unless you are a developer or are doing network performance analysis
+ or tuning, say N.
+
+config CIFS_WEAK_PW_HASH
+ bool "Support legacy servers which use weaker LANMAN security"
+ depends on CIFS
+ help
+ Modern CIFS servers including Samba and most Windows versions
+ (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos)
+ security mechanisms. These hash the password more securely
+ than the mechanisms used in the older LANMAN version of the
+ SMB protocol but LANMAN based authentication is needed to
+ establish sessions with some old SMB servers.
+
+ Enabling this option allows the cifs module to mount to older
+ LANMAN based servers such as OS/2 and Windows 95, but such
+ mounts may be less secure than mounts using NTLM or more recent
+ security mechanisms if you are on a public network. Unless you
+ have a need to access old SMB servers (and are on a private
+ network) you probably want to say N. Even if this support
+ is enabled in the kernel build, LANMAN authentication will not be
+ used automatically. At runtime LANMAN mounts are disabled but
+ can be set to required (or optional) either in
+ /proc/fs/cifs (see fs/cifs/README for more detail) or via an
+ option on the mount command. This support is disabled by
+ default in order to reduce the possibility of a downgrade
+ attack.
+
+ If unsure, say N.
+
+config CIFS_UPCALL
+ bool "Kerberos/SPNEGO advanced session setup"
+ depends on CIFS && KEYS
+ help
+ Enables an upcall mechanism for CIFS which accesses
+ userspace helper utilities to provide SPNEGO packaged (RFC 4178)
+ Kerberos tickets which are needed to mount to certain secure servers
+ (for which more secure Kerberos authentication is required). If
+ unsure, say N.
+
+config CIFS_XATTR
+ bool "CIFS extended attributes"
+ depends on CIFS
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details). CIFS maps the name of
+ extended attributes beginning with the user namespace prefix
+ to SMB/CIFS EAs. EAs are stored on Windows servers without the
+ user namespace prefix, but their names are seen by Linux cifs clients
+ prefaced by the user namespace prefix. The system namespace
+ (used by some filesystems to store ACLs) is not supported at
+ this time.
+
+ If unsure, say N.
+
+config CIFS_POSIX
+ bool "CIFS POSIX Extensions"
+ depends on CIFS_XATTR
+ help
+ Enabling this option will cause the cifs client to attempt to
+ negotiate a newer dialect with servers, such as Samba 3.0.5
+ or later, that optionally can handle more POSIX like (rather
+ than Windows like) file behavior. It also enables
+ support for POSIX ACLs (getfacl and setfacl) to servers
+ (such as Samba 3.10 and later) which can negotiate
+ CIFS POSIX ACL support. If unsure, say N.
+
+config CIFS_DEBUG2
+ bool "Enable additional CIFS debugging routines"
+ depends on CIFS
+ help
+ Enabling this option adds a few more debugging routines
+ to the cifs code which slightly increases the size of
+ the cifs module and can cause additional logging of debug
+ messages in some error paths, slowing performance. This
+ option can be turned off unless you are debugging
+ cifs problems. If unsure, say N.
+
+config CIFS_EXPERIMENTAL
+ bool "CIFS Experimental Features (EXPERIMENTAL)"
+ depends on CIFS && EXPERIMENTAL
+ help
+ Enables cifs features under testing. These features are
+ experimental and currently include DFS support and directory
+ change notification ie fcntl(F_DNOTIFY), as well as the upcall
+ mechanism which will be used for Kerberos session negotiation
+ and uid remapping. Some of these features also may depend on
+ setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental
+ (which is disabled by default). See the file fs/cifs/README
+ for more details. If unsure, say N.
+
+config CIFS_DFS_UPCALL
+ bool "DFS feature support (EXPERIMENTAL)"
+ depends on CIFS_EXPERIMENTAL
+ depends on KEYS
+ help
+ Enables an upcall mechanism for CIFS which contacts userspace
+ helper utilities to provide server name resolution (host names to
+ IP addresses) which is needed for implicit mounts of DFS junction
+ points. If unsure, say N.
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c4a8a060512..62d8bd8f14c 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1791,7 +1791,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
SetPageUptodate(page);
unlock_page(page);
if (!pagevec_add(plru_pvec, page))
- __pagevec_lru_add(plru_pvec);
+ __pagevec_lru_add_file(plru_pvec);
data += PAGE_CACHE_SIZE;
}
return;
@@ -1925,7 +1925,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
bytes_read = 0;
}
- pagevec_lru_add(&lru_pvec);
+ pagevec_lru_add_file(&lru_pvec);
/* need to free smb_read_data buf before exit */
if (smb_read_data) {
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 0d9b80ec689..cfd29da714d 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -362,9 +362,8 @@ static int init_coda_psdev(void)
goto out_chrdev;
}
for (i = 0; i < MAX_CODADEVS; i++)
- device_create_drvdata(coda_psdev_class, NULL,
- MKDEV(CODA_PSDEV_MAJOR, i),
- NULL, "cfs%d", i);
+ device_create(coda_psdev_class, NULL,
+ MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i);
coda_sysctl_init();
goto out;
diff --git a/fs/exec.c b/fs/exec.c
index cfb5656b2cd..4e834f16d9d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -50,15 +50,12 @@
#include <linux/cn_proc.h>
#include <linux/audit.h>
#include <linux/tracehook.h>
+#include <linux/kmod.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>
-#ifdef CONFIG_KMOD
-#include <linux/kmod.h>
-#endif
-
#ifdef __alpha__
/* for /sbin/loader handling in search_binary_handler() */
#include <linux/a.out.h>
@@ -1245,8 +1242,8 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
read_unlock(&binfmt_lock);
if (retval != -ENOEXEC || bprm->mm == NULL) {
break;
-#ifdef CONFIG_KMOD
- }else{
+#ifdef CONFIG_MODULES
+ } else {
#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
if (printable(bprm->buf[0]) &&
printable(bprm->buf[1]) &&
@@ -1389,7 +1386,7 @@ EXPORT_SYMBOL(set_binfmt);
* name into corename, which must have space for at least
* CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
*/
-static int format_corename(char *corename, int nr_threads, long signr)
+static int format_corename(char *corename, long signr)
{
const char *pat_ptr = core_pattern;
int ispipe = (*pat_ptr == '|');
@@ -1496,8 +1493,7 @@ static int format_corename(char *corename, int nr_threads, long signr)
* If core_pattern does not include a %p (as is the default)
* and core_uses_pid is set, then .%pid will be appended to
* the filename. Do not do this for piped commands. */
- if (!ispipe && !pid_in_pattern
- && (core_uses_pid || nr_threads)) {
+ if (!ispipe && !pid_in_pattern && core_uses_pid) {
rc = snprintf(out_ptr, out_end - out_ptr,
".%d", task_tgid_vnr(current));
if (rc > out_end - out_ptr)
@@ -1760,7 +1756,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
* uses lock_kernel()
*/
lock_kernel();
- ispipe = format_corename(corename, retval, signr);
+ ispipe = format_corename(corename, signr);
unlock_kernel();
/*
* Don't bother to check the RLIMIT_CORE value if core_pattern points
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
new file mode 100644
index 00000000000..14a6780fd03
--- /dev/null
+++ b/fs/ext2/Kconfig
@@ -0,0 +1,55 @@
+config EXT2_FS
+ tristate "Second extended fs support"
+ help
+ Ext2 is a standard Linux file system for hard disks.
+
+ To compile this file system support as a module, choose M here: the
+ module will be called ext2.
+
+ If unsure, say Y.
+
+config EXT2_FS_XATTR
+ bool "Ext2 extended attributes"
+ depends on EXT2_FS
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ If unsure, say N.
+
+config EXT2_FS_POSIX_ACL
+ bool "Ext2 POSIX Access Control Lists"
+ depends on EXT2_FS_XATTR
+ select FS_POSIX_ACL
+ help
+ Posix Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the Posix ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
+
+config EXT2_FS_SECURITY
+ bool "Ext2 Security Labels"
+ depends on EXT2_FS_XATTR
+ help
+ Security labels support alternative access control models
+ implemented by security modules like SELinux. This option
+ enables an extended attribute handler for file security
+ labels in the ext2 filesystem.
+
+ If you are not using a security module that requires using
+ extended attributes for file security labels, say N.
+
+config EXT2_FS_XIP
+ bool "Ext2 execute in place support"
+ depends on EXT2_FS && MMU
+ help
+ Execute in place can be used on memory-backed block devices. If you
+ enable this option, you can select to mount block devices which are
+ capable of this feature without using the page cache.
+
+ If you do not use a block device that is capable of using this,
+ or if unsure, say N.
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
new file mode 100644
index 00000000000..8e0cfe44b0f
--- /dev/null
+++ b/fs/ext3/Kconfig
@@ -0,0 +1,67 @@
+config EXT3_FS
+ tristate "Ext3 journalling file system support"
+ select JBD
+ help
+ This is the journalling version of the Second extended file system
+ (often called ext3), the de facto standard Linux file system
+ (method to organize files on a storage device) for hard disks.
+
+ The journalling code included in this driver means you do not have
+ to run e2fsck (file system checker) on your file systems after a
+ crash. The journal keeps track of any changes that were being made
+ at the time the system crashed, and can ensure that your file system
+ is consistent without the need for a lengthy check.
+
+ Other than adding the journal to the file system, the on-disk format
+ of ext3 is identical to ext2. It is possible to freely switch
+ between using the ext3 driver and the ext2 driver, as long as the
+ file system has been cleanly unmounted, or e2fsck is run on the file
+ system.
+
+ To add a journal on an existing ext2 file system or change the
+ behavior of ext3 file systems, you can use the tune2fs utility ("man
+ tune2fs"). To modify attributes of files and directories on ext3
+ file systems, use chattr ("man chattr"). You need to be using
+ e2fsprogs version 1.20 or later in order to create ext3 journals
+ (available at <http://sourceforge.net/projects/e2fsprogs/>).
+
+ To compile this file system support as a module, choose M here: the
+ module will be called ext3.
+
+config EXT3_FS_XATTR
+ bool "Ext3 extended attributes"
+ depends on EXT3_FS
+ default y
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ If unsure, say N.
+
+ You need this for POSIX ACL support on ext3.
+
+config EXT3_FS_POSIX_ACL
+ bool "Ext3 POSIX Access Control Lists"
+ depends on EXT3_FS_XATTR
+ select FS_POSIX_ACL
+ help
+ Posix Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the Posix ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
+
+config EXT3_FS_SECURITY
+ bool "Ext3 Security Labels"
+ depends on EXT3_FS_XATTR
+ help
+ Security labels support alternative access control models
+ implemented by security modules like SELinux. This option
+ enables an extended attribute handler for file security
+ labels in the ext3 filesystem.
+
+ If you are not using a security module that requires using
+ extended attributes for file security labels, say N.
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 92fd0338a6e..f5b57a2ca35 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1547,6 +1547,7 @@ retry_alloc:
* turn off reservation for this allocation
*/
if (my_rsv && (free_blocks < windowsz)
+ && (free_blocks > 0)
&& (rsv_is_empty(&my_rsv->rsv_window)))
my_rsv = NULL;
@@ -1585,7 +1586,7 @@ retry_alloc:
* free blocks is less than half of the reservation
* window size.
*/
- if (free_blocks <= (windowsz/2))
+ if (my_rsv && (free_blocks <= (windowsz/2)))
continue;
brelse(bitmap_bh);
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 2eea96ec78e..4c82531ea0a 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -102,6 +102,7 @@ static int ext3_readdir(struct file * filp,
int err;
struct inode *inode = filp->f_path.dentry->d_inode;
int ret = 0;
+ int dir_has_error = 0;
sb = inode->i_sb;
@@ -148,9 +149,12 @@ static int ext3_readdir(struct file * filp,
* of recovering data when there's a bad sector
*/
if (!bh) {
- ext3_error (sb, "ext3_readdir",
- "directory #%lu contains a hole at offset %lu",
- inode->i_ino, (unsigned long)filp->f_pos);
+ if (!dir_has_error) {
+ ext3_error(sb, __func__, "directory #%lu "
+ "contains a hole at offset %lld",
+ inode->i_ino, filp->f_pos);
+ dir_has_error = 1;
+ }
/* corrupt size? Maybe no more blocks to read */
if (filp->f_pos > inode->i_blocks << 9)
break;
@@ -410,7 +414,7 @@ static int call_filldir(struct file * filp, void * dirent,
get_dtype(sb, fname->file_type));
if (error) {
filp->f_pos = curr_pos;
- info->extra_fname = fname->next;
+ info->extra_fname = fname;
return error;
}
fname = fname->next;
@@ -449,11 +453,21 @@ static int ext3_dx_readdir(struct file * filp,
* If there are any leftover names on the hash collision
* chain, return them first.
*/
- if (info->extra_fname &&
- call_filldir(filp, dirent, filldir, info->extra_fname))
- goto finished;
+ if (info->extra_fname) {
+ if (call_filldir(filp, dirent, filldir, info->extra_fname))
+ goto finished;
- if (!info->curr_node)
+ info->extra_fname = NULL;
+ info->curr_node = rb_next(info->curr_node);
+ if (!info->curr_node) {
+ if (info->next_hash == ~0) {
+ filp->f_pos = EXT3_HTREE_EOF;
+ goto finished;
+ }
+ info->curr_hash = info->next_hash;
+ info->curr_minor_hash = 0;
+ }
+ } else if (!info->curr_node)
info->curr_node = rb_first(&info->root);
while (1) {
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index ebfec4d0148..f8424ad8997 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1186,6 +1186,13 @@ write_begin_failed:
ext3_journal_stop(handle);
unlock_page(page);
page_cache_release(page);
+ /*
+ * block_write_begin may have instantiated a few blocks
+ * outside i_size. Trim these off again. Don't need
+ * i_size_read because we hold i_mutex.
+ */
+ if (pos + len > inode->i_size)
+ vmtruncate(inode, inode->i_size);
}
if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
goto retry;
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 77278e947e9..78fdf383637 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -790,7 +790,8 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
if (reserved_gdb || gdb_off == 0) {
if (!EXT3_HAS_COMPAT_FEATURE(sb,
- EXT3_FEATURE_COMPAT_RESIZE_INODE)){
+ EXT3_FEATURE_COMPAT_RESIZE_INODE)
+ || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
ext3_warning(sb, __func__,
"No reserved GDT blocks, can't resize");
return -EPERM;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 399a96a6c55..3a260af5544 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -625,6 +625,9 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
seq_puts(seq, ",data=writeback");
+ if (test_opt(sb, DATA_ERR_ABORT))
+ seq_puts(seq, ",data_err=abort");
+
ext3_show_quota_options(seq, sb);
return 0;
@@ -754,6 +757,7 @@ enum {
Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
@@ -796,6 +800,8 @@ static const match_table_t tokens = {
{Opt_data_journal, "data=journal"},
{Opt_data_ordered, "data=ordered"},
{Opt_data_writeback, "data=writeback"},
+ {Opt_data_err_abort, "data_err=abort"},
+ {Opt_data_err_ignore, "data_err=ignore"},
{Opt_offusrjquota, "usrjquota="},
{Opt_usrjquota, "usrjquota=%s"},
{Opt_offgrpjquota, "grpjquota="},
@@ -1011,6 +1017,12 @@ static int parse_options (char *options, struct super_block *sb,
sbi->s_mount_opt |= data_opt;
}
break;
+ case Opt_data_err_abort:
+ set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+ break;
+ case Opt_data_err_ignore:
+ clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+ break;
#ifdef CONFIG_QUOTA
case Opt_usrjquota:
qtype = USRQUOTA;
@@ -1986,6 +1998,10 @@ static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
journal->j_flags |= JFS_BARRIER;
else
journal->j_flags &= ~JFS_BARRIER;
+ if (test_opt(sb, DATA_ERR_ABORT))
+ journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR;
+ else
+ journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR;
spin_unlock(&journal->j_state_lock);
}
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
new file mode 100644
index 00000000000..7505482a08f
--- /dev/null
+++ b/fs/ext4/Kconfig
@@ -0,0 +1,79 @@
+config EXT4_FS
+ tristate "The Extended 4 (ext4) filesystem"
+ select JBD2
+ select CRC16
+ help
+ This is the next generation of the ext3 filesystem.
+
+ Unlike the change from ext2 filesystem to ext3 filesystem,
+ the on-disk format of ext4 is not forwards compatible with
+ ext3; it is based on extent maps and it supports 48-bit
+ physical block numbers. The ext4 filesystem also supports delayed
+ allocation, persistent preallocation, high resolution time stamps,
+ and a number of other features to improve performance and speed
+ up fsck time. For more information, please see the web pages at
+ http://ext4.wiki.kernel.org.
+
+ The ext4 filesystem will support mounting an ext3
+ filesystem; while there will be some performance gains from
+ the delayed allocation and inode table readahead, the best
+ performance gains will require enabling ext4 features in the
+ filesystem, or formating a new filesystem as an ext4
+ filesystem initially.
+
+ To compile this file system support as a module, choose M here. The
+ module will be called ext4.
+
+ If unsure, say N.
+
+config EXT4DEV_COMPAT
+ bool "Enable ext4dev compatibility"
+ depends on EXT4_FS
+ help
+ Starting with 2.6.28, the name of the ext4 filesystem was
+ renamed from ext4dev to ext4. Unfortunately there are some
+ legacy userspace programs (such as klibc's fstype) have
+ "ext4dev" hardcoded.
+
+ To enable backwards compatibility so that systems that are
+ still expecting to mount ext4 filesystems using ext4dev,
+ chose Y here. This feature will go away by 2.6.31, so
+ please arrange to get your userspace programs fixed!
+
+config EXT4_FS_XATTR
+ bool "Ext4 extended attributes"
+ depends on EXT4_FS
+ default y
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ If unsure, say N.
+
+ You need this for POSIX ACL support on ext4.
+
+config EXT4_FS_POSIX_ACL
+ bool "Ext4 POSIX Access Control Lists"
+ depends on EXT4_FS_XATTR
+ select FS_POSIX_ACL
+ help
+ POSIX Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the POSIX ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
+
+config EXT4_FS_SECURITY
+ bool "Ext4 Security Labels"
+ depends on EXT4_FS_XATTR
+ help
+ Security labels support alternative access control models
+ implemented by security modules like SELinux. This option
+ enables an extended attribute handler for file security
+ labels in the ext4 filesystem.
+
+ If you are not using a security module that requires using
+ extended attributes for file security labels, say N.
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index bd2ece22882..b9821be709b 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -568,8 +568,16 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
/* this isn't the right place to decide whether block is metadata
* inode.c/extents.c knows better, but for safety ... */
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
- ext4_should_journal_data(inode))
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ metadata = 1;
+
+ /* We need to make sure we don't reuse
+ * block released untill the transaction commit.
+ * writeback mode have weak data consistency so
+ * don't force data as metadata when freeing block
+ * for writeback mode.
+ */
+ if (metadata == 0 && !ext4_should_writeback_data(inode))
metadata = 1;
sb = inode->i_sb;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6690a41cdd9..4880cc3e672 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -511,7 +511,6 @@ do { \
/*
* Mount flags
*/
-#define EXT4_MOUNT_CHECK 0x00001 /* Do mount-time checks */
#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */
#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 6a0b40d4326..445fde603df 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -99,9 +99,6 @@ struct ext4_sb_info {
struct inode *s_buddy_cache;
long s_blocks_reserved;
spinlock_t s_reserve_lock;
- struct list_head s_active_transaction;
- struct list_head s_closed_transaction;
- struct list_head s_committed_transaction;
spinlock_t s_md_lock;
tid_t s_last_transaction;
unsigned short *s_mb_offsets, *s_mb_maxs;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9b4ec9decfd..8dbf6953845 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
int ret = 0, err, nr_pages, i;
unsigned long index, end;
struct pagevec pvec;
+ long pages_skipped;
BUG_ON(mpd->next_page <= mpd->first_page);
pagevec_init(&pvec, 0);
@@ -1655,20 +1656,30 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
end = mpd->next_page - 1;
while (index <= end) {
- /* XXX: optimize tail */
- nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ /*
+ * We can use PAGECACHE_TAG_DIRTY lookup here because
+ * even though we have cleared the dirty flag on the page
+ * We still keep the page in the radix tree with tag
+ * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
+ * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
+ * which is called via the below writepage callback.
+ */
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index,
+ (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- index = page->index;
- if (index > end)
- break;
- index++;
-
+ pages_skipped = mpd->wbc->pages_skipped;
err = mapping->a_ops->writepage(page, mpd->wbc);
- if (!err)
+ if (!err && (pages_skipped == mpd->wbc->pages_skipped))
+ /*
+ * have successfully written the page
+ * without skipping the same
+ */
mpd->pages_written++;
/*
* In error case, we have to continue because
@@ -2104,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping,
struct writeback_control *wbc,
struct mpage_da_data *mpd)
{
- long to_write;
int ret;
if (!mpd->get_block)
@@ -2119,19 +2129,18 @@ static int mpage_da_writepages(struct address_space *mapping,
mpd->pages_written = 0;
mpd->retval = 0;
- to_write = wbc->nr_to_write;
-
ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
-
/*
* Handle last extent of pages
*/
if (!mpd->io_done && mpd->next_page != mpd->first_page) {
if (mpage_da_map_blocks(mpd) == 0)
mpage_da_submit_io(mpd);
- }
- wbc->nr_to_write = to_write - mpd->pages_written;
+ mpd->io_done = 1;
+ ret = MPAGE_DA_EXTENT_TAIL;
+ }
+ wbc->nr_to_write -= mpd->pages_written;
return ret;
}
@@ -2360,12 +2369,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
static int ext4_da_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ pgoff_t index;
+ int range_whole = 0;
handle_t *handle = NULL;
- loff_t range_start = 0;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
+ int no_nrwrite_index_update;
+ long pages_written = 0, pages_skipped;
int needed_blocks, ret = 0, nr_to_writebump = 0;
- long to_write, pages_skipped = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
/*
@@ -2385,23 +2396,26 @@ static int ext4_da_writepages(struct address_space *mapping,
nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
wbc->nr_to_write = sbi->s_mb_stream_request;
}
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
- if (!wbc->range_cyclic)
- /*
- * If range_cyclic is not set force range_cont
- * and save the old writeback_index
- */
- wbc->range_cont = 1;
-
- range_start = wbc->range_start;
- pages_skipped = wbc->pages_skipped;
+ if (wbc->range_cyclic)
+ index = mapping->writeback_index;
+ else
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
mpd.wbc = wbc;
mpd.inode = mapping->host;
-restart_loop:
- to_write = wbc->nr_to_write;
- while (!ret && to_write > 0) {
+ /*
+ * we don't want write_cache_pages to update
+ * nr_to_write and writeback_index
+ */
+ no_nrwrite_index_update = wbc->no_nrwrite_index_update;
+ wbc->no_nrwrite_index_update = 1;
+ pages_skipped = wbc->pages_skipped;
+
+ while (!ret && wbc->nr_to_write > 0) {
/*
* we insert one extent at a time. So we need
@@ -2422,48 +2436,53 @@ restart_loop:
dump_stack();
goto out_writepages;
}
- to_write -= wbc->nr_to_write;
-
mpd.get_block = ext4_da_get_block_write;
ret = mpage_da_writepages(mapping, wbc, &mpd);
ext4_journal_stop(handle);
- if (mpd.retval == -ENOSPC)
+ if (mpd.retval == -ENOSPC) {
+ /* commit the transaction which would
+ * free blocks released in the transaction
+ * and try again
+ */
jbd2_journal_force_commit_nested(sbi->s_journal);
-
- /* reset the retry count */
- if (ret == MPAGE_DA_EXTENT_TAIL) {
+ wbc->pages_skipped = pages_skipped;
+ ret = 0;
+ } else if (ret == MPAGE_DA_EXTENT_TAIL) {
/*
* got one extent now try with
* rest of the pages
*/
- to_write += wbc->nr_to_write;
+ pages_written += mpd.pages_written;
+ wbc->pages_skipped = pages_skipped;
ret = 0;
- } else if (wbc->nr_to_write) {
+ } else if (wbc->nr_to_write)
/*
* There is no more writeout needed
* or we requested for a noblocking writeout
* and we found the device congested
*/
- to_write += wbc->nr_to_write;
break;
- }
- wbc->nr_to_write = to_write;
- }
-
- if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
- /* We skipped pages in this loop */
- wbc->range_start = range_start;
- wbc->nr_to_write = to_write +
- wbc->pages_skipped - pages_skipped;
- wbc->pages_skipped = pages_skipped;
- goto restart_loop;
}
+ if (pages_skipped != wbc->pages_skipped)
+ printk(KERN_EMERG "This should not happen leaving %s "
+ "with nr_to_write = %ld ret = %d\n",
+ __func__, wbc->nr_to_write, ret);
+
+ /* Update index */
+ index += pages_written;
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ /*
+ * set the writeback_index so that range_cyclic
+ * mode will write it back later
+ */
+ mapping->writeback_index = index;
out_writepages:
- wbc->nr_to_write = to_write - nr_to_writebump;
- wbc->range_start = range_start;
+ if (!no_nrwrite_index_update)
+ wbc->no_nrwrite_index_update = 0;
+ wbc->nr_to_write -= nr_to_writebump;
return ret;
}
@@ -4175,7 +4194,6 @@ static int ext4_inode_blocks_set(handle_t *handle,
struct inode *inode = &(ei->vfs_inode);
u64 i_blocks = inode->i_blocks;
struct super_block *sb = inode->i_sb;
- int err = 0;
if (i_blocks <= ~0U) {
/*
@@ -4185,36 +4203,27 @@ static int ext4_inode_blocks_set(handle_t *handle,
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = 0;
ei->i_flags &= ~EXT4_HUGE_FILE_FL;
- } else if (i_blocks <= 0xffffffffffffULL) {
+ return 0;
+ }
+ if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+ return -EFBIG;
+
+ if (i_blocks <= 0xffffffffffffULL) {
/*
* i_blocks can be represented in a 48 bit variable
* as multiple of 512 bytes
*/
- err = ext4_update_rocompat_feature(handle, sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
- if (err)
- goto err_out;
- /* i_block is stored in the split 48 bit fields */
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
ei->i_flags &= ~EXT4_HUGE_FILE_FL;
} else {
- /*
- * i_blocks should be represented in a 48 bit variable
- * as multiple of file system block size
- */
- err = ext4_update_rocompat_feature(handle, sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
- if (err)
- goto err_out;
ei->i_flags |= EXT4_HUGE_FILE_FL;
/* i_block is stored in file system block size */
i_blocks = i_blocks >> (inode->i_blkbits - 9);
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
}
-err_out:
- return err;
+ return 0;
}
/*
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b580714f0d8..dfe17a13405 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2300,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
}
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+ meta_group_info[i]->bb_free_root.rb_node = NULL;;
#ifdef DOUBLE_CHECK
{
@@ -2522,9 +2523,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
}
spin_lock_init(&sbi->s_md_lock);
- INIT_LIST_HEAD(&sbi->s_active_transaction);
- INIT_LIST_HEAD(&sbi->s_closed_transaction);
- INIT_LIST_HEAD(&sbi->s_committed_transaction);
spin_lock_init(&sbi->s_bal_lock);
sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
@@ -2553,6 +2551,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
ext4_mb_init_per_dev_proc(sb);
ext4_mb_history_init(sb);
+ sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+
printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
return 0;
}
@@ -2568,7 +2568,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
list_del(&pa->pa_group_list);
count++;
- kfree(pa);
+ kmem_cache_free(ext4_pspace_cachep, pa);
}
if (count)
mb_debug("mballoc: %u PAs left\n", count);
@@ -2582,15 +2582,6 @@ int ext4_mb_release(struct super_block *sb)
struct ext4_group_info *grinfo;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- /* release freed, non-committed blocks */
- spin_lock(&sbi->s_md_lock);
- list_splice_init(&sbi->s_closed_transaction,
- &sbi->s_committed_transaction);
- list_splice_init(&sbi->s_active_transaction,
- &sbi->s_committed_transaction);
- spin_unlock(&sbi->s_md_lock);
- ext4_mb_free_committed_blocks(sb);
-
if (sbi->s_group_info) {
for (i = 0; i < sbi->s_groups_count; i++) {
grinfo = ext4_get_group_info(sb, i);
@@ -2644,61 +2635,57 @@ int ext4_mb_release(struct super_block *sb)
return 0;
}
-static noinline_for_stack void
-ext4_mb_free_committed_blocks(struct super_block *sb)
+/*
+ * This function is called by the jbd2 layer once the commit has finished,
+ * so we know we can free the blocks that were released with that commit.
+ */
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int err;
- int i;
- int count = 0;
- int count2 = 0;
- struct ext4_free_metadata *md;
+ struct super_block *sb = journal->j_private;
struct ext4_buddy e4b;
+ struct ext4_group_info *db;
+ int err, count = 0, count2 = 0;
+ struct ext4_free_data *entry;
+ ext4_fsblk_t discard_block;
+ struct list_head *l, *ltmp;
- if (list_empty(&sbi->s_committed_transaction))
- return;
-
- /* there is committed blocks to be freed yet */
- do {
- /* get next array of blocks */
- md = NULL;
- spin_lock(&sbi->s_md_lock);
- if (!list_empty(&sbi->s_committed_transaction)) {
- md = list_entry(sbi->s_committed_transaction.next,
- struct ext4_free_metadata, list);
- list_del(&md->list);
- }
- spin_unlock(&sbi->s_md_lock);
-
- if (md == NULL)
- break;
+ list_for_each_safe(l, ltmp, &txn->t_private_list) {
+ entry = list_entry(l, struct ext4_free_data, list);
mb_debug("gonna free %u blocks in group %lu (0x%p):",
- md->num, md->group, md);
+ entry->count, entry->group, entry);
- err = ext4_mb_load_buddy(sb, md->group, &e4b);
+ err = ext4_mb_load_buddy(sb, entry->group, &e4b);
/* we expect to find existing buddy because it's pinned */
BUG_ON(err != 0);
+ db = e4b.bd_info;
/* there are blocks to put in buddy to make them really free */
- count += md->num;
+ count += entry->count;
count2++;
- ext4_lock_group(sb, md->group);
- for (i = 0; i < md->num; i++) {
- mb_debug(" %u", md->blocks[i]);
- mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
+ ext4_lock_group(sb, entry->group);
+ /* Take it out of per group rb tree */
+ rb_erase(&entry->node, &(db->bb_free_root));
+ mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
+
+ if (!db->bb_free_root.rb_node) {
+ /* No more items in the per group rb tree
+ * balance refcounts from ext4_mb_free_metadata()
+ */
+ page_cache_release(e4b.bd_buddy_page);
+ page_cache_release(e4b.bd_bitmap_page);
}
- mb_debug("\n");
- ext4_unlock_group(sb, md->group);
-
- /* balance refcounts from ext4_mb_free_metadata() */
- page_cache_release(e4b.bd_buddy_page);
- page_cache_release(e4b.bd_bitmap_page);
-
- kfree(md);
+ ext4_unlock_group(sb, entry->group);
+ discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
+ + entry->start_blk
+ + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+ trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id,
+ (unsigned long long) discard_block, entry->count);
+ sb_issue_discard(sb, discard_block, entry->count);
+
+ kmem_cache_free(ext4_free_ext_cachep, entry);
ext4_mb_release_desc(&e4b);
-
- } while (md);
+ }
mb_debug("freed %u blocks in %u structures\n", count, count2);
}
@@ -2712,6 +2699,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
static int ext4_mb_init_per_dev_proc(struct super_block *sb)
{
+#ifdef CONFIG_PROC_FS
mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct proc_dir_entry *proc;
@@ -2735,10 +2723,14 @@ err_out:
remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
return -ENOMEM;
+#else
+ return 0;
+#endif
}
static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
{
+#ifdef CONFIG_PROC_FS
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (sbi->s_proc == NULL)
@@ -2750,7 +2742,7 @@ static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-
+#endif
return 0;
}
@@ -2771,6 +2763,16 @@ int __init init_ext4_mballoc(void)
kmem_cache_destroy(ext4_pspace_cachep);
return -ENOMEM;
}
+
+ ext4_free_ext_cachep =
+ kmem_cache_create("ext4_free_block_extents",
+ sizeof(struct ext4_free_data),
+ 0, SLAB_RECLAIM_ACCOUNT, NULL);
+ if (ext4_free_ext_cachep == NULL) {
+ kmem_cache_destroy(ext4_pspace_cachep);
+ kmem_cache_destroy(ext4_ac_cachep);
+ return -ENOMEM;
+ }
return 0;
}
@@ -2779,6 +2781,7 @@ void exit_ext4_mballoc(void)
/* XXX: synchronize_rcu(); */
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
+ kmem_cache_destroy(ext4_free_ext_cachep);
}
@@ -4324,8 +4327,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
goto out1;
}
- ext4_mb_poll_new_transaction(sb, handle);
-
*errp = ext4_mb_initialize_context(ac, ar);
if (*errp) {
ar->len = 0;
@@ -4384,35 +4385,20 @@ out1:
return block;
}
-static void ext4_mb_poll_new_transaction(struct super_block *sb,
- handle_t *handle)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
-
- if (sbi->s_last_transaction == handle->h_transaction->t_tid)
- return;
-
- /* new transaction! time to close last one and free blocks for
- * committed transaction. we know that only transaction can be
- * active, so previos transaction can be being logged and we
- * know that transaction before previous is known to be already
- * logged. this means that now we may free blocks freed in all
- * transactions before previous one. hope I'm clear enough ... */
- spin_lock(&sbi->s_md_lock);
- if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
- mb_debug("new transaction %lu, old %lu\n",
- (unsigned long) handle->h_transaction->t_tid,
- (unsigned long) sbi->s_last_transaction);
- list_splice_init(&sbi->s_closed_transaction,
- &sbi->s_committed_transaction);
- list_splice_init(&sbi->s_active_transaction,
- &sbi->s_closed_transaction);
- sbi->s_last_transaction = handle->h_transaction->t_tid;
- }
- spin_unlock(&sbi->s_md_lock);
-
- ext4_mb_free_committed_blocks(sb);
+/*
+ * We can merge two free data extents only if the physical blocks
+ * are contiguous, AND the extents were freed by the same transaction,
+ * AND the blocks are associated with the same group.
+ */
+static int can_merge(struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
+{
+ if ((entry1->t_tid == entry2->t_tid) &&
+ (entry1->group == entry2->group) &&
+ ((entry1->start_blk + entry1->count) == entry2->start_blk))
+ return 1;
+ return 0;
}
static noinline_for_stack int
@@ -4422,57 +4408,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_free_metadata *md;
- int i;
+ struct ext4_free_data *entry, *new_entry;
+ struct rb_node **n = &db->bb_free_root.rb_node, *node;
+ struct rb_node *parent = NULL, *new_node;
+
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
+ new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+ new_entry->start_blk = block;
+ new_entry->group = group;
+ new_entry->count = count;
+ new_entry->t_tid = handle->h_transaction->t_tid;
+ new_node = &new_entry->node;
+
ext4_lock_group(sb, group);
- for (i = 0; i < count; i++) {
- md = db->bb_md_cur;
- if (md && db->bb_tid != handle->h_transaction->t_tid) {
- db->bb_md_cur = NULL;
- md = NULL;
+ if (!*n) {
+ /* first free block exent. We need to
+ protect buddy cache from being freed,
+ * otherwise we'll refresh it from
+ * on-disk bitmap and lose not-yet-available
+ * blocks */
+ page_cache_get(e4b->bd_buddy_page);
+ page_cache_get(e4b->bd_bitmap_page);
+ }
+ while (*n) {
+ parent = *n;
+ entry = rb_entry(parent, struct ext4_free_data, node);
+ if (block < entry->start_blk)
+ n = &(*n)->rb_left;
+ else if (block >= (entry->start_blk + entry->count))
+ n = &(*n)->rb_right;
+ else {
+ ext4_error(sb, __func__,
+ "Double free of blocks %d (%d %d)\n",
+ block, entry->start_blk, entry->count);
+ return 0;
}
+ }
- if (md == NULL) {
- ext4_unlock_group(sb, group);
- md = kmalloc(sizeof(*md), GFP_NOFS);
- if (md == NULL)
- return -ENOMEM;
- md->num = 0;
- md->group = group;
-
- ext4_lock_group(sb, group);
- if (db->bb_md_cur == NULL) {
- spin_lock(&sbi->s_md_lock);
- list_add(&md->list, &sbi->s_active_transaction);
- spin_unlock(&sbi->s_md_lock);
- /* protect buddy cache from being freed,
- * otherwise we'll refresh it from
- * on-disk bitmap and lose not-yet-available
- * blocks */
- page_cache_get(e4b->bd_buddy_page);
- page_cache_get(e4b->bd_bitmap_page);
- db->bb_md_cur = md;
- db->bb_tid = handle->h_transaction->t_tid;
- mb_debug("new md 0x%p for group %lu\n",
- md, md->group);
- } else {
- kfree(md);
- md = db->bb_md_cur;
- }
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, &db->bb_free_root);
+
+ /* Now try to see the extent can be merged to left and right */
+ node = rb_prev(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, node);
+ if (can_merge(entry, new_entry)) {
+ new_entry->start_blk = entry->start_blk;
+ new_entry->count += entry->count;
+ rb_erase(node, &(db->bb_free_root));
+ spin_lock(&sbi->s_md_lock);
+ list_del(&entry->list);
+ spin_unlock(&sbi->s_md_lock);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
}
+ }
- BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
- md->blocks[md->num] = block + i;
- md->num++;
- if (md->num == EXT4_BB_MAX_BLOCKS) {
- /* no more space, put full container on a sb's list */
- db->bb_md_cur = NULL;
+ node = rb_next(new_node);
+ if (node) {
+ entry = rb_entry(node, struct ext4_free_data, node);
+ if (can_merge(new_entry, entry)) {
+ new_entry->count += entry->count;
+ rb_erase(node, &(db->bb_free_root));
+ spin_lock(&sbi->s_md_lock);
+ list_del(&entry->list);
+ spin_unlock(&sbi->s_md_lock);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
}
}
+ /* Add the extent to transaction's private list */
+ spin_lock(&sbi->s_md_lock);
+ list_add(&new_entry->list, &handle->h_transaction->t_private_list);
+ spin_unlock(&sbi->s_md_lock);
ext4_unlock_group(sb, group);
return 0;
}
@@ -4500,8 +4509,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
*freed = 0;
- ext4_mb_poll_new_transaction(sb, handle);
-
sbi = EXT4_SB(sb);
es = EXT4_SB(sb)->s_es;
if (block < le32_to_cpu(es->s_first_data_block) ||
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b3b4828f8b8..b5dff1fff1e 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -18,6 +18,8 @@
#include <linux/pagemap.h>
#include <linux/seq_file.h>
#include <linux/version.h>
+#include <linux/blkdev.h>
+#include <linux/marker.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "group.h"
@@ -98,23 +100,29 @@
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;
-#ifdef EXT4_BB_MAX_BLOCKS
-#undef EXT4_BB_MAX_BLOCKS
-#endif
-#define EXT4_BB_MAX_BLOCKS 30
+struct ext4_free_data {
+ /* this links the free block information from group_info */
+ struct rb_node node;
-struct ext4_free_metadata {
- ext4_group_t group;
- unsigned short num;
- ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
+ /* this links the free block information from ext4_sb_info */
struct list_head list;
+
+ /* group which free block extent belongs */
+ ext4_group_t group;
+
+ /* free block extent */
+ ext4_grpblk_t start_blk;
+ ext4_grpblk_t count;
+
+ /* transaction which freed this extent */
+ tid_t t_tid;
};
struct ext4_group_info {
unsigned long bb_state;
- unsigned long bb_tid;
- struct ext4_free_metadata *bb_md_cur;
+ struct rb_root bb_free_root;
unsigned short bb_first_free;
unsigned short bb_free;
unsigned short bb_fragments;
@@ -261,8 +269,6 @@ struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
-static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
-static void ext4_mb_free_committed_blocks(struct super_block *);
static void ext4_mb_return_to_preallocation(struct inode *inode,
struct ext4_buddy *e4b, sector_t block,
int count);
@@ -270,6 +276,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *,
struct super_block *, struct ext4_prealloc_space *pa);
static int ext4_mb_init_per_dev_proc(struct super_block *sb);
static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dea8f13c2fd..9b2b2bc4ec1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -374,66 +374,6 @@ void ext4_update_dynamic_rev(struct super_block *sb)
*/
}
-int ext4_update_compat_feature(handle_t *handle,
- struct super_block *sb, __u32 compat)
-{
- int err = 0;
- if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) {
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- return err;
- EXT4_SET_COMPAT_FEATURE(sb, compat);
- sb->s_dirt = 1;
- handle->h_sync = 1;
- BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
- "call ext4_journal_dirty_met adata");
- err = ext4_journal_dirty_metadata(handle,
- EXT4_SB(sb)->s_sbh);
- }
- return err;
-}
-
-int ext4_update_rocompat_feature(handle_t *handle,
- struct super_block *sb, __u32 rocompat)
-{
- int err = 0;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) {
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- return err;
- EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat);
- sb->s_dirt = 1;
- handle->h_sync = 1;
- BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
- "call ext4_journal_dirty_met adata");
- err = ext4_journal_dirty_metadata(handle,
- EXT4_SB(sb)->s_sbh);
- }
- return err;
-}
-
-int ext4_update_incompat_feature(handle_t *handle,
- struct super_block *sb, __u32 incompat)
-{
- int err = 0;
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) {
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- return err;
- EXT4_SET_INCOMPAT_FEATURE(sb, incompat);
- sb->s_dirt = 1;
- handle->h_sync = 1;
- BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
- "call ext4_journal_dirty_met adata");
- err = ext4_journal_dirty_metadata(handle,
- EXT4_SB(sb)->s_sbh);
- }
- return err;
-}
-
/*
* Open the external journal device
*/
@@ -904,7 +844,7 @@ static const struct export_operations ext4_export_ops = {
enum {
Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
- Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
+ Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
@@ -915,7 +855,7 @@ enum {
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
- Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+ Opt_stripe, Opt_delalloc, Opt_nodelalloc,
Opt_inode_readahead_blks
};
@@ -933,8 +873,6 @@ static const match_table_t tokens = {
{Opt_err_panic, "errors=panic"},
{Opt_err_ro, "errors=remount-ro"},
{Opt_nouid32, "nouid32"},
- {Opt_nocheck, "nocheck"},
- {Opt_nocheck, "check=none"},
{Opt_debug, "debug"},
{Opt_oldalloc, "oldalloc"},
{Opt_orlov, "orlov"},
@@ -973,8 +911,6 @@ static const match_table_t tokens = {
{Opt_extents, "extents"},
{Opt_noextents, "noextents"},
{Opt_i_version, "i_version"},
- {Opt_mballoc, "mballoc"},
- {Opt_nomballoc, "nomballoc"},
{Opt_stripe, "stripe=%u"},
{Opt_resize, "resize"},
{Opt_delalloc, "delalloc"},
@@ -1073,9 +1009,6 @@ static int parse_options(char *options, struct super_block *sb,
case Opt_nouid32:
set_opt(sbi->s_mount_opt, NO_UID32);
break;
- case Opt_nocheck:
- clear_opt(sbi->s_mount_opt, CHECK);
- break;
case Opt_debug:
set_opt(sbi->s_mount_opt, DEBUG);
break;
@@ -1618,14 +1551,14 @@ static int ext4_check_descriptors(struct super_block *sb)
if (block_bitmap < first_block || block_bitmap > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
"Block bitmap for group %lu not in group "
- "(block %llu)!", i, block_bitmap);
+ "(block %llu)!\n", i, block_bitmap);
return 0;
}
inode_bitmap = ext4_inode_bitmap(sb, gdp);
if (inode_bitmap < first_block || inode_bitmap > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
"Inode bitmap for group %lu not in group "
- "(block %llu)!", i, inode_bitmap);
+ "(block %llu)!\n", i, inode_bitmap);
return 0;
}
inode_table = ext4_inode_table(sb, gdp);
@@ -1633,7 +1566,7 @@ static int ext4_check_descriptors(struct super_block *sb)
inode_table + sbi->s_itb_per_group - 1 > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
"Inode table for group %lu not in group "
- "(block %llu)!", i, inode_table);
+ "(block %llu)!\n", i, inode_table);
return 0;
}
spin_lock(sb_bgl_lock(sbi, i));
@@ -1778,13 +1711,13 @@ static void ext4_orphan_cleanup(struct super_block *sb,
*
* Note, this does *not* consider any metadata overhead for vfs i_blocks.
*/
-static loff_t ext4_max_size(int blkbits)
+static loff_t ext4_max_size(int blkbits, int has_huge_files)
{
loff_t res;
loff_t upper_limit = MAX_LFS_FILESIZE;
/* small i_blocks in vfs inode? */
- if (sizeof(blkcnt_t) < sizeof(u64)) {
+ if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
/*
* CONFIG_LSF is not enabled implies the inode
* i_block represent total blocks in 512 bytes
@@ -1814,7 +1747,7 @@ static loff_t ext4_max_size(int blkbits)
* block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
* We need to be 1 filesystem block less than the 2^48 sector limit.
*/
-static loff_t ext4_max_bitmap_size(int bits)
+static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{
loff_t res = EXT4_NDIR_BLOCKS;
int meta_blocks;
@@ -1827,11 +1760,11 @@ static loff_t ext4_max_bitmap_size(int bits)
* total number of 512 bytes blocks of the file
*/
- if (sizeof(blkcnt_t) < sizeof(u64)) {
+ if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
/*
- * CONFIG_LSF is not enabled implies the inode
- * i_block represent total blocks in 512 bytes
- * 32 == size of vfs inode i_blocks * 8
+ * !has_huge_files or CONFIG_LSF is not enabled
+ * implies the inode i_block represent total blocks in
+ * 512 bytes 32 == size of vfs inode i_blocks * 8
*/
upper_limit = (1LL << 32) - 1;
@@ -1940,7 +1873,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
int blocksize;
int db_count;
int i;
- int needs_recovery;
+ int needs_recovery, has_huge_files;
__le32 features;
__u64 blocks_count;
int err;
@@ -2081,7 +2014,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_id, le32_to_cpu(features));
goto failed_mount;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+ if (has_huge_files) {
/*
* Large file size enabled file system can only be
* mount if kernel is build with CONFIG_LSF
@@ -2131,8 +2066,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
- sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits);
- sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
+ sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
+ has_huge_files);
+ sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
@@ -2456,6 +2392,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"available.\n");
}
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+ "requested data journaling mode\n");
+ clear_opt(sbi->s_mount_opt, DELALLOC);
+ } else if (test_opt(sb, DELALLOC))
+ printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+
+ ext4_ext_init(sb);
+ err = ext4_mb_init(sb, needs_recovery);
+ if (err) {
+ printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
+ err);
+ goto failed_mount4;
+ }
+
/*
* akpm: core read_super() calls in here with the superblock locked.
* That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2475,21 +2426,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
- printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
- "requested data journaling mode\n");
- clear_opt(sbi->s_mount_opt, DELALLOC);
- } else if (test_opt(sb, DELALLOC))
- printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
-
- ext4_ext_init(sb);
- err = ext4_mb_init(sb, needs_recovery);
- if (err) {
- printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
- err);
- goto failed_mount4;
- }
-
lock_kernel();
return 0;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 2bada6bbc31..34930a964b8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -101,6 +101,8 @@ void fuse_finish_open(struct inode *inode, struct file *file,
file->f_op = &fuse_direct_io_file_operations;
if (!(outarg->open_flags & FOPEN_KEEP_CACHE))
invalidate_inode_pages2(inode->i_mapping);
+ if (outarg->open_flags & FOPEN_NONSEEKABLE)
+ nonseekable_open(inode, file);
ff->fh = outarg->fh;
file->private_data = fuse_file_get(ff);
}
@@ -1448,6 +1450,9 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
mutex_lock(&inode->i_mutex);
switch (origin) {
case SEEK_END:
+ retval = fuse_update_attributes(inode, NULL, file, NULL);
+ if (retval)
+ return retval;
offset += i_size_read(inode);
break;
case SEEK_CUR:
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 3a876076bdd..35accfdd747 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -6,6 +6,9 @@
See the file COPYING.
*/
+#ifndef _FS_FUSE_I_H
+#define _FS_FUSE_I_H
+
#include <linux/fuse.h>
#include <linux/fs.h>
#include <linux/mount.h>
@@ -655,3 +658,5 @@ void fuse_set_nowrite(struct inode *inode);
void fuse_release_nowrite(struct inode *inode);
u64 fuse_get_attr_version(struct fuse_conn *fc);
+
+#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6a84388cacf..54b1f0e1ef5 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -865,7 +865,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (is_bdev) {
fc->destroy_req = fuse_request_alloc();
if (!fc->destroy_req)
- goto err_put_root;
+ goto err_free_init_req;
}
mutex_lock(&fuse_mutex);
@@ -895,6 +895,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
err_unlock:
mutex_unlock(&fuse_mutex);
+ err_free_init_req:
fuse_request_free(init_req);
err_put_root:
dput(root_dentry);
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index fec8f61227f..0022eec63cd 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -199,6 +199,9 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
goto done;
}
+ if (inode->i_ino == HFSPLUS_EXT_CNID)
+ return -EIO;
+
mutex_lock(&HFSPLUS_I(inode).extents_lock);
res = hfsplus_ext_read_extent(inode, ablock);
if (!res) {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index b085d64a2b6..963be644297 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -254,6 +254,8 @@ static int hfsplus_file_open(struct inode *inode, struct file *file)
{
if (HFSPLUS_IS_RSRC(inode))
inode = HFSPLUS_I(inode).rsrc_inode;
+ if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+ return -EOVERFLOW;
atomic_inc(&HFSPLUS_I(inode).opencnt);
return 0;
}
diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig
new file mode 100644
index 00000000000..4e28beeed15
--- /dev/null
+++ b/fs/jbd/Kconfig
@@ -0,0 +1,30 @@
+config JBD
+ tristate
+ help
+ This is a generic journalling layer for block devices. It is
+ currently used by the ext3 file system, but it could also be
+ used to add journal support to other file systems or block
+ devices such as RAID or LVM.
+
+ If you are using the ext3 file system, you need to say Y here.
+ If you are not using ext3 then you will probably want to say N.
+
+ To compile this device as a module, choose M here: the module will be
+ called jbd. If you are compiling ext3 into the kernel, you
+ cannot compile this code as a module.
+
+config JBD_DEBUG
+ bool "JBD (ext3) debugging support"
+ depends on JBD && DEBUG_FS
+ help
+ If you are using the ext3 journaled file system (or potentially any
+ other file system/device using JBD), this option allows you to
+ enable debugging output while the system is running, in order to
+ help track down any problems you are having. By default the
+ debugging output will be turned off.
+
+ If you select Y here, then you will be able to turn on debugging
+ with "echo N > /sys/kernel/debug/jbd/jbd-debug", where N is a
+ number between 1 and 5, the higher the number, the more debugging
+ output is generated. To turn debugging off again, do
+ "echo 0 > /sys/kernel/debug/jbd/jbd-debug".
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index ae08c057e75..25719d902c5 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -482,6 +482,8 @@ void journal_commit_transaction(journal_t *journal)
printk(KERN_WARNING
"JBD: Detected IO errors while flushing file data "
"on %s\n", bdevname(journal->j_fs_dev, b));
+ if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
+ journal_abort(journal, err);
err = 0;
}
@@ -518,9 +520,10 @@ void journal_commit_transaction(journal_t *journal)
jh = commit_transaction->t_buffers;
/* If we're in abort mode, we just un-journal the buffer and
- release it for background writing. */
+ release it. */
if (is_journal_aborted(journal)) {
+ clear_buffer_jbddirty(jh2bh(jh));
JBUFFER_TRACE(jh, "journal is aborting: refile");
journal_refile_buffer(journal, jh);
/* If that was the last one, we need to clean up
@@ -762,6 +765,9 @@ wait_for_iobuf:
/* AKPM: bforget here */
}
+ if (err)
+ journal_abort(journal, err);
+
jbd_debug(3, "JBD: commit phase 6\n");
if (journal_write_commit_record(journal, commit_transaction))
@@ -852,6 +858,8 @@ restart_loop:
if (buffer_jbddirty(bh)) {
JBUFFER_TRACE(jh, "add to new checkpointing trans");
__journal_insert_checkpoint(jh, commit_transaction);
+ if (is_journal_aborted(journal))
+ clear_buffer_jbddirty(bh);
JBUFFER_TRACE(jh, "refile for checkpoint writeback");
__journal_refile_buffer(jh);
jbd_unlock_bh_state(bh);
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 0540ca27a44..d15cd6e7251 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -954,9 +954,10 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
journal_t *journal = handle->h_transaction->t_journal;
int need_brelse = 0;
struct journal_head *jh;
+ int ret = 0;
if (is_handle_aborted(handle))
- return 0;
+ return ret;
jh = journal_add_journal_head(bh);
JBUFFER_TRACE(jh, "entry");
@@ -1067,7 +1068,16 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
time if it is redirtied */
}
- /* journal_clean_data_list() may have got there first */
+ /*
+ * We cannot remove the buffer with io error from the
+ * committing transaction, because otherwise it would
+ * miss the error and the commit would not abort.
+ */
+ if (unlikely(!buffer_uptodate(bh))) {
+ ret = -EIO;
+ goto no_journal;
+ }
+
if (jh->b_transaction != NULL) {
JBUFFER_TRACE(jh, "unfile from commit");
__journal_temp_unlink_buffer(jh);
@@ -1108,7 +1118,7 @@ no_journal:
}
JBUFFER_TRACE(jh, "exit");
journal_put_journal_head(jh);
- return 0;
+ return ret;
}
/**
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
new file mode 100644
index 00000000000..f32f346f4b0
--- /dev/null
+++ b/fs/jbd2/Kconfig
@@ -0,0 +1,33 @@
+config JBD2
+ tristate
+ select CRC32
+ help
+ This is a generic journaling layer for block devices that support
+ both 32-bit and 64-bit block numbers. It is currently used by
+ the ext4 and OCFS2 filesystems, but it could also be used to add
+ journal support to other file systems or block devices such
+ as RAID or LVM.
+
+ If you are using ext4 or OCFS2, you need to say Y here.
+ If you are not using ext4 or OCFS2 then you will
+ probably want to say N.
+
+ To compile this device as a module, choose M here. The module will be
+ called jbd2. If you are compiling ext4 or OCFS2 into the kernel,
+ you cannot compile this code as a module.
+
+config JBD2_DEBUG
+ bool "JBD2 (ext4) debugging support"
+ depends on JBD2 && DEBUG_FS
+ help
+ If you are using the ext4 journaled file system (or
+ potentially any other filesystem/device using JBD2), this option
+ allows you to enable debugging output while the system is running,
+ in order to help track down any problems you are having.
+ By default, the debugging output will be turned off.
+
+ If you select Y here, then you will be able to turn on debugging
+ with "echo N > /sys/kernel/debug/jbd2/jbd2-debug", where N is a
+ number between 1 and 5. The higher the number, the more debugging
+ output is generated. To turn debugging off again, do
+ "echo 0 > /sys/kernel/debug/jbd2/jbd2-debug".
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0abe02c4242..8b119e16aa3 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -995,6 +995,9 @@ restart_loop:
}
spin_unlock(&journal->j_list_lock);
+ if (journal->j_commit_callback)
+ journal->j_commit_callback(journal, commit_transaction);
+
trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
journal->j_devname, commit_transaction->t_tid,
journal->j_tail_sequence);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e5d540588fa..39b7805a599 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -52,6 +52,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
INIT_LIST_HEAD(&transaction->t_inode_list);
+ INIT_LIST_HEAD(&transaction->t_private_list);
/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig
new file mode 100644
index 00000000000..6ae169cd8fa
--- /dev/null
+++ b/fs/jffs2/Kconfig
@@ -0,0 +1,188 @@
+config JFFS2_FS
+ tristate "Journalling Flash File System v2 (JFFS2) support"
+ select CRC32
+ depends on MTD
+ help
+ JFFS2 is the second generation of the Journalling Flash File System
+ for use on diskless embedded devices. It provides improved wear
+ levelling, compression and support for hard links. You cannot use
+ this on normal block devices, only on 'MTD' devices.
+
+ Further information on the design and implementation of JFFS2 is
+ available at <http://sources.redhat.com/jffs2/>.
+
+config JFFS2_FS_DEBUG
+ int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)"
+ depends on JFFS2_FS
+ default "0"
+ help
+ This controls the amount of debugging messages produced by the JFFS2
+ code. Set it to zero for use in production systems. For evaluation,
+ testing and debugging, it's advisable to set it to one. This will
+ enable a few assertions and will print debugging messages at the
+ KERN_DEBUG loglevel, where they won't normally be visible. Level 2
+ is unlikely to be useful - it enables extra debugging in certain
+ areas which at one point needed debugging, but when the bugs were
+ located and fixed, the detailed messages were relegated to level 2.
+
+ If reporting bugs, please try to have available a full dump of the
+ messages at debug level 1 while the misbehaviour was occurring.
+
+config JFFS2_FS_WRITEBUFFER
+ bool "JFFS2 write-buffering support"
+ depends on JFFS2_FS
+ default y
+ help
+ This enables the write-buffering support in JFFS2.
+
+ This functionality is required to support JFFS2 on the following
+ types of flash devices:
+ - NAND flash
+ - NOR flash with transparent ECC
+ - DataFlash
+
+config JFFS2_FS_WBUF_VERIFY
+ bool "Verify JFFS2 write-buffer reads"
+ depends on JFFS2_FS_WRITEBUFFER
+ default n
+ help
+ This causes JFFS2 to read back every page written through the
+ write-buffer, and check for errors.
+
+config JFFS2_SUMMARY
+ bool "JFFS2 summary support (EXPERIMENTAL)"
+ depends on JFFS2_FS && EXPERIMENTAL
+ default n
+ help
+ This feature makes it possible to use summary information
+ for faster filesystem mount.
+
+ The summary information can be inserted into a filesystem image
+ by the utility 'sumtool'.
+
+ If unsure, say 'N'.
+
+config JFFS2_FS_XATTR
+ bool "JFFS2 XATTR support (EXPERIMENTAL)"
+ depends on JFFS2_FS && EXPERIMENTAL
+ default n
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ If unsure, say N.
+
+config JFFS2_FS_POSIX_ACL
+ bool "JFFS2 POSIX Access Control Lists"
+ depends on JFFS2_FS_XATTR
+ default y
+ select FS_POSIX_ACL
+ help
+ Posix Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the Posix ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
+
+config JFFS2_FS_SECURITY
+ bool "JFFS2 Security Labels"
+ depends on JFFS2_FS_XATTR
+ default y
+ help
+ Security labels support alternative access control models
+ implemented by security modules like SELinux. This option
+ enables an extended attribute handler for file security
+ labels in the jffs2 filesystem.
+
+ If you are not using a security module that requires using
+ extended attributes for file security labels, say N.
+
+config JFFS2_COMPRESSION_OPTIONS
+ bool "Advanced compression options for JFFS2"
+ depends on JFFS2_FS
+ default n
+ help
+ Enabling this option allows you to explicitly choose which
+ compression modules, if any, are enabled in JFFS2. Removing
+ compressors can mean you cannot read existing file systems,
+ and enabling experimental compressors can mean that you
+ write a file system which cannot be read by a standard kernel.
+
+ If unsure, you should _definitely_ say 'N'.
+
+config JFFS2_ZLIB
+ bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS
+ select ZLIB_INFLATE
+ select ZLIB_DEFLATE
+ depends on JFFS2_FS
+ default y
+ help
+ Zlib is designed to be a free, general-purpose, legally unencumbered,
+ lossless data-compression library for use on virtually any computer
+ hardware and operating system. See <http://www.gzip.org/zlib/> for
+ further information.
+
+ Say 'Y' if unsure.
+
+config JFFS2_LZO
+ bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS
+ select LZO_COMPRESS
+ select LZO_DECOMPRESS
+ depends on JFFS2_FS
+ default n
+ help
+ minilzo-based compression. Generally works better than Zlib.
+
+ This feature was added in July, 2007. Say 'N' if you need
+ compatibility with older bootloaders or kernels.
+
+config JFFS2_RTIME
+ bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS
+ depends on JFFS2_FS
+ default y
+ help
+ Rtime does manage to recompress already-compressed data. Say 'Y' if unsure.
+
+config JFFS2_RUBIN
+ bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS
+ depends on JFFS2_FS
+ default n
+ help
+ RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure.
+
+choice
+ prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS
+ default JFFS2_CMODE_PRIORITY
+ depends on JFFS2_FS
+ help
+ You can set here the default compression mode of JFFS2 from
+ the available compression modes. Don't touch if unsure.
+
+config JFFS2_CMODE_NONE
+ bool "no compression"
+ help
+ Uses no compression.
+
+config JFFS2_CMODE_PRIORITY
+ bool "priority"
+ help
+ Tries the compressors in a predefined order and chooses the first
+ successful one.
+
+config JFFS2_CMODE_SIZE
+ bool "size (EXPERIMENTAL)"
+ help
+ Tries all compressors and chooses the one which has the smallest
+ result.
+
+config JFFS2_CMODE_FAVOURLZO
+ bool "Favour LZO"
+ help
+ Tries all compressors and chooses the one which has the smallest
+ result but gives some preference to LZO (which has faster
+ decompression) at the expense of size.
+
+endchoice
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index 86739ee53b3..f25e70c1b51 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -53,8 +53,8 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this,
}
/* jffs2_compress:
- * @data: Pointer to uncompressed data
- * @cdata: Pointer to returned pointer to buffer for compressed data
+ * @data_in: Pointer to uncompressed data
+ * @cpage_out: Pointer to returned pointer to buffer for compressed data
* @datalen: On entry, holds the amount of data available for compression.
* On exit, expected to hold the amount of data actually compressed.
* @cdatalen: On entry, holds the amount of space available for compressed
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index cd219ef5525..b1aaae823a5 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -311,7 +311,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
/* FIXME: If you care. We'd need to use frags for the target
if it grows much more than this */
if (targetlen > 254)
- return -EINVAL;
+ return -ENAMETOOLONG;
ri = jffs2_alloc_raw_inode();
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index dddb2a6c9e2..259461b910a 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -68,7 +68,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
instr->len = c->sector_size;
instr->callback = jffs2_erase_callback;
instr->priv = (unsigned long)(&instr[1]);
- instr->fail_addr = 0xffffffff;
+ instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN;
((struct erase_priv_struct *)instr->priv)->jeb = jeb;
((struct erase_priv_struct *)instr->priv)->c = c;
@@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
{
/* For NAND, if the failure did not occur at the device level for a
specific physical page, don't bother updating the bad block table. */
- if (jffs2_cleanmarker_oob(c) && (bad_offset != 0xffffffff)) {
+ if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) {
/* We had a device-level failure to erase. Let's see if we've
failed too many times. */
if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 086c4383022..249305d65d5 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -207,6 +207,8 @@ int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = 0;
buf->f_ffree = 0;
buf->f_namelen = JFFS2_MAX_NAME_LEN;
+ buf->f_fsid.val[0] = JFFS2_SUPER_MAGIC;
+ buf->f_fsid.val[1] = c->mtd->index;
spin_lock(&c->erase_completion_lock);
avail = c->dirty_size + c->free_size;
@@ -440,14 +442,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
memset(ri, 0, sizeof(*ri));
/* Set OS-specific defaults for new inodes */
- ri->uid = cpu_to_je16(current->fsuid);
+ ri->uid = cpu_to_je16(current_fsuid());
if (dir_i->i_mode & S_ISGID) {
ri->gid = cpu_to_je16(dir_i->i_gid);
if (S_ISDIR(mode))
mode |= S_ISGID;
} else {
- ri->gid = cpu_to_je16(current->fsgid);
+ ri->gid = cpu_to_je16(current_fsgid());
}
/* POSIX ACLs have to be processed now, at least partly.
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index a9bf9603c1b..0875b60b4bf 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -261,6 +261,10 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
jffs2_sum_reset_collected(c->summary); /* reset collected summary */
+ /* adjust write buffer offset, else we get a non contiguous write bug */
+ if (!(c->wbuf_ofs % c->sector_size) && !c->wbuf_len)
+ c->wbuf_ofs = 0xffffffff;
+
D1(printk(KERN_DEBUG "jffs2_find_nextblock(): new nextblock = 0x%08x\n", c->nextblock->offset));
return 0;
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 0e78b00035e..d9a721e6db7 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -679,10 +679,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
memset(c->wbuf,0xff,c->wbuf_pagesize);
/* adjust write buffer offset, else we get a non contiguous write bug */
- if (SECTOR_ADDR(c->wbuf_ofs) == SECTOR_ADDR(c->wbuf_ofs+c->wbuf_pagesize))
- c->wbuf_ofs += c->wbuf_pagesize;
- else
- c->wbuf_ofs = 0xffffffff;
+ c->wbuf_ofs += c->wbuf_pagesize;
c->wbuf_len = 0;
return 0;
}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 6a09760c596..c2e9cfd9e5a 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -40,6 +40,16 @@ unsigned short nfs_callback_tcpport;
static const int nfs_set_port_min = 0;
static const int nfs_set_port_max = 65535;
+/*
+ * If the kernel has IPv6 support available, always listen for
+ * both AF_INET and AF_INET6 requests.
+ */
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static const sa_family_t nfs_callback_family = AF_INET6;
+#else
+static const sa_family_t nfs_callback_family = AF_INET;
+#endif
+
static int param_set_port(const char *val, struct kernel_param *kp)
{
char *endp;
@@ -106,7 +116,7 @@ int nfs_callback_up(void)
if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
goto out;
serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
- AF_INET, NULL);
+ nfs_callback_family, NULL);
ret = -ENOMEM;
if (!serv)
goto out_err;
@@ -116,7 +126,8 @@ int nfs_callback_up(void)
if (ret <= 0)
goto out_err;
nfs_callback_tcpport = ret;
- dprintk("Callback port = 0x%x\n", nfs_callback_tcpport);
+ dprintk("NFS: Callback listener port = %u (af %u)\n",
+ nfs_callback_tcpport, nfs_callback_family);
nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
if (IS_ERR(nfs_callback_info.rqst)) {
@@ -149,8 +160,8 @@ out:
mutex_unlock(&nfs_callback_mutex);
return ret;
out_err:
- dprintk("Couldn't create callback socket or server thread; err = %d\n",
- ret);
+ dprintk("NFS: Couldn't create callback socket or server thread; "
+ "err = %d\n", ret);
nfs_callback_info.users--;
goto out;
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 5ee23e7058b..7547600b617 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -675,7 +675,7 @@ static int nfs_init_server(struct nfs_server *server,
server->nfs_client = clp;
/* Initialise the client representation from the mount data */
- server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+ server->flags = data->flags;
if (data->rsize)
server->rsize = nfs_block_size(data->rsize, NULL);
@@ -850,7 +850,6 @@ static struct nfs_server *nfs_alloc_server(void)
INIT_LIST_HEAD(&server->client_link);
INIT_LIST_HEAD(&server->master_link);
- init_waitqueue_head(&server->active_wq);
atomic_set(&server->active, 0);
server->io_stats = nfs_alloc_iostats();
@@ -1073,7 +1072,7 @@ static int nfs4_init_server(struct nfs_server *server,
goto error;
/* Initialise the client representation from the mount data */
- server->flags = data->flags & NFS_MOUNT_FLAGMASK;
+ server->flags = data->flags;
server->caps |= NFS_CAP_ATOMIC_OPEN;
if (data->rsize)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 74f92b717f7..efdba2e802d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -156,6 +156,7 @@ typedef struct {
decode_dirent_t decode;
int plus;
unsigned long timestamp;
+ unsigned long gencount;
int timestamp_valid;
} nfs_readdir_descriptor_t;
@@ -177,7 +178,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
struct file *file = desc->file;
struct inode *inode = file->f_path.dentry->d_inode;
struct rpc_cred *cred = nfs_file_cred(file);
- unsigned long timestamp;
+ unsigned long timestamp, gencount;
int error;
dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
@@ -186,6 +187,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
again:
timestamp = jiffies;
+ gencount = nfs_inc_attr_generation_counter();
error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page,
NFS_SERVER(inode)->dtsize, desc->plus);
if (error < 0) {
@@ -199,6 +201,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
goto error;
}
desc->timestamp = timestamp;
+ desc->gencount = gencount;
desc->timestamp_valid = 1;
SetPageUptodate(page);
/* Ensure consistent page alignment of the data.
@@ -224,9 +227,10 @@ int dir_decode(nfs_readdir_descriptor_t *desc)
if (IS_ERR(p))
return PTR_ERR(p);
desc->ptr = p;
- if (desc->timestamp_valid)
+ if (desc->timestamp_valid) {
desc->entry->fattr->time_start = desc->timestamp;
- else
+ desc->entry->fattr->gencount = desc->gencount;
+ } else
desc->entry->fattr->valid &= ~NFS_ATTR_FATTR;
return 0;
}
@@ -471,7 +475,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
struct rpc_cred *cred = nfs_file_cred(file);
struct page *page = NULL;
int status;
- unsigned long timestamp;
+ unsigned long timestamp, gencount;
dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n",
(unsigned long long)*desc->dir_cookie);
@@ -482,6 +486,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
goto out;
}
timestamp = jiffies;
+ gencount = nfs_inc_attr_generation_counter();
status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred,
*desc->dir_cookie, page,
NFS_SERVER(inode)->dtsize,
@@ -490,6 +495,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */
if (status >= 0) {
desc->timestamp = timestamp;
+ desc->gencount = gencount;
desc->timestamp_valid = 1;
if ((status = dir_decode(desc)) == 0)
desc->entry->prev_cookie = *desc->dir_cookie;
@@ -655,7 +661,7 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
*/
void nfs_force_lookup_revalidate(struct inode *dir)
{
- NFS_I(dir)->cache_change_attribute = jiffies;
+ NFS_I(dir)->cache_change_attribute++;
}
/*
@@ -667,6 +673,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
{
if (IS_ROOT(dentry))
return 1;
+ if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+ return 0;
if (!nfs_verify_change_attribute(dir, dentry->d_time))
return 0;
/* Revalidate nfsi->cache_change_attribute before we declare a match */
@@ -750,6 +758,8 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
/* Don't revalidate a negative dentry if we're creating a new file */
if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0)
return 0;
+ if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
+ return 1;
return !nfs_check_verifier(dir, dentry);
}
@@ -1507,7 +1517,7 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
GFP_KERNEL)) {
pagevec_add(&lru_pvec, page);
- pagevec_lru_add(&lru_pvec);
+ pagevec_lru_add_file(&lru_pvec);
SetPageUptodate(page);
unlock_page(page);
} else
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 78460657f5c..d319b49f8f0 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -188,13 +188,16 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
/* origin == SEEK_END => we must revalidate the cached file length */
if (origin == SEEK_END) {
struct inode *inode = filp->f_mapping->host;
+
int retval = nfs_revalidate_file_size(inode, filp);
if (retval < 0)
return (loff_t)retval;
- }
- lock_kernel(); /* BKL needed? */
- loff = generic_file_llseek_unlocked(filp, offset, origin);
- unlock_kernel();
+
+ spin_lock(&inode->i_lock);
+ loff = generic_file_llseek_unlocked(filp, offset, origin);
+ spin_unlock(&inode->i_lock);
+ } else
+ loff = generic_file_llseek_unlocked(filp, offset, origin);
return loff;
}
@@ -699,13 +702,6 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
filp->f_path.dentry->d_name.name,
fl->fl_type, fl->fl_flags);
- /*
- * No BSD flocks over NFS allowed.
- * Note: we could try to fake a POSIX lock request here by
- * using ((u32) filp | 0x80000000) or some such as the pid.
- * Not sure whether that would be unique, though, or whether
- * that would break in other places.
- */
if (!(fl->fl_flags & FL_FLOCK))
return -ENOLCK;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 52daefa2f52..b9195c02a86 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -305,8 +305,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
init_special_inode(inode, inode->i_mode, fattr->rdev);
nfsi->read_cache_jiffies = fattr->time_start;
- nfsi->last_updated = now;
- nfsi->cache_change_attribute = now;
+ nfsi->attr_gencount = fattr->gencount;
inode->i_atime = fattr->atime;
inode->i_mtime = fattr->mtime;
inode->i_ctime = fattr->ctime;
@@ -453,6 +452,7 @@ out_big:
void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
{
if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
+ spin_lock(&inode->i_lock);
if ((attr->ia_valid & ATTR_MODE) != 0) {
int mode = attr->ia_mode & S_IALLUGO;
mode |= inode->i_mode & ~S_IALLUGO;
@@ -462,7 +462,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
inode->i_uid = attr->ia_uid;
if ((attr->ia_valid & ATTR_GID) != 0)
inode->i_gid = attr->ia_gid;
- spin_lock(&inode->i_lock);
NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
spin_unlock(&inode->i_lock);
}
@@ -472,37 +471,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
}
}
-static int nfs_wait_schedule(void *word)
-{
- if (signal_pending(current))
- return -ERESTARTSYS;
- schedule();
- return 0;
-}
-
-/*
- * Wait for the inode to get unlocked.
- */
-static int nfs_wait_on_inode(struct inode *inode)
-{
- struct nfs_inode *nfsi = NFS_I(inode);
- int error;
-
- error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING,
- nfs_wait_schedule, TASK_KILLABLE);
-
- return error;
-}
-
-static void nfs_wake_up_inode(struct inode *inode)
-{
- struct nfs_inode *nfsi = NFS_I(inode);
-
- clear_bit(NFS_INO_REVALIDATING, &nfsi->flags);
- smp_mb__after_clear_bit();
- wake_up_bit(&nfsi->flags, NFS_INO_REVALIDATING);
-}
-
int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
@@ -697,20 +665,15 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
inode->i_sb->s_id, (long long)NFS_FILEID(inode));
- nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
if (is_bad_inode(inode))
- goto out_nowait;
+ goto out;
if (NFS_STALE(inode))
- goto out_nowait;
-
- status = nfs_wait_on_inode(inode);
- if (status < 0)
goto out;
- status = -ESTALE;
if (NFS_STALE(inode))
goto out;
+ nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
if (status != 0) {
dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
@@ -724,16 +687,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
goto out;
}
- spin_lock(&inode->i_lock);
- status = nfs_update_inode(inode, &fattr);
+ status = nfs_refresh_inode(inode, &fattr);
if (status) {
- spin_unlock(&inode->i_lock);
dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
inode->i_sb->s_id,
(long long)NFS_FILEID(inode), status);
goto out;
}
- spin_unlock(&inode->i_lock);
if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
nfs_zap_acl_cache(inode);
@@ -743,9 +703,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
(long long)NFS_FILEID(inode));
out:
- nfs_wake_up_inode(inode);
-
- out_nowait:
return status;
}
@@ -908,9 +865,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
return -EIO;
}
- /* Do atomic weak cache consistency updates */
- nfs_wcc_update_inode(inode, fattr);
-
if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
nfsi->change_attr != fattr->change_attr)
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
@@ -939,15 +893,81 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
if (invalid != 0)
nfsi->cache_validity |= invalid;
- else
- nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
- | NFS_INO_INVALID_ATIME
- | NFS_INO_REVAL_PAGECACHE);
nfsi->read_cache_jiffies = fattr->time_start;
return 0;
}
+static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+ return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
+}
+
+static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+ return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
+}
+
+static unsigned long nfs_attr_generation_counter;
+
+static unsigned long nfs_read_attr_generation_counter(void)
+{
+ smp_rmb();
+ return nfs_attr_generation_counter;
+}
+
+unsigned long nfs_inc_attr_generation_counter(void)
+{
+ unsigned long ret;
+ smp_rmb();
+ ret = ++nfs_attr_generation_counter;
+ smp_wmb();
+ return ret;
+}
+
+void nfs_fattr_init(struct nfs_fattr *fattr)
+{
+ fattr->valid = 0;
+ fattr->time_start = jiffies;
+ fattr->gencount = nfs_inc_attr_generation_counter();
+}
+
+/**
+ * nfs_inode_attrs_need_update - check if the inode attributes need updating
+ * @inode - pointer to inode
+ * @fattr - attributes
+ *
+ * Attempt to divine whether or not an RPC call reply carrying stale
+ * attributes got scheduled after another call carrying updated ones.
+ *
+ * To do so, the function first assumes that a more recent ctime means
+ * that the attributes in fattr are newer, however it also attempt to
+ * catch the case where ctime either didn't change, or went backwards
+ * (if someone reset the clock on the server) by looking at whether
+ * or not this RPC call was started after the inode was last updated.
+ * Note also the check for wraparound of 'attr_gencount'
+ *
+ * The function returns 'true' if it thinks the attributes in 'fattr' are
+ * more recent than the ones cached in the inode.
+ *
+ */
+static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
+{
+ const struct nfs_inode *nfsi = NFS_I(inode);
+
+ return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
+ nfs_ctime_need_update(inode, fattr) ||
+ nfs_size_need_update(inode, fattr) ||
+ ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
+}
+
+static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+ if (nfs_inode_attrs_need_update(inode, fattr))
+ return nfs_update_inode(inode, fattr);
+ return nfs_check_inode_attributes(inode, fattr);
+}
+
/**
* nfs_refresh_inode - try to update the inode attribute cache
* @inode - pointer to inode
@@ -960,21 +980,28 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
*/
int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
{
- struct nfs_inode *nfsi = NFS_I(inode);
int status;
if ((fattr->valid & NFS_ATTR_FATTR) == 0)
return 0;
spin_lock(&inode->i_lock);
- if (time_after(fattr->time_start, nfsi->last_updated))
- status = nfs_update_inode(inode, fattr);
- else
- status = nfs_check_inode_attributes(inode, fattr);
-
+ status = nfs_refresh_inode_locked(inode, fattr);
spin_unlock(&inode->i_lock);
return status;
}
+static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ if (S_ISDIR(inode->i_mode))
+ nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+ if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+ return 0;
+ return nfs_refresh_inode_locked(inode, fattr);
+}
+
/**
* nfs_post_op_update_inode - try to update the inode attribute cache
* @inode - pointer to inode
@@ -991,14 +1018,12 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
*/
int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
{
- struct nfs_inode *nfsi = NFS_I(inode);
+ int status;
spin_lock(&inode->i_lock);
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
- if (S_ISDIR(inode->i_mode))
- nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+ status = nfs_post_op_update_inode_locked(inode, fattr);
spin_unlock(&inode->i_lock);
- return nfs_refresh_inode(inode, fattr);
+ return status;
}
/**
@@ -1014,6 +1039,15 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
*/
int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
{
+ int status;
+
+ spin_lock(&inode->i_lock);
+ /* Don't do a WCC update if these attributes are already stale */
+ if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
+ !nfs_inode_attrs_need_update(inode, fattr)) {
+ fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC);
+ goto out_noforce;
+ }
if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
(fattr->valid & NFS_ATTR_WCC_V4) == 0) {
fattr->pre_change_attr = NFS_I(inode)->change_attr;
@@ -1026,7 +1060,10 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
fattr->pre_size = i_size_read(inode);
fattr->valid |= NFS_ATTR_WCC;
}
- return nfs_post_op_update_inode(inode, fattr);
+out_noforce:
+ status = nfs_post_op_update_inode_locked(inode, fattr);
+ spin_unlock(&inode->i_lock);
+ return status;
}
/*
@@ -1092,7 +1129,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
}
/* If ctime has changed we should definitely clear access+acl caches */
if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
- invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
} else if (nfsi->change_attr != fattr->change_attr) {
dprintk("NFS: change_attr change on server for file %s/%ld\n",
inode->i_sb->s_id, inode->i_ino);
@@ -1126,6 +1163,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_gid != fattr->gid)
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ if (inode->i_nlink != fattr->nlink)
+ invalid |= NFS_INO_INVALID_ATTR;
+
inode->i_mode = fattr->mode;
inode->i_nlink = fattr->nlink;
inode->i_uid = fattr->uid;
@@ -1145,18 +1185,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
- nfsi->last_updated = now;
+ nfsi->attr_gencount = nfs_inc_attr_generation_counter();
} else {
if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
}
- /*
- * Avoid jiffy wraparound issues with nfsi->last_updated
- */
- if (!time_in_range(nfsi->last_updated, nfsi->read_cache_jiffies, now))
- nfsi->last_updated = nfsi->read_cache_jiffies;
}
invalid &= ~NFS_INO_INVALID_ATTR;
/* Don't invalidate the data if we were to blame */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 24241fcbb98..d212ee41caf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -153,6 +153,7 @@ extern void nfs4_clear_inode(struct inode *);
void nfs_zap_acl_cache(struct inode *inode);
/* super.c */
+void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
extern struct file_system_type nfs_xdev_fs_type;
#ifdef CONFIG_NFS_V4
extern struct file_system_type nfs4_xdev_fs_type;
@@ -163,8 +164,8 @@ extern struct rpc_stat nfs_rpcstat;
extern int __init register_nfs_fs(void);
extern void __exit unregister_nfs_fs(void);
-extern void nfs_sb_active(struct nfs_server *server);
-extern void nfs_sb_deactive(struct nfs_server *server);
+extern void nfs_sb_active(struct super_block *sb);
+extern void nfs_sb_deactive(struct super_block *sb);
/* namespace.c */
extern char *nfs_path(const char *base,
@@ -276,3 +277,23 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
PAGE_SIZE - 1) >> PAGE_SHIFT;
}
+#define IPV6_SCOPE_DELIMITER '%'
+
+/*
+ * Set the port number in an address. Be agnostic about the address
+ * family.
+ */
+static inline void nfs_set_port(struct sockaddr *sap, unsigned short port)
+{
+ struct sockaddr_in *ap = (struct sockaddr_in *)sap;
+ struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap;
+
+ switch (sap->sa_family) {
+ case AF_INET:
+ ap->sin_port = htons(port);
+ break;
+ case AF_INET6:
+ ap6->sin6_port = htons(port);
+ break;
+ }
+}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 779d2eb649c..086a6830d78 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -14,6 +14,7 @@
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/sched.h>
#include <linux/nfs_fs.h>
+#include "internal.h"
#ifdef RPC_DEBUG
# define NFSDBG_FACILITY NFSDBG_MOUNT
@@ -98,7 +99,7 @@ out_call_err:
out_mnt_err:
dprintk("NFS: MNT server returned result %d\n", result.status);
- status = -EACCES;
+ status = nfs_stat_to_errno(result.status);
goto out;
}
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 66df08dd1ca..64a288ee046 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -105,7 +105,10 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
dprintk("--> nfs_follow_mountpoint()\n");
- BUG_ON(IS_ROOT(dentry));
+ err = -ESTALE;
+ if (IS_ROOT(dentry))
+ goto out_err;
+
dprintk("%s: enter\n", __func__);
dput(nd->path.dentry);
nd->path.dentry = dget(dentry);
@@ -189,7 +192,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
struct nfs_clone_mount *mountdata)
{
#ifdef CONFIG_NFS_V4
- struct vfsmount *mnt = NULL;
+ struct vfsmount *mnt = ERR_PTR(-EINVAL);
switch (server->nfs_client->rpc_ops->version) {
case 2:
case 3:
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 423842f51ac..cef62557c87 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -229,6 +229,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
dprintk("NFS call getacl\n");
msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL];
+ nfs_fattr_init(&fattr);
status = rpc_call_sync(server->client_acl, &msg, 0);
dprintk("NFS reply getacl: %d\n", status);
@@ -322,6 +323,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
dprintk("NFS call setacl\n");
msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
+ nfs_fattr_init(&fattr);
status = rpc_call_sync(server->client_acl, &msg, 0);
nfs_access_zap_cache(inode);
nfs_zap_acl_cache(inode);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 1e750e4574a..c55be7a7679 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -699,7 +699,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
}
static int
-nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle,
struct nfs_fsinfo *info)
{
struct rpc_message msg = {
@@ -711,11 +711,27 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
dprintk("NFS call fsinfo\n");
nfs_fattr_init(info->fattr);
- status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+ status = rpc_call_sync(client, &msg, 0);
dprintk("NFS reply fsinfo: %d\n", status);
return status;
}
+/*
+ * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via
+ * nfs_create_server
+ */
+static int
+nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
+ struct nfs_fsinfo *info)
+{
+ int status;
+
+ status = do_proc_fsinfo(server->client, fhandle, info);
+ if (status && server->nfs_client->cl_rpcclient != server->client)
+ status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info);
+ return status;
+}
+
static int
nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
struct nfs_pathconf *info)
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index b112857301f..30befc39b3c 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -93,21 +93,52 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
return 0;
}
-/*
- * Check if the string represents a "valid" IPv4 address
- */
-static inline int valid_ipaddr4(const char *buf)
+static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
+ char *page, char *page2,
+ const struct nfs4_fs_location *location)
{
- int rc, count, in[4];
-
- rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
- if (rc != 4)
- return -EINVAL;
- for (count = 0; count < 4; count++) {
- if (in[count] > 255)
- return -EINVAL;
+ struct vfsmount *mnt = ERR_PTR(-ENOENT);
+ char *mnt_path;
+ int page2len;
+ unsigned int s;
+
+ mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
+ if (IS_ERR(mnt_path))
+ return mnt;
+ mountdata->mnt_path = mnt_path;
+ page2 += strlen(mnt_path) + 1;
+ page2len = PAGE_SIZE - strlen(mnt_path) - 1;
+
+ for (s = 0; s < location->nservers; s++) {
+ const struct nfs4_string *buf = &location->servers[s];
+ struct sockaddr_storage addr;
+
+ if (buf->len <= 0 || buf->len >= PAGE_SIZE)
+ continue;
+
+ mountdata->addr = (struct sockaddr *)&addr;
+
+ if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
+ continue;
+ nfs_parse_ip_address(buf->data, buf->len,
+ mountdata->addr, &mountdata->addrlen);
+ if (mountdata->addr->sa_family == AF_UNSPEC)
+ continue;
+ nfs_set_port(mountdata->addr, NFS_PORT);
+
+ strncpy(page2, buf->data, page2len);
+ page2[page2len] = '\0';
+ mountdata->hostname = page2;
+
+ snprintf(page, PAGE_SIZE, "%s:%s",
+ mountdata->hostname,
+ mountdata->mnt_path);
+
+ mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata);
+ if (!IS_ERR(mnt))
+ break;
}
- return 0;
+ return mnt;
}
/**
@@ -128,7 +159,6 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
.authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor,
};
char *page = NULL, *page2 = NULL;
- unsigned int s;
int loc, error;
if (locations == NULL || locations->nlocations <= 0)
@@ -152,53 +182,16 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
goto out;
}
- loc = 0;
- while (loc < locations->nlocations && IS_ERR(mnt)) {
+ for (loc = 0; loc < locations->nlocations; loc++) {
const struct nfs4_fs_location *location = &locations->locations[loc];
- char *mnt_path;
if (location == NULL || location->nservers <= 0 ||
- location->rootpath.ncomponents == 0) {
- loc++;
+ location->rootpath.ncomponents == 0)
continue;
- }
- mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
- if (IS_ERR(mnt_path)) {
- loc++;
- continue;
- }
- mountdata.mnt_path = mnt_path;
-
- s = 0;
- while (s < location->nservers) {
- struct sockaddr_in addr = {
- .sin_family = AF_INET,
- .sin_port = htons(NFS_PORT),
- };
-
- if (location->servers[s].len <= 0 ||
- valid_ipaddr4(location->servers[s].data) < 0) {
- s++;
- continue;
- }
-
- mountdata.hostname = location->servers[s].data;
- addr.sin_addr.s_addr = in_aton(mountdata.hostname),
- mountdata.addr = (struct sockaddr *)&addr;
- mountdata.addrlen = sizeof(addr);
-
- snprintf(page, PAGE_SIZE, "%s:%s",
- mountdata.hostname,
- mountdata.mnt_path);
-
- mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, &mountdata);
- if (!IS_ERR(mnt)) {
- break;
- }
- s++;
- }
- loc++;
+ mnt = try_location(&mountdata, page, page2, location);
+ if (!IS_ERR(mnt))
+ break;
}
out:
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c910413eaec..83e700a2b0c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1659,8 +1659,10 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct nfs_open_context *ctx;
ctx = nfs_file_open_context(sattr->ia_file);
- cred = ctx->cred;
- state = ctx->state;
+ if (ctx) {
+ cred = ctx->cred;
+ state = ctx->state;
+ }
}
status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 4dbb84df1b6..193465210d7 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -65,14 +65,20 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
dprintk("%s: call getattr\n", __func__);
nfs_fattr_init(fattr);
- status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+ status = rpc_call_sync(server->client, &msg, 0);
+ /* Retry with default authentication if different */
+ if (status && server->nfs_client->cl_rpcclient != server->client)
+ status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
dprintk("%s: reply getattr: %d\n", __func__, status);
if (status)
return status;
dprintk("%s: call statfs\n", __func__);
msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS];
msg.rpc_resp = &fsinfo;
- status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
+ status = rpc_call_sync(server->client, &msg, 0);
+ /* Retry with default authentication if different */
+ if (status && server->nfs_client->cl_rpcclient != server->client)
+ status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
dprintk("%s: reply statfs: %d\n", __func__, status);
if (status)
return status;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index ffb697416cb..a3b0061dfd4 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -91,6 +91,7 @@ enum {
/* Mount options that take string arguments */
Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
Opt_addr, Opt_mountaddr, Opt_clientaddr,
+ Opt_lookupcache,
/* Special mount options */
Opt_userspace, Opt_deprecated, Opt_sloppy,
@@ -154,6 +155,8 @@ static const match_table_t nfs_mount_option_tokens = {
{ Opt_mounthost, "mounthost=%s" },
{ Opt_mountaddr, "mountaddr=%s" },
+ { Opt_lookupcache, "lookupcache=%s" },
+
{ Opt_err, NULL }
};
@@ -200,6 +203,22 @@ static const match_table_t nfs_secflavor_tokens = {
{ Opt_sec_err, NULL }
};
+enum {
+ Opt_lookupcache_all, Opt_lookupcache_positive,
+ Opt_lookupcache_none,
+
+ Opt_lookupcache_err
+};
+
+static match_table_t nfs_lookupcache_tokens = {
+ { Opt_lookupcache_all, "all" },
+ { Opt_lookupcache_positive, "pos" },
+ { Opt_lookupcache_positive, "positive" },
+ { Opt_lookupcache_none, "none" },
+
+ { Opt_lookupcache_err, NULL }
+};
+
static void nfs_umount_begin(struct super_block *);
static int nfs_statfs(struct dentry *, struct kstatfs *);
@@ -209,7 +228,6 @@ static int nfs_get_sb(struct file_system_type *, int, const char *, void *, stru
static int nfs_xdev_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
static void nfs_kill_super(struct super_block *);
-static void nfs_put_super(struct super_block *);
static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
static struct file_system_type nfs_fs_type = {
@@ -232,7 +250,6 @@ static const struct super_operations nfs_sops = {
.alloc_inode = nfs_alloc_inode,
.destroy_inode = nfs_destroy_inode,
.write_inode = nfs_write_inode,
- .put_super = nfs_put_super,
.statfs = nfs_statfs,
.clear_inode = nfs_clear_inode,
.umount_begin = nfs_umount_begin,
@@ -337,26 +354,20 @@ void __exit unregister_nfs_fs(void)
unregister_filesystem(&nfs_fs_type);
}
-void nfs_sb_active(struct nfs_server *server)
+void nfs_sb_active(struct super_block *sb)
{
- atomic_inc(&server->active);
-}
+ struct nfs_server *server = NFS_SB(sb);
-void nfs_sb_deactive(struct nfs_server *server)
-{
- if (atomic_dec_and_test(&server->active))
- wake_up(&server->active_wq);
+ if (atomic_inc_return(&server->active) == 1)
+ atomic_inc(&sb->s_active);
}
-static void nfs_put_super(struct super_block *sb)
+void nfs_sb_deactive(struct super_block *sb)
{
struct nfs_server *server = NFS_SB(sb);
- /*
- * Make sure there are no outstanding ops to this server.
- * If so, wait for them to finish before allowing the
- * unmount to continue.
- */
- wait_event(server->active_wq, atomic_read(&server->active) == 0);
+
+ if (atomic_dec_and_test(&server->active))
+ deactivate_super(sb);
}
/*
@@ -664,25 +675,6 @@ static void nfs_umount_begin(struct super_block *sb)
}
/*
- * Set the port number in an address. Be agnostic about the address family.
- */
-static void nfs_set_port(struct sockaddr *sap, unsigned short port)
-{
- switch (sap->sa_family) {
- case AF_INET: {
- struct sockaddr_in *ap = (struct sockaddr_in *)sap;
- ap->sin_port = htons(port);
- break;
- }
- case AF_INET6: {
- struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
- ap->sin6_port = htons(port);
- break;
- }
- }
-}
-
-/*
* Sanity-check a server address provided by the mount command.
*
* Address family must be initialized, and address must not be
@@ -724,20 +716,22 @@ static void nfs_parse_ipv4_address(char *string, size_t str_len,
*addr_len = 0;
}
-#define IPV6_SCOPE_DELIMITER '%'
-
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
- const char *delim,
- struct sockaddr_in6 *sin6)
+static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
+ const char *delim,
+ struct sockaddr_in6 *sin6)
{
char *p;
size_t len;
- if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
- return ;
+ if ((string + str_len) == delim)
+ return 1;
+
if (*delim != IPV6_SCOPE_DELIMITER)
- return;
+ return 0;
+
+ if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+ return 0;
len = (string + str_len) - delim - 1;
p = kstrndup(delim + 1, len, GFP_KERNEL);
@@ -750,14 +744,20 @@ static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
scope_id = dev->ifindex;
dev_put(dev);
} else {
- /* scope_id is set to zero on error */
- strict_strtoul(p, 10, &scope_id);
+ if (strict_strtoul(p, 10, &scope_id) == 0) {
+ kfree(p);
+ return 0;
+ }
}
kfree(p);
+
sin6->sin6_scope_id = scope_id;
dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
+ return 1;
}
+
+ return 0;
}
static void nfs_parse_ipv6_address(char *string, size_t str_len,
@@ -773,9 +773,11 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len,
sin6->sin6_family = AF_INET6;
*addr_len = sizeof(*sin6);
- if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) {
- nfs_parse_ipv6_scope_id(string, str_len, delim, sin6);
- return;
+ if (in6_pton(string, str_len, addr,
+ IPV6_SCOPE_DELIMITER, &delim) != 0) {
+ if (nfs_parse_ipv6_scope_id(string, str_len,
+ delim, sin6) != 0)
+ return;
}
}
@@ -798,7 +800,7 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len,
* If there is a problem constructing the new sockaddr, set the address
* family to AF_UNSPEC.
*/
-static void nfs_parse_ip_address(char *string, size_t str_len,
+void nfs_parse_ip_address(char *string, size_t str_len,
struct sockaddr *sap, size_t *addr_len)
{
unsigned int i, colons;
@@ -1258,6 +1260,30 @@ static int nfs_parse_mount_options(char *raw,
&mnt->mount_server.addrlen);
kfree(string);
break;
+ case Opt_lookupcache:
+ string = match_strdup(args);
+ if (string == NULL)
+ goto out_nomem;
+ token = match_token(string,
+ nfs_lookupcache_tokens, args);
+ kfree(string);
+ switch (token) {
+ case Opt_lookupcache_all:
+ mnt->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE);
+ break;
+ case Opt_lookupcache_positive:
+ mnt->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE;
+ mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG;
+ break;
+ case Opt_lookupcache_none:
+ mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE;
+ break;
+ default:
+ errors++;
+ dfprintk(MOUNT, "NFS: invalid "
+ "lookupcache argument\n");
+ };
+ break;
/*
* Special options
@@ -1558,7 +1584,7 @@ static int nfs_validate_mount_data(void *options,
* Translate to nfs_parsed_mount_data, which nfs_fill_super
* can deal with.
*/
- args->flags = data->flags;
+ args->flags = data->flags & NFS_MOUNT_FLAGMASK;
args->rsize = data->rsize;
args->wsize = data->wsize;
args->timeo = data->timeo;
@@ -2433,7 +2459,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
compare_super = NULL;
/* Get a superblock - note that we may end up sharing one that already exists */
- s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
+ s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
if (IS_ERR(s)) {
error = PTR_ERR(s);
goto out_err_nosb;
@@ -2518,7 +2544,7 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
compare_super = NULL;
/* Get a superblock - note that we may end up sharing one that already exists */
- s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
+ s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
if (IS_ERR(s)) {
error = PTR_ERR(s);
goto out_err_nosb;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index f089e5839d7..ecc29534777 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -99,7 +99,7 @@ static void nfs_async_unlink_release(void *calldata)
nfs_dec_sillycount(data->dir);
nfs_free_unlinkdata(data);
- nfs_sb_deactive(NFS_SB(sb));
+ nfs_sb_deactive(sb);
}
static const struct rpc_call_ops nfs_unlink_ops = {
@@ -118,6 +118,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
.rpc_message = &msg,
.callback_ops = &nfs_unlink_ops,
.callback_data = data,
+ .workqueue = nfsiod_workqueue,
.flags = RPC_TASK_ASYNC,
};
struct rpc_task *task;
@@ -149,7 +150,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
nfs_dec_sillycount(dir);
return 0;
}
- nfs_sb_active(NFS_SERVER(dir));
+ nfs_sb_active(dir->i_sb);
data->args.fh = NFS_FH(dir);
nfs_fattr_init(&data->res.dir_attr);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3229e217c77..9f9845859fc 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1427,8 +1427,9 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
.bdi = mapping->backing_dev_info,
.sync_mode = WB_SYNC_NONE,
.nr_to_write = LONG_MAX,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
.for_writepages = 1,
- .range_cyclic = 1,
};
int ret;
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 64965e1c21c..9b0efdad891 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -13,9 +13,7 @@
#include <linux/nls.h>
#include <linux/kernel.h>
#include <linux/errno.h>
-#ifdef CONFIG_KMOD
#include <linux/kmod.h>
-#endif
#include <linux/spinlock.h>
static struct nls_table default_table;
@@ -215,24 +213,7 @@ static struct nls_table *find_nls(char *charset)
struct nls_table *load_nls(char *charset)
{
- struct nls_table *nls;
-#ifdef CONFIG_KMOD
- int ret;
-#endif
-
- nls = find_nls(charset);
- if (nls)
- return nls;
-
-#ifdef CONFIG_KMOD
- ret = request_module("nls_%s", charset);
- if (ret != 0) {
- printk("Unable to load NLS charset %s\n", charset);
- return NULL;
- }
- nls = find_nls(charset);
-#endif
- return nls;
+ return try_then_request_module(find_nls(charset), "nls_%s", charset);
}
void unload_nls(struct nls_table *nls)
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index d020866d423..3140a4429af 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -439,7 +439,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
pages[nr] = *cached_page;
page_cache_get(*cached_page);
if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
- __pagevec_lru_add(lru_pvec);
+ __pagevec_lru_add_file(lru_pvec);
*cached_page = NULL;
}
index++;
@@ -2084,7 +2084,7 @@ err_out:
OSYNC_METADATA|OSYNC_DATA);
}
}
- pagevec_lru_add(&lru_pvec);
+ pagevec_lru_add_file(&lru_pvec);
ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
written ? "written" : "status", (unsigned long)written,
(long)status);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index fbeb2f372a9..cfb0c80690a 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -195,6 +195,14 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
return ERR_PTR(res);
}
+static ssize_t part_partition_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hd_struct *p = dev_to_part(dev);
+
+ return sprintf(buf, "%d\n", p->partno);
+}
+
static ssize_t part_start_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -260,6 +268,7 @@ ssize_t part_fail_store(struct device *dev,
}
#endif
+static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
@@ -269,6 +278,7 @@ static struct device_attribute dev_attr_fail =
#endif
static struct attribute *part_attrs[] = {
+ &dev_attr_partition.attr,
&dev_attr_start.attr,
&dev_attr_size.attr,
&dev_attr_stat.attr,
diff --git a/fs/proc/array.c b/fs/proc/array.c
index f4bc0e78953..bb9f4b05703 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -388,20 +388,20 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
/* add up live thread stats at the group level */
if (whole) {
+ struct task_cputime cputime;
struct task_struct *t = task;
do {
min_flt += t->min_flt;
maj_flt += t->maj_flt;
- utime = cputime_add(utime, task_utime(t));
- stime = cputime_add(stime, task_stime(t));
gtime = cputime_add(gtime, task_gtime(t));
t = next_thread(t);
} while (t != task);
min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
- utime = cputime_add(utime, sig->utime);
- stime = cputime_add(stime, sig->stime);
+ thread_group_cputime(task, &cputime);
+ utime = cputime.utime;
+ stime = cputime.stime;
gtime = cputime_add(gtime, sig->gtime);
}
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 59ea42e1ef0..7ea52c79b2d 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -30,6 +30,7 @@
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/pagemap.h>
+#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/swap.h>
#include <linux/slab.h>
@@ -136,6 +137,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
unsigned long allowed;
struct vmalloc_info vmi;
long cached;
+ unsigned long pages[NR_LRU_LISTS];
+ int lru;
/*
* display in kilobytes.
@@ -154,51 +157,70 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
get_vmalloc_info(&vmi);
+ for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+ pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
/*
* Tagged format, for easy grepping and expansion.
*/
len = sprintf(page,
- "MemTotal: %8lu kB\n"
- "MemFree: %8lu kB\n"
- "Buffers: %8lu kB\n"
- "Cached: %8lu kB\n"
- "SwapCached: %8lu kB\n"
- "Active: %8lu kB\n"
- "Inactive: %8lu kB\n"
+ "MemTotal: %8lu kB\n"
+ "MemFree: %8lu kB\n"
+ "Buffers: %8lu kB\n"
+ "Cached: %8lu kB\n"
+ "SwapCached: %8lu kB\n"
+ "Active: %8lu kB\n"
+ "Inactive: %8lu kB\n"
+ "Active(anon): %8lu kB\n"
+ "Inactive(anon): %8lu kB\n"
+ "Active(file): %8lu kB\n"
+ "Inactive(file): %8lu kB\n"
+#ifdef CONFIG_UNEVICTABLE_LRU
+ "Unevictable: %8lu kB\n"
+ "Mlocked: %8lu kB\n"
+#endif
#ifdef CONFIG_HIGHMEM
- "HighTotal: %8lu kB\n"
- "HighFree: %8lu kB\n"
- "LowTotal: %8lu kB\n"
- "LowFree: %8lu kB\n"
+ "HighTotal: %8lu kB\n"
+ "HighFree: %8lu kB\n"
+ "LowTotal: %8lu kB\n"
+ "LowFree: %8lu kB\n"
#endif
- "SwapTotal: %8lu kB\n"
- "SwapFree: %8lu kB\n"
- "Dirty: %8lu kB\n"
- "Writeback: %8lu kB\n"
- "AnonPages: %8lu kB\n"
- "Mapped: %8lu kB\n"
- "Slab: %8lu kB\n"
- "SReclaimable: %8lu kB\n"
- "SUnreclaim: %8lu kB\n"
- "PageTables: %8lu kB\n"
+ "SwapTotal: %8lu kB\n"
+ "SwapFree: %8lu kB\n"
+ "Dirty: %8lu kB\n"
+ "Writeback: %8lu kB\n"
+ "AnonPages: %8lu kB\n"
+ "Mapped: %8lu kB\n"
+ "Slab: %8lu kB\n"
+ "SReclaimable: %8lu kB\n"
+ "SUnreclaim: %8lu kB\n"
+ "PageTables: %8lu kB\n"
#ifdef CONFIG_QUICKLIST
- "Quicklists: %8lu kB\n"
+ "Quicklists: %8lu kB\n"
#endif
- "NFS_Unstable: %8lu kB\n"
- "Bounce: %8lu kB\n"
- "WritebackTmp: %8lu kB\n"
- "CommitLimit: %8lu kB\n"
- "Committed_AS: %8lu kB\n"
- "VmallocTotal: %8lu kB\n"
- "VmallocUsed: %8lu kB\n"
- "VmallocChunk: %8lu kB\n",
+ "NFS_Unstable: %8lu kB\n"
+ "Bounce: %8lu kB\n"
+ "WritebackTmp: %8lu kB\n"
+ "CommitLimit: %8lu kB\n"
+ "Committed_AS: %8lu kB\n"
+ "VmallocTotal: %8lu kB\n"
+ "VmallocUsed: %8lu kB\n"
+ "VmallocChunk: %8lu kB\n",
K(i.totalram),
K(i.freeram),
K(i.bufferram),
K(cached),
K(total_swapcache_pages),
- K(global_page_state(NR_ACTIVE)),
- K(global_page_state(NR_INACTIVE)),
+ K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]),
+ K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
+ K(pages[LRU_ACTIVE_ANON]),
+ K(pages[LRU_INACTIVE_ANON]),
+ K(pages[LRU_ACTIVE_FILE]),
+ K(pages[LRU_INACTIVE_FILE]),
+#ifdef CONFIG_UNEVICTABLE_LRU
+ K(pages[LRU_UNEVICTABLE]),
+ K(global_page_state(NR_MLOCK)),
+#endif
#ifdef CONFIG_HIGHMEM
K(i.totalhigh),
K(i.freehigh),
@@ -500,17 +522,13 @@ static const struct file_operations proc_vmalloc_operations = {
static int show_stat(struct seq_file *p, void *v)
{
- int i;
+ int i, j;
unsigned long jif;
cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
cputime64_t guest;
u64 sum = 0;
struct timespec boottime;
- unsigned int *per_irq_sum;
-
- per_irq_sum = kzalloc(sizeof(unsigned int)*NR_IRQS, GFP_KERNEL);
- if (!per_irq_sum)
- return -ENOMEM;
+ unsigned int per_irq_sum;
user = nice = system = idle = iowait =
irq = softirq = steal = cputime64_zero;
@@ -519,8 +537,6 @@ static int show_stat(struct seq_file *p, void *v)
jif = boottime.tv_sec;
for_each_possible_cpu(i) {
- int j;
-
user = cputime64_add(user, kstat_cpu(i).cpustat.user);
nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
system = cputime64_add(system, kstat_cpu(i).cpustat.system);
@@ -530,11 +546,10 @@ static int show_stat(struct seq_file *p, void *v)
softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
- for (j = 0; j < NR_IRQS; j++) {
- unsigned int temp = kstat_cpu(i).irqs[j];
- sum += temp;
- per_irq_sum[j] += temp;
- }
+
+ for_each_irq_nr(j)
+ sum += kstat_irqs_cpu(j, i);
+
sum += arch_irq_stat_cpu(i);
}
sum += arch_irq_stat();
@@ -576,8 +591,15 @@ static int show_stat(struct seq_file *p, void *v)
}
seq_printf(p, "intr %llu", (unsigned long long)sum);
- for (i = 0; i < NR_IRQS; i++)
- seq_printf(p, " %u", per_irq_sum[i]);
+ /* sum again ? it could be updated? */
+ for_each_irq_nr(j) {
+ per_irq_sum = 0;
+
+ for_each_possible_cpu(i)
+ per_irq_sum += kstat_irqs_cpu(j, i);
+
+ seq_printf(p, " %u", per_irq_sum);
+ }
seq_printf(p,
"\nctxt %llu\n"
@@ -591,7 +613,6 @@ static int show_stat(struct seq_file *p, void *v)
nr_running(),
nr_iowait());
- kfree(per_irq_sum);
return 0;
}
@@ -630,15 +651,14 @@ static const struct file_operations proc_stat_operations = {
*/
static void *int_seq_start(struct seq_file *f, loff_t *pos)
{
- return (*pos <= NR_IRQS) ? pos : NULL;
+ return (*pos <= nr_irqs) ? pos : NULL;
}
+
static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos)
{
(*pos)++;
- if (*pos > NR_IRQS)
- return NULL;
- return pos;
+ return (*pos <= nr_irqs) ? pos : NULL;
}
static void int_seq_stop(struct seq_file *f, void *v)
@@ -646,7 +666,6 @@ static void int_seq_stop(struct seq_file *f, void *v)
/* Nothing to do */
}
-
static const struct seq_operations int_seq_ops = {
.start = int_seq_start,
.next = int_seq_next,
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 841368b87a2..cd9ca67f841 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -32,9 +32,6 @@ static size_t elfcorebuf_sz;
/* Total size of vmcore file. */
static u64 vmcore_size;
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-
struct proc_dir_entry *proc_vmcore = NULL;
/* Reads a page from the oldmem device from given offset. */
@@ -647,7 +644,7 @@ static int __init vmcore_init(void)
int rc = 0;
/* If elfcorehdr= has been passed in cmdline, then capture the dump.*/
- if (!(elfcorehdr_addr < ELFCORE_ADDR_MAX))
+ if (!(is_vmcore_usable()))
return rc;
rc = parse_crash_elf_headers();
if (rc) {
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 5145cb9125a..76acdbc3461 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -112,12 +112,12 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
goto add_error;
if (!pagevec_add(&lru_pvec, page))
- __pagevec_lru_add(&lru_pvec);
+ __pagevec_lru_add_file(&lru_pvec);
unlock_page(page);
}
- pagevec_lru_add(&lru_pvec);
+ pagevec_lru_add_file(&lru_pvec);
return 0;
fsize_exceeded:
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index b13123424e4..f031d1c925f 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -61,6 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
inode->i_mapping->a_ops = &ramfs_aops;
inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+ mapping_set_unevictable(inode->i_mapping);
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
switch (mode & S_IFMT) {
default:
diff --git a/fs/seq_file.c b/fs/seq_file.c
index bd20f7f5a93..eba2eabcd2b 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -452,17 +452,34 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits)
{
- size_t len = bitmap_scnprintf_len(nr_bits);
+ if (m->count < m->size) {
+ int len = bitmap_scnprintf(m->buf + m->count,
+ m->size - m->count, bits, nr_bits);
+ if (m->count + len < m->size) {
+ m->count += len;
+ return 0;
+ }
+ }
+ m->count = m->size;
+ return -1;
+}
+EXPORT_SYMBOL(seq_bitmap);
- if (m->count + len < m->size) {
- bitmap_scnprintf(m->buf + m->count, m->size - m->count,
- bits, nr_bits);
- m->count += len;
- return 0;
+int seq_bitmap_list(struct seq_file *m, unsigned long *bits,
+ unsigned int nr_bits)
+{
+ if (m->count < m->size) {
+ int len = bitmap_scnlistprintf(m->buf + m->count,
+ m->size - m->count, bits, nr_bits);
+ if (m->count + len < m->size) {
+ m->count += len;
+ return 0;
+ }
}
m->count = m->size;
return -1;
}
+EXPORT_SYMBOL(seq_bitmap_list);
static void *single_start(struct seq_file *p, loff_t *pos)
{
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 006fc64227d..66f6e58a7e4 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -61,6 +61,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
int size = dentry->d_inode->i_size;
loff_t offs = *off;
int count = min_t(size_t, bytes, PAGE_SIZE);
+ char *temp;
if (size) {
if (offs > size)
@@ -69,23 +70,33 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
count = size - offs;
}
+ temp = kmalloc(count, GFP_KERNEL);
+ if (!temp)
+ return -ENOMEM;
+
mutex_lock(&bb->mutex);
count = fill_read(dentry, bb->buffer, offs, count);
- if (count < 0)
- goto out_unlock;
+ if (count < 0) {
+ mutex_unlock(&bb->mutex);
+ goto out_free;
+ }
- if (copy_to_user(userbuf, bb->buffer, count)) {
+ memcpy(temp, bb->buffer, count);
+
+ mutex_unlock(&bb->mutex);
+
+ if (copy_to_user(userbuf, temp, count)) {
count = -EFAULT;
- goto out_unlock;
+ goto out_free;
}
pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
*off = offs + count;
- out_unlock:
- mutex_unlock(&bb->mutex);
+ out_free:
+ kfree(temp);
return count;
}
@@ -118,6 +129,7 @@ static ssize_t write(struct file *file, const char __user *userbuf,
int size = dentry->d_inode->i_size;
loff_t offs = *off;
int count = min_t(size_t, bytes, PAGE_SIZE);
+ char *temp;
if (size) {
if (offs > size)
@@ -126,19 +138,27 @@ static ssize_t write(struct file *file, const char __user *userbuf,
count = size - offs;
}
- mutex_lock(&bb->mutex);
+ temp = kmalloc(count, GFP_KERNEL);
+ if (!temp)
+ return -ENOMEM;
- if (copy_from_user(bb->buffer, userbuf, count)) {
+ if (copy_from_user(temp, userbuf, count)) {
count = -EFAULT;
- goto out_unlock;
+ goto out_free;
}
+ mutex_lock(&bb->mutex);
+
+ memcpy(bb->buffer, temp, count);
+
count = flush_write(dentry, bb->buffer, offs, count);
+ mutex_unlock(&bb->mutex);
+
if (count > 0)
*off = offs + count;
- out_unlock:
- mutex_unlock(&bb->mutex);
+out_free:
+ kfree(temp);
return count;
}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index aedaeba82ae..3a05a596e3b 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -370,17 +370,17 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
memset(acxt, 0, sizeof(*acxt));
acxt->parent_sd = parent_sd;
- /* Lookup parent inode. inode initialization and I_NEW
- * clearing are protected by sysfs_mutex. By grabbing it and
- * looking up with _nowait variant, inode state can be
- * determined reliably.
+ /* Lookup parent inode. inode initialization is protected by
+ * sysfs_mutex, so inode existence can be determined by
+ * looking up inode while holding sysfs_mutex.
*/
mutex_lock(&sysfs_mutex);
- inode = ilookup5_nowait(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test,
- parent_sd);
+ inode = ilookup5(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test,
+ parent_sd);
+ if (inode) {
+ WARN_ON(inode->i_state & I_NEW);
- if (inode && !(inode->i_state & I_NEW)) {
/* parent inode available */
acxt->parent_inode = inode;
@@ -393,8 +393,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
mutex_lock(&inode->i_mutex);
mutex_lock(&sysfs_mutex);
}
- } else
- iput(inode);
+ }
}
/**
@@ -636,6 +635,7 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
return sd;
}
+EXPORT_SYMBOL_GPL(sysfs_get_dirent);
static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
const char *name, struct sysfs_dirent **p_sd)
@@ -829,16 +829,12 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name)
if (!new_dentry)
goto out_unlock;
- /* rename kobject and sysfs_dirent */
+ /* rename sysfs_dirent */
error = -ENOMEM;
new_name = dup_name = kstrdup(new_name, GFP_KERNEL);
if (!new_name)
goto out_unlock;
- error = kobject_set_name(kobj, "%s", new_name);
- if (error)
- goto out_unlock;
-
dup_name = sd->s_name;
sd->s_name = new_name;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index c9e4e5091da..1f4a3f87726 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -19,10 +19,18 @@
#include <linux/poll.h>
#include <linux/list.h>
#include <linux/mutex.h>
+#include <linux/limits.h>
#include <asm/uaccess.h>
#include "sysfs.h"
+/* used in crash dumps to help with debugging */
+static char last_sysfs_file[PATH_MAX];
+void sysfs_printk_last_file(void)
+{
+ printk(KERN_EMERG "last sysfs file: %s\n", last_sysfs_file);
+}
+
/*
* There's one sysfs_buffer for each open file and one
* sysfs_open_dirent for each sysfs_dirent with one or more open
@@ -328,6 +336,11 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
struct sysfs_buffer *buffer;
struct sysfs_ops *ops;
int error = -EACCES;
+ char *p;
+
+ p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
+ if (p)
+ memmove(last_sysfs_file, p, strlen(p) + 1);
/* need attr_sd for attr and ops, its parent for kobj */
if (!sysfs_get_active_two(attr_sd))
@@ -440,7 +453,23 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
return POLLERR|POLLPRI;
}
-void sysfs_notify(struct kobject *k, char *dir, char *attr)
+void sysfs_notify_dirent(struct sysfs_dirent *sd)
+{
+ struct sysfs_open_dirent *od;
+
+ spin_lock(&sysfs_open_dirent_lock);
+
+ od = sd->s_attr.open;
+ if (od) {
+ atomic_inc(&od->event);
+ wake_up_interruptible(&od->poll);
+ }
+
+ spin_unlock(&sysfs_open_dirent_lock);
+}
+EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
+
+void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
{
struct sysfs_dirent *sd = k->sd;
@@ -450,19 +479,8 @@ void sysfs_notify(struct kobject *k, char *dir, char *attr)
sd = sysfs_find_dirent(sd, dir);
if (sd && attr)
sd = sysfs_find_dirent(sd, attr);
- if (sd) {
- struct sysfs_open_dirent *od;
-
- spin_lock(&sysfs_open_dirent_lock);
-
- od = sd->s_attr.open;
- if (od) {
- atomic_inc(&od->event);
- wake_up_interruptible(&od->poll);
- }
-
- spin_unlock(&sysfs_open_dirent_lock);
- }
+ if (sd)
+ sysfs_notify_dirent(sd);
mutex_unlock(&sysfs_mutex);
}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 14f0023984d..ab343e371d6 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -16,6 +16,7 @@
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/init.h>
+#include <linux/module.h>
#include "sysfs.h"
@@ -115,3 +116,17 @@ out_err:
sysfs_dir_cachep = NULL;
goto out;
}
+
+#undef sysfs_get
+struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
+{
+ return __sysfs_get(sd);
+}
+EXPORT_SYMBOL_GPL(sysfs_get);
+
+#undef sysfs_put
+void sysfs_put(struct sysfs_dirent *sd)
+{
+ __sysfs_put(sd);
+}
+EXPORT_SYMBOL_GPL(sysfs_put);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index a5db496f71c..93c6d6b27c4 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -124,7 +124,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name,
struct sysfs_dirent **p_sd);
void sysfs_remove_subdir(struct sysfs_dirent *sd);
-static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
+static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
{
if (sd) {
WARN_ON(!atomic_read(&sd->s_count));
@@ -132,12 +132,14 @@ static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
}
return sd;
}
+#define sysfs_get(sd) __sysfs_get(sd)
-static inline void sysfs_put(struct sysfs_dirent *sd)
+static inline void __sysfs_put(struct sysfs_dirent *sd)
{
if (sd && atomic_dec_and_test(&sd->s_count))
release_sysfs_dirent(sd);
}
+#define sysfs_put(sd) __sysfs_put(sd)
/*
* inode.c
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 73db464cd08..1a4973e1066 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -414,19 +414,21 @@ static int do_budget_space(struct ubifs_info *c)
* @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt -
* @c->lst.taken_empty_lebs
*
- * @empty_lebs are available because they are empty. @freeable_cnt are
- * available because they contain only free and dirty space and the
- * index allocation always occurs after wbufs are synch'ed.
- * @idx_gc_cnt are available because they are index LEBs that have been
- * garbage collected (including trivial GC) and are awaiting the commit
- * before they can be unmapped - note that the in-the-gaps method will
- * grab these if it needs them. @taken_empty_lebs are empty_lebs that
- * have already been allocated for some purpose (also includes those
- * LEBs on the @idx_gc list).
+ * @c->lst.empty_lebs are available because they are empty.
+ * @c->freeable_cnt are available because they contain only free and
+ * dirty space, @c->idx_gc_cnt are available because they are index
+ * LEBs that have been garbage collected and are awaiting the commit
+ * before they can be used. And the in-the-gaps method will grab these
+ * if it needs them. @c->lst.taken_empty_lebs are empty LEBs that have
+ * already been allocated for some purpose.
*
- * Note, @taken_empty_lebs may temporarily be higher by one because of
- * the way we serialize LEB allocations and budgeting. See a comment in
- * 'ubifs_find_free_space()'.
+ * Note, @c->idx_gc_cnt is included to both @c->lst.empty_lebs (because
+ * these LEBs are empty) and to @c->lst.taken_empty_lebs (because they
+ * are taken until after the commit).
+ *
+ * Note, @c->lst.taken_empty_lebs may temporarily be higher by one
+ * because of the way we serialize LEB allocations and budgeting. See a
+ * comment in 'ubifs_find_free_space()'.
*/
lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
c->lst.taken_empty_lebs;
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index 5bb51dac3c1..a0ada596b17 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -91,8 +91,6 @@ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
*
* Note, if the input buffer was not compressed, it is copied to the output
* buffer and %UBIFS_COMPR_NONE is returned in @compr_type.
- *
- * This functions returns %0 on success or a negative error code on failure.
*/
void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
int *compr_type)
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index d7f7645779f..7186400750e 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -222,30 +222,38 @@ void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode)
{
const struct ubifs_inode *ui = ubifs_inode(inode);
- printk(KERN_DEBUG "inode %lu\n", inode->i_ino);
- printk(KERN_DEBUG "size %llu\n",
+ printk(KERN_DEBUG "Dump in-memory inode:");
+ printk(KERN_DEBUG "\tinode %lu\n", inode->i_ino);
+ printk(KERN_DEBUG "\tsize %llu\n",
(unsigned long long)i_size_read(inode));
- printk(KERN_DEBUG "nlink %u\n", inode->i_nlink);
- printk(KERN_DEBUG "uid %u\n", (unsigned int)inode->i_uid);
- printk(KERN_DEBUG "gid %u\n", (unsigned int)inode->i_gid);
- printk(KERN_DEBUG "atime %u.%u\n",
+ printk(KERN_DEBUG "\tnlink %u\n", inode->i_nlink);
+ printk(KERN_DEBUG "\tuid %u\n", (unsigned int)inode->i_uid);
+ printk(KERN_DEBUG "\tgid %u\n", (unsigned int)inode->i_gid);
+ printk(KERN_DEBUG "\tatime %u.%u\n",
(unsigned int)inode->i_atime.tv_sec,
(unsigned int)inode->i_atime.tv_nsec);
- printk(KERN_DEBUG "mtime %u.%u\n",
+ printk(KERN_DEBUG "\tmtime %u.%u\n",
(unsigned int)inode->i_mtime.tv_sec,
(unsigned int)inode->i_mtime.tv_nsec);
- printk(KERN_DEBUG "ctime %u.%u\n",
+ printk(KERN_DEBUG "\tctime %u.%u\n",
(unsigned int)inode->i_ctime.tv_sec,
(unsigned int)inode->i_ctime.tv_nsec);
- printk(KERN_DEBUG "creat_sqnum %llu\n", ui->creat_sqnum);
- printk(KERN_DEBUG "xattr_size %u\n", ui->xattr_size);
- printk(KERN_DEBUG "xattr_cnt %u\n", ui->xattr_cnt);
- printk(KERN_DEBUG "xattr_names %u\n", ui->xattr_names);
- printk(KERN_DEBUG "dirty %u\n", ui->dirty);
- printk(KERN_DEBUG "xattr %u\n", ui->xattr);
- printk(KERN_DEBUG "flags %d\n", ui->flags);
- printk(KERN_DEBUG "compr_type %d\n", ui->compr_type);
- printk(KERN_DEBUG "data_len %d\n", ui->data_len);
+ printk(KERN_DEBUG "\tcreat_sqnum %llu\n", ui->creat_sqnum);
+ printk(KERN_DEBUG "\txattr_size %u\n", ui->xattr_size);
+ printk(KERN_DEBUG "\txattr_cnt %u\n", ui->xattr_cnt);
+ printk(KERN_DEBUG "\txattr_names %u\n", ui->xattr_names);
+ printk(KERN_DEBUG "\tdirty %u\n", ui->dirty);
+ printk(KERN_DEBUG "\txattr %u\n", ui->xattr);
+ printk(KERN_DEBUG "\tbulk_read %u\n", ui->xattr);
+ printk(KERN_DEBUG "\tsynced_i_size %llu\n",
+ (unsigned long long)ui->synced_i_size);
+ printk(KERN_DEBUG "\tui_size %llu\n",
+ (unsigned long long)ui->ui_size);
+ printk(KERN_DEBUG "\tflags %d\n", ui->flags);
+ printk(KERN_DEBUG "\tcompr_type %d\n", ui->compr_type);
+ printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read);
+ printk(KERN_DEBUG "\tread_in_a_row %lu\n", ui->read_in_a_row);
+ printk(KERN_DEBUG "\tdata_len %d\n", ui->data_len);
}
void dbg_dump_node(const struct ubifs_info *c, const void *node)
@@ -647,6 +655,43 @@ void dbg_dump_lprops(struct ubifs_info *c)
}
}
+void dbg_dump_lpt_info(struct ubifs_info *c)
+{
+ int i;
+
+ spin_lock(&dbg_lock);
+ printk(KERN_DEBUG "\tlpt_sz: %lld\n", c->lpt_sz);
+ printk(KERN_DEBUG "\tpnode_sz: %d\n", c->pnode_sz);
+ printk(KERN_DEBUG "\tnnode_sz: %d\n", c->nnode_sz);
+ printk(KERN_DEBUG "\tltab_sz: %d\n", c->ltab_sz);
+ printk(KERN_DEBUG "\tlsave_sz: %d\n", c->lsave_sz);
+ printk(KERN_DEBUG "\tbig_lpt: %d\n", c->big_lpt);
+ printk(KERN_DEBUG "\tlpt_hght: %d\n", c->lpt_hght);
+ printk(KERN_DEBUG "\tpnode_cnt: %d\n", c->pnode_cnt);
+ printk(KERN_DEBUG "\tnnode_cnt: %d\n", c->nnode_cnt);
+ printk(KERN_DEBUG "\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt);
+ printk(KERN_DEBUG "\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt);
+ printk(KERN_DEBUG "\tlsave_cnt: %d\n", c->lsave_cnt);
+ printk(KERN_DEBUG "\tspace_bits: %d\n", c->space_bits);
+ printk(KERN_DEBUG "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits);
+ printk(KERN_DEBUG "\tlpt_offs_bits: %d\n", c->lpt_offs_bits);
+ printk(KERN_DEBUG "\tlpt_spc_bits: %d\n", c->lpt_spc_bits);
+ printk(KERN_DEBUG "\tpcnt_bits: %d\n", c->pcnt_bits);
+ printk(KERN_DEBUG "\tlnum_bits: %d\n", c->lnum_bits);
+ printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
+ printk(KERN_DEBUG "\tLPT head is at %d:%d\n",
+ c->nhead_lnum, c->nhead_offs);
+ printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs);
+ if (c->big_lpt)
+ printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n",
+ c->lsave_lnum, c->lsave_offs);
+ for (i = 0; i < c->lpt_lebs; i++)
+ printk(KERN_DEBUG "\tLPT LEB %d free %d dirty %d tgc %d "
+ "cmt %d\n", i + c->lpt_first, c->ltab[i].free,
+ c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt);
+ spin_unlock(&dbg_lock);
+}
+
void dbg_dump_leb(const struct ubifs_info *c, int lnum)
{
struct ubifs_scan_leb *sleb;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 50315fc5718..33d6b95071e 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -224,6 +224,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
void dbg_dump_budg(struct ubifs_info *c);
void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
void dbg_dump_lprops(struct ubifs_info *c);
+void dbg_dump_lpt_info(struct ubifs_info *c);
void dbg_dump_leb(const struct ubifs_info *c, int lnum);
void dbg_dump_znode(const struct ubifs_info *c,
const struct ubifs_znode *znode);
@@ -249,6 +250,8 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
int dbg_check_cats(struct ubifs_info *c);
int dbg_check_ltab(struct ubifs_info *c);
+int dbg_chk_lpt_free_spc(struct ubifs_info *c);
+int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len);
int dbg_check_synced_i_size(struct inode *inode);
int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir);
int dbg_check_tnc(struct ubifs_info *c, int extra);
@@ -367,6 +370,7 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
#define dbg_dump_budg(c) ({})
#define dbg_dump_lprop(c, lp) ({})
#define dbg_dump_lprops(c) ({})
+#define dbg_dump_lpt_info(c) ({})
#define dbg_dump_leb(c, lnum) ({})
#define dbg_dump_znode(c, znode) ({})
#define dbg_dump_heap(c, heap, cat) ({})
@@ -379,6 +383,8 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
#define dbg_check_old_index(c, zroot) 0
#define dbg_check_cats(c) 0
#define dbg_check_ltab(c) 0
+#define dbg_chk_lpt_free_spc(c) 0
+#define dbg_chk_lpt_sz(c, action, len) 0
#define dbg_check_synced_i_size(inode) 0
#define dbg_check_dir_size(c, dir) 0
#define dbg_check_tnc(c, x) 0
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 3d698e2022b..51cf511d44d 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -147,6 +147,12 @@ static int do_readpage(struct page *page)
err = ret;
if (err != -ENOENT)
break;
+ } else if (block + 1 == beyond) {
+ int dlen = le32_to_cpu(dn->size);
+ int ilen = i_size & (UBIFS_BLOCK_SIZE - 1);
+
+ if (ilen && ilen < dlen)
+ memset(addr + ilen, 0, dlen - ilen);
}
}
if (++i >= UBIFS_BLOCKS_PER_PAGE)
@@ -577,8 +583,262 @@ out:
return copied;
}
+/**
+ * populate_page - copy data nodes into a page for bulk-read.
+ * @c: UBIFS file-system description object
+ * @page: page
+ * @bu: bulk-read information
+ * @n: next zbranch slot
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int populate_page(struct ubifs_info *c, struct page *page,
+ struct bu_info *bu, int *n)
+{
+ int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0;
+ struct inode *inode = page->mapping->host;
+ loff_t i_size = i_size_read(inode);
+ unsigned int page_block;
+ void *addr, *zaddr;
+ pgoff_t end_index;
+
+ dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
+ inode->i_ino, page->index, i_size, page->flags);
+
+ addr = zaddr = kmap(page);
+
+ end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+ if (!i_size || page->index > end_index) {
+ hole = 1;
+ memset(addr, 0, PAGE_CACHE_SIZE);
+ goto out_hole;
+ }
+
+ page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+ while (1) {
+ int err, len, out_len, dlen;
+
+ if (nn >= bu->cnt) {
+ hole = 1;
+ memset(addr, 0, UBIFS_BLOCK_SIZE);
+ } else if (key_block(c, &bu->zbranch[nn].key) == page_block) {
+ struct ubifs_data_node *dn;
+
+ dn = bu->buf + (bu->zbranch[nn].offs - offs);
+
+ ubifs_assert(dn->ch.sqnum >
+ ubifs_inode(inode)->creat_sqnum);
+
+ len = le32_to_cpu(dn->size);
+ if (len <= 0 || len > UBIFS_BLOCK_SIZE)
+ goto out_err;
+
+ dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+ out_len = UBIFS_BLOCK_SIZE;
+ err = ubifs_decompress(&dn->data, dlen, addr, &out_len,
+ le16_to_cpu(dn->compr_type));
+ if (err || len != out_len)
+ goto out_err;
+
+ if (len < UBIFS_BLOCK_SIZE)
+ memset(addr + len, 0, UBIFS_BLOCK_SIZE - len);
+
+ nn += 1;
+ read = (i << UBIFS_BLOCK_SHIFT) + len;
+ } else if (key_block(c, &bu->zbranch[nn].key) < page_block) {
+ nn += 1;
+ continue;
+ } else {
+ hole = 1;
+ memset(addr, 0, UBIFS_BLOCK_SIZE);
+ }
+ if (++i >= UBIFS_BLOCKS_PER_PAGE)
+ break;
+ addr += UBIFS_BLOCK_SIZE;
+ page_block += 1;
+ }
+
+ if (end_index == page->index) {
+ int len = i_size & (PAGE_CACHE_SIZE - 1);
+
+ if (len && len < read)
+ memset(zaddr + len, 0, read - len);
+ }
+
+out_hole:
+ if (hole) {
+ SetPageChecked(page);
+ dbg_gen("hole");
+ }
+
+ SetPageUptodate(page);
+ ClearPageError(page);
+ flush_dcache_page(page);
+ kunmap(page);
+ *n = nn;
+ return 0;
+
+out_err:
+ ClearPageUptodate(page);
+ SetPageError(page);
+ flush_dcache_page(page);
+ kunmap(page);
+ ubifs_err("bad data node (block %u, inode %lu)",
+ page_block, inode->i_ino);
+ return -EINVAL;
+}
+
+/**
+ * ubifs_do_bulk_read - do bulk-read.
+ * @c: UBIFS file-system description object
+ * @page1: first page
+ *
+ * This function returns %1 if the bulk-read is done, otherwise %0 is returned.
+ */
+static int ubifs_do_bulk_read(struct ubifs_info *c, struct page *page1)
+{
+ pgoff_t offset = page1->index, end_index;
+ struct address_space *mapping = page1->mapping;
+ struct inode *inode = mapping->host;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct bu_info *bu;
+ int err, page_idx, page_cnt, ret = 0, n = 0;
+ loff_t isize;
+
+ bu = kmalloc(sizeof(struct bu_info), GFP_NOFS);
+ if (!bu)
+ return 0;
+
+ bu->buf_len = c->bulk_read_buf_size;
+ bu->buf = kmalloc(bu->buf_len, GFP_NOFS);
+ if (!bu->buf)
+ goto out_free;
+
+ data_key_init(c, &bu->key, inode->i_ino,
+ offset << UBIFS_BLOCKS_PER_PAGE_SHIFT);
+
+ err = ubifs_tnc_get_bu_keys(c, bu);
+ if (err)
+ goto out_warn;
+
+ if (bu->eof) {
+ /* Turn off bulk-read at the end of the file */
+ ui->read_in_a_row = 1;
+ ui->bulk_read = 0;
+ }
+
+ page_cnt = bu->blk_cnt >> UBIFS_BLOCKS_PER_PAGE_SHIFT;
+ if (!page_cnt) {
+ /*
+ * This happens when there are multiple blocks per page and the
+ * blocks for the first page we are looking for, are not
+ * together. If all the pages were like this, bulk-read would
+ * reduce performance, so we turn it off for a while.
+ */
+ ui->read_in_a_row = 0;
+ ui->bulk_read = 0;
+ goto out_free;
+ }
+
+ if (bu->cnt) {
+ err = ubifs_tnc_bulk_read(c, bu);
+ if (err)
+ goto out_warn;
+ }
+
+ err = populate_page(c, page1, bu, &n);
+ if (err)
+ goto out_warn;
+
+ unlock_page(page1);
+ ret = 1;
+
+ isize = i_size_read(inode);
+ if (isize == 0)
+ goto out_free;
+ end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+
+ for (page_idx = 1; page_idx < page_cnt; page_idx++) {
+ pgoff_t page_offset = offset + page_idx;
+ struct page *page;
+
+ if (page_offset > end_index)
+ break;
+ page = find_or_create_page(mapping, page_offset,
+ GFP_NOFS | __GFP_COLD);
+ if (!page)
+ break;
+ if (!PageUptodate(page))
+ err = populate_page(c, page, bu, &n);
+ unlock_page(page);
+ page_cache_release(page);
+ if (err)
+ break;
+ }
+
+ ui->last_page_read = offset + page_idx - 1;
+
+out_free:
+ kfree(bu->buf);
+ kfree(bu);
+ return ret;
+
+out_warn:
+ ubifs_warn("ignoring error %d and skipping bulk-read", err);
+ goto out_free;
+}
+
+/**
+ * ubifs_bulk_read - determine whether to bulk-read and, if so, do it.
+ * @page: page from which to start bulk-read.
+ *
+ * Some flash media are capable of reading sequentially at faster rates. UBIFS
+ * bulk-read facility is designed to take advantage of that, by reading in one
+ * go consecutive data nodes that are also located consecutively in the same
+ * LEB. This function returns %1 if a bulk-read is done and %0 otherwise.
+ */
+static int ubifs_bulk_read(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ pgoff_t index = page->index, last_page_read = ui->last_page_read;
+ int ret = 0;
+
+ ui->last_page_read = index;
+
+ if (!c->bulk_read)
+ return 0;
+ /*
+ * Bulk-read is protected by ui_mutex, but it is an optimization, so
+ * don't bother if we cannot lock the mutex.
+ */
+ if (!mutex_trylock(&ui->ui_mutex))
+ return 0;
+ if (index != last_page_read + 1) {
+ /* Turn off bulk-read if we stop reading sequentially */
+ ui->read_in_a_row = 1;
+ if (ui->bulk_read)
+ ui->bulk_read = 0;
+ goto out_unlock;
+ }
+ if (!ui->bulk_read) {
+ ui->read_in_a_row += 1;
+ if (ui->read_in_a_row < 3)
+ goto out_unlock;
+ /* Three reads in a row, so switch on bulk-read */
+ ui->bulk_read = 1;
+ }
+ ret = ubifs_do_bulk_read(c, page);
+out_unlock:
+ mutex_unlock(&ui->ui_mutex);
+ return ret;
+}
+
static int ubifs_readpage(struct file *file, struct page *page)
{
+ if (ubifs_bulk_read(page))
+ return 0;
do_readpage(page);
unlock_page(page);
return 0;
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 47814cde240..717d79c97c5 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -901,11 +901,11 @@ static int get_idx_gc_leb(struct ubifs_info *c)
* it is needed now for this commit.
*/
lp = ubifs_lpt_lookup_dirty(c, lnum);
- if (unlikely(IS_ERR(lp)))
+ if (IS_ERR(lp))
return PTR_ERR(lp);
lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
lp->flags | LPROPS_INDEX, -1);
- if (unlikely(IS_ERR(lp)))
+ if (IS_ERR(lp))
return PTR_ERR(lp);
dbg_find("LEB %d, dirty %d and free %d flags %#x",
lp->lnum, lp->dirty, lp->free, lp->flags);
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 02aba36fe3d..0bef6501d58 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -96,6 +96,48 @@ static int switch_gc_head(struct ubifs_info *c)
}
/**
+ * joinup - bring data nodes for an inode together.
+ * @c: UBIFS file-system description object
+ * @sleb: describes scanned LEB
+ * @inum: inode number
+ * @blk: block number
+ * @data: list to which to add data nodes
+ *
+ * This function looks at the first few nodes in the scanned LEB @sleb and adds
+ * them to @data if they are data nodes from @inum and have a larger block
+ * number than @blk. This function returns %0 on success and a negative error
+ * code on failure.
+ */
+static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum,
+ unsigned int blk, struct list_head *data)
+{
+ int err, cnt = 6, lnum = sleb->lnum, offs;
+ struct ubifs_scan_node *snod, *tmp;
+ union ubifs_key *key;
+
+ list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+ key = &snod->key;
+ if (key_inum(c, key) == inum &&
+ key_type(c, key) == UBIFS_DATA_KEY &&
+ key_block(c, key) > blk) {
+ offs = snod->offs;
+ err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0);
+ if (err < 0)
+ return err;
+ list_del(&snod->list);
+ if (err) {
+ list_add_tail(&snod->list, data);
+ blk = key_block(c, key);
+ } else
+ kfree(snod);
+ cnt = 6;
+ } else if (--cnt == 0)
+ break;
+ }
+ return 0;
+}
+
+/**
* move_nodes - move nodes.
* @c: UBIFS file-system description object
* @sleb: describes nodes to move
@@ -116,16 +158,21 @@ static int switch_gc_head(struct ubifs_info *c)
static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
{
struct ubifs_scan_node *snod, *tmp;
- struct list_head large, medium, small;
+ struct list_head data, large, medium, small;
struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
int avail, err, min = INT_MAX;
+ unsigned int blk = 0;
+ ino_t inum = 0;
+ INIT_LIST_HEAD(&data);
INIT_LIST_HEAD(&large);
INIT_LIST_HEAD(&medium);
INIT_LIST_HEAD(&small);
- list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
- struct list_head *lst;
+ while (!list_empty(&sleb->nodes)) {
+ struct list_head *lst = sleb->nodes.next;
+
+ snod = list_entry(lst, struct ubifs_scan_node, list);
ubifs_assert(snod->type != UBIFS_IDX_NODE);
ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -136,7 +183,6 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
if (err < 0)
goto out;
- lst = &snod->list;
list_del(lst);
if (!err) {
/* The node is obsolete, remove it from the list */
@@ -145,15 +191,30 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
}
/*
- * Sort the list of nodes so that large nodes go first, and
- * small nodes go last.
+ * Sort the list of nodes so that data nodes go first, large
+ * nodes go second, and small nodes go last.
*/
- if (snod->len > MEDIUM_NODE_WM)
- list_add(lst, &large);
+ if (key_type(c, &snod->key) == UBIFS_DATA_KEY) {
+ if (inum != key_inum(c, &snod->key)) {
+ if (inum) {
+ /*
+ * Try to move data nodes from the same
+ * inode together.
+ */
+ err = joinup(c, sleb, inum, blk, &data);
+ if (err)
+ goto out;
+ }
+ inum = key_inum(c, &snod->key);
+ blk = key_block(c, &snod->key);
+ }
+ list_add_tail(lst, &data);
+ } else if (snod->len > MEDIUM_NODE_WM)
+ list_add_tail(lst, &large);
else if (snod->len > SMALL_NODE_WM)
- list_add(lst, &medium);
+ list_add_tail(lst, &medium);
else
- list_add(lst, &small);
+ list_add_tail(lst, &small);
/* And find the smallest node */
if (snod->len < min)
@@ -164,6 +225,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
* Join the tree lists so that we'd have one roughly sorted list
* ('large' will be the head of the joined list).
*/
+ list_splice(&data, &large);
list_splice(&medium, large.prev);
list_splice(&small, large.prev);
@@ -653,7 +715,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c)
*/
while (1) {
lp = ubifs_fast_find_freeable(c);
- if (unlikely(IS_ERR(lp))) {
+ if (IS_ERR(lp)) {
err = PTR_ERR(lp);
goto out;
}
@@ -665,7 +727,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c)
if (err)
goto out;
lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0);
- if (unlikely(IS_ERR(lp))) {
+ if (IS_ERR(lp)) {
err = PTR_ERR(lp);
goto out;
}
@@ -680,7 +742,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c)
/* Record index freeable LEBs for unmapping after commit */
while (1) {
lp = ubifs_fast_find_frdi_idx(c);
- if (unlikely(IS_ERR(lp))) {
+ if (IS_ERR(lp)) {
err = PTR_ERR(lp);
goto out;
}
@@ -696,7 +758,7 @@ int ubifs_gc_start_commit(struct ubifs_info *c)
/* Don't release the LEB until after the next commit */
flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX;
lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1);
- if (unlikely(IS_ERR(lp))) {
+ if (IS_ERR(lp)) {
err = PTR_ERR(lp);
kfree(idx_gc);
goto out;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 054363f2b20..01682713af6 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -62,6 +62,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
{
if (!c->ro_media) {
c->ro_media = 1;
+ c->no_chk_data_crc = 0;
ubifs_warn("switched to read-only mode, error %d", err);
dbg_dump_stack();
}
@@ -74,6 +75,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
* @lnum: logical eraseblock number
* @offs: offset within the logical eraseblock
* @quiet: print no messages
+ * @chk_crc: indicates whether to always check the CRC
*
* This function checks node magic number and CRC checksum. This function also
* validates node length to prevent UBIFS from becoming crazy when an attacker
@@ -85,7 +87,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
* or magic.
*/
int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
- int offs, int quiet)
+ int offs, int quiet, int chk_crc)
{
int err = -EINVAL, type, node_len;
uint32_t crc, node_crc, magic;
@@ -121,6 +123,10 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
node_len > c->ranges[type].max_len)
goto out_len;
+ if (!chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc)
+ if (c->no_chk_data_crc)
+ return 0;
+
crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
node_crc = le32_to_cpu(ch->crc);
if (crc != node_crc) {
@@ -722,7 +728,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
goto out;
}
- err = ubifs_check_node(c, buf, lnum, offs, 0);
+ err = ubifs_check_node(c, buf, lnum, offs, 0, 0);
if (err) {
ubifs_err("expected node type %d", type);
return err;
@@ -781,7 +787,7 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
goto out;
}
- err = ubifs_check_node(c, buf, lnum, offs, 0);
+ err = ubifs_check_node(c, buf, lnum, offs, 0, 0);
if (err) {
ubifs_err("expected node type %d", type);
return err;
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 8f747600754..9ee65086f62 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -484,7 +484,7 @@ static inline void key_copy(const struct ubifs_info *c,
* @key2: the second key to compare
*
* This function compares 2 keys and returns %-1 if @key1 is less than
- * @key2, 0 if the keys are equivalent and %1 if @key1 is greater than @key2.
+ * @key2, %0 if the keys are equivalent and %1 if @key1 is greater than @key2.
*/
static inline int keys_cmp(const struct ubifs_info *c,
const union ubifs_key *key1,
@@ -503,6 +503,26 @@ static inline int keys_cmp(const struct ubifs_info *c,
}
/**
+ * keys_eq - determine if keys are equivalent.
+ * @c: UBIFS file-system description object
+ * @key1: the first key to compare
+ * @key2: the second key to compare
+ *
+ * This function compares 2 keys and returns %1 if @key1 is equal to @key2 and
+ * %0 if not.
+ */
+static inline int keys_eq(const struct ubifs_info *c,
+ const union ubifs_key *key1,
+ const union ubifs_key *key2)
+{
+ if (key1->u32[0] != key2->u32[0])
+ return 0;
+ if (key1->u32[1] != key2->u32[1])
+ return 0;
+ return 1;
+}
+
+/**
* is_hash_key - is a key vulnerable to hash collisions.
* @c: UBIFS file-system description object
* @key: key
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 2ba93da71b6..f27176e9b70 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -125,6 +125,7 @@ static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
}
}
}
+
/* Not greater than parent, so compare to children */
while (1) {
/* Compare to left child */
@@ -460,18 +461,6 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
}
/**
- * ubifs_get_lprops - get reference to LEB properties.
- * @c: the UBIFS file-system description object
- *
- * This function locks lprops. Lprops have to be unlocked by
- * 'ubifs_release_lprops()'.
- */
-void ubifs_get_lprops(struct ubifs_info *c)
-{
- mutex_lock(&c->lp_mutex);
-}
-
-/**
* calc_dark - calculate LEB dark space size.
* @c: the UBIFS file-system description object
* @spc: amount of free and dirty space in the LEB
@@ -576,7 +565,6 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7));
spin_lock(&c->space_lock);
-
if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
c->lst.taken_empty_lebs -= 1;
@@ -637,31 +625,12 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
c->lst.taken_empty_lebs += 1;
change_category(c, lprops);
-
c->idx_gc_cnt += idx_gc_cnt;
-
spin_unlock(&c->space_lock);
-
return lprops;
}
/**
- * ubifs_release_lprops - release lprops lock.
- * @c: the UBIFS file-system description object
- *
- * This function has to be called after each 'ubifs_get_lprops()' call to
- * unlock lprops.
- */
-void ubifs_release_lprops(struct ubifs_info *c)
-{
- ubifs_assert(mutex_is_locked(&c->lp_mutex));
- ubifs_assert(c->lst.empty_lebs >= 0 &&
- c->lst.empty_lebs <= c->main_lebs);
-
- mutex_unlock(&c->lp_mutex);
-}
-
-/**
* ubifs_get_lp_stats - get lprops statistics.
* @c: UBIFS file-system description object
* @st: return statistics
@@ -1262,7 +1231,6 @@ static int scan_check_cb(struct ubifs_info *c,
}
ubifs_scan_destroy(sleb);
-
return LPT_SCAN_CONTINUE;
out_print:
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 9ff2463177e..db8bd0e518b 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -109,7 +109,8 @@ static void do_calc_lpt_geom(struct ubifs_info *c)
c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz;
c->lpt_sz += c->ltab_sz;
- c->lpt_sz += c->lsave_sz;
+ if (c->big_lpt)
+ c->lpt_sz += c->lsave_sz;
/* Add wastage */
sz = c->lpt_sz;
@@ -287,25 +288,56 @@ uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits)
const int k = 32 - nrbits;
uint8_t *p = *addr;
int b = *pos;
- uint32_t val;
+ uint32_t uninitialized_var(val);
+ const int bytes = (nrbits + b + 7) >> 3;
ubifs_assert(nrbits > 0);
ubifs_assert(nrbits <= 32);
ubifs_assert(*pos >= 0);
ubifs_assert(*pos < 8);
if (b) {
- val = p[1] | ((uint32_t)p[2] << 8) | ((uint32_t)p[3] << 16) |
- ((uint32_t)p[4] << 24);
+ switch (bytes) {
+ case 2:
+ val = p[1];
+ break;
+ case 3:
+ val = p[1] | ((uint32_t)p[2] << 8);
+ break;
+ case 4:
+ val = p[1] | ((uint32_t)p[2] << 8) |
+ ((uint32_t)p[3] << 16);
+ break;
+ case 5:
+ val = p[1] | ((uint32_t)p[2] << 8) |
+ ((uint32_t)p[3] << 16) |
+ ((uint32_t)p[4] << 24);
+ }
val <<= (8 - b);
val |= *p >> b;
nrbits += b;
- } else
- val = p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) |
- ((uint32_t)p[3] << 24);
+ } else {
+ switch (bytes) {
+ case 1:
+ val = p[0];
+ break;
+ case 2:
+ val = p[0] | ((uint32_t)p[1] << 8);
+ break;
+ case 3:
+ val = p[0] | ((uint32_t)p[1] << 8) |
+ ((uint32_t)p[2] << 16);
+ break;
+ case 4:
+ val = p[0] | ((uint32_t)p[1] << 8) |
+ ((uint32_t)p[2] << 16) |
+ ((uint32_t)p[3] << 24);
+ break;
+ }
+ }
val <<= k;
val >>= k;
b = nrbits & 7;
- p += nrbits / 8;
+ p += nrbits >> 3;
*addr = p;
*pos = b;
ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 5f0b83e20af..eed5a0025d6 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -177,8 +177,6 @@ static int alloc_lpt_leb(struct ubifs_info *c, int *lnum)
return 0;
}
}
- dbg_err("last LEB %d", *lnum);
- dump_stack();
return -ENOSPC;
}
@@ -193,6 +191,9 @@ static int layout_cnodes(struct ubifs_info *c)
int lnum, offs, len, alen, done_lsave, done_ltab, err;
struct ubifs_cnode *cnode;
+ err = dbg_chk_lpt_sz(c, 0, 0);
+ if (err)
+ return err;
cnode = c->lpt_cnext;
if (!cnode)
return 0;
@@ -206,6 +207,7 @@ static int layout_cnodes(struct ubifs_info *c)
c->lsave_lnum = lnum;
c->lsave_offs = offs;
offs += c->lsave_sz;
+ dbg_chk_lpt_sz(c, 1, c->lsave_sz);
}
if (offs + c->ltab_sz <= c->leb_size) {
@@ -213,6 +215,7 @@ static int layout_cnodes(struct ubifs_info *c)
c->ltab_lnum = lnum;
c->ltab_offs = offs;
offs += c->ltab_sz;
+ dbg_chk_lpt_sz(c, 1, c->ltab_sz);
}
do {
@@ -226,9 +229,10 @@ static int layout_cnodes(struct ubifs_info *c)
while (offs + len > c->leb_size) {
alen = ALIGN(offs, c->min_io_size);
upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+ dbg_chk_lpt_sz(c, 2, alen - offs);
err = alloc_lpt_leb(c, &lnum);
if (err)
- return err;
+ goto no_space;
offs = 0;
ubifs_assert(lnum >= c->lpt_first &&
lnum <= c->lpt_last);
@@ -238,6 +242,7 @@ static int layout_cnodes(struct ubifs_info *c)
c->lsave_lnum = lnum;
c->lsave_offs = offs;
offs += c->lsave_sz;
+ dbg_chk_lpt_sz(c, 1, c->lsave_sz);
continue;
}
if (!done_ltab) {
@@ -245,6 +250,7 @@ static int layout_cnodes(struct ubifs_info *c)
c->ltab_lnum = lnum;
c->ltab_offs = offs;
offs += c->ltab_sz;
+ dbg_chk_lpt_sz(c, 1, c->ltab_sz);
continue;
}
break;
@@ -257,6 +263,7 @@ static int layout_cnodes(struct ubifs_info *c)
c->lpt_offs = offs;
}
offs += len;
+ dbg_chk_lpt_sz(c, 1, len);
cnode = cnode->cnext;
} while (cnode && cnode != c->lpt_cnext);
@@ -265,9 +272,10 @@ static int layout_cnodes(struct ubifs_info *c)
if (offs + c->lsave_sz > c->leb_size) {
alen = ALIGN(offs, c->min_io_size);
upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+ dbg_chk_lpt_sz(c, 2, alen - offs);
err = alloc_lpt_leb(c, &lnum);
if (err)
- return err;
+ goto no_space;
offs = 0;
ubifs_assert(lnum >= c->lpt_first &&
lnum <= c->lpt_last);
@@ -276,6 +284,7 @@ static int layout_cnodes(struct ubifs_info *c)
c->lsave_lnum = lnum;
c->lsave_offs = offs;
offs += c->lsave_sz;
+ dbg_chk_lpt_sz(c, 1, c->lsave_sz);
}
/* Make sure to place LPT's own lprops table */
@@ -283,9 +292,10 @@ static int layout_cnodes(struct ubifs_info *c)
if (offs + c->ltab_sz > c->leb_size) {
alen = ALIGN(offs, c->min_io_size);
upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+ dbg_chk_lpt_sz(c, 2, alen - offs);
err = alloc_lpt_leb(c, &lnum);
if (err)
- return err;
+ goto no_space;
offs = 0;
ubifs_assert(lnum >= c->lpt_first &&
lnum <= c->lpt_last);
@@ -294,11 +304,23 @@ static int layout_cnodes(struct ubifs_info *c)
c->ltab_lnum = lnum;
c->ltab_offs = offs;
offs += c->ltab_sz;
+ dbg_chk_lpt_sz(c, 1, c->ltab_sz);
}
alen = ALIGN(offs, c->min_io_size);
upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+ dbg_chk_lpt_sz(c, 4, alen - offs);
+ err = dbg_chk_lpt_sz(c, 3, alen);
+ if (err)
+ return err;
return 0;
+
+no_space:
+ ubifs_err("LPT out of space");
+ dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, "
+ "done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
+ dbg_dump_lpt_info(c);
+ return err;
}
/**
@@ -333,8 +355,6 @@ static int realloc_lpt_leb(struct ubifs_info *c, int *lnum)
*lnum = i + c->lpt_first;
return 0;
}
- dbg_err("last LEB %d", *lnum);
- dump_stack();
return -ENOSPC;
}
@@ -369,12 +389,14 @@ static int write_cnodes(struct ubifs_info *c)
done_lsave = 1;
ubifs_pack_lsave(c, buf + offs, c->lsave);
offs += c->lsave_sz;
+ dbg_chk_lpt_sz(c, 1, c->lsave_sz);
}
if (offs + c->ltab_sz <= c->leb_size) {
done_ltab = 1;
ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
offs += c->ltab_sz;
+ dbg_chk_lpt_sz(c, 1, c->ltab_sz);
}
/* Loop for each cnode */
@@ -392,10 +414,12 @@ static int write_cnodes(struct ubifs_info *c)
alen, UBI_SHORTTERM);
if (err)
return err;
+ dbg_chk_lpt_sz(c, 4, alen - wlen);
}
+ dbg_chk_lpt_sz(c, 2, 0);
err = realloc_lpt_leb(c, &lnum);
if (err)
- return err;
+ goto no_space;
offs = 0;
from = 0;
ubifs_assert(lnum >= c->lpt_first &&
@@ -408,12 +432,14 @@ static int write_cnodes(struct ubifs_info *c)
done_lsave = 1;
ubifs_pack_lsave(c, buf + offs, c->lsave);
offs += c->lsave_sz;
+ dbg_chk_lpt_sz(c, 1, c->lsave_sz);
continue;
}
if (!done_ltab) {
done_ltab = 1;
ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
offs += c->ltab_sz;
+ dbg_chk_lpt_sz(c, 1, c->ltab_sz);
continue;
}
break;
@@ -435,6 +461,7 @@ static int write_cnodes(struct ubifs_info *c)
clear_bit(COW_ZNODE, &cnode->flags);
smp_mb__after_clear_bit();
offs += len;
+ dbg_chk_lpt_sz(c, 1, len);
cnode = cnode->cnext;
} while (cnode && cnode != c->lpt_cnext);
@@ -448,9 +475,10 @@ static int write_cnodes(struct ubifs_info *c)
UBI_SHORTTERM);
if (err)
return err;
+ dbg_chk_lpt_sz(c, 2, alen - wlen);
err = realloc_lpt_leb(c, &lnum);
if (err)
- return err;
+ goto no_space;
offs = 0;
ubifs_assert(lnum >= c->lpt_first &&
lnum <= c->lpt_last);
@@ -461,6 +489,7 @@ static int write_cnodes(struct ubifs_info *c)
done_lsave = 1;
ubifs_pack_lsave(c, buf + offs, c->lsave);
offs += c->lsave_sz;
+ dbg_chk_lpt_sz(c, 1, c->lsave_sz);
}
/* Make sure to place LPT's own lprops table */
@@ -473,9 +502,10 @@ static int write_cnodes(struct ubifs_info *c)
UBI_SHORTTERM);
if (err)
return err;
+ dbg_chk_lpt_sz(c, 2, alen - wlen);
err = realloc_lpt_leb(c, &lnum);
if (err)
- return err;
+ goto no_space;
offs = 0;
ubifs_assert(lnum >= c->lpt_first &&
lnum <= c->lpt_last);
@@ -486,6 +516,7 @@ static int write_cnodes(struct ubifs_info *c)
done_ltab = 1;
ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
offs += c->ltab_sz;
+ dbg_chk_lpt_sz(c, 1, c->ltab_sz);
}
/* Write remaining data in buffer */
@@ -495,6 +526,12 @@ static int write_cnodes(struct ubifs_info *c)
err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM);
if (err)
return err;
+
+ dbg_chk_lpt_sz(c, 4, alen - wlen);
+ err = dbg_chk_lpt_sz(c, 3, ALIGN(offs, c->min_io_size));
+ if (err)
+ return err;
+
c->nhead_lnum = lnum;
c->nhead_offs = ALIGN(offs, c->min_io_size);
@@ -503,7 +540,15 @@ static int write_cnodes(struct ubifs_info *c)
dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
if (c->big_lpt)
dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+
return 0;
+
+no_space:
+ ubifs_err("LPT out of space mismatch");
+ dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab "
+ "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
+ dbg_dump_lpt_info(c);
+ return err;
}
/**
@@ -1044,6 +1089,8 @@ static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
int pos = 0, node_type, node_len;
uint16_t crc, calc_crc;
+ if (len < UBIFS_LPT_CRC_BYTES + (UBIFS_LPT_TYPE_BITS + 7) / 8)
+ return 0;
node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
if (node_type == UBIFS_LPT_NOT_A_NODE)
return 0;
@@ -1156,6 +1203,9 @@ int ubifs_lpt_start_commit(struct ubifs_info *c)
dbg_lp("");
mutex_lock(&c->lp_mutex);
+ err = dbg_chk_lpt_free_spc(c);
+ if (err)
+ goto out;
err = dbg_check_ltab(c);
if (err)
goto out;
@@ -1645,4 +1695,121 @@ int dbg_check_ltab(struct ubifs_info *c)
return 0;
}
+/**
+ * dbg_chk_lpt_free_spc - check LPT free space is enough to write entire LPT.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_chk_lpt_free_spc(struct ubifs_info *c)
+{
+ long long free = 0;
+ int i;
+
+ for (i = 0; i < c->lpt_lebs; i++) {
+ if (c->ltab[i].tgc || c->ltab[i].cmt)
+ continue;
+ if (i + c->lpt_first == c->nhead_lnum)
+ free += c->leb_size - c->nhead_offs;
+ else if (c->ltab[i].free == c->leb_size)
+ free += c->leb_size;
+ }
+ if (free < c->lpt_sz) {
+ dbg_err("LPT space error: free %lld lpt_sz %lld",
+ free, c->lpt_sz);
+ dbg_dump_lpt_info(c);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * dbg_chk_lpt_sz - check LPT does not write more than LPT size.
+ * @c: the UBIFS file-system description object
+ * @action: action
+ * @len: length written
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
+{
+ long long chk_lpt_sz, lpt_sz;
+ int err = 0;
+
+ switch (action) {
+ case 0:
+ c->chk_lpt_sz = 0;
+ c->chk_lpt_sz2 = 0;
+ c->chk_lpt_lebs = 0;
+ c->chk_lpt_wastage = 0;
+ if (c->dirty_pn_cnt > c->pnode_cnt) {
+ dbg_err("dirty pnodes %d exceed max %d",
+ c->dirty_pn_cnt, c->pnode_cnt);
+ err = -EINVAL;
+ }
+ if (c->dirty_nn_cnt > c->nnode_cnt) {
+ dbg_err("dirty nnodes %d exceed max %d",
+ c->dirty_nn_cnt, c->nnode_cnt);
+ err = -EINVAL;
+ }
+ return err;
+ case 1:
+ c->chk_lpt_sz += len;
+ return 0;
+ case 2:
+ c->chk_lpt_sz += len;
+ c->chk_lpt_wastage += len;
+ c->chk_lpt_lebs += 1;
+ return 0;
+ case 3:
+ chk_lpt_sz = c->leb_size;
+ chk_lpt_sz *= c->chk_lpt_lebs;
+ chk_lpt_sz += len - c->nhead_offs;
+ if (c->chk_lpt_sz != chk_lpt_sz) {
+ dbg_err("LPT wrote %lld but space used was %lld",
+ c->chk_lpt_sz, chk_lpt_sz);
+ err = -EINVAL;
+ }
+ if (c->chk_lpt_sz > c->lpt_sz) {
+ dbg_err("LPT wrote %lld but lpt_sz is %lld",
+ c->chk_lpt_sz, c->lpt_sz);
+ err = -EINVAL;
+ }
+ if (c->chk_lpt_sz2 && c->chk_lpt_sz != c->chk_lpt_sz2) {
+ dbg_err("LPT layout size %lld but wrote %lld",
+ c->chk_lpt_sz, c->chk_lpt_sz2);
+ err = -EINVAL;
+ }
+ if (c->chk_lpt_sz2 && c->new_nhead_offs != len) {
+ dbg_err("LPT new nhead offs: expected %d was %d",
+ c->new_nhead_offs, len);
+ err = -EINVAL;
+ }
+ lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
+ lpt_sz += (long long)c->nnode_cnt * c->nnode_sz;
+ lpt_sz += c->ltab_sz;
+ if (c->big_lpt)
+ lpt_sz += c->lsave_sz;
+ if (c->chk_lpt_sz - c->chk_lpt_wastage > lpt_sz) {
+ dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
+ c->chk_lpt_sz, c->chk_lpt_wastage, lpt_sz);
+ err = -EINVAL;
+ }
+ if (err)
+ dbg_dump_lpt_info(c);
+ c->chk_lpt_sz2 = c->chk_lpt_sz;
+ c->chk_lpt_sz = 0;
+ c->chk_lpt_wastage = 0;
+ c->chk_lpt_lebs = 0;
+ c->new_nhead_offs = len;
+ return err;
+ case 4:
+ c->chk_lpt_sz += len;
+ c->chk_lpt_wastage += len;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index 4c12a9215d7..4fa81d867e4 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -310,4 +310,31 @@ static inline int ubifs_tnc_lookup(struct ubifs_info *c,
return ubifs_tnc_locate(c, key, node, NULL, NULL);
}
+/**
+ * ubifs_get_lprops - get reference to LEB properties.
+ * @c: the UBIFS file-system description object
+ *
+ * This function locks lprops. Lprops have to be unlocked by
+ * 'ubifs_release_lprops()'.
+ */
+static inline void ubifs_get_lprops(struct ubifs_info *c)
+{
+ mutex_lock(&c->lp_mutex);
+}
+
+/**
+ * ubifs_release_lprops - release lprops lock.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called after each 'ubifs_get_lprops()' call to
+ * unlock lprops.
+ */
+static inline void ubifs_release_lprops(struct ubifs_info *c)
+{
+ ubifs_assert(mutex_is_locked(&c->lp_mutex));
+ ubifs_assert(c->lst.empty_lebs >= 0 &&
+ c->lst.empty_lebs <= c->main_lebs);
+ mutex_unlock(&c->lp_mutex);
+}
+
#endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index acf5c5fffc6..0ed82479b44 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -87,7 +87,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
dbg_scan("scanning %s", dbg_ntype(ch->node_type));
- if (ubifs_check_node(c, buf, lnum, offs, quiet))
+ if (ubifs_check_node(c, buf, lnum, offs, quiet, 1))
return SCANNED_A_CORRUPT_NODE;
if (ch->node_type == UBIFS_PAD_NODE) {
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 9a9220333b3..8780efbf40a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -401,6 +401,16 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
else if (c->mount_opts.unmount_mode == 1)
seq_printf(s, ",norm_unmount");
+ if (c->mount_opts.bulk_read == 2)
+ seq_printf(s, ",bulk_read");
+ else if (c->mount_opts.bulk_read == 1)
+ seq_printf(s, ",no_bulk_read");
+
+ if (c->mount_opts.chk_data_crc == 2)
+ seq_printf(s, ",chk_data_crc");
+ else if (c->mount_opts.chk_data_crc == 1)
+ seq_printf(s, ",no_chk_data_crc");
+
return 0;
}
@@ -408,13 +418,26 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
{
struct ubifs_info *c = sb->s_fs_info;
int i, ret = 0, err;
+ long long bud_bytes;
- if (c->jheads)
+ if (c->jheads) {
for (i = 0; i < c->jhead_cnt; i++) {
err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
if (err && !ret)
ret = err;
}
+
+ /* Commit the journal unless it has too little data */
+ spin_lock(&c->buds_lock);
+ bud_bytes = c->bud_bytes;
+ spin_unlock(&c->buds_lock);
+ if (bud_bytes > c->leb_size) {
+ err = ubifs_run_commit(c);
+ if (err)
+ return err;
+ }
+ }
+
/*
* We ought to call sync for c->ubi but it does not have one. If it had
* it would in turn call mtd->sync, however mtd operations are
@@ -538,6 +561,18 @@ static int init_constants_early(struct ubifs_info *c)
* calculations when reporting free space.
*/
c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;
+ /* Buffer size for bulk-reads */
+ c->bulk_read_buf_size = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ;
+ if (c->bulk_read_buf_size > c->leb_size)
+ c->bulk_read_buf_size = c->leb_size;
+ if (c->bulk_read_buf_size > 128 * 1024) {
+ /* Check if we can kmalloc more than 128KiB */
+ void *try = kmalloc(c->bulk_read_buf_size, GFP_KERNEL);
+
+ kfree(try);
+ if (!try)
+ c->bulk_read_buf_size = 128 * 1024;
+ }
return 0;
}
@@ -840,17 +875,29 @@ static int check_volume_empty(struct ubifs_info *c)
*
* Opt_fast_unmount: do not run a journal commit before un-mounting
* Opt_norm_unmount: run a journal commit before un-mounting
+ * Opt_bulk_read: enable bulk-reads
+ * Opt_no_bulk_read: disable bulk-reads
+ * Opt_chk_data_crc: check CRCs when reading data nodes
+ * Opt_no_chk_data_crc: do not check CRCs when reading data nodes
* Opt_err: just end of array marker
*/
enum {
Opt_fast_unmount,
Opt_norm_unmount,
+ Opt_bulk_read,
+ Opt_no_bulk_read,
+ Opt_chk_data_crc,
+ Opt_no_chk_data_crc,
Opt_err,
};
static const match_table_t tokens = {
{Opt_fast_unmount, "fast_unmount"},
{Opt_norm_unmount, "norm_unmount"},
+ {Opt_bulk_read, "bulk_read"},
+ {Opt_no_bulk_read, "no_bulk_read"},
+ {Opt_chk_data_crc, "chk_data_crc"},
+ {Opt_no_chk_data_crc, "no_chk_data_crc"},
{Opt_err, NULL},
};
@@ -888,6 +935,22 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
c->mount_opts.unmount_mode = 1;
c->fast_unmount = 0;
break;
+ case Opt_bulk_read:
+ c->mount_opts.bulk_read = 2;
+ c->bulk_read = 1;
+ break;
+ case Opt_no_bulk_read:
+ c->mount_opts.bulk_read = 1;
+ c->bulk_read = 0;
+ break;
+ case Opt_chk_data_crc:
+ c->mount_opts.chk_data_crc = 2;
+ c->no_chk_data_crc = 0;
+ break;
+ case Opt_no_chk_data_crc:
+ c->mount_opts.chk_data_crc = 1;
+ c->no_chk_data_crc = 1;
+ break;
default:
ubifs_err("unrecognized mount option \"%s\" "
"or missing value", p);
@@ -996,6 +1059,8 @@ static int mount_ubifs(struct ubifs_info *c)
goto out_free;
}
+ c->always_chk_crc = 1;
+
err = ubifs_read_superblock(c);
if (err)
goto out_free;
@@ -1032,8 +1097,6 @@ static int mount_ubifs(struct ubifs_info *c)
/* Create background thread */
c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
- if (!c->bgt)
- c->bgt = ERR_PTR(-EINVAL);
if (IS_ERR(c->bgt)) {
err = PTR_ERR(c->bgt);
c->bgt = NULL;
@@ -1139,24 +1202,28 @@ static int mount_ubifs(struct ubifs_info *c)
if (err)
goto out_infos;
+ c->always_chk_crc = 0;
+
ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
c->vi.ubi_num, c->vi.vol_id, c->vi.name);
if (mounted_read_only)
ubifs_msg("mounted read-only");
x = (long long)c->main_lebs * c->leb_size;
- ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
- x, x >> 10, x >> 20, c->main_lebs);
+ ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d "
+ "LEBs)", x, x >> 10, x >> 20, c->main_lebs);
x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
- ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
- x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
- ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
- ubifs_msg("media format %d, latest format %d",
+ ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d "
+ "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
+ ubifs_msg("media format: %d (latest is %d)",
c->fmt_version, UBIFS_FORMAT_VERSION);
+ ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
+ ubifs_msg("reserved for root: %llu bytes (%llu KiB)",
+ c->report_rp_size, c->report_rp_size >> 10);
dbg_msg("compiled on: " __DATE__ " at " __TIME__);
dbg_msg("min. I/O unit size: %d bytes", c->min_io_size);
dbg_msg("LEB size: %d bytes (%d KiB)",
- c->leb_size, c->leb_size / 1024);
+ c->leb_size, c->leb_size >> 10);
dbg_msg("data journal heads: %d",
c->jhead_cnt - NONDATA_JHEADS_CNT);
dbg_msg("UUID: %02X%02X%02X%02X-%02X%02X"
@@ -1282,6 +1349,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
mutex_lock(&c->umount_mutex);
c->remounting_rw = 1;
+ c->always_chk_crc = 1;
/* Check for enough free space */
if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
@@ -1345,20 +1413,20 @@ static int ubifs_remount_rw(struct ubifs_info *c)
/* Create background thread */
c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
- if (!c->bgt)
- c->bgt = ERR_PTR(-EINVAL);
if (IS_ERR(c->bgt)) {
err = PTR_ERR(c->bgt);
c->bgt = NULL;
ubifs_err("cannot spawn \"%s\", error %d",
c->bgt_name, err);
- return err;
+ goto out;
}
wake_up_process(c->bgt);
c->orph_buf = vmalloc(c->leb_size);
- if (!c->orph_buf)
- return -ENOMEM;
+ if (!c->orph_buf) {
+ err = -ENOMEM;
+ goto out;
+ }
/* Check for enough log space */
lnum = c->lhead_lnum + 1;
@@ -1385,6 +1453,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
dbg_gen("re-mounted read-write");
c->vfs_sb->s_flags &= ~MS_RDONLY;
c->remounting_rw = 0;
+ c->always_chk_crc = 0;
mutex_unlock(&c->umount_mutex);
return 0;
@@ -1400,6 +1469,7 @@ out:
c->ileb_buf = NULL;
ubifs_lpt_free(c, 1);
c->remounting_rw = 0;
+ c->always_chk_crc = 0;
mutex_unlock(&c->umount_mutex);
return err;
}
@@ -1408,12 +1478,9 @@ out:
* commit_on_unmount - commit the journal when un-mounting.
* @c: UBIFS file-system description object
*
- * This function is called during un-mounting and it commits the journal unless
- * the "fast unmount" mode is enabled. It also avoids committing the journal if
- * it contains too few data.
- *
- * Sometimes recovery requires the journal to be committed at least once, and
- * this function takes care about this.
+ * This function is called during un-mounting and re-mounting, and it commits
+ * the journal unless the "fast unmount" mode is enabled. It also avoids
+ * committing the journal if it contains too few data.
*/
static void commit_on_unmount(struct ubifs_info *c)
{
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 7634c597088..d27fd918b9c 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -284,7 +284,7 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c,
}
zn = copy_znode(c, znode);
- if (unlikely(IS_ERR(zn)))
+ if (IS_ERR(zn))
return zn;
if (zbr->len) {
@@ -470,6 +470,10 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
if (node_len != len)
return 0;
+ if (type == UBIFS_DATA_NODE && !c->always_chk_crc)
+ if (c->no_chk_data_crc)
+ return 0;
+
crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
node_crc = le32_to_cpu(ch->crc);
if (crc != node_crc)
@@ -1128,7 +1132,7 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
ubifs_assert(znode == c->zroot.znode);
znode = dirty_cow_znode(c, &c->zroot);
}
- if (unlikely(IS_ERR(znode)) || !p)
+ if (IS_ERR(znode) || !p)
break;
ubifs_assert(path[p - 1] >= 0);
ubifs_assert(path[p - 1] < znode->child_cnt);
@@ -1492,6 +1496,289 @@ out:
}
/**
+ * ubifs_tnc_get_bu_keys - lookup keys for bulk-read.
+ * @c: UBIFS file-system description object
+ * @bu: bulk-read parameters and results
+ *
+ * Lookup consecutive data node keys for the same inode that reside
+ * consecutively in the same LEB.
+ */
+int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
+{
+ int n, err = 0, lnum = -1, uninitialized_var(offs);
+ int uninitialized_var(len);
+ unsigned int block = key_block(c, &bu->key);
+ struct ubifs_znode *znode;
+
+ bu->cnt = 0;
+ bu->blk_cnt = 0;
+ bu->eof = 0;
+
+ mutex_lock(&c->tnc_mutex);
+ /* Find first key */
+ err = ubifs_lookup_level0(c, &bu->key, &znode, &n);
+ if (err < 0)
+ goto out;
+ if (err) {
+ /* Key found */
+ len = znode->zbranch[n].len;
+ /* The buffer must be big enough for at least 1 node */
+ if (len > bu->buf_len) {
+ err = -EINVAL;
+ goto out;
+ }
+ /* Add this key */
+ bu->zbranch[bu->cnt++] = znode->zbranch[n];
+ bu->blk_cnt += 1;
+ lnum = znode->zbranch[n].lnum;
+ offs = ALIGN(znode->zbranch[n].offs + len, 8);
+ }
+ while (1) {
+ struct ubifs_zbranch *zbr;
+ union ubifs_key *key;
+ unsigned int next_block;
+
+ /* Find next key */
+ err = tnc_next(c, &znode, &n);
+ if (err)
+ goto out;
+ zbr = &znode->zbranch[n];
+ key = &zbr->key;
+ /* See if there is another data key for this file */
+ if (key_inum(c, key) != key_inum(c, &bu->key) ||
+ key_type(c, key) != UBIFS_DATA_KEY) {
+ err = -ENOENT;
+ goto out;
+ }
+ if (lnum < 0) {
+ /* First key found */
+ lnum = zbr->lnum;
+ offs = ALIGN(zbr->offs + zbr->len, 8);
+ len = zbr->len;
+ if (len > bu->buf_len) {
+ err = -EINVAL;
+ goto out;
+ }
+ } else {
+ /*
+ * The data nodes must be in consecutive positions in
+ * the same LEB.
+ */
+ if (zbr->lnum != lnum || zbr->offs != offs)
+ goto out;
+ offs += ALIGN(zbr->len, 8);
+ len = ALIGN(len, 8) + zbr->len;
+ /* Must not exceed buffer length */
+ if (len > bu->buf_len)
+ goto out;
+ }
+ /* Allow for holes */
+ next_block = key_block(c, key);
+ bu->blk_cnt += (next_block - block - 1);
+ if (bu->blk_cnt >= UBIFS_MAX_BULK_READ)
+ goto out;
+ block = next_block;
+ /* Add this key */
+ bu->zbranch[bu->cnt++] = *zbr;
+ bu->blk_cnt += 1;
+ /* See if we have room for more */
+ if (bu->cnt >= UBIFS_MAX_BULK_READ)
+ goto out;
+ if (bu->blk_cnt >= UBIFS_MAX_BULK_READ)
+ goto out;
+ }
+out:
+ if (err == -ENOENT) {
+ bu->eof = 1;
+ err = 0;
+ }
+ bu->gc_seq = c->gc_seq;
+ mutex_unlock(&c->tnc_mutex);
+ if (err)
+ return err;
+ /*
+ * An enormous hole could cause bulk-read to encompass too many
+ * page cache pages, so limit the number here.
+ */
+ if (bu->blk_cnt > UBIFS_MAX_BULK_READ)
+ bu->blk_cnt = UBIFS_MAX_BULK_READ;
+ /*
+ * Ensure that bulk-read covers a whole number of page cache
+ * pages.
+ */
+ if (UBIFS_BLOCKS_PER_PAGE == 1 ||
+ !(bu->blk_cnt & (UBIFS_BLOCKS_PER_PAGE - 1)))
+ return 0;
+ if (bu->eof) {
+ /* At the end of file we can round up */
+ bu->blk_cnt += UBIFS_BLOCKS_PER_PAGE - 1;
+ return 0;
+ }
+ /* Exclude data nodes that do not make up a whole page cache page */
+ block = key_block(c, &bu->key) + bu->blk_cnt;
+ block &= ~(UBIFS_BLOCKS_PER_PAGE - 1);
+ while (bu->cnt) {
+ if (key_block(c, &bu->zbranch[bu->cnt - 1].key) < block)
+ break;
+ bu->cnt -= 1;
+ }
+ return 0;
+}
+
+/**
+ * read_wbuf - bulk-read from a LEB with a wbuf.
+ * @wbuf: wbuf that may overlap the read
+ * @buf: buffer into which to read
+ * @len: read length
+ * @lnum: LEB number from which to read
+ * @offs: offset from which to read
+ *
+ * This functions returns %0 on success or a negative error code on failure.
+ */
+static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum,
+ int offs)
+{
+ const struct ubifs_info *c = wbuf->c;
+ int rlen, overlap;
+
+ dbg_io("LEB %d:%d, length %d", lnum, offs, len);
+ ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+ ubifs_assert(!(offs & 7) && offs < c->leb_size);
+ ubifs_assert(offs + len <= c->leb_size);
+
+ spin_lock(&wbuf->lock);
+ overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs);
+ if (!overlap) {
+ /* We may safely unlock the write-buffer and read the data */
+ spin_unlock(&wbuf->lock);
+ return ubi_read(c->ubi, lnum, buf, offs, len);
+ }
+
+ /* Don't read under wbuf */
+ rlen = wbuf->offs - offs;
+ if (rlen < 0)
+ rlen = 0;
+
+ /* Copy the rest from the write-buffer */
+ memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen);
+ spin_unlock(&wbuf->lock);
+
+ if (rlen > 0)
+ /* Read everything that goes before write-buffer */
+ return ubi_read(c->ubi, lnum, buf, offs, rlen);
+
+ return 0;
+}
+
+/**
+ * validate_data_node - validate data nodes for bulk-read.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing data node to validate
+ * @zbr: zbranch of data node to validate
+ *
+ * This functions returns %0 on success or a negative error code on failure.
+ */
+static int validate_data_node(struct ubifs_info *c, void *buf,
+ struct ubifs_zbranch *zbr)
+{
+ union ubifs_key key1;
+ struct ubifs_ch *ch = buf;
+ int err, len;
+
+ if (ch->node_type != UBIFS_DATA_NODE) {
+ ubifs_err("bad node type (%d but expected %d)",
+ ch->node_type, UBIFS_DATA_NODE);
+ goto out_err;
+ }
+
+ err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0, 0);
+ if (err) {
+ ubifs_err("expected node type %d", UBIFS_DATA_NODE);
+ goto out;
+ }
+
+ len = le32_to_cpu(ch->len);
+ if (len != zbr->len) {
+ ubifs_err("bad node length %d, expected %d", len, zbr->len);
+ goto out_err;
+ }
+
+ /* Make sure the key of the read node is correct */
+ key_read(c, buf + UBIFS_KEY_OFFSET, &key1);
+ if (!keys_eq(c, &zbr->key, &key1)) {
+ ubifs_err("bad key in node at LEB %d:%d",
+ zbr->lnum, zbr->offs);
+ dbg_tnc("looked for key %s found node's key %s",
+ DBGKEY(&zbr->key), DBGKEY1(&key1));
+ goto out_err;
+ }
+
+ return 0;
+
+out_err:
+ err = -EINVAL;
+out:
+ ubifs_err("bad node at LEB %d:%d", zbr->lnum, zbr->offs);
+ dbg_dump_node(c, buf);
+ dbg_dump_stack();
+ return err;
+}
+
+/**
+ * ubifs_tnc_bulk_read - read a number of data nodes in one go.
+ * @c: UBIFS file-system description object
+ * @bu: bulk-read parameters and results
+ *
+ * This functions reads and validates the data nodes that were identified by the
+ * 'ubifs_tnc_get_bu_keys()' function. This functions returns %0 on success,
+ * -EAGAIN to indicate a race with GC, or another negative error code on
+ * failure.
+ */
+int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
+{
+ int lnum = bu->zbranch[0].lnum, offs = bu->zbranch[0].offs, len, err, i;
+ struct ubifs_wbuf *wbuf;
+ void *buf;
+
+ len = bu->zbranch[bu->cnt - 1].offs;
+ len += bu->zbranch[bu->cnt - 1].len - offs;
+ if (len > bu->buf_len) {
+ ubifs_err("buffer too small %d vs %d", bu->buf_len, len);
+ return -EINVAL;
+ }
+
+ /* Do the read */
+ wbuf = ubifs_get_wbuf(c, lnum);
+ if (wbuf)
+ err = read_wbuf(wbuf, bu->buf, len, lnum, offs);
+ else
+ err = ubi_read(c->ubi, lnum, bu->buf, offs, len);
+
+ /* Check for a race with GC */
+ if (maybe_leb_gced(c, lnum, bu->gc_seq))
+ return -EAGAIN;
+
+ if (err && err != -EBADMSG) {
+ ubifs_err("failed to read from LEB %d:%d, error %d",
+ lnum, offs, err);
+ dbg_dump_stack();
+ dbg_tnc("key %s", DBGKEY(&bu->key));
+ return err;
+ }
+
+ /* Validate the nodes read */
+ buf = bu->buf;
+ for (i = 0; i < bu->cnt; i++) {
+ err = validate_data_node(c, buf, &bu->zbranch[i]);
+ if (err)
+ return err;
+ buf = buf + ALIGN(bu->zbranch[i].len, 8);
+ }
+
+ return 0;
+}
+
+/**
* do_lookup_nm- look up a "hashed" node.
* @c: UBIFS file-system description object
* @key: node key to lookup
@@ -1675,7 +1962,7 @@ static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode,
{
struct ubifs_znode *zn, *zi, *zp;
int i, keep, move, appending = 0;
- union ubifs_key *key = &zbr->key;
+ union ubifs_key *key = &zbr->key, *key1;
ubifs_assert(n >= 0 && n <= c->fanout);
@@ -1716,20 +2003,33 @@ again:
zn->level = znode->level;
/* Decide where to split */
- if (znode->level == 0 && n == c->fanout &&
- key_type(c, key) == UBIFS_DATA_KEY) {
- union ubifs_key *key1;
-
- /*
- * If this is an inode which is being appended - do not split
- * it because no other zbranches can be inserted between
- * zbranches of consecutive data nodes anyway.
- */
- key1 = &znode->zbranch[n - 1].key;
- if (key_inum(c, key1) == key_inum(c, key) &&
- key_type(c, key1) == UBIFS_DATA_KEY &&
- key_block(c, key1) == key_block(c, key) - 1)
- appending = 1;
+ if (znode->level == 0 && key_type(c, key) == UBIFS_DATA_KEY) {
+ /* Try not to split consecutive data keys */
+ if (n == c->fanout) {
+ key1 = &znode->zbranch[n - 1].key;
+ if (key_inum(c, key1) == key_inum(c, key) &&
+ key_type(c, key1) == UBIFS_DATA_KEY)
+ appending = 1;
+ } else
+ goto check_split;
+ } else if (appending && n != c->fanout) {
+ /* Try not to split consecutive data keys */
+ appending = 0;
+check_split:
+ if (n >= (c->fanout + 1) / 2) {
+ key1 = &znode->zbranch[0].key;
+ if (key_inum(c, key1) == key_inum(c, key) &&
+ key_type(c, key1) == UBIFS_DATA_KEY) {
+ key1 = &znode->zbranch[n].key;
+ if (key_inum(c, key1) != key_inum(c, key) ||
+ key_type(c, key1) != UBIFS_DATA_KEY) {
+ keep = n;
+ move = c->fanout - keep;
+ zi = znode;
+ goto do_split;
+ }
+ }
+ }
}
if (appending) {
@@ -1759,6 +2059,8 @@ again:
zbr->znode->parent = zn;
}
+do_split:
+
__set_bit(DIRTY_ZNODE, &zn->flags);
atomic_long_inc(&c->dirty_zn_cnt);
@@ -1785,14 +2087,11 @@ again:
/* Insert new znode (produced by spitting) into the parent */
if (zp) {
- i = n;
+ if (n == 0 && zi == znode && znode->iip == 0)
+ correct_parent_keys(c, znode);
+
/* Locate insertion point */
n = znode->iip + 1;
- if (appending && n != c->fanout)
- appending = 0;
-
- if (i == 0 && zi == znode && znode->iip == 0)
- correct_parent_keys(c, znode);
/* Tail recursion */
zbr->key = zn->zbranch[0].key;
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index a25c1cc1f8d..b48db999903 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -480,8 +480,8 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
}
/* Make sure the key of the read node is correct */
- key_read(c, key, &key1);
- if (memcmp(node + UBIFS_KEY_OFFSET, &key1, c->key_len)) {
+ key_read(c, node + UBIFS_KEY_OFFSET, &key1);
+ if (!keys_eq(c, key, &key1)) {
ubifs_err("bad key in node at LEB %d:%d",
zbr->lnum, zbr->offs);
dbg_tnc("looked for key %s found node's key %s",
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index a9ecbd9af20..0b378042a3a 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -75,7 +75,6 @@
*/
#define UBIFS_BLOCK_SIZE 4096
#define UBIFS_BLOCK_SHIFT 12
-#define UBIFS_BLOCK_MASK 0x00000FFF
/* UBIFS padding byte pattern (must not be first or last byte of node magic) */
#define UBIFS_PADDING_BYTE 0xCE
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 17c620b93ee..a7bd32fa15b 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -142,6 +142,9 @@
/* Maximum expected tree height for use by bottom_up_buf */
#define BOTTOM_UP_HEIGHT 64
+/* Maximum number of data nodes to bulk-read */
+#define UBIFS_MAX_BULK_READ 32
+
/*
* Lockdep classes for UBIFS inode @ui_mutex.
*/
@@ -328,9 +331,10 @@ struct ubifs_gced_idx_leb {
* this inode
* @dirty: non-zero if the inode is dirty
* @xattr: non-zero if this is an extended attribute inode
+ * @bulk_read: non-zero if bulk-read should be used
* @ui_mutex: serializes inode write-back with the rest of VFS operations,
- * serializes "clean <-> dirty" state changes, protects @dirty,
- * @ui_size, and @xattr_size
+ * serializes "clean <-> dirty" state changes, serializes bulk-read,
+ * protects @dirty, @bulk_read, @ui_size, and @xattr_size
* @ui_lock: protects @synced_i_size
* @synced_i_size: synchronized size of inode, i.e. the value of inode size
* currently stored on the flash; used only for regular file
@@ -338,6 +342,8 @@ struct ubifs_gced_idx_leb {
* @ui_size: inode size used by UBIFS when writing to flash
* @flags: inode flags (@UBIFS_COMPR_FL, etc)
* @compr_type: default compression type used for this inode
+ * @last_page_read: page number of last page read (for bulk read)
+ * @read_in_a_row: number of consecutive pages read in a row (for bulk read)
* @data_len: length of the data attached to the inode
* @data: inode's data
*
@@ -379,12 +385,15 @@ struct ubifs_inode {
unsigned int xattr_names;
unsigned int dirty:1;
unsigned int xattr:1;
+ unsigned int bulk_read:1;
struct mutex ui_mutex;
spinlock_t ui_lock;
loff_t synced_i_size;
loff_t ui_size;
int flags;
int compr_type;
+ pgoff_t last_page_read;
+ pgoff_t read_in_a_row;
int data_len;
void *data;
};
@@ -698,8 +707,8 @@ struct ubifs_jhead {
* struct ubifs_zbranch - key/coordinate/length branch stored in znodes.
* @key: key
* @znode: znode address in memory
- * @lnum: LEB number of the indexing node
- * @offs: offset of the indexing node within @lnum
+ * @lnum: LEB number of the target node (indexing node or data node)
+ * @offs: target node offset within @lnum
* @len: target node length
*/
struct ubifs_zbranch {
@@ -744,6 +753,28 @@ struct ubifs_znode {
};
/**
+ * struct bu_info - bulk-read information
+ * @key: first data node key
+ * @zbranch: zbranches of data nodes to bulk read
+ * @buf: buffer to read into
+ * @buf_len: buffer length
+ * @gc_seq: GC sequence number to detect races with GC
+ * @cnt: number of data nodes for bulk read
+ * @blk_cnt: number of data blocks including holes
+ * @oef: end of file reached
+ */
+struct bu_info {
+ union ubifs_key key;
+ struct ubifs_zbranch zbranch[UBIFS_MAX_BULK_READ];
+ void *buf;
+ int buf_len;
+ int gc_seq;
+ int cnt;
+ int blk_cnt;
+ int eof;
+};
+
+/**
* struct ubifs_node_range - node length range description data structure.
* @len: fixed node length
* @min_len: minimum possible node length
@@ -862,9 +893,13 @@ struct ubifs_orphan {
/**
* struct ubifs_mount_opts - UBIFS-specific mount options information.
* @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
+ * @bulk_read: enable bulk-reads
+ * @chk_data_crc: check CRCs when reading data nodes
*/
struct ubifs_mount_opts {
unsigned int unmount_mode:2;
+ unsigned int bulk_read:2;
+ unsigned int chk_data_crc:2;
};
/**
@@ -905,13 +940,12 @@ struct ubifs_mount_opts {
* @cmt_state: commit state
* @cs_lock: commit state lock
* @cmt_wq: wait queue to sleep on if the log is full and a commit is running
+ *
* @fast_unmount: do not run journal commit before un-mounting
* @big_lpt: flag that LPT is too big to write whole during commit
- * @check_lpt_free: flag that indicates LPT GC may be needed
- * @nospace: non-zero if the file-system does not have flash space (used as
- * optimization)
- * @nospace_rp: the same as @nospace, but additionally means that even reserved
- * pool is full
+ * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
+ * recovery)
+ * @bulk_read: enable bulk-reads
*
* @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
* @calc_idx_sz
@@ -935,6 +969,7 @@ struct ubifs_mount_opts {
* @mst_node: master node
* @mst_offs: offset of valid master node
* @mst_mutex: protects the master node area, @mst_node, and @mst_offs
+ * @bulk_read_buf_size: buffer size for bulk-reads
*
* @log_lebs: number of logical eraseblocks in the log
* @log_bytes: log size in bytes
@@ -977,12 +1012,17 @@ struct ubifs_mount_opts {
* but which still have to be taken into account because
* the index has not been committed so far
* @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth,
- * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, and @lst;
+ * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, @lst,
+ * @nospace, and @nospace_rp;
* @min_idx_lebs: minimum number of LEBs required for the index
* @old_idx_sz: size of index on flash
* @calc_idx_sz: temporary variable which is used to calculate new index size
* (contains accurate new index size at end of TNC commit start)
* @lst: lprops statistics
+ * @nospace: non-zero if the file-system does not have flash space (used as
+ * optimization)
+ * @nospace_rp: the same as @nospace, but additionally means that even reserved
+ * pool is full
*
* @page_budget: budget for a page
* @inode_budget: budget for an inode
@@ -1061,6 +1101,7 @@ struct ubifs_mount_opts {
* @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab
* @dirty_nn_cnt: number of dirty nnodes
* @dirty_pn_cnt: number of dirty pnodes
+ * @check_lpt_free: flag that indicates LPT GC may be needed
* @lpt_sz: LPT size
* @lpt_nod_buf: buffer for an on-flash nnode or pnode
* @lpt_buf: buffer of LEB size used by LPT
@@ -1102,6 +1143,7 @@ struct ubifs_mount_opts {
* @rcvrd_mst_node: recovered master node to write when mounting ro to rw
* @size_tree: inode size information for recovery
* @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY)
+ * @always_chk_crc: always check CRCs (while mounting and remounting rw)
* @mount_opts: UBIFS-specific mount options
*
* @dbg_buf: a buffer of LEB size used for debugging purposes
@@ -1146,11 +1188,11 @@ struct ubifs_info {
int cmt_state;
spinlock_t cs_lock;
wait_queue_head_t cmt_wq;
+
unsigned int fast_unmount:1;
unsigned int big_lpt:1;
- unsigned int check_lpt_free:1;
- unsigned int nospace:1;
- unsigned int nospace_rp:1;
+ unsigned int no_chk_data_crc:1;
+ unsigned int bulk_read:1;
struct mutex tnc_mutex;
struct ubifs_zbranch zroot;
@@ -1175,6 +1217,7 @@ struct ubifs_info {
struct ubifs_mst_node *mst_node;
int mst_offs;
struct mutex mst_mutex;
+ int bulk_read_buf_size;
int log_lebs;
long long log_bytes;
@@ -1218,6 +1261,8 @@ struct ubifs_info {
unsigned long long old_idx_sz;
unsigned long long calc_idx_sz;
struct ubifs_lp_stats lst;
+ unsigned int nospace:1;
+ unsigned int nospace_rp:1;
int page_budget;
int inode_budget;
@@ -1294,6 +1339,7 @@ struct ubifs_info {
int lpt_drty_flgs;
int dirty_nn_cnt;
int dirty_pn_cnt;
+ int check_lpt_free;
long long lpt_sz;
void *lpt_nod_buf;
void *lpt_buf;
@@ -1335,6 +1381,7 @@ struct ubifs_info {
struct ubifs_mst_node *rcvrd_mst_node;
struct rb_root size_tree;
int remounting_rw;
+ int always_chk_crc;
struct ubifs_mount_opts mount_opts;
#ifdef CONFIG_UBIFS_FS_DEBUG
@@ -1347,6 +1394,12 @@ struct ubifs_info {
unsigned long fail_timeout;
unsigned int fail_cnt;
unsigned int fail_cnt_max;
+ long long chk_lpt_sz;
+ long long chk_lpt_sz2;
+ long long chk_lpt_wastage;
+ int chk_lpt_lebs;
+ int new_nhead_lnum;
+ int new_nhead_offs;
#endif
};
@@ -1377,7 +1430,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
int offs, int dtype);
int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
- int offs, int quiet);
+ int offs, int quiet, int chk_crc);
void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
int ubifs_io_init(struct ubifs_info *c);
@@ -1490,6 +1543,8 @@ void destroy_old_idx(struct ubifs_info *c);
int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
int lnum, int offs);
int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode);
+int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu);
+int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu);
/* tnc_misc.c */
struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
@@ -1586,12 +1641,10 @@ int ubifs_lpt_post_commit(struct ubifs_info *c);
void ubifs_lpt_free(struct ubifs_info *c, int wr_only);
/* lprops.c */
-void ubifs_get_lprops(struct ubifs_info *c);
const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
const struct ubifs_lprops *lp,
int free, int dirty, int flags,
int idx_gc_cnt);
-void ubifs_release_lprops(struct ubifs_info *c);
void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats);
void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
int cat);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 649bec78b64..cfd31e229c8 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -446,7 +446,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
int type;
xent = ubifs_tnc_next_ent(c, &key, &nm);
- if (unlikely(IS_ERR(xent))) {
+ if (IS_ERR(xent)) {
err = PTR_ERR(xent);
break;
}